define and use with_stack_trace

9bc05a38 · Tim Cooijmans · Reyhane Askari · 592e7c75 · 9bc05a38 · 9bc05a38
--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -2948,6 +2948,34 @@ def copy_stack_trace(from_var, to_var):
        to_var.tag.trace = getattr(to_var.tag, 'trace', []) + tr
+def with_stack_trace(from_var, to_var):
+    """
+    Copies the stack trace from one or more tensor variables to
+    one or more tensor variables and returns the destination variables.
+    Parameters
+    ----------
+    from_var
+        Tensor variable or list of tensor variables to copy stack traces from.
+    to_var
+        Tensor variable or list of tensor variables to copy stack traces to.
+    Returns
+    -------
+    tensor variable or list of tensor variables
+        `to_var`, augmented with the stack traces from `from_var`.
+    Notes
+    -----
+    The stacktrace is assumed to be of the form of a list of lists
+    of tuples. Each tuple contains the filename, line number, function name
+    and so on. Each list of tuples contains the truples belonging to a
+    particular variable.
+    """
+    copy_stack_trace(from_var, to_var)
+    return to_var
 def check_stack_trace(f_or_fgraph, ops_to_check='last', bug_print='raise'):
    """
    This function checks if the outputs of specific ops of a compiled graph

--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -15,6 +15,7 @@ from theano.tensor.basic import (
 from theano.gof import HideC, COp, ParamsType
 from theano.gof.utils import MethodNotDefined
+from theano.gof.opt import with_stack_trace
 from collections import deque
@@ -75,11 +76,11 @@ def as_gpuarray_variable(x, context_name):
        # If we couldn't deal with transfers, then maybe it's a tensor
        if isinstance(x.type, tensor.TensorType):
-            return GpuFromHost(context_name)(x)
+            return with_stack_trace(x, GpuFromHost(context_name)(x))
    # Try _as_GpuArrayVariable if possible
    if hasattr(x, '_as_GpuArrayVariable'):
-        return x._as_GpuArrayVariable(context_name)
+        return with_stack_trace(x, x._as_GpuArrayVariable(context_name))
    # If it didn't work try for a constant
    ctx = get_context(context_name)
@@ -88,13 +89,13 @@ def as_gpuarray_variable(x, context_name):
        if x.context.ptr != ctx.ptr:
            x = x.transfer(ctx)
-    x = gpuarray.asarray(x, context=ctx)
+    x = with_stack_trace(x, gpuarray.asarray(x, context=ctx))
    bcast = [(s == 1) for s in x.shape]
-    return GpuArrayConstant(GpuArrayType(dtype=x.dtype,
+    return with_stack_trace(x, GpuArrayConstant(GpuArrayType(dtype=x.dtype,
-                                         broadcastable=bcast,
+                                                             broadcastable=bcast,
-                                         context_name=context_name),
+                                                             context_name=context_name),
-                            x)
+                                                x))
 def infer_context_name(*vars):

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -15,7 +15,7 @@ from theano.compile.ops import shape_i
 from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
                        LocalGroupDB,
                        SequenceDB, Optimizer, DB, toolbox, graph)
-from theano.gof.opt import LocalMetaOptimizer, copy_stack_trace
+from theano.gof.opt import LocalMetaOptimizer, copy_stack_trace, with_stack_trace
 from theano.ifelse import IfElse
 from theano.misc.ordered_set import OrderedSet
@@ -421,6 +421,8 @@ class GraphToGPU(Optimizer):
            if isinstance(new_ops, theano.Op):
                outputs = new_ops(*[mapping[i] for i in node.inputs], return_list=True)
+                for old_output, new_output in zip(node.outputs, outputs):
+                    copy_stack_trace(old_output, new_output)
            elif not new_ops:
                newnode = node.clone_with_new_inputs([mapping.get(i) for i in node.inputs])
                outputs = newnode.outputs
@@ -461,7 +463,7 @@ class GraphToGPU(Optimizer):
                        new_o.owner.inputs[0].type == o.type):
                    new_o = new_o.owner.inputs[0]
                else:
-                    new_o = safe_to_cpu(new_o)
+                    new_o = with_stack_trace(o, safe_to_cpu(new_o))
            new_nodes.append(new_o)
        fgraph.replace_all_validate(zip(fgraph.outputs, new_nodes),
                                    reason=self.__class__.__name__)
@@ -692,8 +694,6 @@ def local_gpu_contiguous_gpu_contiguous(node):
    if isinstance(node.op, GpuContiguous):
        inp = node.inputs[0]
        if inp.owner and isinstance(inp.owner.op, GpuContiguous):
-            if not getattr(inp.tag, 'trace', None):
-                copy_stack_trace(node.outputs[0], inp)
            return [inp]
@@ -1220,7 +1220,7 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
            op.scalar_op, axis=op.axis,
            dtype=odtype,
            acc_dtype=adtype)
-        gvar = greduce(x)
+        gvar = with_stack_trace(outputs, greduce(x))
        # We need to have the make node called, otherwise the mask can
        # be None
        if (op2 is GpuCAReduceCPY or
@@ -1260,22 +1260,27 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
                dtype=getattr(op, 'dtype', outputs[0].dtype),
                acc_dtype=getattr(op, 'acc_dtype', None))
-            reshaped_x = x.reshape(tensor.stack(new_in_shp))
+            reshaped_x = with_stack_trace(
-            gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name)
+                outputs, x.reshape(tensor.stack(new_in_shp)))
-            gvar = greduce(gpu_reshaped_x)
+            gpu_reshaped_x = with_stack_trace(
+                outputs, as_gpuarray_variable(reshaped_x, context_name))
+            gvar = with_stack_trace(outputs, greduce(gpu_reshaped_x))
            # We need to have the make node called, otherwise the mask can
            # be None
            reshaped_gpu_inputs = [gpu_reshaped_x]
            if greduce.supports_c_code(reshaped_gpu_inputs):
-                reduce_reshaped_x = greduce(gpu_reshaped_x)
+                reduce_reshaped_x = with_stack_trace(
+                    outputs, greduce(gpu_reshaped_x))
                if reduce_reshaped_x.ndim != outputs[0].ndim:
                    out_shp = []
                    for i in range(x.ndim):
                        if i not in op.axis:
                            out_shp.append(shape_i(x, i))
-                    unreshaped_reduce = GpuReshape(len(out_shp))(reduce_reshaped_x,
+                    unreshaped_reduce = with_stack_trace(
-                                                                 tensor.stack(out_shp))
+                        outputs, GpuReshape(len(out_shp))(
+                            reduce_reshaped_x,
+                            tensor.stack(out_shp)))
                else:
                    unreshaped_reduce = reduce_reshaped_x
                return [unreshaped_reduce]
@@ -2398,7 +2403,8 @@ def local_gpu_elemwise_careduce(node):
        props = node.op._props_dict()
        props["pre_scalar_op"] = scalar.basic.sqr
        out = GpuCAReduceCuda(**props)(inp)
-        return [out]
+        return with_stack_trace(
+            node.outputs, out)
 @local_optimizer(None)

--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -14,7 +14,7 @@ import theano.gpuarray
 from .. import basic_ops
 from ..type import GpuArrayType, gpuarray_shared_constructor, get_context
 from ..basic_ops import (
-    GpuAlloc, GpuAllocEmpty, GpuReshape, GpuFromHost, host_from_gpu)
+    GpuAlloc, GpuAllocEmpty, GpuReshape, GpuFromHost, HostFromGpu, host_from_gpu)
 from ..blas import GpuGemm
 from ..elemwise import (
    GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise, Elemwise, max_inputs_to_GpuElemwise)
@@ -28,6 +28,16 @@ from theano.tensor.nnet import abstract_conv
 from theano.gpuarray import dnn, blas
+def _check_stack_trace(thing):
+    def _ops_to_check(op):
+        if not isinstance(op, theano.gof.Op):
+            op = op.op # assume node
+        return not isinstance(op, (theano.compile.ops.Shape_i,
+                                   theano.ifelse.IfElse,
+                                   GpuFromHost, HostFromGpu,
+                                   GpuElemwise))
+    return check_stack_trace(thing, ops_to_check=_ops_to_check)
 def test_local_assert():
    x = theano.tensor.fmatrix()
    a = theano.tensor.opt.assert_op(x, theano.tensor.eq(x, 0).any())
@@ -71,8 +81,8 @@ def test_local_gpu_contiguous_gpu_contiguous():
                     if isinstance(node.op, basic_ops.GpuContiguous)])
    assert 1 == len([node for node in f2.maker.fgraph.toposort()
                     if isinstance(node.op, basic_ops.GpuContiguous)])
-    assert check_stack_trace(f1, ops_to_check='all')
+    assert _check_stack_trace(f1)
-    assert check_stack_trace(f2, ops_to_check='all')
+    assert _check_stack_trace(f2)
 def test_local_gpu_contiguous():
@@ -82,7 +92,7 @@ def test_local_gpu_contiguous():
    assert 1 == len([node for node in f.maker.fgraph.toposort()
                     if isinstance(node.op, basic_ops.GpuContiguous)])
    f([[2.]])
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
 def test_flatten():
@@ -100,7 +110,7 @@ def test_flatten():
    assert res.shape == val.flatten().shape
    assert GpuReshape in [type(node.op)
                          for node in f.maker.fgraph.toposort()]
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
    f = theano.function([m], m.flatten(ndim=2),
                        mode=mode_with_gpu.excluding("local_useless_reshape"))
@@ -110,7 +120,7 @@ def test_flatten():
    assert res.shape == val.shape
    assert GpuReshape in [type(node.op)
                          for node in f.maker.fgraph.toposort()]
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
    m = theano.tensor.tensor3()
    f = theano.function([m], m.flatten(ndim=2), mode=mode_with_gpu)
@@ -120,7 +130,7 @@ def test_flatten():
    assert res.shape == val.reshape(10, -1).shape
    assert GpuReshape in [type(node.op)
                          for node in f.maker.fgraph.toposort()]
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
 def test_reduce():
@@ -133,7 +143,7 @@ def test_reduce():
        f = theano.function([m], getattr(m, method)(axis=0,
                                                    **param),
                            mode=mode_with_gpu)
-        assert check_stack_trace(f, ops_to_check='all')
+        assert _check_stack_trace(f)
        val = np.random.rand(10, 11).astype("float32")
        res = f(val)
        utt.assert_allclose(res, getattr(val, method)(axis=0))
@@ -165,7 +175,7 @@ def test_local_gpualloc_memset_0():
    assert len(topo) == 1
    assert isinstance(topo[0].op, theano.tensor.Alloc)
    assert (np.asarray(f(6)) == 0).all()
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
    # Test with 0 from CPU op.
    # Should be transfered as it is used by another op.
@@ -175,7 +185,7 @@ def test_local_gpualloc_memset_0():
    assert len(topo) == 3
    assert isinstance(topo[0].op, GpuAlloc)
    assert (np.asarray(f(6)) == 0).all()
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
    # Test with 0
    a = GpuAlloc(test_ctx_name)(z, i)
@@ -184,7 +194,7 @@ def test_local_gpualloc_memset_0():
    assert len(topo) == 1
    assert isinstance(topo[0].op, GpuAlloc) and topo[0].op.memset_0
    assert (np.asarray(f(6)) == 0).all()
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
    # Test with 1
    a = GpuAlloc(test_ctx_name)(o, i)
@@ -194,7 +204,7 @@ def test_local_gpualloc_memset_0():
    assert isinstance(topo[0].op, GpuAlloc)
    assert not topo[0].op.memset_0
    assert (np.asarray(f(6)) == 1).all()
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
    # Test with 1, 1
    a = GpuAlloc(test_ctx_name)(ones, i)
@@ -204,7 +214,7 @@ def test_local_gpualloc_memset_0():
    assert isinstance(topo[0].op, GpuAlloc)
    assert not topo[0].op.memset_0
    assert (np.asarray(f(2)) == 1).all()
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
 def test_local_gpualloc_empty():
@@ -220,7 +230,7 @@ def test_local_gpualloc_empty():
    assert isinstance(topo[0].op, theano.tensor.AllocEmpty)
    # This return not initilized data, so we can only check the shape
    assert f(3).shape == (3,)
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
    # Test with vector
    # Should be moved
@@ -231,7 +241,7 @@ def test_local_gpualloc_empty():
    assert isinstance(topo[0].op, GpuAllocEmpty)
    # This return not initilized data, so we can only check the shape
    assert f(3).shape == (3,)
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
    # Test with matrix
    a = tensor.AllocEmpty('float32')(i, ii)
@@ -241,7 +251,7 @@ def test_local_gpualloc_empty():
    assert isinstance(topo[0].op, GpuAllocEmpty)
    # This return not initilized data, so we can only check the shape
    assert f(3, 4).shape == (3, 4)
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
 def test_rebroadcast():
@@ -259,7 +269,7 @@ def test_rebroadcast():
    assert isinstance(rebr.inputs[0].type, GpuArrayType)
    assert isinstance(rebr.outputs[0].type, GpuArrayType)
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
 class TestSpecifyShape(test_basic.TestSpecifyShape):
    mode = mode_with_gpu
@@ -284,7 +294,7 @@ class test_gpu_ifelse(test_ifelse.test_ifelse):
                            theano.ifelse.ifelse(cond, x.mean(), x.sum()),
                            mode=mode_with_gpu)
        assert f(np.float32([1, 2, 3]), 0) == 6
-        assert check_stack_trace(f, ops_to_check='all')
+        assert _check_stack_trace(f)
        x = tensor.vector()
        cond = tensor.scalar()
@@ -292,7 +302,7 @@ class test_gpu_ifelse(test_ifelse.test_ifelse):
                            theano.ifelse.ifelse(cond, x.mean(), x.sum()),
                            mode=mode_with_gpu)
        assert f(np.float32([1, 2, 3]), 0) == 6
-        assert check_stack_trace(f, ops_to_check='all')
+        assert _check_stack_trace(f)
    def test_lifter_with_shared_var(self):
        x = tensor.lscalar('x')
@@ -315,7 +325,7 @@ def test_print_op():
    assert isinstance(topo[1].op, theano.printing.Print)
    assert isinstance(topo[2].op, GpuElemwise)
    assert topo[3].op == host_from_gpu
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
    f(np.random.random((5, 5)).astype('float32'))
@@ -336,7 +346,7 @@ def test_pdbbreakpoint_op():
    topo = f.maker.fgraph.toposort()
    assert isinstance(topo[-2].op, GpuElemwise)
    assert topo[-1].op == host_from_gpu
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
 def test_local_gpu_elemwise_careduce():
@@ -346,7 +356,7 @@ def test_local_gpu_elemwise_careduce():
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 3
    assert topo[1].op.pre_scalar_op == theano.scalar.sqr
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
    data = np.random.rand(3, 4).astype(theano.config.floatX)
    utt.assert_allclose(f(data), (data * data).sum())
@@ -355,7 +365,7 @@ def test_local_gpu_elemwise_careduce():
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 3
    assert topo[1].op.pre_scalar_op == theano.scalar.sqr
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
    utt.assert_allclose(f(data), (data * data).sum(axis=1))
@@ -374,7 +384,7 @@ def test_local_lift_dot22scalar():
    y_val = np.random.random((3, 4)).astype(theano.config.floatX)
    a_val = 0.5
    utt.assert_allclose(f_cpu(x_val, y_val, a_val), f_gpu(x_val, y_val, a_val))
-    assert check_stack_trace(f_gpu, ops_to_check='all')
+    assert _check_stack_trace(f_gpu)
 def test_local_gpu_subtensor():
@@ -384,7 +394,7 @@ def test_local_gpu_subtensor():
    topo = f.maker.fgraph.toposort()
    assert any([type(node.op) is tensor.Subtensor for node in topo])
    assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
    # Test graph input.
    t = tensor.fmatrix()
@@ -392,7 +402,7 @@ def test_local_gpu_subtensor():
    topo = f.maker.fgraph.toposort()
    assert any([type(node.op) is tensor.Subtensor for node in topo])
    assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
    # Test multiple use of the input
    # We want the subtensor to be on the GPU to prevent multiple transfer.
@@ -401,7 +411,7 @@ def test_local_gpu_subtensor():
    topo = f.maker.fgraph.toposort()
    assert not any([type(node.op) is tensor.Subtensor for node in topo])
    assert any([isinstance(node.op, GpuSubtensor) for node in topo])
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
    # Test multiple use of the input + input as output
    # We want the subtensor to be on the GPU to prevent multiple transfer.
@@ -410,7 +420,7 @@ def test_local_gpu_subtensor():
    topo = f.maker.fgraph.toposort()
    assert not any([type(node.op) is tensor.Subtensor for node in topo])
    assert any([isinstance(node.op, GpuSubtensor) for node in topo])
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
    # Test shared forced on CPU end we do computation on the output of
    # the subtensor.
@@ -423,7 +433,7 @@ def test_local_gpu_subtensor():
    # If it where just a little bit smarter, it could wrongly move it to the GPU.
    # If it where super smart, it would know it should not move it to the GPU.
    assert any([isinstance(node.op, tensor.Elemwise) for node in topo])
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
 def test_local_gpu_elemwise():
@@ -445,7 +455,7 @@ def test_local_gpu_elemwise():
    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
    assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
    utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
    # Now test with the composite already on the cpu before we move it
    # to the gpu
@@ -459,7 +469,7 @@ def test_local_gpu_elemwise():
    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
    assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
    utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
    return  # Not yet implemeted
    # Test multiple output
@@ -477,7 +487,7 @@ def test_local_gpu_elemwise():
    utt.assert_allclose(out[0], a_v)
    utt.assert_allclose(out[1], c_v)
    utt.assert_allclose(out[2], b_v)
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
    # Test multiple output
    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * b_s])
@@ -489,7 +499,7 @@ def test_local_gpu_elemwise():
    out = f(a_v, b_v, c_v)
    utt.assert_allclose(out[0], a_v + b_v)
    utt.assert_allclose(out[1], a_v * c_v)
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
    # Test non-contiguous input
    c = gpuarray_shared_constructor(np.asarray(c_v, dtype='float32'))
@@ -498,7 +508,7 @@ def test_local_gpu_elemwise():
    out = f(a_v, b_v)
    utt.assert_allclose(out[0], a_v[::2] + b_v[::2])
    utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
 def test_many_arg_elemwise():
@@ -575,7 +585,7 @@ def test_local_lift_abstractconv_gpu_shape():
        b = tensor.ftensor4()
        c = tensor.nnet.abstract_conv.AbstractConv2d_gradWeights()(a, b, s)
        f = theano.function([s, a, b], c, mode=mode_with_gpu)
-        assert check_stack_trace(f, ops_to_check='all')
+        assert _check_stack_trace(f)
    finally:
        theano.config.on_opt_error = prev
@@ -606,7 +616,7 @@ def test_local_assert_no_cpu_op():
    try:
        theano.config.assert_no_cpu_op = 'ignore'
        f = theano.function([], out, mode=mode_local_assert)
-        assert check_stack_trace(f, ops_to_check='all')
+        assert _check_stack_trace(f)
    finally:
        theano.config.assert_no_cpu_op = old
@@ -618,7 +628,7 @@ def test_no_complex():
    stft_out = tensor.exp(width_var * freq_var) * signal_var
    f = theano.function([width_var, freq_var, signal_var], stft_out,
                        mode=mode_with_gpu)
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
 @utt.assertFailure_fast
@@ -637,7 +647,7 @@ def test_local_lift_solve():
    A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
    b_val = np.random.uniform(-0.4, 0.4, (5, 3)).astype("float32")
    utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val))
-    assert check_stack_trace(f_gpu, ops_to_check='all')
+    assert _check_stack_trace(f_gpu)
 def test_gpu_solve_not_inplace():
@@ -703,7 +713,7 @@ def test_local_gpua_advanced_incsubtensor():
    w = tensor.set_subtensor(w[tensor.eq(y, 1.0).nonzero()], 100)
    w = tensor.set_subtensor(w[tensor.eq(y, -1.0).nonzero()], 0)
    f = theano.function([target], w)
-    assert check_stack_trace(f, ops_to_check='all')
+    assert _check_stack_trace(f)
 def test_batched_dot_lifter():

--- a/theano/tensor/opt_uncanonicalize.py
+++ b/theano/tensor/opt_uncanonicalize.py
@@ -43,6 +43,7 @@ from theano.tensor import DimShuffle, Subtensor
 from theano.tensor.opt import register_uncanonicalize
 from theano import scalar as scal
+from theano.gof.opt import copy_stack_trace, with_stack_trace
 _logger = logging.getLogger('theano.tensor.opt')
@@ -57,10 +58,13 @@ def local_max_and_argmax(node):
        axis = node.op.get_params(node)
        if len(node.outputs[1].clients) == 0:
            new = CAReduce(scal.maximum, axis)(node.inputs[0])
+            copy_stack_trace(node.outputs[0], new)
            return [new, None]
        if len(node.outputs[0].clients) == 0:
-            return [None, T.Argmax(axis)(node.inputs[0])]
+            new = T.Argmax(axis)(node.inputs[0])
+            copy_stack_trace(node.outputs[0], new)
+            return [None, new]
 @register_uncanonicalize
@@ -84,8 +88,8 @@ def local_max_to_min(node):
                max.owner.op.scalar_op == scal.maximum):
            neg = max.owner.inputs[0]
            if neg.owner and neg.owner.op == T.neg:
-                return [CAReduce(scal.minimum,
+                new = CAReduce(scal.minimum, max.owner.op.axis)(neg.owner.inputs[0])
-                                 max.owner.op.axis)(neg.owner.inputs[0])]
+                return [with_stack_trace(node.outputs[0], new)]
    return False