Merge pull request #5688 from cooijmanstim/gpuarray-stack-trace

gpuarray: keep stack trace

Merge pull request #5688 from cooijmanstim/gpuarray-stack-trace
7a6d676f · Frédéric Bastien · GitHub · 47ac5f99 · 9f8b5561 · 7a6d676f
--- a/theano/gof/graph.py
+++ b/theano/gof/graph.py
@@ -4,6 +4,7 @@ Node classes (`Apply`, `Variable`) and expression graph algorithms.
 from __future__ import absolute_import, print_function, division

 from collections import deque
+import contextlib
 from copy import copy
 from itertools import count

@@ -390,6 +391,8 @@ class Variable(Node):
        self.name = name
        self.auto_name = 'auto_' + str(next(self.__count__))

+        Variable.notify_construction_observers(self)
+
    def __str__(self):
        """Return a str representation of the Variable.

@@ -536,6 +539,22 @@ class Variable(Node):
            d["tag"] = t
        return d

+    #  refer to doc in nodes_constructed.
+    construction_observers = []
+
+    @classmethod
+    def append_construction_observer(cls, observer):
+        cls.construction_observers.append(observer)
+
+    @classmethod
+    def remove_construction_observer(cls, observer):
+        cls.construction_observers.remove(observer)
+
+    @classmethod
+    def notify_construction_observers(cls, instance):
+        for observer in cls.construction_observers:
+            observer(instance)
+

 class Constant(Variable):
    """
@@ -1426,3 +1445,38 @@ def is_in_ancestors(l_node, f_node):
            todo.append(cur)
            todo.extend(i.owner for i in cur.inputs if i.owner)
    return False
+
+
+@contextlib.contextmanager
+def nodes_constructed():
+    """
+    A contextmanager that is used in inherit_stack_trace and keeps track
+    of all the newly created varaible nodes inside an optimization. A list
+    of new_nodes is instantiated but will be filled in a lazy manner (when
+    Variable.notify_construction_observers is called).
+
+
+    `observer` is the entity that updates the new_nodes list.
+    construction_observers is a list inside Variable class and contains
+    a list of observer functions. The observer functions inside
+    construction_observers are only called when a variable node is
+    instantiated (where Variable.notify_construction_observers is called).
+    When the observer function is called, a new variable node is added to
+    the new_nodes list.
+
+
+    Parameters
+    ----------
+    new_nodes
+        A list of all the variable nodes that are created inside the optimization.
+
+    yields
+        new_nodes list.
+    """
+    new_nodes = []
+
+    def observer(node):
+        new_nodes.append(node)
+    Variable.append_construction_observer(observer)
+    yield new_nodes
+    Variable.remove_construction_observer(observer)
--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -6,6 +6,7 @@ amount of useful generic optimization tools.
 from __future__ import absolute_import, print_function, division

 from collections import deque, defaultdict, OrderedDict
+import contextlib
 import copy
 import inspect
 import logging
@@ -2902,7 +2903,7 @@ def pre_greedy_local_optimizer(list_optimizations, out):
 def copy_stack_trace(from_var, to_var):
    """
    Copies the stack trace from one or more tensor variables to
-    one or more tensor variables.
+    one or more tensor variables and returns the destination variables.

    Parameters
    ----------
@@ -2946,6 +2947,25 @@ def copy_stack_trace(from_var, to_var):
        # Copy over stack traces from from_var to each variable to
        # to_var, including the stack_trace of the to_var before
        to_var.tag.trace = getattr(to_var.tag, 'trace', []) + tr
+    return to_var
+
+
+@contextlib.contextmanager
+def inherit_stack_trace(from_var):
+    """
+    Contextmanager that copies the stack trace from one or more variable nodes to all
+    variable nodes constructed in the body. new_nodes is the list of all the newly created
+    variable nodes inside an optimization that is managed by graph.nodes_constructed().
+
+    Parameters
+    ----------
+    from_var
+        Variable node or a list of variable nodes to copy stack traces from.
+
+    """
+    with graph.nodes_constructed() as new_nodes:
+        yield
+    copy_stack_trace(from_var, new_nodes)


 def check_stack_trace(f_or_fgraph, ops_to_check='last', bug_print='raise'):

--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -15,6 +15,7 @@ from theano.tensor.basic import (

 from theano.gof import HideC, COp, ParamsType
 from theano.gof.utils import MethodNotDefined
+from theano.gof.opt import copy_stack_trace

 from collections import deque

@@ -75,11 +76,11 @@ def as_gpuarray_variable(x, context_name):

        # If we couldn't deal with transfers, then maybe it's a tensor
        if isinstance(x.type, tensor.TensorType):
-            return GpuFromHost(context_name)(x)
+            return copy_stack_trace(x, GpuFromHost(context_name)(x))

    # Try _as_GpuArrayVariable if possible
    if hasattr(x, '_as_GpuArrayVariable'):
-        return x._as_GpuArrayVariable(context_name)
+        return copy_stack_trace(x, x._as_GpuArrayVariable(context_name))

    # If it didn't work try for a constant
    ctx = get_context(context_name)

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -18,6 +18,7 @@ from theano.gradient import DisconnectedType, grad_not_implemented
 from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList
 from theano.gof.cmodule import GCC_compiler
 from theano.gof.type import CDataType, Generic
+from theano.gof.opt import inherit_stack_trace
 from theano.compile import optdb
 from theano.compile.ops import shape_i, shape_i_op
 from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
@@ -3127,9 +3128,11 @@ def local_abstractconv_cudnn(node):
    if node.op.unshared:
        return None
    if isinstance(node.op, AbstractConv2d):
-        return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
+        with inherit_stack_trace(node.outputs):
+            return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
    elif isinstance(node.op, AbstractConv3d):
-        return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
+        with inherit_stack_trace(node.outputs):
+            return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)


 @local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
@@ -3352,9 +3355,11 @@ def local_abstractconv_gw_cudnn(node):
    if node.op.unshared:
        return None
    if isinstance(node.op, AbstractConv2d_gradWeights):
-        return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
+        with inherit_stack_trace(node.outputs):
+            return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
    elif isinstance(node.op, AbstractConv3d_gradWeights):
-        return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
+        with inherit_stack_trace(node.outputs):
+            return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)


 @local_optimizer([AbstractConv2d_gradInputs, AbstractConv3d_gradInputs])
@@ -3365,9 +3370,11 @@ def local_abstractconv_gi_cudnn(node):
    if node.op.unshared:
        return None
    if isinstance(node.op, AbstractConv2d_gradInputs):
-        return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
+        with inherit_stack_trace(node.outputs):
+            return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
    elif isinstance(node.op, AbstractConv3d_gradInputs):
-        return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
+        with inherit_stack_trace(node.outputs):
+            return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)


 @inplace_allocempty(GpuDnnConv, 2)
@@ -3384,7 +3391,6 @@ def local_dnn_convgw_inplace(node, inputs):
 def local_dnn_convgi_inplace(node, inputs):
    return [GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]

-
 optdb.register('local_dnna_conv_inplace',
               tensor.opt.in2out(local_dnn_conv_inplace,
                                 local_dnn_convgw_inplace,
@@ -3654,11 +3660,12 @@ def local_dnn_reduction(node):
    if not cudnn.cudnnReduceTensorOp_t.has_alias(node.op.scalar_op.name):
        return

-    return (GpuDnnReduction(node.op.scalar_op.name,
-                            node.op.axis,
-                            node.op.acc_dtype,
-                            node.op.dtype,
-                            False)(node.inputs[0]),)
+    with inherit_stack_trace(node.outputs):
+        return (GpuDnnReduction(node.op.scalar_op.name,
+                                node.op.axis,
+                                node.op.acc_dtype,
+                                node.op.dtype,
+                                False)(node.inputs[0]),)


 @register_opt('cudnn')

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -15,7 +15,8 @@ from theano.compile.ops import shape_i
 from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
                        LocalGroupDB,
                        SequenceDB, Optimizer, DB, toolbox, graph)
-from theano.gof.opt import LocalMetaOptimizer
+from theano.gof.opt import (LocalMetaOptimizer, copy_stack_trace,
+                            inherit_stack_trace)
 from theano.ifelse import IfElse
 from theano.misc.ordered_set import OrderedSet

@@ -252,12 +253,25 @@ def op_lifter(OP, cuda_only=False):
                # This is needed as sometimes new_op inherits from OP.
                if new_op and new_op != node.op:
                    if isinstance(new_op, theano.Op):
-                        return [safe_to_cpu(o) for o in
-                                new_op(*node.inputs, return_list=True)]
+                        new_outputs = new_op(*node.inputs, return_list=True)
+                        to_cpu_fn = safe_to_cpu
                    elif isinstance(new_op, (tuple, list)):
-                        return [safe_to_cpu(o) for o in new_op]
+                        new_outputs = new_op
+                        to_cpu_fn = safe_to_cpu
                    else:  # suppose it is a variable on the GPU
-                        return [new_op.transfer('cpu')]
+                        new_outputs = [new_op]
+
+                        def to_cpu_fn(x):
+                            return x.transfer('cpu')
+                    # copy stack traces onto gpu outputs
+                    # also copy the stack traces onto HostFromGpu outputs
+                    on_cpu = []
+                    for old_output, new_output in zip(node.outputs, new_outputs):
+                        copy_stack_trace(old_output, new_output)
+                        cpu = to_cpu_fn(new_output)
+                        on_cpu.append(cpu)
+                        copy_stack_trace(old_output, cpu)
+                    return on_cpu
            return False
        local_opt.__name__ = maker.__name__
        return local_optimizer(OP)(local_opt)
@@ -419,6 +433,9 @@ class GraphToGPU(Optimizer):
            elif isinstance(new_ops, theano.Variable):
                outputs = [new_ops]

+            for old_output, new_output in zip(node.outputs, outputs):
+                copy_stack_trace(old_output, new_output)
+
            if new_ops:
                node_created[lopt] += len(graph.ops([mapping[i] for i in node.inputs], outputs))
                if any([getattr(old_o, 'dtype', None) != getattr(new_o, 'dtype', None)
@@ -451,7 +468,7 @@ class GraphToGPU(Optimizer):
                        new_o.owner.inputs[0].type == o.type):
                    new_o = new_o.owner.inputs[0]
                else:
-                    new_o = safe_to_cpu(new_o)
+                    new_o = copy_stack_trace(o, safe_to_cpu(new_o))
            new_nodes.append(new_o)
        fgraph.replace_all_validate(zip(fgraph.outputs, new_nodes),
                                    reason=self.__class__.__name__)
@@ -650,7 +667,8 @@ def local_gpualloc_memset_0(node):
                inp.data.size == 1 and
                (np.asarray(inp.data) == 0).all()):
            new_op = GpuAlloc(node.op.context_name, memset_0=True)
-            return [new_op(*node.inputs)]
+            with inherit_stack_trace(node.outputs):
+                return new_op(*node.inputs, return_list=True)


 # Don't register by default.
@@ -659,10 +677,9 @@ def local_gpua_alloc_empty_to_zeros(node):
    if isinstance(node.op, GpuAllocEmpty):
        context_name = infer_context_name(*node.inputs)
        z = np.asarray(0, dtype=node.outputs[0].dtype)
-        return [GpuAlloc(context_name)(as_gpuarray_variable(z, context_name),
-                                       *node.inputs)]
-
-
+        with inherit_stack_trace(node.outputs):
+            return [GpuAlloc(context_name)(
+                as_gpuarray_variable(z, context_name), *node.inputs)]
 optdb.register('local_gpua_alloc_empty_to_zeros',
               theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros),
               # After move to gpu and merge2, before inplace.
@@ -1206,7 +1223,8 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
            op.scalar_op, axis=op.axis,
            dtype=odtype,
            acc_dtype=adtype)
-        gvar = greduce(x)
+        with inherit_stack_trace(outputs):
+            gvar = greduce(x)
        # We need to have the make node called, otherwise the mask can
        # be None
        if (op2 is GpuCAReduceCPY or
@@ -1246,25 +1264,27 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
                dtype=getattr(op, 'dtype', outputs[0].dtype),
                acc_dtype=getattr(op, 'acc_dtype', None))

-            reshaped_x = x.reshape(tensor.stack(new_in_shp))
-            gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name)
-            gvar = greduce(gpu_reshaped_x)
-            # We need to have the make node called, otherwise the mask can
-            # be None
-            reshaped_gpu_inputs = [gpu_reshaped_x]
-            if greduce.supports_c_code(reshaped_gpu_inputs):
-                reduce_reshaped_x = greduce(gpu_reshaped_x)
-
-                if reduce_reshaped_x.ndim != outputs[0].ndim:
-                    out_shp = []
-                    for i in range(x.ndim):
-                        if i not in op.axis:
-                            out_shp.append(shape_i(x, i))
-                    unreshaped_reduce = GpuReshape(len(out_shp))(reduce_reshaped_x,
-                                                                 tensor.stack(out_shp))
-                else:
-                    unreshaped_reduce = reduce_reshaped_x
-                return [unreshaped_reduce]
+            with inherit_stack_trace(outputs):
+                reshaped_x = x.reshape(tensor.stack(new_in_shp))
+                gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name)
+                # We need to have the make node called, otherwise the mask can
+                # be None
+                gvar = greduce(gpu_reshaped_x)
+                reshaped_gpu_inputs = [gpu_reshaped_x]
+                if greduce.supports_c_code(reshaped_gpu_inputs):
+                    reduce_reshaped_x = greduce(gpu_reshaped_x)
+
+                    if reduce_reshaped_x.ndim != outputs[0].ndim:
+                        out_shp = []
+                        for i in range(x.ndim):
+                            if i not in op.axis:
+                                out_shp.append(shape_i(x, i))
+                        unreshaped_reduce = GpuReshape(len(out_shp))(
+                            reduce_reshaped_x,
+                            tensor.stack(out_shp))
+                    else:
+                        unreshaped_reduce = reduce_reshaped_x
+                    return [unreshaped_reduce]


 @register_opt('fast_compile')
@@ -1305,33 +1325,34 @@ def local_gpua_gemm(op, context_name, inputs, outputs):
 def local_gpua_gemmbatch(op, context_name, inputs, outputs):
    if inputs[0].dtype not in ['float16', 'float32', 'float64']:
        return
-    a, b = inputs
-    # Since GpuGemmBatch only supports 3D inputs and output,
-    # we need to add broadcastable dims to the inputs, and drop
-    # them from outputs
-    output_dims = [0, 1, 2]
-    if a.ndim == 2:
-        a = GpuDimShuffle(a.broadcastable, (0, 'x', 1))(a)
-        del output_dims[1]
-    if b.ndim == 2:
-        b = GpuDimShuffle(b.broadcastable, (0, 1, 'x'))(b)
-        del output_dims[-1]
-    # In case of mismatched dtypes, we also have to upcast
-    out_dtype = outputs[0].dtype
-    if a.dtype != out_dtype or b.dtype != out_dtype:
-        gpu_cast_op = GpuElemwise(Cast(Scalar(out_dtype)))
-        if a.dtype != out_dtype:
-            a = gpu_cast_op(a)
-        if b.dtype != out_dtype:
-            b = gpu_cast_op(b)
-
-    c = GpuAllocEmpty(out_dtype, context_name)(
-        a.shape[0], a.shape[1], b.shape[2])
-    out = gpugemmbatch_no_inplace(c, np.asarray(1.0, dtype=out_dtype),
-                                  a, b, np.asarray(0.0, dtype=out_dtype))
-    if len(output_dims) != 3:
-        out = GpuDimShuffle(out.broadcastable, output_dims)(out)
-    return out
+    with inherit_stack_trace(outputs):
+        a, b = inputs
+        # Since GpuGemmBatch only supports 3D inputs and output,
+        # we need to add broadcastable dims to the inputs, and drop
+        # them from outputs
+        output_dims = [0, 1, 2]
+        if a.ndim == 2:
+            a = GpuDimShuffle(a.broadcastable, (0, 'x', 1))(a)
+            del output_dims[1]
+        if b.ndim == 2:
+            b = GpuDimShuffle(b.broadcastable, (0, 1, 'x'))(b)
+            del output_dims[-1]
+        # In case of mismatched dtypes, we also have to upcast
+        out_dtype = outputs[0].dtype
+        if a.dtype != out_dtype or b.dtype != out_dtype:
+            gpu_cast_op = GpuElemwise(Cast(Scalar(out_dtype)))
+            if a.dtype != out_dtype:
+                a = gpu_cast_op(a)
+            if b.dtype != out_dtype:
+                b = gpu_cast_op(b)
+
+        c = GpuAllocEmpty(out_dtype, context_name)(
+            a.shape[0], a.shape[1], b.shape[2])
+        out = gpugemmbatch_no_inplace(c, np.asarray(1.0, dtype=out_dtype),
+                                      a, b, np.asarray(0.0, dtype=out_dtype))
+        if len(output_dims) != 3:
+            out = GpuDimShuffle(out.broadcastable, output_dims)(out)
+        return out


 @register_opt()
@@ -1378,11 +1399,12 @@ def local_gpua_dot22(op, context_name, inputs, outputs):
 @op_lifter([tensor.blas.Dot22Scalar])
 @register_opt2([tensor.blas.Dot22Scalar], 'fast_compile')
 def local_gpua_dot22scalar(op, context_name, inputs, outputs):
-    x, y, a = inputs
-    x = as_gpuarray_variable(x, context_name)
-    y = as_gpuarray_variable(y, context_name)
-    z = GpuAllocEmpty(x.dtype, context_name)(x.shape[0], y.shape[1])
-    return [gpugemm_no_inplace(z, a, x, y, 0)]
+    with inherit_stack_trace(outputs):
+        x, y, a = inputs
+        x = as_gpuarray_variable(x, context_name)
+        y = as_gpuarray_variable(y, context_name)
+        z = GpuAllocEmpty(x.dtype, context_name)(x.shape[0], y.shape[1])
+        return [gpugemm_no_inplace(z, a, x, y, 0)]


 @register_opt('fast_compile')
@@ -2392,7 +2414,8 @@ def local_gpu_elemwise_careduce(node):
        props = node.op._props_dict()
        props["pre_scalar_op"] = scalar.basic.sqr
        out = GpuCAReduceCuda(**props)(inp)
-        return [out]
+        with inherit_stack_trace(node.outputs):
+            return [out]


 @local_optimizer(None)
@@ -2583,8 +2606,9 @@ def local_gpu_solve(op, context_name, inputs, outputs):
 @local_optimizer([GpuCusolverSolve], inplace=True)
 def local_inplace_gpu_solve(node):
    if isinstance(node.op, GpuCusolverSolve) and not node.op.inplace:
-        return [GpuCusolverSolve(A_structure=node.op.A_structure, trans=node.op.trans,
-                                 inplace=True)(*node.inputs)]
+        with inherit_stack_trace(node.outputs):
+            return [GpuCusolverSolve(A_structure=node.op.A_structure, trans=node.op.trans,
+                                     inplace=True)(*node.inputs)]


 # Cholesky decomposition
@@ -2622,7 +2646,8 @@ register_opt2([slinalg.Solve], 'fast_compile', name='matrix_ops_db2')(matrix_ops
 @local_optimizer([GpuCholesky], inplace=True)
 def local_inplace_gpu_cholesky(node):
    if isinstance(node.op, GpuCholesky) and not node.op.inplace:
-        return [node.op.clone_inplace()(*node.inputs)]
+        with inherit_stack_trace(node.outputs):
+            return [node.op.clone_inplace()(*node.inputs)]


 def local_gpu_magma_cholesky(op, context_name, inputs, outputs):
@@ -2705,7 +2730,8 @@ def local_gpu_magma_matrix_inverse(op, context_name, inputs, outputs):
 @local_optimizer([GpuMagmaMatrixInverse])
 def local_inplace_gpu_magma_matrix_inverse(node):
    if isinstance(node.op, GpuMagmaMatrixInverse) and not node.op.inplace:
-        return [node.op.clone_inplace()(*node.inputs)]
+        with inherit_stack_trace(node.outputs):
+            return [node.op.clone_inplace()(*node.inputs)]


 # Eigen decomposition of a symmetric matrix

--- a/theano/gpuarray/opt_util.py
+++ b/theano/gpuarray/opt_util.py
@@ -5,6 +5,7 @@ import numpy as np

 from theano import tensor, scalar as scal, Constant
 from theano.gof import local_optimizer
+from theano.gof.opt import inherit_stack_trace
 from theano.tensor import (DimShuffle, get_scalar_constant_value,
                           NotScalarConstantError)

@@ -184,7 +185,8 @@ def alpha_merge(cls, alpha_in, beta_in):
                except NotScalarConstantError:
                    inputs[alpha_in] = lr * targ.inputs[alpha_in]
                    inputs[beta_in] = lr * targ.inputs[beta_in]
-                return maker(targ, *inputs)
+                with inherit_stack_trace(node.outputs):
+                    return maker(targ, *inputs)
        return opt
    return wrapper

@@ -272,7 +274,8 @@ def output_merge(cls, alpha_in, beta_in, out_in):
                inputs = list(targ.inputs)
                inputs[out_in] = W
                inputs[beta_in] = _one.clone()
-                return maker(targ, *inputs)
+                with inherit_stack_trace(node.outputs):
+                    return maker(targ, *inputs)
        return opt
    return wrapper

@@ -326,7 +329,8 @@ def inplace_allocempty(op, idx):
                    len(alloc.clients) > 1):
                alloc_op = GpuAllocEmpty(alloc.owner.op.dtype, alloc.owner.op.context_name)
                inputs[idx] = alloc_op(*alloc.owner.inputs)
-            return maker(node, inputs)
+            with inherit_stack_trace(node.outputs):
+                return maker(node, inputs)
        return opt
    return wrapper


--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -8,12 +8,13 @@ import theano.tensor.slinalg as slinalg
 from theano.tests.breakpoint import PdbBreakpoint
 from theano.tests import unittest_tools as utt, test_ifelse
 from theano.tensor.tests import test_basic
+from theano.gof.opt import check_stack_trace

 import theano.gpuarray
 from .. import basic_ops
 from ..type import GpuArrayType, gpuarray_shared_constructor, get_context
 from ..basic_ops import (
-    GpuAlloc, GpuAllocEmpty, GpuReshape, GpuFromHost, host_from_gpu)
+    GpuAlloc, GpuAllocEmpty, GpuReshape, GpuFromHost, HostFromGpu, host_from_gpu)
 from ..blas import GpuGemm
 from ..elemwise import (
    GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise, Elemwise, max_inputs_to_GpuElemwise)
@@ -27,6 +28,28 @@ from theano.tensor.nnet import abstract_conv
 from theano.gpuarray import dnn, blas


+def _check_stack_trace(thing):
+    def _ops_to_check(op):
+        if not isinstance(op, theano.gof.Op):
+            op = op.op  # assume it is an apply node
+        return not isinstance(op, (theano.compile.ops.Shape_i,
+                                   theano.compile.ops.Shape,
+                                   theano.compile.ops.DeepCopyOp,
+                                   theano.tensor.opt.MakeVector,
+                                   theano.tensor.subtensor.Subtensor,
+                                   theano.tensor.elemwise.Elemwise,
+                                   theano.ifelse.IfElse,
+                                   GpuFromHost, HostFromGpu,
+                                   GpuCAReduceCuda,
+                                   basic_ops.GpuContiguous,
+                                   GpuElemwise,
+                                   theano.printing.Print,
+                                   PdbBreakpoint,
+                                   ))
+    return check_stack_trace(thing, ops_to_check=_ops_to_check,
+                             bug_print="ignore")
+
+
 def test_local_assert():
    x = theano.tensor.fmatrix()
    a = theano.tensor.opt.assert_op(x, theano.tensor.eq(x, 0).any())
@@ -70,6 +93,8 @@ def test_local_gpu_contiguous_gpu_contiguous():
                     if isinstance(node.op, basic_ops.GpuContiguous)])
    assert 1 == len([node for node in f2.maker.fgraph.toposort()
                     if isinstance(node.op, basic_ops.GpuContiguous)])
+    assert _check_stack_trace(f1)
+    assert _check_stack_trace(f2)


 def test_local_gpu_contiguous():
@@ -79,6 +104,7 @@ def test_local_gpu_contiguous():
    assert 1 == len([node for node in f.maker.fgraph.toposort()
                     if isinstance(node.op, basic_ops.GpuContiguous)])
    f([[2.]])
+    assert _check_stack_trace(f)


 def test_flatten():
@@ -96,6 +122,7 @@ def test_flatten():
    assert res.shape == val.flatten().shape
    assert GpuReshape in [type(node.op)
                          for node in f.maker.fgraph.toposort()]
+    assert _check_stack_trace(f)

    f = theano.function([m], m.flatten(ndim=2),
                        mode=mode_with_gpu.excluding("local_useless_reshape"))
@@ -105,6 +132,7 @@ def test_flatten():
    assert res.shape == val.shape
    assert GpuReshape in [type(node.op)
                          for node in f.maker.fgraph.toposort()]
+    assert _check_stack_trace(f)

    m = theano.tensor.tensor3()
    f = theano.function([m], m.flatten(ndim=2), mode=mode_with_gpu)
@@ -114,6 +142,7 @@ def test_flatten():
    assert res.shape == val.reshape(10, -1).shape
    assert GpuReshape in [type(node.op)
                          for node in f.maker.fgraph.toposort()]
+    assert _check_stack_trace(f)


 def test_reduce():
@@ -126,6 +155,9 @@ def test_reduce():
        f = theano.function([m], getattr(m, method)(axis=0,
                                                    **param),
                            mode=mode_with_gpu)
+        # assert _check_stack_trace(f) this op is ok but since
+        # it is using GpuCAReduceCuda that has an empty stack
+        # trace, this assertion gives error.
        val = np.random.rand(10, 11).astype("float32")
        res = f(val)
        utt.assert_allclose(res, getattr(val, method)(axis=0))
@@ -157,6 +189,7 @@ def test_local_gpualloc_memset_0():
    assert len(topo) == 1
    assert isinstance(topo[0].op, theano.tensor.Alloc)
    assert (np.asarray(f(6)) == 0).all()
+    assert _check_stack_trace(f)

    # Test with 0 from CPU op.
    # Should be transfered as it is used by another op.
@@ -166,6 +199,7 @@ def test_local_gpualloc_memset_0():
    assert len(topo) == 3
    assert isinstance(topo[0].op, GpuAlloc)
    assert (np.asarray(f(6)) == 0).all()
+    assert _check_stack_trace(f)

    # Test with 0
    a = GpuAlloc(test_ctx_name)(z, i)
@@ -174,6 +208,7 @@ def test_local_gpualloc_memset_0():
    assert len(topo) == 1
    assert isinstance(topo[0].op, GpuAlloc) and topo[0].op.memset_0
    assert (np.asarray(f(6)) == 0).all()
+    assert _check_stack_trace(f)

    # Test with 1
    a = GpuAlloc(test_ctx_name)(o, i)
@@ -183,6 +218,7 @@ def test_local_gpualloc_memset_0():
    assert isinstance(topo[0].op, GpuAlloc)
    assert not topo[0].op.memset_0
    assert (np.asarray(f(6)) == 1).all()
+    assert _check_stack_trace(f)

    # Test with 1, 1
    a = GpuAlloc(test_ctx_name)(ones, i)
@@ -192,6 +228,7 @@ def test_local_gpualloc_memset_0():
    assert isinstance(topo[0].op, GpuAlloc)
    assert not topo[0].op.memset_0
    assert (np.asarray(f(2)) == 1).all()
+    assert _check_stack_trace(f)


 def test_local_gpualloc_empty():
@@ -207,6 +244,7 @@ def test_local_gpualloc_empty():
    assert isinstance(topo[0].op, theano.tensor.AllocEmpty)
    # This return not initilized data, so we can only check the shape
    assert f(3).shape == (3,)
+    assert _check_stack_trace(f)

    # Test with vector
    # Should be moved
@@ -217,6 +255,7 @@ def test_local_gpualloc_empty():
    assert isinstance(topo[0].op, GpuAllocEmpty)
    # This return not initilized data, so we can only check the shape
    assert f(3).shape == (3,)
+    assert _check_stack_trace(f)

    # Test with matrix
    a = tensor.AllocEmpty('float32')(i, ii)
@@ -226,6 +265,7 @@ def test_local_gpualloc_empty():
    assert isinstance(topo[0].op, GpuAllocEmpty)
    # This return not initilized data, so we can only check the shape
    assert f(3, 4).shape == (3, 4)
+    assert _check_stack_trace(f)


 def test_rebroadcast():
@@ -243,6 +283,7 @@ def test_rebroadcast():

    assert isinstance(rebr.inputs[0].type, GpuArrayType)
    assert isinstance(rebr.outputs[0].type, GpuArrayType)
+    assert _check_stack_trace(f)


 class TestSpecifyShape(test_basic.TestSpecifyShape):
@@ -268,6 +309,7 @@ class test_gpu_ifelse(test_ifelse.test_ifelse):
                            theano.ifelse.ifelse(cond, x.mean(), x.sum()),
                            mode=mode_with_gpu)
        assert f(np.float32([1, 2, 3]), 0) == 6
+        assert _check_stack_trace(f)

        x = tensor.vector()
        cond = tensor.scalar()
@@ -275,6 +317,7 @@ class test_gpu_ifelse(test_ifelse.test_ifelse):
                            theano.ifelse.ifelse(cond, x.mean(), x.sum()),
                            mode=mode_with_gpu)
        assert f(np.float32([1, 2, 3]), 0) == 6
+        assert _check_stack_trace(f)

    def test_lifter_with_shared_var(self):
        x = tensor.lscalar('x')
@@ -297,6 +340,7 @@ def test_print_op():
    assert isinstance(topo[1].op, theano.printing.Print)
    assert isinstance(topo[2].op, GpuElemwise)
    assert topo[3].op == host_from_gpu
+    assert _check_stack_trace(f)
    f(np.random.random((5, 5)).astype('float32'))


@@ -317,6 +361,7 @@ def test_pdbbreakpoint_op():
    topo = f.maker.fgraph.toposort()
    assert isinstance(topo[-2].op, GpuElemwise)
    assert topo[-1].op == host_from_gpu
+    assert _check_stack_trace(f)


 def test_local_gpu_elemwise_careduce():
@@ -326,6 +371,7 @@ def test_local_gpu_elemwise_careduce():
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 3
    assert topo[1].op.pre_scalar_op == theano.scalar.sqr
+    assert _check_stack_trace(f)
    data = np.random.rand(3, 4).astype(theano.config.floatX)
    utt.assert_allclose(f(data), (data * data).sum())

@@ -334,6 +380,7 @@ def test_local_gpu_elemwise_careduce():
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 3
    assert topo[1].op.pre_scalar_op == theano.scalar.sqr
+    assert _check_stack_trace(f)
    utt.assert_allclose(f(data), (data * data).sum(axis=1))


@@ -352,6 +399,7 @@ def test_local_lift_dot22scalar():
    y_val = np.random.random((3, 4)).astype(theano.config.floatX)
    a_val = 0.5
    utt.assert_allclose(f_cpu(x_val, y_val, a_val), f_gpu(x_val, y_val, a_val))
+    assert _check_stack_trace(f_gpu)


 def test_local_gpu_subtensor():
@@ -361,6 +409,7 @@ def test_local_gpu_subtensor():
    topo = f.maker.fgraph.toposort()
    assert any([type(node.op) is tensor.Subtensor for node in topo])
    assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
+    assert _check_stack_trace(f)

    # Test graph input.
    t = tensor.fmatrix()
@@ -368,6 +417,7 @@ def test_local_gpu_subtensor():
    topo = f.maker.fgraph.toposort()
    assert any([type(node.op) is tensor.Subtensor for node in topo])
    assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
+    assert _check_stack_trace(f)

    # Test multiple use of the input
    # We want the subtensor to be on the GPU to prevent multiple transfer.
@@ -376,6 +426,7 @@ def test_local_gpu_subtensor():
    topo = f.maker.fgraph.toposort()
    assert not any([type(node.op) is tensor.Subtensor for node in topo])
    assert any([isinstance(node.op, GpuSubtensor) for node in topo])
+    assert _check_stack_trace(f)

    # Test multiple use of the input + input as output
    # We want the subtensor to be on the GPU to prevent multiple transfer.
@@ -384,6 +435,7 @@ def test_local_gpu_subtensor():
    topo = f.maker.fgraph.toposort()
    assert not any([type(node.op) is tensor.Subtensor for node in topo])
    assert any([isinstance(node.op, GpuSubtensor) for node in topo])
+    assert _check_stack_trace(f)

    # Test shared forced on CPU end we do computation on the output of
    # the subtensor.
@@ -396,6 +448,7 @@ def test_local_gpu_subtensor():
    # If it where just a little bit smarter, it could wrongly move it to the GPU.
    # If it where super smart, it would know it should not move it to the GPU.
    assert any([isinstance(node.op, tensor.Elemwise) for node in topo])
+    assert _check_stack_trace(f)


 def test_local_gpu_elemwise():
@@ -417,6 +470,7 @@ def test_local_gpu_elemwise():
    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
    assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
    utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)
+    assert _check_stack_trace(f)

    # Now test with the composite already on the cpu before we move it
    # to the gpu
@@ -430,6 +484,7 @@ def test_local_gpu_elemwise():
    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
    assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
    utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)
+    assert _check_stack_trace(f)

    return  # Not yet implemeted
    # Test multiple output
@@ -447,6 +502,7 @@ def test_local_gpu_elemwise():
    utt.assert_allclose(out[0], a_v)
    utt.assert_allclose(out[1], c_v)
    utt.assert_allclose(out[2], b_v)
+    assert _check_stack_trace(f)

    # Test multiple output
    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * b_s])
@@ -458,6 +514,7 @@ def test_local_gpu_elemwise():
    out = f(a_v, b_v, c_v)
    utt.assert_allclose(out[0], a_v + b_v)
    utt.assert_allclose(out[1], a_v * c_v)
+    assert _check_stack_trace(f)

    # Test non-contiguous input
    c = gpuarray_shared_constructor(np.asarray(c_v, dtype='float32'))
@@ -466,6 +523,7 @@ def test_local_gpu_elemwise():
    out = f(a_v, b_v)
    utt.assert_allclose(out[0], a_v[::2] + b_v[::2])
    utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
+    assert _check_stack_trace(f)


 def test_many_arg_elemwise():
@@ -541,7 +599,8 @@ def test_local_lift_abstractconv_gpu_shape():
        a = tensor.ftensor4()
        b = tensor.ftensor4()
        c = tensor.nnet.abstract_conv.AbstractConv2d_gradWeights()(a, b, s)
-        theano.function([s, a, b], c, mode=mode_with_gpu)
+        f = theano.function([s, a, b], c, mode=mode_with_gpu)
+        assert _check_stack_trace(f)
    finally:
        theano.config.on_opt_error = prev

@@ -571,7 +630,8 @@ def test_local_assert_no_cpu_op():
    # If the flag is ignore
    try:
        theano.config.assert_no_cpu_op = 'ignore'
-        theano.function([], out, mode=mode_local_assert)
+        f = theano.function([], out, mode=mode_local_assert)
+        assert _check_stack_trace(f)
    finally:
        theano.config.assert_no_cpu_op = old

@@ -581,8 +641,9 @@ def test_no_complex():
    freq_var = tensor.fscalar()
    signal_var = tensor.fscalar()
    stft_out = tensor.exp(width_var * freq_var) * signal_var
-    theano.function([width_var, freq_var, signal_var], stft_out,
-                    mode=mode_with_gpu)
+    f = theano.function([width_var, freq_var, signal_var], stft_out,
+                        mode=mode_with_gpu)
+    assert _check_stack_trace(f)


 @utt.assertFailure_fast
@@ -601,6 +662,7 @@ def test_local_lift_solve():
    A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
    b_val = np.random.uniform(-0.4, 0.4, (5, 3)).astype("float32")
    utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val))
+    assert _check_stack_trace(f_gpu)


 def test_gpu_solve_not_inplace():
@@ -665,7 +727,8 @@ def test_local_gpua_advanced_incsubtensor():
    w = tensor.ones_like(y)
    w = tensor.set_subtensor(w[tensor.eq(y, 1.0).nonzero()], 100)
    w = tensor.set_subtensor(w[tensor.eq(y, -1.0).nonzero()], 0)
-    theano.function([target], w)
+    f = theano.function([target], w)
+    assert _check_stack_trace(f)


 def test_batched_dot_lifter():
@@ -690,6 +753,7 @@ def test_batched_dot_lifter():
        z = tensor.batched_dot(x, y)
        f = theano.function([x, y], z, mode=mode_with_gpu)
        f(x_val, y_val)
+        assert check_stack_trace(f, ops_to_check='all')


 def test_crossentropycategorical1hot_lifter():

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -146,6 +146,7 @@ from theano.gof import (utils, Op, view_roots,
                        EquilibriumOptimizer, Apply,
                        ReplacementDidntRemovedError)
 from theano.gof.params_type import ParamsType
+from theano.gof.opt import inherit_stack_trace
 from theano.printing import pprint, FunctionPrinter, debugprint
 from theano.compile.mode import optdb
 import theano.scalar
@@ -1625,19 +1626,16 @@ def local_dot_to_dot22(node):
        return

    if y.type.dtype in ['float16', 'float32', 'float64', 'complex64', 'complex128']:
-        if x.ndim == 2 and y.ndim == 2:
-            # print "local_dot_to_dot22: MM"
-            return [_dot22(*node.inputs)]
-        if x.ndim == 2 and y.ndim == 1:
-            # print "local_dot_to_dot22: MV"
-            return [_dot22(x, y.dimshuffle(0, 'x')).dimshuffle(0)]
-        if x.ndim == 1 and y.ndim == 2:
-            # print "local_dot_to_dot22: VM"
-            return [_dot22(x.dimshuffle('x', 0), y).dimshuffle(1)]
-        if x.ndim == 1 and y.ndim == 1:
-            # print "local_dot_to_dot22: VV"
-            return [_dot22(x.dimshuffle('x', 0),
-                           y.dimshuffle(0, 'x')).dimshuffle()]
+        with inherit_stack_trace(node.outputs):
+            if x.ndim == 2 and y.ndim == 2:
+                return [_dot22(*node.inputs)]
+            if x.ndim == 2 and y.ndim == 1:
+                return [_dot22(x, y.dimshuffle(0, 'x')).dimshuffle(0)]
+            if x.ndim == 1 and y.ndim == 2:
+                return [_dot22(x.dimshuffle('x', 0), y).dimshuffle(1)]
+            if x.ndim == 1 and y.ndim == 1:
+                return [_dot22(x.dimshuffle('x', 0),
+                               y.dimshuffle(0, 'x')).dimshuffle()]

    _logger.info('Not optimizing dot with inputs %s %s %s %s',
                 x, y, x.type, y.type)
@@ -1646,19 +1644,22 @@ def local_dot_to_dot22(node):
 @local_optimizer([gemm_no_inplace], inplace=True)
 def local_inplace_gemm(node):
    if node.op == gemm_no_inplace:
-        return [gemm_inplace(*node.inputs)]
+        with inherit_stack_trace(node.outputs):
+            return [gemm_inplace(*node.inputs)]


 @local_optimizer([gemv_no_inplace], inplace=True)
 def local_inplace_gemv(node):
    if node.op == gemv_no_inplace:
-        return [gemv_inplace(*node.inputs)]
+        with inherit_stack_trace(node.outputs):
+            return [gemv_inplace(*node.inputs)]


 @local_optimizer([ger], inplace=True)
 def local_inplace_ger(node):
    if node.op == ger:
-        return [ger_destructive(*node.inputs)]
+        with inherit_stack_trace(node.outputs):
+            return [ger_destructive(*node.inputs)]


 @local_optimizer([gemm_no_inplace])
@@ -1666,12 +1667,13 @@ def local_gemm_to_gemv(node):
    """GEMM acting on row or column matrices -> GEMV."""
    if node.op == gemm_no_inplace:
        z, a, x, y, b = node.inputs
-        if z.broadcastable == x.broadcastable == (True, False):
-            r = gemv_no_inplace(z.dimshuffle(1), a, y.T, x.dimshuffle(1), b)
-            return [r.dimshuffle('x', 0)]
-        if z.broadcastable == y.broadcastable == (False, True):
-            r = gemv_no_inplace(z.dimshuffle(0), a, x, y.dimshuffle(0), b)
-            return [r.dimshuffle(0, 'x')]
+        with inherit_stack_trace(node.outputs):
+            if z.broadcastable == x.broadcastable == (True, False):
+                r = gemv_no_inplace(z.dimshuffle(1), a, y.T, x.dimshuffle(1), b)
+                return [r.dimshuffle('x', 0)]
+            if z.broadcastable == y.broadcastable == (False, True):
+                r = gemv_no_inplace(z.dimshuffle(0), a, x, y.dimshuffle(0), b)
+                return [r.dimshuffle(0, 'x')]


 @local_optimizer([gemm_no_inplace])
@@ -1680,26 +1682,27 @@ def local_gemm_to_ger(node):
    if node.op == gemm_no_inplace:
        z, a, x, y, b = node.inputs
        if x.broadcastable[1] and y.broadcastable[0]:
-            # x and y are both vectors so this might qualifies for a GER
-            xv = x.dimshuffle(0)
-            yv = y.dimshuffle(1)
-            try:
-                bval = T.get_scalar_constant_value(b)
-            except T.NotScalarConstantError:
-                # b isn't a constant, GEMM is doing useful pre-scaling
-                return
-
-            if bval == 1:   # best case a natural GER
-                rval = ger(z, a, xv, yv)
-                return [rval]
-            elif bval == 0:   # GER on zeros_like should be faster than GEMM
-                zeros = T.zeros([x.shape[0], y.shape[1]], x.dtype)
-                rval = ger(zeros, a, xv, yv)
-                return [rval]
-            else:
-                # if bval is another constant, then z is being usefully
-                # pre-scaled and GER isn't really the right tool for the job.
-                return
+            with inherit_stack_trace(node.outputs):
+                # x and y are both vectors so this might qualifies for a GER
+                xv = x.dimshuffle(0)
+                yv = y.dimshuffle(1)
+                try:
+                    bval = T.get_scalar_constant_value(b)
+                except T.NotScalarConstantError:
+                    # b isn't a constant, GEMM is doing useful pre-scaling
+                    return
+
+                if bval == 1:   # best case a natural GER
+                    rval = ger(z, a, xv, yv)
+                    return [rval]
+                elif bval == 0:   # GER on zeros_like should be faster than GEMM
+                    zeros = T.zeros([x.shape[0], y.shape[1]], x.dtype)
+                    rval = ger(zeros, a, xv, yv)
+                    return [rval]
+                else:
+                    # if bval is another constant, then z is being usefully
+                    # pre-scaled and GER isn't really the right tool for the job.
+                    return


 # TODO: delete this optimization when we have the proper dot->gemm->ger pipeline
@@ -1708,37 +1711,38 @@ def local_gemm_to_ger(node):
 def local_dot22_to_ger_or_gemv(node):
    """dot22 computing an outer-product -> GER."""
    if node.op == _dot22:
-        x, y = node.inputs
-        xb = x.broadcastable
-        yb = y.broadcastable
-        one = T.as_tensor_variable(np.asarray(1, dtype=x.dtype))
-        zero = T.as_tensor_variable(np.asarray(0, dtype=x.dtype))
-        if xb[1] and yb[0]:
-            # x and y are both vectors so this might qualifies for a GER
-            xv = x.dimshuffle(0)
-            yv = y.dimshuffle(1)
-            zeros = T.zeros([x.shape[0], y.shape[1]], dtype=x.dtype)
-            rval = ger(zeros, one, xv, yv)
-            return [rval]
-        if xb[0] and yb[1]:
-            # x and y are both vectors so this qualifies for a sdot / ddot
-            # TODO: Theano doesn't have a sdot, but gemv is better than _dot22
-            xv = x.dimshuffle(1)
-            zeros = T.AllocEmpty(x.dtype)(1)
-            rval = gemv_no_inplace(zeros, one, y.T, xv, zero)
-            return [rval.dimshuffle('x', 0)]
-        if xb[0] and not yb[0] and not yb[1]:
-            # x is vector, y is matrix so try gemv
-            xv = x.dimshuffle(1)
-            zeros = T.AllocEmpty(x.dtype)(y.shape[1])
-            rval = gemv_no_inplace(zeros, one, y.T, xv, zero)
-            return [rval.dimshuffle('x', 0)]
-        if not xb[0] and not xb[1] and yb[1]:
-            # x is matrix, y is vector, try gemv
-            yv = y.dimshuffle(0)
-            zeros = T.AllocEmpty(x.dtype)(x.shape[0])
-            rval = gemv_no_inplace(zeros, one, x, yv, zero)
-            return [rval.dimshuffle(0, 'x')]
+        with inherit_stack_trace(node.outputs):
+            x, y = node.inputs
+            xb = x.broadcastable
+            yb = y.broadcastable
+            one = T.as_tensor_variable(np.asarray(1, dtype=x.dtype))
+            zero = T.as_tensor_variable(np.asarray(0, dtype=x.dtype))
+            if xb[1] and yb[0]:
+                # x and y are both vectors so this might qualifies for a GER
+                xv = x.dimshuffle(0)
+                yv = y.dimshuffle(1)
+                zeros = T.zeros([x.shape[0], y.shape[1]], dtype=x.dtype)
+                rval = ger(zeros, one, xv, yv)
+                return [rval]
+            if xb[0] and yb[1]:
+                # x and y are both vectors so this qualifies for a sdot / ddot
+                # TODO: Theano doesn't have a sdot, but gemv is better than _dot22
+                xv = x.dimshuffle(1)
+                zeros = T.AllocEmpty(x.dtype)(1)
+                rval = gemv_no_inplace(zeros, one, y.T, xv, zero)
+                return [rval.dimshuffle('x', 0)]
+            if xb[0] and not yb[0] and not yb[1]:
+                # x is vector, y is matrix so try gemv
+                xv = x.dimshuffle(1)
+                zeros = T.AllocEmpty(x.dtype)(y.shape[1])
+                rval = gemv_no_inplace(zeros, one, y.T, xv, zero)
+                return [rval.dimshuffle('x', 0)]
+            if not xb[0] and not xb[1] and yb[1]:
+                # x is matrix, y is vector, try gemv
+                yv = y.dimshuffle(0)
+                zeros = T.AllocEmpty(x.dtype)(x.shape[0])
+                rval = gemv_no_inplace(zeros, one, x, yv, zero)
+                return [rval.dimshuffle(0, 'x')]


 #################################

--- a/theano/tensor/opt_uncanonicalize.py
+++ b/theano/tensor/opt_uncanonicalize.py
@@ -43,6 +43,7 @@ from theano.tensor import DimShuffle, Subtensor

 from theano.tensor.opt import register_uncanonicalize
 from theano import scalar as scal
+from theano.gof.opt import copy_stack_trace

 _logger = logging.getLogger('theano.tensor.opt')

@@ -57,10 +58,13 @@ def local_max_and_argmax(node):
        axis = node.op.get_params(node)
        if len(node.outputs[1].clients) == 0:
            new = CAReduce(scal.maximum, axis)(node.inputs[0])
+            copy_stack_trace(node.outputs[0], new)
            return [new, None]

        if len(node.outputs[0].clients) == 0:
-            return [None, T.Argmax(axis)(node.inputs[0])]
+            new = T.Argmax(axis)(node.inputs[0])
+            copy_stack_trace(node.outputs[0], new)
+            return [None, new]


 @register_uncanonicalize
@@ -84,8 +88,8 @@ def local_max_to_min(node):
                max.owner.op.scalar_op == scal.maximum):
            neg = max.owner.inputs[0]
            if neg.owner and neg.owner.op == T.neg:
-                return [CAReduce(scal.minimum,
-                                 max.owner.op.axis)(neg.owner.inputs[0])]
+                new = CAReduce(scal.minimum, max.owner.op.axis)(neg.owner.inputs[0])
+                return [copy_stack_trace(node.outputs[0], new)]

    return False