more stack trace copying

96cbce45 · Tim Cooijmans · Reyhane Askari · 1b101ffc · 96cbce45 · 96cbce45
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1323,33 +1323,34 @@ def local_gpua_gemm(op, context_name, inputs, outputs):
 def local_gpua_gemmbatch(op, context_name, inputs, outputs):
    if inputs[0].dtype not in ['float16', 'float32', 'float64']:
        return
-    a, b = inputs
+    with inherit_stack_trace(outputs):
-    # Since GpuGemmBatch only supports 3D inputs and output,
+        a, b = inputs
-    # we need to add broadcastable dims to the inputs, and drop
+        # Since GpuGemmBatch only supports 3D inputs and output,
-    # them from outputs
+        # we need to add broadcastable dims to the inputs, and drop
-    output_dims = [0, 1, 2]
+        # them from outputs
-    if a.ndim == 2:
+        output_dims = [0, 1, 2]
-        a = GpuDimShuffle(a.broadcastable, (0, 'x', 1))(a)
+        if a.ndim == 2:
-        del output_dims[1]
+            a = GpuDimShuffle(a.broadcastable, (0, 'x', 1))(a)
-    if b.ndim == 2:
+            del output_dims[1]
-        b = GpuDimShuffle(b.broadcastable, (0, 1, 'x'))(b)
+        if b.ndim == 2:
-        del output_dims[-1]
+            b = GpuDimShuffle(b.broadcastable, (0, 1, 'x'))(b)
-    # In case of mismatched dtypes, we also have to upcast
+            del output_dims[-1]
-    out_dtype = outputs[0].dtype
+        # In case of mismatched dtypes, we also have to upcast
-    if a.dtype != out_dtype or b.dtype != out_dtype:
+        out_dtype = outputs[0].dtype
-        gpu_cast_op = GpuElemwise(Cast(Scalar(out_dtype)))
+        if a.dtype != out_dtype or b.dtype != out_dtype:
-        if a.dtype != out_dtype:
+            gpu_cast_op = GpuElemwise(Cast(Scalar(out_dtype)))
-            a = gpu_cast_op(a)
+            if a.dtype != out_dtype:
-        if b.dtype != out_dtype:
+                a = gpu_cast_op(a)
-            b = gpu_cast_op(b)
+            if b.dtype != out_dtype:
+                b = gpu_cast_op(b)
-    c = GpuAllocEmpty(out_dtype, context_name)(
-        a.shape[0], a.shape[1], b.shape[2])
+        c = GpuAllocEmpty(out_dtype, context_name)(
-    out = gpugemmbatch_no_inplace(c, np.asarray(1.0, dtype=out_dtype),
+            a.shape[0], a.shape[1], b.shape[2])
-                                  a, b, np.asarray(0.0, dtype=out_dtype))
+        out = gpugemmbatch_no_inplace(c, np.asarray(1.0, dtype=out_dtype),
-    if len(output_dims) != 3:
+                                      a, b, np.asarray(0.0, dtype=out_dtype))
-        out = GpuDimShuffle(out.broadcastable, output_dims)(out)
+        if len(output_dims) != 3:
-    return out
+            out = GpuDimShuffle(out.broadcastable, output_dims)(out)
+        return out
 @register_opt()
@@ -2599,8 +2600,9 @@ def local_gpu_solve(op, context_name, inputs, outputs):
 @local_optimizer([GpuCusolverSolve], inplace=True)
 def local_inplace_gpu_solve(node):
    if isinstance(node.op, GpuCusolverSolve) and not node.op.inplace:
-        return [GpuCusolverSolve(A_structure=node.op.A_structure, trans=node.op.trans,
+        with inherit_stack_trace(node.outputs):
-                                 inplace=True)(*node.inputs)]
+            return [GpuCusolverSolve(A_structure=node.op.A_structure, trans=node.op.trans,
+                                     inplace=True)(*node.inputs)]
 # Cholesky decomposition
@@ -2638,7 +2640,8 @@ register_opt2([slinalg.Solve], 'fast_compile', name='matrix_ops_db2')(matrix_ops
 @local_optimizer([GpuCholesky], inplace=True)
 def local_inplace_gpu_cholesky(node):
    if isinstance(node.op, GpuCholesky) and not node.op.inplace:
-        return [node.op.clone_inplace()(*node.inputs)]
+        with inherit_stack_trace(node.outputs):
+            return [node.op.clone_inplace()(*node.inputs)]
 def local_gpu_magma_cholesky(op, context_name, inputs, outputs):
@@ -2721,7 +2724,8 @@ def local_gpu_magma_matrix_inverse(op, context_name, inputs, outputs):
 @local_optimizer([GpuMagmaMatrixInverse])
 def local_inplace_gpu_magma_matrix_inverse(node):
    if isinstance(node.op, GpuMagmaMatrixInverse) and not node.op.inplace:
-        return [node.op.clone_inplace()(*node.inputs)]
+        with inherit_stack_trace(node.outputs):
+            return [node.op.clone_inplace()(*node.inputs)]
 # Eigen decomposition of a symmetric matrix

--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -41,6 +41,7 @@ def _check_stack_trace(thing):
                                   theano.ifelse.IfElse,
                                   GpuFromHost, HostFromGpu,
                                   GpuCAReduceCuda,
+                                   basic_ops.GpuContiguous,
                                   GpuElemwise,
                                   theano.printing.Print,
                                   PdbBreakpoint,