提交 7a6d676f authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5688 from cooijmanstim/gpuarray-stack-trace

gpuarray: keep stack trace
......@@ -4,6 +4,7 @@ Node classes (`Apply`, `Variable`) and expression graph algorithms.
from __future__ import absolute_import, print_function, division
from collections import deque
import contextlib
from copy import copy
from itertools import count
......@@ -390,6 +391,8 @@ class Variable(Node):
self.name = name
self.auto_name = 'auto_' + str(next(self.__count__))
Variable.notify_construction_observers(self)
def __str__(self):
"""Return a str representation of the Variable.
......@@ -536,6 +539,22 @@ class Variable(Node):
d["tag"] = t
return d
# refer to doc in nodes_constructed.
construction_observers = []
@classmethod
def append_construction_observer(cls, observer):
cls.construction_observers.append(observer)
@classmethod
def remove_construction_observer(cls, observer):
cls.construction_observers.remove(observer)
@classmethod
def notify_construction_observers(cls, instance):
for observer in cls.construction_observers:
observer(instance)
class Constant(Variable):
"""
......@@ -1426,3 +1445,38 @@ def is_in_ancestors(l_node, f_node):
todo.append(cur)
todo.extend(i.owner for i in cur.inputs if i.owner)
return False
@contextlib.contextmanager
def nodes_constructed():
"""
A contextmanager that is used in inherit_stack_trace and keeps track
of all the newly created varaible nodes inside an optimization. A list
of new_nodes is instantiated but will be filled in a lazy manner (when
Variable.notify_construction_observers is called).
`observer` is the entity that updates the new_nodes list.
construction_observers is a list inside Variable class and contains
a list of observer functions. The observer functions inside
construction_observers are only called when a variable node is
instantiated (where Variable.notify_construction_observers is called).
When the observer function is called, a new variable node is added to
the new_nodes list.
Parameters
----------
new_nodes
A list of all the variable nodes that are created inside the optimization.
yields
new_nodes list.
"""
new_nodes = []
def observer(node):
new_nodes.append(node)
Variable.append_construction_observer(observer)
yield new_nodes
Variable.remove_construction_observer(observer)
......@@ -6,6 +6,7 @@ amount of useful generic optimization tools.
from __future__ import absolute_import, print_function, division
from collections import deque, defaultdict, OrderedDict
import contextlib
import copy
import inspect
import logging
......@@ -2902,7 +2903,7 @@ def pre_greedy_local_optimizer(list_optimizations, out):
def copy_stack_trace(from_var, to_var):
"""
Copies the stack trace from one or more tensor variables to
one or more tensor variables.
one or more tensor variables and returns the destination variables.
Parameters
----------
......@@ -2946,6 +2947,25 @@ def copy_stack_trace(from_var, to_var):
# Copy over stack traces from from_var to each variable to
# to_var, including the stack_trace of the to_var before
to_var.tag.trace = getattr(to_var.tag, 'trace', []) + tr
return to_var
@contextlib.contextmanager
def inherit_stack_trace(from_var):
"""
Contextmanager that copies the stack trace from one or more variable nodes to all
variable nodes constructed in the body. new_nodes is the list of all the newly created
variable nodes inside an optimization that is managed by graph.nodes_constructed().
Parameters
----------
from_var
Variable node or a list of variable nodes to copy stack traces from.
"""
with graph.nodes_constructed() as new_nodes:
yield
copy_stack_trace(from_var, new_nodes)
def check_stack_trace(f_or_fgraph, ops_to_check='last', bug_print='raise'):
......
......@@ -15,6 +15,7 @@ from theano.tensor.basic import (
from theano.gof import HideC, COp, ParamsType
from theano.gof.utils import MethodNotDefined
from theano.gof.opt import copy_stack_trace
from collections import deque
......@@ -75,11 +76,11 @@ def as_gpuarray_variable(x, context_name):
# If we couldn't deal with transfers, then maybe it's a tensor
if isinstance(x.type, tensor.TensorType):
return GpuFromHost(context_name)(x)
return copy_stack_trace(x, GpuFromHost(context_name)(x))
# Try _as_GpuArrayVariable if possible
if hasattr(x, '_as_GpuArrayVariable'):
return x._as_GpuArrayVariable(context_name)
return copy_stack_trace(x, x._as_GpuArrayVariable(context_name))
# If it didn't work try for a constant
ctx = get_context(context_name)
......
......@@ -18,6 +18,7 @@ from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList
from theano.gof.cmodule import GCC_compiler
from theano.gof.type import CDataType, Generic
from theano.gof.opt import inherit_stack_trace
from theano.compile import optdb
from theano.compile.ops import shape_i, shape_i_op
from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
......@@ -3127,8 +3128,10 @@ def local_abstractconv_cudnn(node):
if node.op.unshared:
return None
if isinstance(node.op, AbstractConv2d):
with inherit_stack_trace(node.outputs):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
elif isinstance(node.op, AbstractConv3d):
with inherit_stack_trace(node.outputs):
return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
......@@ -3352,8 +3355,10 @@ def local_abstractconv_gw_cudnn(node):
if node.op.unshared:
return None
if isinstance(node.op, AbstractConv2d_gradWeights):
with inherit_stack_trace(node.outputs):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
elif isinstance(node.op, AbstractConv3d_gradWeights):
with inherit_stack_trace(node.outputs):
return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
......@@ -3365,8 +3370,10 @@ def local_abstractconv_gi_cudnn(node):
if node.op.unshared:
return None
if isinstance(node.op, AbstractConv2d_gradInputs):
with inherit_stack_trace(node.outputs):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
elif isinstance(node.op, AbstractConv3d_gradInputs):
with inherit_stack_trace(node.outputs):
return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
......@@ -3384,7 +3391,6 @@ def local_dnn_convgw_inplace(node, inputs):
def local_dnn_convgi_inplace(node, inputs):
return [GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
optdb.register('local_dnna_conv_inplace',
tensor.opt.in2out(local_dnn_conv_inplace,
local_dnn_convgw_inplace,
......@@ -3654,6 +3660,7 @@ def local_dnn_reduction(node):
if not cudnn.cudnnReduceTensorOp_t.has_alias(node.op.scalar_op.name):
return
with inherit_stack_trace(node.outputs):
return (GpuDnnReduction(node.op.scalar_op.name,
node.op.axis,
node.op.acc_dtype,
......
......@@ -15,7 +15,8 @@ from theano.compile.ops import shape_i
from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
LocalGroupDB,
SequenceDB, Optimizer, DB, toolbox, graph)
from theano.gof.opt import LocalMetaOptimizer
from theano.gof.opt import (LocalMetaOptimizer, copy_stack_trace,
inherit_stack_trace)
from theano.ifelse import IfElse
from theano.misc.ordered_set import OrderedSet
......@@ -252,12 +253,25 @@ def op_lifter(OP, cuda_only=False):
# This is needed as sometimes new_op inherits from OP.
if new_op and new_op != node.op:
if isinstance(new_op, theano.Op):
return [safe_to_cpu(o) for o in
new_op(*node.inputs, return_list=True)]
new_outputs = new_op(*node.inputs, return_list=True)
to_cpu_fn = safe_to_cpu
elif isinstance(new_op, (tuple, list)):
return [safe_to_cpu(o) for o in new_op]
new_outputs = new_op
to_cpu_fn = safe_to_cpu
else: # suppose it is a variable on the GPU
return [new_op.transfer('cpu')]
new_outputs = [new_op]
def to_cpu_fn(x):
return x.transfer('cpu')
# copy stack traces onto gpu outputs
# also copy the stack traces onto HostFromGpu outputs
on_cpu = []
for old_output, new_output in zip(node.outputs, new_outputs):
copy_stack_trace(old_output, new_output)
cpu = to_cpu_fn(new_output)
on_cpu.append(cpu)
copy_stack_trace(old_output, cpu)
return on_cpu
return False
local_opt.__name__ = maker.__name__
return local_optimizer(OP)(local_opt)
......@@ -419,6 +433,9 @@ class GraphToGPU(Optimizer):
elif isinstance(new_ops, theano.Variable):
outputs = [new_ops]
for old_output, new_output in zip(node.outputs, outputs):
copy_stack_trace(old_output, new_output)
if new_ops:
node_created[lopt] += len(graph.ops([mapping[i] for i in node.inputs], outputs))
if any([getattr(old_o, 'dtype', None) != getattr(new_o, 'dtype', None)
......@@ -451,7 +468,7 @@ class GraphToGPU(Optimizer):
new_o.owner.inputs[0].type == o.type):
new_o = new_o.owner.inputs[0]
else:
new_o = safe_to_cpu(new_o)
new_o = copy_stack_trace(o, safe_to_cpu(new_o))
new_nodes.append(new_o)
fgraph.replace_all_validate(zip(fgraph.outputs, new_nodes),
reason=self.__class__.__name__)
......@@ -650,7 +667,8 @@ def local_gpualloc_memset_0(node):
inp.data.size == 1 and
(np.asarray(inp.data) == 0).all()):
new_op = GpuAlloc(node.op.context_name, memset_0=True)
return [new_op(*node.inputs)]
with inherit_stack_trace(node.outputs):
return new_op(*node.inputs, return_list=True)
# Don't register by default.
......@@ -659,10 +677,9 @@ def local_gpua_alloc_empty_to_zeros(node):
if isinstance(node.op, GpuAllocEmpty):
context_name = infer_context_name(*node.inputs)
z = np.asarray(0, dtype=node.outputs[0].dtype)
return [GpuAlloc(context_name)(as_gpuarray_variable(z, context_name),
*node.inputs)]
with inherit_stack_trace(node.outputs):
return [GpuAlloc(context_name)(
as_gpuarray_variable(z, context_name), *node.inputs)]
optdb.register('local_gpua_alloc_empty_to_zeros',
theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros),
# After move to gpu and merge2, before inplace.
......@@ -1206,6 +1223,7 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
op.scalar_op, axis=op.axis,
dtype=odtype,
acc_dtype=adtype)
with inherit_stack_trace(outputs):
gvar = greduce(x)
# We need to have the make node called, otherwise the mask can
# be None
......@@ -1246,11 +1264,12 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
dtype=getattr(op, 'dtype', outputs[0].dtype),
acc_dtype=getattr(op, 'acc_dtype', None))
with inherit_stack_trace(outputs):
reshaped_x = x.reshape(tensor.stack(new_in_shp))
gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name)
gvar = greduce(gpu_reshaped_x)
# We need to have the make node called, otherwise the mask can
# be None
gvar = greduce(gpu_reshaped_x)
reshaped_gpu_inputs = [gpu_reshaped_x]
if greduce.supports_c_code(reshaped_gpu_inputs):
reduce_reshaped_x = greduce(gpu_reshaped_x)
......@@ -1260,7 +1279,8 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
for i in range(x.ndim):
if i not in op.axis:
out_shp.append(shape_i(x, i))
unreshaped_reduce = GpuReshape(len(out_shp))(reduce_reshaped_x,
unreshaped_reduce = GpuReshape(len(out_shp))(
reduce_reshaped_x,
tensor.stack(out_shp))
else:
unreshaped_reduce = reduce_reshaped_x
......@@ -1305,6 +1325,7 @@ def local_gpua_gemm(op, context_name, inputs, outputs):
def local_gpua_gemmbatch(op, context_name, inputs, outputs):
if inputs[0].dtype not in ['float16', 'float32', 'float64']:
return
with inherit_stack_trace(outputs):
a, b = inputs
# Since GpuGemmBatch only supports 3D inputs and output,
# we need to add broadcastable dims to the inputs, and drop
......@@ -1378,6 +1399,7 @@ def local_gpua_dot22(op, context_name, inputs, outputs):
@op_lifter([tensor.blas.Dot22Scalar])
@register_opt2([tensor.blas.Dot22Scalar], 'fast_compile')
def local_gpua_dot22scalar(op, context_name, inputs, outputs):
with inherit_stack_trace(outputs):
x, y, a = inputs
x = as_gpuarray_variable(x, context_name)
y = as_gpuarray_variable(y, context_name)
......@@ -2392,6 +2414,7 @@ def local_gpu_elemwise_careduce(node):
props = node.op._props_dict()
props["pre_scalar_op"] = scalar.basic.sqr
out = GpuCAReduceCuda(**props)(inp)
with inherit_stack_trace(node.outputs):
return [out]
......@@ -2583,6 +2606,7 @@ def local_gpu_solve(op, context_name, inputs, outputs):
@local_optimizer([GpuCusolverSolve], inplace=True)
def local_inplace_gpu_solve(node):
if isinstance(node.op, GpuCusolverSolve) and not node.op.inplace:
with inherit_stack_trace(node.outputs):
return [GpuCusolverSolve(A_structure=node.op.A_structure, trans=node.op.trans,
inplace=True)(*node.inputs)]
......@@ -2622,6 +2646,7 @@ register_opt2([slinalg.Solve], 'fast_compile', name='matrix_ops_db2')(matrix_ops
@local_optimizer([GpuCholesky], inplace=True)
def local_inplace_gpu_cholesky(node):
if isinstance(node.op, GpuCholesky) and not node.op.inplace:
with inherit_stack_trace(node.outputs):
return [node.op.clone_inplace()(*node.inputs)]
......@@ -2705,6 +2730,7 @@ def local_gpu_magma_matrix_inverse(op, context_name, inputs, outputs):
@local_optimizer([GpuMagmaMatrixInverse])
def local_inplace_gpu_magma_matrix_inverse(node):
if isinstance(node.op, GpuMagmaMatrixInverse) and not node.op.inplace:
with inherit_stack_trace(node.outputs):
return [node.op.clone_inplace()(*node.inputs)]
......
......@@ -5,6 +5,7 @@ import numpy as np
from theano import tensor, scalar as scal, Constant
from theano.gof import local_optimizer
from theano.gof.opt import inherit_stack_trace
from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError)
......@@ -184,6 +185,7 @@ def alpha_merge(cls, alpha_in, beta_in):
except NotScalarConstantError:
inputs[alpha_in] = lr * targ.inputs[alpha_in]
inputs[beta_in] = lr * targ.inputs[beta_in]
with inherit_stack_trace(node.outputs):
return maker(targ, *inputs)
return opt
return wrapper
......@@ -272,6 +274,7 @@ def output_merge(cls, alpha_in, beta_in, out_in):
inputs = list(targ.inputs)
inputs[out_in] = W
inputs[beta_in] = _one.clone()
with inherit_stack_trace(node.outputs):
return maker(targ, *inputs)
return opt
return wrapper
......@@ -326,6 +329,7 @@ def inplace_allocempty(op, idx):
len(alloc.clients) > 1):
alloc_op = GpuAllocEmpty(alloc.owner.op.dtype, alloc.owner.op.context_name)
inputs[idx] = alloc_op(*alloc.owner.inputs)
with inherit_stack_trace(node.outputs):
return maker(node, inputs)
return opt
return wrapper
......
......@@ -146,6 +146,7 @@ from theano.gof import (utils, Op, view_roots,
EquilibriumOptimizer, Apply,
ReplacementDidntRemovedError)
from theano.gof.params_type import ParamsType
from theano.gof.opt import inherit_stack_trace
from theano.printing import pprint, FunctionPrinter, debugprint
from theano.compile.mode import optdb
import theano.scalar
......@@ -1625,17 +1626,14 @@ def local_dot_to_dot22(node):
return
if y.type.dtype in ['float16', 'float32', 'float64', 'complex64', 'complex128']:
with inherit_stack_trace(node.outputs):
if x.ndim == 2 and y.ndim == 2:
# print "local_dot_to_dot22: MM"
return [_dot22(*node.inputs)]
if x.ndim == 2 and y.ndim == 1:
# print "local_dot_to_dot22: MV"
return [_dot22(x, y.dimshuffle(0, 'x')).dimshuffle(0)]
if x.ndim == 1 and y.ndim == 2:
# print "local_dot_to_dot22: VM"
return [_dot22(x.dimshuffle('x', 0), y).dimshuffle(1)]
if x.ndim == 1 and y.ndim == 1:
# print "local_dot_to_dot22: VV"
return [_dot22(x.dimshuffle('x', 0),
y.dimshuffle(0, 'x')).dimshuffle()]
......@@ -1646,18 +1644,21 @@ def local_dot_to_dot22(node):
@local_optimizer([gemm_no_inplace], inplace=True)
def local_inplace_gemm(node):
if node.op == gemm_no_inplace:
with inherit_stack_trace(node.outputs):
return [gemm_inplace(*node.inputs)]
@local_optimizer([gemv_no_inplace], inplace=True)
def local_inplace_gemv(node):
if node.op == gemv_no_inplace:
with inherit_stack_trace(node.outputs):
return [gemv_inplace(*node.inputs)]
@local_optimizer([ger], inplace=True)
def local_inplace_ger(node):
if node.op == ger:
with inherit_stack_trace(node.outputs):
return [ger_destructive(*node.inputs)]
......@@ -1666,6 +1667,7 @@ def local_gemm_to_gemv(node):
"""GEMM acting on row or column matrices -> GEMV."""
if node.op == gemm_no_inplace:
z, a, x, y, b = node.inputs
with inherit_stack_trace(node.outputs):
if z.broadcastable == x.broadcastable == (True, False):
r = gemv_no_inplace(z.dimshuffle(1), a, y.T, x.dimshuffle(1), b)
return [r.dimshuffle('x', 0)]
......@@ -1680,6 +1682,7 @@ def local_gemm_to_ger(node):
if node.op == gemm_no_inplace:
z, a, x, y, b = node.inputs
if x.broadcastable[1] and y.broadcastable[0]:
with inherit_stack_trace(node.outputs):
# x and y are both vectors so this might qualifies for a GER
xv = x.dimshuffle(0)
yv = y.dimshuffle(1)
......@@ -1708,6 +1711,7 @@ def local_gemm_to_ger(node):
def local_dot22_to_ger_or_gemv(node):
"""dot22 computing an outer-product -> GER."""
if node.op == _dot22:
with inherit_stack_trace(node.outputs):
x, y = node.inputs
xb = x.broadcastable
yb = y.broadcastable
......
......@@ -43,6 +43,7 @@ from theano.tensor import DimShuffle, Subtensor
from theano.tensor.opt import register_uncanonicalize
from theano import scalar as scal
from theano.gof.opt import copy_stack_trace
_logger = logging.getLogger('theano.tensor.opt')
......@@ -57,10 +58,13 @@ def local_max_and_argmax(node):
axis = node.op.get_params(node)
if len(node.outputs[1].clients) == 0:
new = CAReduce(scal.maximum, axis)(node.inputs[0])
copy_stack_trace(node.outputs[0], new)
return [new, None]
if len(node.outputs[0].clients) == 0:
return [None, T.Argmax(axis)(node.inputs[0])]
new = T.Argmax(axis)(node.inputs[0])
copy_stack_trace(node.outputs[0], new)
return [None, new]
@register_uncanonicalize
......@@ -84,8 +88,8 @@ def local_max_to_min(node):
max.owner.op.scalar_op == scal.maximum):
neg = max.owner.inputs[0]
if neg.owner and neg.owner.op == T.neg:
return [CAReduce(scal.minimum,
max.owner.op.axis)(neg.owner.inputs[0])]
new = CAReduce(scal.minimum, max.owner.op.axis)(neg.owner.inputs[0])
return [copy_stack_trace(node.outputs[0], new)]
return False
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论