提交 ff9e2b38 authored 作者: Tim Cooijmans's avatar Tim Cooijmans 提交者: Reyhane Askari

more stack trace copying

上级 39269c13
...@@ -89,13 +89,13 @@ def as_gpuarray_variable(x, context_name): ...@@ -89,13 +89,13 @@ def as_gpuarray_variable(x, context_name):
if x.context.ptr != ctx.ptr: if x.context.ptr != ctx.ptr:
x = x.transfer(ctx) x = x.transfer(ctx)
x = with_stack_trace(x, gpuarray.asarray(x, context=ctx)) x = gpuarray.asarray(x, context=ctx)
bcast = [(s == 1) for s in x.shape] bcast = [(s == 1) for s in x.shape]
return with_stack_trace(x, GpuArrayConstant(GpuArrayType(dtype=x.dtype, return GpuArrayConstant(GpuArrayType(dtype=x.dtype,
broadcastable=bcast, broadcastable=bcast,
context_name=context_name), context_name=context_name),
x)) x)
def infer_context_name(*vars): def infer_context_name(*vars):
......
...@@ -7,6 +7,7 @@ from theano import Apply, Op ...@@ -7,6 +7,7 @@ from theano import Apply, Op
from theano.compile import optdb from theano.compile import optdb
from theano.gof import LocalOptGroup, ParamsType from theano.gof import LocalOptGroup, ParamsType
from theano.scalar import bool as bool_t from theano.scalar import bool as bool_t
from theano.gof.opt import inherit_stack_trace
from theano.tensor.basic import as_tensor_variable from theano.tensor.basic import as_tensor_variable
from theano.tensor.opt import in2out from theano.tensor.opt import in2out
...@@ -1830,16 +1831,19 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM): ...@@ -1830,16 +1831,19 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
@inplace_allocempty(GpuGemv, 0) @inplace_allocempty(GpuGemv, 0)
def local_inplace_gpuagemv(node, inputs): def local_inplace_gpuagemv(node, inputs):
with inherit_stack_trace(node.outputs):
return [gpugemv_inplace(*inputs)] return [gpugemv_inplace(*inputs)]
@inplace_allocempty(GpuGemm, 0) @inplace_allocempty(GpuGemm, 0)
def local_inplace_gpuagemm(node, inputs): def local_inplace_gpuagemm(node, inputs):
with inherit_stack_trace(node.outputs):
return [gpugemm_inplace(*inputs)] return [gpugemm_inplace(*inputs)]
@inplace_allocempty(GpuGer, 0) @inplace_allocempty(GpuGer, 0)
def local_inplace_gpuager(node, inputs): def local_inplace_gpuager(node, inputs):
with inherit_stack_trace(node.outputs):
return [gpuger_inplace(*inputs)] return [gpuger_inplace(*inputs)]
......
...@@ -18,6 +18,7 @@ from theano.gradient import DisconnectedType, grad_not_implemented ...@@ -18,6 +18,7 @@ from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList
from theano.gof.cmodule import GCC_compiler from theano.gof.cmodule import GCC_compiler
from theano.gof.type import CDataType, Generic from theano.gof.type import CDataType, Generic
from theano.gof.opt import inherit_stack_trace
from theano.compile import optdb from theano.compile import optdb
from theano.compile.ops import shape_i, shape_i_op from theano.compile.ops import shape_i, shape_i_op
from theano.tensor.nnet import LogSoftmax, SoftmaxGrad from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
...@@ -3132,6 +3133,7 @@ def local_abstractconv_cudnn(node): ...@@ -3132,6 +3133,7 @@ def local_abstractconv_cudnn(node):
ctx = infer_context_name(*node.inputs) ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType): if not isinstance(node.inputs[0].type, GpuArrayType):
return return
with inherit_stack_trace(node.outputs):
if node.op.unshared: if node.op.unshared:
return None return None
if isinstance(node.op, AbstractConv2d): if isinstance(node.op, AbstractConv2d):
...@@ -3356,6 +3358,7 @@ def local_abstractconv_gw_cudnn(node): ...@@ -3356,6 +3358,7 @@ def local_abstractconv_gw_cudnn(node):
ctx = infer_context_name(*node.inputs) ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType): if not isinstance(node.inputs[0].type, GpuArrayType):
return return
with inherit_stack_trace(node.outputs):
if node.op.unshared: if node.op.unshared:
return None return None
if isinstance(node.op, AbstractConv2d_gradWeights): if isinstance(node.op, AbstractConv2d_gradWeights):
...@@ -3369,6 +3372,7 @@ def local_abstractconv_gi_cudnn(node): ...@@ -3369,6 +3372,7 @@ def local_abstractconv_gi_cudnn(node):
ctx = infer_context_name(*node.inputs) ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType): if not isinstance(node.inputs[0].type, GpuArrayType):
return return
with inherit_stack_trace(node.outputs):
if node.op.unshared: if node.op.unshared:
return None return None
if isinstance(node.op, AbstractConv2d_gradInputs): if isinstance(node.op, AbstractConv2d_gradInputs):
...@@ -3379,19 +3383,21 @@ def local_abstractconv_gi_cudnn(node): ...@@ -3379,19 +3383,21 @@ def local_abstractconv_gi_cudnn(node):
@inplace_allocempty(GpuDnnConv, 2) @inplace_allocempty(GpuDnnConv, 2)
def local_dnn_conv_inplace(node, inputs): def local_dnn_conv_inplace(node, inputs):
with inherit_stack_trace(node.outputs):
return [GpuDnnConv(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)] return [GpuDnnConv(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
@inplace_allocempty(GpuDnnConvGradW, 2) @inplace_allocempty(GpuDnnConvGradW, 2)
def local_dnn_convgw_inplace(node, inputs): def local_dnn_convgw_inplace(node, inputs):
with inherit_stack_trace(node.outputs):
return [GpuDnnConvGradW(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)] return [GpuDnnConvGradW(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
@inplace_allocempty(GpuDnnConvGradI, 2) @inplace_allocempty(GpuDnnConvGradI, 2)
def local_dnn_convgi_inplace(node, inputs): def local_dnn_convgi_inplace(node, inputs):
with inherit_stack_trace(node.outputs):
return [GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)] return [GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
optdb.register('local_dnna_conv_inplace', optdb.register('local_dnna_conv_inplace',
tensor.opt.in2out(local_dnn_conv_inplace, tensor.opt.in2out(local_dnn_conv_inplace,
local_dnn_convgw_inplace, local_dnn_convgw_inplace,
...@@ -3403,24 +3409,25 @@ optdb.register('local_dnna_conv_inplace', ...@@ -3403,24 +3409,25 @@ optdb.register('local_dnna_conv_inplace',
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5) @alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
def local_dnn_conv_alpha_merge(node, *inputs): def local_dnn_conv_alpha_merge(node, *inputs):
with inherit_stack_trace(node.outputs):
return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)] return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5) @alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
def local_dnn_convw_alpha_merge(node, *inputs): def local_dnn_convw_alpha_merge(node, *inputs):
with inherit_stack_trace(node.outputs):
return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)] return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5) @alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
def local_dnn_convi_alpha_merge(node, *inputs): def local_dnn_convi_alpha_merge(node, *inputs):
with inherit_stack_trace(node.outputs):
return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)] return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2) @output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_conv_output_merge(node, *inputs): def local_dnn_conv_output_merge(node, *inputs):
with inherit_stack_trace(node.outputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(algo=node.op.algo)(*inputs)] return [GpuDnnConv(algo=node.op.algo)(*inputs)]
...@@ -3428,6 +3435,7 @@ def local_dnn_conv_output_merge(node, *inputs): ...@@ -3428,6 +3435,7 @@ def local_dnn_conv_output_merge(node, *inputs):
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2) @output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convw_output_merge(node, *inputs): def local_dnn_convw_output_merge(node, *inputs):
with inherit_stack_trace(node.outputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)] return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
...@@ -3435,6 +3443,7 @@ def local_dnn_convw_output_merge(node, *inputs): ...@@ -3435,6 +3443,7 @@ def local_dnn_convw_output_merge(node, *inputs):
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2) @output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convi_output_merge(node, *inputs): def local_dnn_convi_output_merge(node, *inputs):
with inherit_stack_trace(node.outputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)] return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
......
...@@ -15,7 +15,8 @@ from theano.compile.ops import shape_i ...@@ -15,7 +15,8 @@ from theano.compile.ops import shape_i
from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer, from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
LocalGroupDB, LocalGroupDB,
SequenceDB, Optimizer, DB, toolbox, graph) SequenceDB, Optimizer, DB, toolbox, graph)
from theano.gof.opt import LocalMetaOptimizer, copy_stack_trace, with_stack_trace from theano.gof.opt import (LocalMetaOptimizer, copy_stack_trace,
with_stack_trace, inherit_stack_trace)
from theano.ifelse import IfElse from theano.ifelse import IfElse
from theano.misc.ordered_set import OrderedSet from theano.misc.ordered_set import OrderedSet
...@@ -421,8 +422,6 @@ class GraphToGPU(Optimizer): ...@@ -421,8 +422,6 @@ class GraphToGPU(Optimizer):
if isinstance(new_ops, theano.Op): if isinstance(new_ops, theano.Op):
outputs = new_ops(*[mapping[i] for i in node.inputs], return_list=True) outputs = new_ops(*[mapping[i] for i in node.inputs], return_list=True)
for old_output, new_output in zip(node.outputs, outputs):
copy_stack_trace(old_output, new_output)
elif not new_ops: elif not new_ops:
newnode = node.clone_with_new_inputs([mapping.get(i) for i in node.inputs]) newnode = node.clone_with_new_inputs([mapping.get(i) for i in node.inputs])
outputs = newnode.outputs outputs = newnode.outputs
...@@ -431,6 +430,10 @@ class GraphToGPU(Optimizer): ...@@ -431,6 +430,10 @@ class GraphToGPU(Optimizer):
elif isinstance(new_ops, theano.Variable): elif isinstance(new_ops, theano.Variable):
outputs = [new_ops] outputs = [new_ops]
for old_output, new_output in zip(node.outputs, outputs):
copy_stack_trace(old_output, new_output)
new_output.tag.tracefrom = old_output
if new_ops: if new_ops:
node_created[lopt] += len(graph.ops([mapping[i] for i in node.inputs], outputs)) node_created[lopt] += len(graph.ops([mapping[i] for i in node.inputs], outputs))
if any([getattr(old_o, 'dtype', None) != getattr(new_o, 'dtype', None) if any([getattr(old_o, 'dtype', None) != getattr(new_o, 'dtype', None)
...@@ -662,9 +665,8 @@ def local_gpualloc_memset_0(node): ...@@ -662,9 +665,8 @@ def local_gpualloc_memset_0(node):
inp.data.size == 1 and inp.data.size == 1 and
(np.asarray(inp.data) == 0).all()): (np.asarray(inp.data) == 0).all()):
new_op = GpuAlloc(node.op.context_name, memset_0=True) new_op = GpuAlloc(node.op.context_name, memset_0=True)
new_output = new_op(*node.inputs) with inherit_stack_trace(node.outputs):
copy_stack_trace(node.outputs[0], new_output) return new_op(*node.inputs, return_list=True)
return [new_output]
# Don't register by default. # Don't register by default.
...@@ -673,10 +675,9 @@ def local_gpua_alloc_empty_to_zeros(node): ...@@ -673,10 +675,9 @@ def local_gpua_alloc_empty_to_zeros(node):
if isinstance(node.op, GpuAllocEmpty): if isinstance(node.op, GpuAllocEmpty):
context_name = infer_context_name(*node.inputs) context_name = infer_context_name(*node.inputs)
z = np.asarray(0, dtype=node.outputs[0].dtype) z = np.asarray(0, dtype=node.outputs[0].dtype)
with inherit_stack_trace(node.outputs):
return [GpuAlloc(context_name)(as_gpuarray_variable(z, context_name), return [GpuAlloc(context_name)(as_gpuarray_variable(z, context_name),
*node.inputs)] *node.inputs)]
optdb.register('local_gpua_alloc_empty_to_zeros', optdb.register('local_gpua_alloc_empty_to_zeros',
theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros), theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros),
# After move to gpu and merge2, before inplace. # After move to gpu and merge2, before inplace.
...@@ -1220,7 +1221,8 @@ def local_gpua_careduce(op, context_name, inputs, outputs): ...@@ -1220,7 +1221,8 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
op.scalar_op, axis=op.axis, op.scalar_op, axis=op.axis,
dtype=odtype, dtype=odtype,
acc_dtype=adtype) acc_dtype=adtype)
gvar = with_stack_trace(outputs, greduce(x)) with inherit_stack_trace(outputs):
gvar = greduce(x)
# We need to have the make node called, otherwise the mask can # We need to have the make node called, otherwise the mask can
# be None # be None
if (op2 is GpuCAReduceCPY or if (op2 is GpuCAReduceCPY or
...@@ -1260,27 +1262,24 @@ def local_gpua_careduce(op, context_name, inputs, outputs): ...@@ -1260,27 +1262,24 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
dtype=getattr(op, 'dtype', outputs[0].dtype), dtype=getattr(op, 'dtype', outputs[0].dtype),
acc_dtype=getattr(op, 'acc_dtype', None)) acc_dtype=getattr(op, 'acc_dtype', None))
reshaped_x = with_stack_trace( with inherit_stack_trace(outputs):
outputs, x.reshape(tensor.stack(new_in_shp))) reshaped_x = x.reshape(tensor.stack(new_in_shp))
gpu_reshaped_x = with_stack_trace( gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name)
outputs, as_gpuarray_variable(reshaped_x, context_name))
gvar = with_stack_trace(outputs, greduce(gpu_reshaped_x))
# We need to have the make node called, otherwise the mask can # We need to have the make node called, otherwise the mask can
# be None # be None
gvar = greduce(gpu_reshaped_x)
reshaped_gpu_inputs = [gpu_reshaped_x] reshaped_gpu_inputs = [gpu_reshaped_x]
if greduce.supports_c_code(reshaped_gpu_inputs): if greduce.supports_c_code(reshaped_gpu_inputs):
reduce_reshaped_x = with_stack_trace( reduce_reshaped_x = greduce(gpu_reshaped_x)
outputs, greduce(gpu_reshaped_x))
if reduce_reshaped_x.ndim != outputs[0].ndim: if reduce_reshaped_x.ndim != outputs[0].ndim:
out_shp = [] out_shp = []
for i in range(x.ndim): for i in range(x.ndim):
if i not in op.axis: if i not in op.axis:
out_shp.append(shape_i(x, i)) out_shp.append(shape_i(x, i))
unreshaped_reduce = with_stack_trace( unreshaped_reduce = GpuReshape(len(out_shp))(
outputs, GpuReshape(len(out_shp))(
reduce_reshaped_x, reduce_reshaped_x,
tensor.stack(out_shp))) tensor.stack(out_shp))
else: else:
unreshaped_reduce = reduce_reshaped_x unreshaped_reduce = reduce_reshaped_x
return [unreshaped_reduce] return [unreshaped_reduce]
...@@ -1356,24 +1355,28 @@ def local_gpua_gemmbatch(op, context_name, inputs, outputs): ...@@ -1356,24 +1355,28 @@ def local_gpua_gemmbatch(op, context_name, inputs, outputs):
@register_opt() @register_opt()
@alpha_merge(GpuGemm, alpha_in=1, beta_in=4) @alpha_merge(GpuGemm, alpha_in=1, beta_in=4)
def local_gpua_gemm_alpha_merge(node, *inputs): def local_gpua_gemm_alpha_merge(node, *inputs):
with inherit_stack_trace(node.outputs):
return [gpugemm_no_inplace(*inputs)] return [gpugemm_no_inplace(*inputs)]
@register_opt() @register_opt()
@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0) @output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0)
def local_gpua_gemm_output_merge(node, *inputs): def local_gpua_gemm_output_merge(node, *inputs):
with inherit_stack_trace(node.outputs):
return [gpugemm_no_inplace(*inputs)] return [gpugemm_no_inplace(*inputs)]
@register_opt() @register_opt()
@alpha_merge(GpuGemmBatch, alpha_in=1, beta_in=4) @alpha_merge(GpuGemmBatch, alpha_in=1, beta_in=4)
def local_gpua_gemmbatch_alpha_merge(node, *inputs): def local_gpua_gemmbatch_alpha_merge(node, *inputs):
with inherit_stack_trace(node.outputs):
return [gpugemmbatch_no_inplace(*inputs)] return [gpugemmbatch_no_inplace(*inputs)]
@register_opt() @register_opt()
@output_merge(GpuGemmBatch, alpha_in=1, beta_in=4, out_in=0) @output_merge(GpuGemmBatch, alpha_in=1, beta_in=4, out_in=0)
def local_gpua_gemmbatch_output_merge(node, *inputs): def local_gpua_gemmbatch_output_merge(node, *inputs):
with inherit_stack_trace(node.outputs):
return [gpugemmbatch_no_inplace(*inputs)] return [gpugemmbatch_no_inplace(*inputs)]
...@@ -2403,8 +2406,8 @@ def local_gpu_elemwise_careduce(node): ...@@ -2403,8 +2406,8 @@ def local_gpu_elemwise_careduce(node):
props = node.op._props_dict() props = node.op._props_dict()
props["pre_scalar_op"] = scalar.basic.sqr props["pre_scalar_op"] = scalar.basic.sqr
out = GpuCAReduceCuda(**props)(inp) out = GpuCAReduceCuda(**props)(inp)
return with_stack_trace( with inherit_stack_trace(node.outputs):
node.outputs, out) return out
@local_optimizer(None) @local_optimizer(None)
......
...@@ -33,10 +33,20 @@ def _check_stack_trace(thing): ...@@ -33,10 +33,20 @@ def _check_stack_trace(thing):
if not isinstance(op, theano.gof.Op): if not isinstance(op, theano.gof.Op):
op = op.op # assume node op = op.op # assume node
return not isinstance(op, (theano.compile.ops.Shape_i, return not isinstance(op, (theano.compile.ops.Shape_i,
theano.compile.ops.Shape,
theano.compile.ops.DeepCopyOp,
theano.tensor.opt.MakeVector,
theano.tensor.subtensor.Subtensor,
theano.tensor.elemwise.Elemwise,
theano.ifelse.IfElse, theano.ifelse.IfElse,
GpuFromHost, HostFromGpu, GpuFromHost, HostFromGpu,
GpuElemwise)) GpuCAReduceCuda,
return check_stack_trace(thing, ops_to_check=_ops_to_check) GpuElemwise,
theano.printing.Print,
PdbBreakpoint,
))
return check_stack_trace(thing, ops_to_check=_ops_to_check,
bug_print="ignore")
def test_local_assert(): def test_local_assert():
x = theano.tensor.fmatrix() x = theano.tensor.fmatrix()
......
...@@ -146,6 +146,7 @@ from theano.gof import (utils, Op, view_roots, ...@@ -146,6 +146,7 @@ from theano.gof import (utils, Op, view_roots,
EquilibriumOptimizer, Apply, EquilibriumOptimizer, Apply,
ReplacementDidntRemovedError) ReplacementDidntRemovedError)
from theano.gof.params_type import ParamsType from theano.gof.params_type import ParamsType
from theano.gof.opt import inherit_stack_trace
from theano.printing import pprint, FunctionPrinter, debugprint from theano.printing import pprint, FunctionPrinter, debugprint
from theano.compile.mode import optdb from theano.compile.mode import optdb
import theano.scalar import theano.scalar
...@@ -1625,6 +1626,7 @@ def local_dot_to_dot22(node): ...@@ -1625,6 +1626,7 @@ def local_dot_to_dot22(node):
return return
if y.type.dtype in ['float16', 'float32', 'float64', 'complex64', 'complex128']: if y.type.dtype in ['float16', 'float32', 'float64', 'complex64', 'complex128']:
with inherit_stack_trace(node.outputs):
if x.ndim == 2 and y.ndim == 2: if x.ndim == 2 and y.ndim == 2:
# print "local_dot_to_dot22: MM" # print "local_dot_to_dot22: MM"
return [_dot22(*node.inputs)] return [_dot22(*node.inputs)]
...@@ -1646,18 +1648,21 @@ def local_dot_to_dot22(node): ...@@ -1646,18 +1648,21 @@ def local_dot_to_dot22(node):
@local_optimizer([gemm_no_inplace], inplace=True) @local_optimizer([gemm_no_inplace], inplace=True)
def local_inplace_gemm(node): def local_inplace_gemm(node):
if node.op == gemm_no_inplace: if node.op == gemm_no_inplace:
with inherit_stack_trace(node.outputs):
return [gemm_inplace(*node.inputs)] return [gemm_inplace(*node.inputs)]
@local_optimizer([gemv_no_inplace], inplace=True) @local_optimizer([gemv_no_inplace], inplace=True)
def local_inplace_gemv(node): def local_inplace_gemv(node):
if node.op == gemv_no_inplace: if node.op == gemv_no_inplace:
with inherit_stack_trace(node.outputs):
return [gemv_inplace(*node.inputs)] return [gemv_inplace(*node.inputs)]
@local_optimizer([ger], inplace=True) @local_optimizer([ger], inplace=True)
def local_inplace_ger(node): def local_inplace_ger(node):
if node.op == ger: if node.op == ger:
with inherit_stack_trace(node.outputs):
return [ger_destructive(*node.inputs)] return [ger_destructive(*node.inputs)]
...@@ -1666,6 +1671,7 @@ def local_gemm_to_gemv(node): ...@@ -1666,6 +1671,7 @@ def local_gemm_to_gemv(node):
"""GEMM acting on row or column matrices -> GEMV.""" """GEMM acting on row or column matrices -> GEMV."""
if node.op == gemm_no_inplace: if node.op == gemm_no_inplace:
z, a, x, y, b = node.inputs z, a, x, y, b = node.inputs
with inherit_stack_trace(node.outputs):
if z.broadcastable == x.broadcastable == (True, False): if z.broadcastable == x.broadcastable == (True, False):
r = gemv_no_inplace(z.dimshuffle(1), a, y.T, x.dimshuffle(1), b) r = gemv_no_inplace(z.dimshuffle(1), a, y.T, x.dimshuffle(1), b)
return [r.dimshuffle('x', 0)] return [r.dimshuffle('x', 0)]
...@@ -1680,6 +1686,7 @@ def local_gemm_to_ger(node): ...@@ -1680,6 +1686,7 @@ def local_gemm_to_ger(node):
if node.op == gemm_no_inplace: if node.op == gemm_no_inplace:
z, a, x, y, b = node.inputs z, a, x, y, b = node.inputs
if x.broadcastable[1] and y.broadcastable[0]: if x.broadcastable[1] and y.broadcastable[0]:
with inherit_stack_trace(node.outputs):
# x and y are both vectors so this might qualifies for a GER # x and y are both vectors so this might qualifies for a GER
xv = x.dimshuffle(0) xv = x.dimshuffle(0)
yv = y.dimshuffle(1) yv = y.dimshuffle(1)
...@@ -1708,6 +1715,7 @@ def local_gemm_to_ger(node): ...@@ -1708,6 +1715,7 @@ def local_gemm_to_ger(node):
def local_dot22_to_ger_or_gemv(node): def local_dot22_to_ger_or_gemv(node):
"""dot22 computing an outer-product -> GER.""" """dot22 computing an outer-product -> GER."""
if node.op == _dot22: if node.op == _dot22:
with inherit_stack_trace(node.outputs):
x, y = node.inputs x, y = node.inputs
xb = x.broadcastable xb = x.broadcastable
yb = y.broadcastable yb = y.broadcastable
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论