提交 ff9e2b38 authored 作者: Tim Cooijmans's avatar Tim Cooijmans 提交者: Reyhane Askari

more stack trace copying

上级 39269c13
......@@ -89,13 +89,13 @@ def as_gpuarray_variable(x, context_name):
if x.context.ptr != ctx.ptr:
x = x.transfer(ctx)
x = with_stack_trace(x, gpuarray.asarray(x, context=ctx))
x = gpuarray.asarray(x, context=ctx)
bcast = [(s == 1) for s in x.shape]
return with_stack_trace(x, GpuArrayConstant(GpuArrayType(dtype=x.dtype,
broadcastable=bcast,
context_name=context_name),
x))
return GpuArrayConstant(GpuArrayType(dtype=x.dtype,
broadcastable=bcast,
context_name=context_name),
x)
def infer_context_name(*vars):
......
......@@ -7,6 +7,7 @@ from theano import Apply, Op
from theano.compile import optdb
from theano.gof import LocalOptGroup, ParamsType
from theano.scalar import bool as bool_t
from theano.gof.opt import inherit_stack_trace
from theano.tensor.basic import as_tensor_variable
from theano.tensor.opt import in2out
......@@ -1830,17 +1831,20 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
@inplace_allocempty(GpuGemv, 0)
def local_inplace_gpuagemv(node, inputs):
return [gpugemv_inplace(*inputs)]
with inherit_stack_trace(node.outputs):
return [gpugemv_inplace(*inputs)]
@inplace_allocempty(GpuGemm, 0)
def local_inplace_gpuagemm(node, inputs):
return [gpugemm_inplace(*inputs)]
with inherit_stack_trace(node.outputs):
return [gpugemm_inplace(*inputs)]
@inplace_allocempty(GpuGer, 0)
def local_inplace_gpuager(node, inputs):
return [gpuger_inplace(*inputs)]
with inherit_stack_trace(node.outputs):
return [gpuger_inplace(*inputs)]
@inplace_allocempty(GpuGemmBatch, 0)
......
......@@ -18,6 +18,7 @@ from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList
from theano.gof.cmodule import GCC_compiler
from theano.gof.type import CDataType, Generic
from theano.gof.opt import inherit_stack_trace
from theano.compile import optdb
from theano.compile.ops import shape_i, shape_i_op
from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
......@@ -3132,12 +3133,13 @@ def local_abstractconv_cudnn(node):
ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType):
return
if node.op.unshared:
return None
if isinstance(node.op, AbstractConv2d):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
elif isinstance(node.op, AbstractConv3d):
return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
with inherit_stack_trace(node.outputs):
if node.op.unshared:
return None
if isinstance(node.op, AbstractConv2d):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
elif isinstance(node.op, AbstractConv3d):
return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
......@@ -3356,12 +3358,13 @@ def local_abstractconv_gw_cudnn(node):
ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType):
return
if node.op.unshared:
return None
if isinstance(node.op, AbstractConv2d_gradWeights):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
elif isinstance(node.op, AbstractConv3d_gradWeights):
return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
with inherit_stack_trace(node.outputs):
if node.op.unshared:
return None
if isinstance(node.op, AbstractConv2d_gradWeights):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
elif isinstance(node.op, AbstractConv3d_gradWeights):
return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
@local_optimizer([AbstractConv2d_gradInputs, AbstractConv3d_gradInputs])
......@@ -3369,28 +3372,31 @@ def local_abstractconv_gi_cudnn(node):
ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType):
return
if node.op.unshared:
return None
if isinstance(node.op, AbstractConv2d_gradInputs):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
elif isinstance(node.op, AbstractConv3d_gradInputs):
return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
with inherit_stack_trace(node.outputs):
if node.op.unshared:
return None
if isinstance(node.op, AbstractConv2d_gradInputs):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
elif isinstance(node.op, AbstractConv3d_gradInputs):
return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
@inplace_allocempty(GpuDnnConv, 2)
def local_dnn_conv_inplace(node, inputs):
return [GpuDnnConv(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
with inherit_stack_trace(node.outputs):
return [GpuDnnConv(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
@inplace_allocempty(GpuDnnConvGradW, 2)
def local_dnn_convgw_inplace(node, inputs):
return [GpuDnnConvGradW(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
with inherit_stack_trace(node.outputs):
return [GpuDnnConvGradW(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
@inplace_allocempty(GpuDnnConvGradI, 2)
def local_dnn_convgi_inplace(node, inputs):
return [GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
with inherit_stack_trace(node.outputs):
return [GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
optdb.register('local_dnna_conv_inplace',
tensor.opt.in2out(local_dnn_conv_inplace,
......@@ -3403,40 +3409,43 @@ optdb.register('local_dnna_conv_inplace',
@register_opt('cudnn')
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
def local_dnn_conv_alpha_merge(node, *inputs):
return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
with inherit_stack_trace(node.outputs):
return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt('cudnn')
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
def local_dnn_convw_alpha_merge(node, *inputs):
return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
with inherit_stack_trace(node.outputs):
return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt('cudnn')
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
def local_dnn_convi_alpha_merge(node, *inputs):
return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
with inherit_stack_trace(node.outputs):
return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_conv_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(algo=node.op.algo)(*inputs)]
with inherit_stack_trace(node.outputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(algo=node.op.algo)(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convw_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
with inherit_stack_trace(node.outputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convi_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
with inherit_stack_trace(node.outputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
......
......@@ -15,7 +15,8 @@ from theano.compile.ops import shape_i
from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
LocalGroupDB,
SequenceDB, Optimizer, DB, toolbox, graph)
from theano.gof.opt import LocalMetaOptimizer, copy_stack_trace, with_stack_trace
from theano.gof.opt import (LocalMetaOptimizer, copy_stack_trace,
with_stack_trace, inherit_stack_trace)
from theano.ifelse import IfElse
from theano.misc.ordered_set import OrderedSet
......@@ -421,8 +422,6 @@ class GraphToGPU(Optimizer):
if isinstance(new_ops, theano.Op):
outputs = new_ops(*[mapping[i] for i in node.inputs], return_list=True)
for old_output, new_output in zip(node.outputs, outputs):
copy_stack_trace(old_output, new_output)
elif not new_ops:
newnode = node.clone_with_new_inputs([mapping.get(i) for i in node.inputs])
outputs = newnode.outputs
......@@ -431,6 +430,10 @@ class GraphToGPU(Optimizer):
elif isinstance(new_ops, theano.Variable):
outputs = [new_ops]
for old_output, new_output in zip(node.outputs, outputs):
copy_stack_trace(old_output, new_output)
new_output.tag.tracefrom = old_output
if new_ops:
node_created[lopt] += len(graph.ops([mapping[i] for i in node.inputs], outputs))
if any([getattr(old_o, 'dtype', None) != getattr(new_o, 'dtype', None)
......@@ -662,9 +665,8 @@ def local_gpualloc_memset_0(node):
inp.data.size == 1 and
(np.asarray(inp.data) == 0).all()):
new_op = GpuAlloc(node.op.context_name, memset_0=True)
new_output = new_op(*node.inputs)
copy_stack_trace(node.outputs[0], new_output)
return [new_output]
with inherit_stack_trace(node.outputs):
return new_op(*node.inputs, return_list=True)
# Don't register by default.
......@@ -673,10 +675,9 @@ def local_gpua_alloc_empty_to_zeros(node):
if isinstance(node.op, GpuAllocEmpty):
context_name = infer_context_name(*node.inputs)
z = np.asarray(0, dtype=node.outputs[0].dtype)
return [GpuAlloc(context_name)(as_gpuarray_variable(z, context_name),
*node.inputs)]
with inherit_stack_trace(node.outputs):
return [GpuAlloc(context_name)(as_gpuarray_variable(z, context_name),
*node.inputs)]
optdb.register('local_gpua_alloc_empty_to_zeros',
theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros),
# After move to gpu and merge2, before inplace.
......@@ -1220,7 +1221,8 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
op.scalar_op, axis=op.axis,
dtype=odtype,
acc_dtype=adtype)
gvar = with_stack_trace(outputs, greduce(x))
with inherit_stack_trace(outputs):
gvar = greduce(x)
# We need to have the make node called, otherwise the mask can
# be None
if (op2 is GpuCAReduceCPY or
......@@ -1260,30 +1262,27 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
dtype=getattr(op, 'dtype', outputs[0].dtype),
acc_dtype=getattr(op, 'acc_dtype', None))
reshaped_x = with_stack_trace(
outputs, x.reshape(tensor.stack(new_in_shp)))
gpu_reshaped_x = with_stack_trace(
outputs, as_gpuarray_variable(reshaped_x, context_name))
gvar = with_stack_trace(outputs, greduce(gpu_reshaped_x))
# We need to have the make node called, otherwise the mask can
# be None
reshaped_gpu_inputs = [gpu_reshaped_x]
if greduce.supports_c_code(reshaped_gpu_inputs):
reduce_reshaped_x = with_stack_trace(
outputs, greduce(gpu_reshaped_x))
if reduce_reshaped_x.ndim != outputs[0].ndim:
out_shp = []
for i in range(x.ndim):
if i not in op.axis:
out_shp.append(shape_i(x, i))
unreshaped_reduce = with_stack_trace(
outputs, GpuReshape(len(out_shp))(
with inherit_stack_trace(outputs):
reshaped_x = x.reshape(tensor.stack(new_in_shp))
gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name)
# We need to have the make node called, otherwise the mask can
# be None
gvar = greduce(gpu_reshaped_x)
reshaped_gpu_inputs = [gpu_reshaped_x]
if greduce.supports_c_code(reshaped_gpu_inputs):
reduce_reshaped_x = greduce(gpu_reshaped_x)
if reduce_reshaped_x.ndim != outputs[0].ndim:
out_shp = []
for i in range(x.ndim):
if i not in op.axis:
out_shp.append(shape_i(x, i))
unreshaped_reduce = GpuReshape(len(out_shp))(
reduce_reshaped_x,
tensor.stack(out_shp)))
else:
unreshaped_reduce = reduce_reshaped_x
return [unreshaped_reduce]
tensor.stack(out_shp))
else:
unreshaped_reduce = reduce_reshaped_x
return [unreshaped_reduce]
@register_opt('fast_compile')
......@@ -1356,25 +1355,29 @@ def local_gpua_gemmbatch(op, context_name, inputs, outputs):
@register_opt()
@alpha_merge(GpuGemm, alpha_in=1, beta_in=4)
def local_gpua_gemm_alpha_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)]
with inherit_stack_trace(node.outputs):
return [gpugemm_no_inplace(*inputs)]
@register_opt()
@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0)
def local_gpua_gemm_output_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)]
with inherit_stack_trace(node.outputs):
return [gpugemm_no_inplace(*inputs)]
@register_opt()
@alpha_merge(GpuGemmBatch, alpha_in=1, beta_in=4)
def local_gpua_gemmbatch_alpha_merge(node, *inputs):
return [gpugemmbatch_no_inplace(*inputs)]
with inherit_stack_trace(node.outputs):
return [gpugemmbatch_no_inplace(*inputs)]
@register_opt()
@output_merge(GpuGemmBatch, alpha_in=1, beta_in=4, out_in=0)
def local_gpua_gemmbatch_output_merge(node, *inputs):
return [gpugemmbatch_no_inplace(*inputs)]
with inherit_stack_trace(node.outputs):
return [gpugemmbatch_no_inplace(*inputs)]
@register_opt('fast_compile')
......@@ -2403,8 +2406,8 @@ def local_gpu_elemwise_careduce(node):
props = node.op._props_dict()
props["pre_scalar_op"] = scalar.basic.sqr
out = GpuCAReduceCuda(**props)(inp)
return with_stack_trace(
node.outputs, out)
with inherit_stack_trace(node.outputs):
return out
@local_optimizer(None)
......
......@@ -33,10 +33,20 @@ def _check_stack_trace(thing):
if not isinstance(op, theano.gof.Op):
op = op.op # assume node
return not isinstance(op, (theano.compile.ops.Shape_i,
theano.compile.ops.Shape,
theano.compile.ops.DeepCopyOp,
theano.tensor.opt.MakeVector,
theano.tensor.subtensor.Subtensor,
theano.tensor.elemwise.Elemwise,
theano.ifelse.IfElse,
GpuFromHost, HostFromGpu,
GpuElemwise))
return check_stack_trace(thing, ops_to_check=_ops_to_check)
GpuCAReduceCuda,
GpuElemwise,
theano.printing.Print,
PdbBreakpoint,
))
return check_stack_trace(thing, ops_to_check=_ops_to_check,
bug_print="ignore")
def test_local_assert():
x = theano.tensor.fmatrix()
......
......@@ -146,6 +146,7 @@ from theano.gof import (utils, Op, view_roots,
EquilibriumOptimizer, Apply,
ReplacementDidntRemovedError)
from theano.gof.params_type import ParamsType
from theano.gof.opt import inherit_stack_trace
from theano.printing import pprint, FunctionPrinter, debugprint
from theano.compile.mode import optdb
import theano.scalar
......@@ -1625,19 +1626,20 @@ def local_dot_to_dot22(node):
return
if y.type.dtype in ['float16', 'float32', 'float64', 'complex64', 'complex128']:
if x.ndim == 2 and y.ndim == 2:
# print "local_dot_to_dot22: MM"
return [_dot22(*node.inputs)]
if x.ndim == 2 and y.ndim == 1:
# print "local_dot_to_dot22: MV"
return [_dot22(x, y.dimshuffle(0, 'x')).dimshuffle(0)]
if x.ndim == 1 and y.ndim == 2:
# print "local_dot_to_dot22: VM"
return [_dot22(x.dimshuffle('x', 0), y).dimshuffle(1)]
if x.ndim == 1 and y.ndim == 1:
# print "local_dot_to_dot22: VV"
return [_dot22(x.dimshuffle('x', 0),
y.dimshuffle(0, 'x')).dimshuffle()]
with inherit_stack_trace(node.outputs):
if x.ndim == 2 and y.ndim == 2:
# print "local_dot_to_dot22: MM"
return [_dot22(*node.inputs)]
if x.ndim == 2 and y.ndim == 1:
# print "local_dot_to_dot22: MV"
return [_dot22(x, y.dimshuffle(0, 'x')).dimshuffle(0)]
if x.ndim == 1 and y.ndim == 2:
# print "local_dot_to_dot22: VM"
return [_dot22(x.dimshuffle('x', 0), y).dimshuffle(1)]
if x.ndim == 1 and y.ndim == 1:
# print "local_dot_to_dot22: VV"
return [_dot22(x.dimshuffle('x', 0),
y.dimshuffle(0, 'x')).dimshuffle()]
_logger.info('Not optimizing dot with inputs %s %s %s %s',
x, y, x.type, y.type)
......@@ -1646,19 +1648,22 @@ def local_dot_to_dot22(node):
@local_optimizer([gemm_no_inplace], inplace=True)
def local_inplace_gemm(node):
if node.op == gemm_no_inplace:
return [gemm_inplace(*node.inputs)]
with inherit_stack_trace(node.outputs):
return [gemm_inplace(*node.inputs)]
@local_optimizer([gemv_no_inplace], inplace=True)
def local_inplace_gemv(node):
if node.op == gemv_no_inplace:
return [gemv_inplace(*node.inputs)]
with inherit_stack_trace(node.outputs):
return [gemv_inplace(*node.inputs)]
@local_optimizer([ger], inplace=True)
def local_inplace_ger(node):
if node.op == ger:
return [ger_destructive(*node.inputs)]
with inherit_stack_trace(node.outputs):
return [ger_destructive(*node.inputs)]
@local_optimizer([gemm_no_inplace])
......@@ -1666,12 +1671,13 @@ def local_gemm_to_gemv(node):
"""GEMM acting on row or column matrices -> GEMV."""
if node.op == gemm_no_inplace:
z, a, x, y, b = node.inputs
if z.broadcastable == x.broadcastable == (True, False):
r = gemv_no_inplace(z.dimshuffle(1), a, y.T, x.dimshuffle(1), b)
return [r.dimshuffle('x', 0)]
if z.broadcastable == y.broadcastable == (False, True):
r = gemv_no_inplace(z.dimshuffle(0), a, x, y.dimshuffle(0), b)
return [r.dimshuffle(0, 'x')]
with inherit_stack_trace(node.outputs):
if z.broadcastable == x.broadcastable == (True, False):
r = gemv_no_inplace(z.dimshuffle(1), a, y.T, x.dimshuffle(1), b)
return [r.dimshuffle('x', 0)]
if z.broadcastable == y.broadcastable == (False, True):
r = gemv_no_inplace(z.dimshuffle(0), a, x, y.dimshuffle(0), b)
return [r.dimshuffle(0, 'x')]
@local_optimizer([gemm_no_inplace])
......@@ -1680,26 +1686,27 @@ def local_gemm_to_ger(node):
if node.op == gemm_no_inplace:
z, a, x, y, b = node.inputs
if x.broadcastable[1] and y.broadcastable[0]:
# x and y are both vectors so this might qualifies for a GER
xv = x.dimshuffle(0)
yv = y.dimshuffle(1)
try:
bval = T.get_scalar_constant_value(b)
except T.NotScalarConstantError:
# b isn't a constant, GEMM is doing useful pre-scaling
return
if bval == 1: # best case a natural GER
rval = ger(z, a, xv, yv)
return [rval]
elif bval == 0: # GER on zeros_like should be faster than GEMM
zeros = T.zeros([x.shape[0], y.shape[1]], x.dtype)
rval = ger(zeros, a, xv, yv)
return [rval]
else:
# if bval is another constant, then z is being usefully
# pre-scaled and GER isn't really the right tool for the job.
return
with inherit_stack_trace(node.outputs):
# x and y are both vectors so this might qualifies for a GER
xv = x.dimshuffle(0)
yv = y.dimshuffle(1)
try:
bval = T.get_scalar_constant_value(b)
except T.NotScalarConstantError:
# b isn't a constant, GEMM is doing useful pre-scaling
return
if bval == 1: # best case a natural GER
rval = ger(z, a, xv, yv)
return [rval]
elif bval == 0: # GER on zeros_like should be faster than GEMM
zeros = T.zeros([x.shape[0], y.shape[1]], x.dtype)
rval = ger(zeros, a, xv, yv)
return [rval]
else:
# if bval is another constant, then z is being usefully
# pre-scaled and GER isn't really the right tool for the job.
return
# TODO: delete this optimization when we have the proper dot->gemm->ger pipeline
......@@ -1708,37 +1715,38 @@ def local_gemm_to_ger(node):
def local_dot22_to_ger_or_gemv(node):
"""dot22 computing an outer-product -> GER."""
if node.op == _dot22:
x, y = node.inputs
xb = x.broadcastable
yb = y.broadcastable
one = T.as_tensor_variable(np.asarray(1, dtype=x.dtype))
zero = T.as_tensor_variable(np.asarray(0, dtype=x.dtype))
if xb[1] and yb[0]:
# x and y are both vectors so this might qualifies for a GER
xv = x.dimshuffle(0)
yv = y.dimshuffle(1)
zeros = T.zeros([x.shape[0], y.shape[1]], dtype=x.dtype)
rval = ger(zeros, one, xv, yv)
return [rval]
if xb[0] and yb[1]:
# x and y are both vectors so this qualifies for a sdot / ddot
# TODO: Theano doesn't have a sdot, but gemv is better than _dot22
xv = x.dimshuffle(1)
zeros = T.AllocEmpty(x.dtype)(1)
rval = gemv_no_inplace(zeros, one, y.T, xv, zero)
return [rval.dimshuffle('x', 0)]
if xb[0] and not yb[0] and not yb[1]:
# x is vector, y is matrix so try gemv
xv = x.dimshuffle(1)
zeros = T.AllocEmpty(x.dtype)(y.shape[1])
rval = gemv_no_inplace(zeros, one, y.T, xv, zero)
return [rval.dimshuffle('x', 0)]
if not xb[0] and not xb[1] and yb[1]:
# x is matrix, y is vector, try gemv
yv = y.dimshuffle(0)
zeros = T.AllocEmpty(x.dtype)(x.shape[0])
rval = gemv_no_inplace(zeros, one, x, yv, zero)
return [rval.dimshuffle(0, 'x')]
with inherit_stack_trace(node.outputs):
x, y = node.inputs
xb = x.broadcastable
yb = y.broadcastable
one = T.as_tensor_variable(np.asarray(1, dtype=x.dtype))
zero = T.as_tensor_variable(np.asarray(0, dtype=x.dtype))
if xb[1] and yb[0]:
# x and y are both vectors so this might qualifies for a GER
xv = x.dimshuffle(0)
yv = y.dimshuffle(1)
zeros = T.zeros([x.shape[0], y.shape[1]], dtype=x.dtype)
rval = ger(zeros, one, xv, yv)
return [rval]
if xb[0] and yb[1]:
# x and y are both vectors so this qualifies for a sdot / ddot
# TODO: Theano doesn't have a sdot, but gemv is better than _dot22
xv = x.dimshuffle(1)
zeros = T.AllocEmpty(x.dtype)(1)
rval = gemv_no_inplace(zeros, one, y.T, xv, zero)
return [rval.dimshuffle('x', 0)]
if xb[0] and not yb[0] and not yb[1]:
# x is vector, y is matrix so try gemv
xv = x.dimshuffle(1)
zeros = T.AllocEmpty(x.dtype)(y.shape[1])
rval = gemv_no_inplace(zeros, one, y.T, xv, zero)
return [rval.dimshuffle('x', 0)]
if not xb[0] and not xb[1] and yb[1]:
# x is matrix, y is vector, try gemv
yv = y.dimshuffle(0)
zeros = T.AllocEmpty(x.dtype)(x.shape[0])
rval = gemv_no_inplace(zeros, one, x, yv, zero)
return [rval.dimshuffle(0, 'x')]
#################################
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论