提交 8f038cc6 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #3404 from kelvinxu/stack_trace

Add stack trace for nnet
...@@ -21,8 +21,8 @@ from theano import gof ...@@ -21,8 +21,8 @@ from theano import gof
from theano import scalar from theano import scalar
from theano.tensor import basic as tensor from theano.tensor import basic as tensor
from theano.tensor import subtensor from theano.tensor import subtensor
from theano.tensor import elemwise
from theano.tensor import opt from theano.tensor import opt
from theano.tensor.opt import copy_stack_trace
from theano.compile import optdb from theano.compile import optdb
from theano.gof import Apply from theano.gof import Apply
...@@ -31,6 +31,7 @@ from theano.gradient import DisconnectedType ...@@ -31,6 +31,7 @@ from theano.gradient import DisconnectedType
from theano.gradient import grad_not_implemented from theano.gradient import grad_not_implemented
from theano.tensor.type import values_eq_approx_remove_nan from theano.tensor.type import values_eq_approx_remove_nan
############ ############
# #
# TENSOR OPS # TENSOR OPS
...@@ -113,7 +114,8 @@ class SoftmaxWithBias(gof.Op): ...@@ -113,7 +114,8 @@ class SoftmaxWithBias(gof.Op):
# TODO: set error messages for failures in this code # TODO: set error messages for failures in this code
# TODO: use this to accept float32 and int32: node.inputs[0].type.dtype_specs()[1] # TODO: use this to accept float32 and int32:
# node.inputs[0].type.dtype_specs()[1]
init_decl = """ init_decl = """
npy_intp* Nx = PyArray_DIMS(%(x)s); npy_intp* Nx = PyArray_DIMS(%(x)s);
npy_intp Sx = 0; npy_intp Sx = 0;
...@@ -634,16 +636,19 @@ def local_softmax_with_bias(node): ...@@ -634,16 +636,19 @@ def local_softmax_with_bias(node):
# we're in business... # we're in business...
if len(vectors) > 1: if len(vectors) > 1:
vector_sum = tensor.add(*vectors) vector_sum = tensor.add(*vectors)
copy_stack_trace(x_in, vector_sum)
else: else:
vector_sum = vectors[0] vector_sum = vectors[0]
if len(non_vectors) > 1: if len(non_vectors) > 1:
non_vector_sum = tensor.add(*non_vectors) non_vector_sum = tensor.add(*non_vectors)
copy_stack_trace(x_in, non_vector_sum)
else: else:
non_vector_sum = non_vectors[0] non_vector_sum = non_vectors[0]
try: try:
sm_bias = softmax_with_bias(non_vector_sum, vector_sum) sm_bias = softmax_with_bias(non_vector_sum, vector_sum)
copy_stack_trace(node.outputs[0], sm_bias)
except Exception: except Exception:
# if our arguments have the wrong types, then # if our arguments have the wrong types, then
# forget about it # forget about it
...@@ -692,114 +697,6 @@ def softmax_simplifier(numerators, denominators): ...@@ -692,114 +697,6 @@ def softmax_simplifier(numerators, denominators):
return numerators, denominators return numerators, denominators
opt.local_mul_canonizer.add_simplifier(softmax_simplifier, 'softmax_simplifier') opt.local_mul_canonizer.add_simplifier(softmax_simplifier, 'softmax_simplifier')
if 0:
@opt.register_specialize
@gof.local_optimizer([tensor.add])
def local_softmax_grad(node):
'''dy*sm - DimShuffle{0,'x'}(sum{1}(dy*sm))*sm -> softmax_grad(dy,sm)'''
# TODO what if the signs are changed?
# TODO and if a scalar is distributed before each of the terms?
# TODO 'dy' could also be a product
if node.op == tensor.add and node.out.ndim == 2:
add_inputs = node.inputs
# Trying to locate two nodes in the sum:
# dy * sm, prod_term
# - DimShuffle{0,'x'}(sum{1}(dy*sm))*sm
prod_term = None
other_terms = []
# First, prod_term
for add_in in add_inputs:
if (add_in.owner and
add_in.owner.op == tensor.mul and
prod_term is None):
mul_inputs = add_in.owner.inputs
if (len(mul_inputs) == 2 and
all([mul_in.ndim == 2 for mul_in in mul_inputs])):
prod_term = add_in
else:
other_terms.append(add_in)
else:
other_terms.append(add_in)
if prod_term is None:
# print 'no prod_term'
return
assert len(other_terms) == len(add_inputs) - 1
ds_term = None
rest = []
for add_in in other_terms:
if add_in.owner and add_in.owner.op == tensor.neg:
neg_input = add_in.owner.inputs[0]
if neg_input.owner and neg_input.owner.op == tensor.mul:
mul2_inputs = neg_input.owner.inputs
if len(mul2_inputs) != 2:
rest.append(add_in)
# print 'len(mul2_inputs) =', len(mul2_inputs)
continue
# Try and find DimShuffle(Sum)
maybe_ds = None
for i, mul2_in in enumerate(mul2_inputs):
if mul2_in.owner and isinstance(mul2_in.owner.op,
elemwise.DimShuffle):
maybe_ds = mul2_in
maybe_sm = mul2_inputs[1 - i] # The other one
if (maybe_ds is None or
maybe_ds.ndim != 2 or
maybe_sm.ndim != 2):
rest.append(add_in)
# print 'maybe_ds =', maybe_ds
# if maybe_ds:
# print 'maybe_ds.ndim =', maybe_ds.ndim, ', maybe_sm.ndim =', maybe_sm.ndim
continue
if maybe_sm is mul_inputs[0]:
maybe_dy = mul_inputs[1]
elif maybe_sm is mul_inputs[1]:
maybe_dy = mul_inputs[0]
else:
rest.append(add_in)
# print 'maybe_sm, maybe_dy =', maybe_sm, maybe_dy
# print 'mul_inputs =', mul_inputs
continue
ds_order = maybe_ds.owner.op.new_order
ds_input = maybe_ds.owner.inputs[0]
axis = None
if ds_input.owner and isinstance(ds_input.owner.op,
elemwise.Sum):
axis = ds_input.owner.op.axis
sum_input = ds_input.owner.inputs[0]
if ((ds_order != (0, 'x')) or
(axis != (1,)) or
(sum_input is not prod_term)):
rest.append(add_in)
# print 'ds_order =', ds_order
# print 'axis =', axis
# if axis is not None:
# print 'sum_input =', sum_input, ', prod_term =', prod_term
# else:
# print 'ds_input.owner =', ds_input.owner
# print 'add_in =', add_in
continue
ds_term = add_in
else:
# print 'neg_input.owner =', neg_input.owner
rest.append(add_in)
else:
# print 'add_in.owner =', add_in.owner
rest.append(add_in)
if ds_term is None:
# print 'no ds_term'
return
if len(rest) == 0:
return [softmax_grad(maybe_dy, maybe_sm)]
else:
return [tensor.add(softmax_grad(maybe_dy, maybe_sm), *rest)]
class CrossentropySoftmaxArgmax1HotWithBias(gof.Op): class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
""" """
...@@ -1457,6 +1354,7 @@ def local_softmax_grad_to_crossentropy_with_softmax_grad(node): ...@@ -1457,6 +1354,7 @@ def local_softmax_grad_to_crossentropy_with_softmax_grad(node):
g_nll, coding_dist, true_one_of_n = g_coding_dist.owner.inputs g_nll, coding_dist, true_one_of_n = g_coding_dist.owner.inputs
dx = crossentropy_softmax_1hot_with_bias_dx(g_nll, coding_dist, dx = crossentropy_softmax_1hot_with_bias_dx(g_nll, coding_dist,
true_one_of_n) true_one_of_n)
copy_stack_trace(node.outputs[0], dx)
return [dx] return [dx]
...@@ -1485,13 +1383,18 @@ def local_argmax_pushdown(node): ...@@ -1485,13 +1383,18 @@ def local_argmax_pushdown(node):
if x.owner and x.owner.op in (softmax_op, softplus, tensor.exp, if x.owner and x.owner.op in (softmax_op, softplus, tensor.exp,
tensor.log, tensor.tanh, sigmoid): tensor.log, tensor.tanh, sigmoid):
pre_x, = x.owner.inputs pre_x, = x.owner.inputs
return tensor._max_and_argmax(pre_x, axis) ret = tensor._max_and_argmax(pre_x, axis)
copy_stack_trace(x_max, ret)
return ret
if x.owner and x.owner.op == softmax_with_bias: if x.owner and x.owner.op == softmax_with_bias:
pre_x, pre_bias = x.owner.inputs pre_x, pre_bias = x.owner.inputs
return tensor._max_and_argmax(pre_x + ret = tensor._max_and_argmax(pre_x +
tensor.DimShuffle( tensor.DimShuffle(
pre_bias.broadcastable, pre_bias.broadcastable,
('x', 0))(pre_bias), axis) ('x', 0))(pre_bias), axis)
# copy both stack traces
copy_stack_trace(x_max, ret)
return ret
# Utility function used by the two next optimizations # Utility function used by the two next optimizations
...@@ -1585,9 +1488,12 @@ def local_advanced_indexing_crossentropy_onehot(node): ...@@ -1585,9 +1488,12 @@ def local_advanced_indexing_crossentropy_onehot(node):
# Check that rows == arange(labels.shape[0]) # Check that rows == arange(labels.shape[0])
if _check_rows_is_arange_len_labels(rows, labels): if _check_rows_is_arange_len_labels(rows, labels):
if labels.ndim == 1 and x_var.ndim == 2: if labels.ndim == 1 and x_var.ndim == 2:
return [-crossentropy_softmax_argmax_1hot_with_bias(x_var, minus_ret = crossentropy_softmax_argmax_1hot_with_bias(x_var,
b_var, b_var,
labels)[0]] labels)[0]
ret = -minus_ret
copy_stack_trace(node.outputs[0], [minus_ret, ret])
return [ret]
@opt.register_specialize('fast_compile_gpu') @opt.register_specialize('fast_compile_gpu')
...@@ -1809,7 +1715,11 @@ def local_advanced_indexing_crossentropy_onehot_grad(node): ...@@ -1809,7 +1715,11 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
# Dimension check before substitution # Dimension check before substitution
if labels.ndim == 1 and x_var.ndim == 2: if labels.ndim == 1 and x_var.ndim == 2:
return [crossentropy_softmax_1hot_with_bias_dx(out_grad, sm, labels)] ret = crossentropy_softmax_1hot_with_bias_dx(out_grad, sm, labels)
# The stack trace is not added to output_grad, sm and labels at
# the moment but may need to be added at a future point
copy_stack_trace(node.outputs[0], ret)
return [ret]
else: else:
return return
...@@ -1825,6 +1735,7 @@ def graph_merge_softmax_with_crossentropy_softmax(node): ...@@ -1825,6 +1735,7 @@ def graph_merge_softmax_with_crossentropy_softmax(node):
if big_client in [b_client[0] for b_client in b.clients]: if big_client in [b_client[0] for b_client in b.clients]:
xx, bb, ll = big_client.inputs xx, bb, ll = big_client.inputs
mergeable_client = big_client.op(x, b, ll) mergeable_client = big_client.op(x, b, ll)
copy_stack_trace(node.outputs[0], mergeable_client[1])
return [mergeable_client[1]] return [mergeable_client[1]]
...@@ -1885,7 +1796,9 @@ def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node): ...@@ -1885,7 +1796,9 @@ def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node):
msg = '`sm` and `dy` do not have the same shape.' msg = '`sm` and `dy` do not have the same shape.'
dz = opt.Assert(msg)(dz, cond) dz = opt.Assert(msg)(dz, cond)
return [node.op(dz, sm, y_idx)] ret = node.op(dz, sm, y_idx)
copy_stack_trace(node.outputs[0], ret)
return [ret]
def binary_crossentropy(output, target): def binary_crossentropy(output, target):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论