提交 83ea3e8f authored 作者: Harm de Vries's avatar Harm de Vries

Gradient of Softmaxgrad

上级 b22ae136
......@@ -77,11 +77,16 @@ class SoftmaxWithBias(gof.Op):
if b.shape[0] != x.shape[1]:
raise ValueError('b must have same number of columns as x')
sm = numpy.zeros_like(x)
for i in xrange(sm.shape[0]):
row = x[i] + b
sm[i] = numpy.exp(row - numpy.max(row))
sm[i] *= 1.0 / numpy.sum(sm[i])
# sm = numpy.zeros_like(x)
# for i in xrange(sm.shape[0]):
# row = x[i] + b
# sm[i] = numpy.exp(row - numpy.max(row))
# sm[i] *= 1.0 / numpy.sum(sm[i])
# output_storage[0][0] = sm
x_plus_b = x + b[None, :]
e_x = numpy.exp(x_plus_b - x_plus_b.max(axis=1)[:, None])
sm = e_x / e_x.sum(axis=1)[:, None]
output_storage[0][0] = sm
def grad(self, inp, grads):
......@@ -303,8 +308,17 @@ class SoftmaxGrad(gof.Op):
dx[i] = dy_times_sm_i - sum(dy_times_sm_i) * sm[i]
output_storage[0][0] = dx
def grad(self, *args):
raise NotImplementedError()
def grad(self, inp, grads):
dy, sm = inp
g, = grads
tmp = g + tensor.neg(tensor.sum(g*sm, axis=1).dimshuffle((0, 'x')))
g_dy = tmp * sm
tmp2 = tensor.sum(dy*sm, axis=1).dimshuffle((0, 'x'))
g_sm = tmp*dy - g *tmp2
return g_dy, g_sm
def infer_shape(self, node, shape):
return [shape[1]]
......@@ -573,9 +587,7 @@ def softmax_graph(c):
return tensor.exp(c) / tensor.exp(c).sum(axis=-1, keepdims=True)
def softmax(c):
if c.ndim == 1:
c = tensor.shape_padleft(c, n_ones=1)
return softmax_graph(c)
return softmax_op(c)
@opt.register_specialize('fast_compile_gpu')
@gof.local_optimizer([softmax_op])
......@@ -733,7 +745,7 @@ if 0:
rest.append(add_in)
# print 'maybe_ds =', maybe_ds
# if maybe_ds:
#I will make a plot with the average over many realizations. # print 'maybe_ds.ndim =', maybe_ds.ndim, ', maybe_sm.ndim =', maybe_sm.ndim
# print 'maybe_ds.ndim =', maybe_ds.ndim, ', maybe_sm.ndim =', maybe_sm.ndim
continue
if maybe_sm is mul_inputs[0]:
......
......@@ -1011,7 +1011,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
try:
g = theano.function([x, b, y], T.grad(expr, x), mode=mode)
finally:
config.warn.sum_div_dimshuffle_bug = backup
config.warn.sum_div_dimshuffle_qbug = backup
if verbose:
printing.debugprint(g)
......@@ -1026,7 +1026,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
theano.printing.debugprint(g)
raise
def test_scale_cost(self):
def test_scrossentropy_softmax_1hot_with_bias_dxcale_cost(self):
# TODO: add the optimization in FAST_COMPILE?
# In the mean time, run it as 'FAST_RUN' instead
mode = theano.compile.mode.get_default_mode()
......@@ -1071,25 +1071,25 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
# Cases to test
expressions = [
a * T.sum(-T.log(softmax_graph(x)[T.arange(y.shape[0]), y])),
-a * T.sum(T.log(softmax_graph(x)[T.arange(y.shape[0]), y])),
a * (-T.sum(T.log(softmax_graph(x)[T.arange(y.shape[0]), y]))),
a * T.sum(T.log(softmax_graph(x)[T.arange(y.shape[0]), y])),
a * T.sum(-T.log(softmax_graph(x))[T.arange(y.shape[0]), y]),
-a * T.sum(T.log(softmax_graph(x))[T.arange(y.shape[0]), y]),
a * (-T.sum(T.log(softmax_graph(x))[T.arange(y.shape[0]), y])),
a * T.sum(T.log(softmax_graph(x))[T.arange(y.shape[0]), y]),
a * T.mean(-T.log(softmax_graph(x)[T.arange(y.shape[0]), y])),
-a * T.mean(T.log(softmax_graph(x)[T.arange(y.shape[0]), y])),
a * (-T.mean(T.log(softmax_graph(x)[T.arange(y.shape[0]), y]))),
a * T.mean(T.log(softmax_graph(x)[T.arange(y.shape[0]), y])),
a * T.mean(-T.log(softmax_graph(x))[T.arange(y.shape[0]), y]),
-a * T.mean(T.log(softmax_graph(x))[T.arange(y.shape[0]), y]),
a * (-T.mean(T.log(softmax_graph(x))[T.arange(y.shape[0]), y])),
a * T.mean(T.log(softmax_graph(x))[T.arange(y.shape[0]), y]),
a * T.sum(-T.log(softmax(x)[T.arange(y.shape[0]), y])),
-a * T.sum(T.log(softmax(x)[T.arange(y.shape[0]), y])),
a * (-T.sum(T.log(softmax(x)[T.arange(y.shape[0]), y]))),
a * T.sum(T.log(softmax(x)[T.arange(y.shape[0]), y])),
a * T.sum(-T.log(softmax(x))[T.arange(y.shape[0]), y]),
-a * T.sum(T.log(softmax(x))[T.arange(y.shape[0]), y]),
a * (-T.sum(T.log(softmax(x))[T.arange(y.shape[0]), y])),
a * T.sum(T.log(softmax(x))[T.arange(y.shape[0]), y]),
a * T.mean(-T.log(softmax(x)[T.arange(y.shape[0]), y])),
-a * T.mean(T.log(softmax(x)[T.arange(y.shape[0]), y])),
a * (-T.mean(T.log(softmax(x)[T.arange(y.shape[0]), y]))),
a * T.mean(T.log(softmax(x)[T.arange(y.shape[0]), y])),
a * T.mean(-T.log(softmax(x))[T.arange(y.shape[0]), y]),
-a * T.mean(T.log(softmax(x))[T.arange(y.shape[0]), y]),
a * (-T.mean(T.log(softmax(x))[T.arange(y.shape[0]), y])),
a * T.mean(T.log(softmax(x))[T.arange(y.shape[0]), y]),
]
for expr in expressions:
......@@ -1130,7 +1130,7 @@ def test_argmax_pushdown():
# test that the max_and_argmax is pushed down if the max is not used
out = tensor.max_and_argmax(
softmax_graph(tensor.exp(tensor.tanh(sigmoid(x)))),
softmax(tensor.exp(tensor.tanh(sigmoid(x)))),
axis=-1)[1]
fgraph = gof.FunctionGraph(
[x],
......@@ -1147,7 +1147,7 @@ def test_argmax_pushdown():
x = tensor.matrix()
# test that the max_and_argmax is not pushed down if the max is used
out = tensor.max_and_argmax(
softmax_graph(tensor.exp(tensor.tanh(sigmoid(x)))),
softmax(tensor.exp(tensor.tanh(sigmoid(x)))),
axis=-1)[0]
fgraph = gof.FunctionGraph(
[x],
......@@ -1157,7 +1157,7 @@ def test_argmax_pushdown():
config.warn.argmax_pushdown_bug = False
try:
theano.compile.mode.optdb.query(
theano.compile.mode.OPT_FAST_RUN).optimize(fgraph)
theano.compile.mode.OPT_FAST_RUN).optimize(fgraph)
finally:
config.warn.argmax_pushdown_bug = backup
......@@ -1236,7 +1236,7 @@ def test_asymptotic_32():
x2 = tensor.dvector()
y = tensor.lvector()
c = categorical_crossentropy(softmax_graph(x + x2), y)
c = categorical_crossentropy(softmax(x + x2), y)
f = theano.function([x, y, x2], [c.sum(),
tensor.grad(c.sum(), x)], mode='FAST_RUN')
if 0:
......@@ -1374,23 +1374,34 @@ class Test_softmax_opt:
# REPEAT 3 CASES in presence of log(softmax) with the advanced indexing
# etc.
def test_softmax_graph():
rng = numpy.random.RandomState(utt.fetch_seed())
x = theano.shared(rng.normal(size=(3, 4)))
def f(inputs):
y = softmax_graph(x)
z = (y**2).mean()
return theano.grad(z, x, known_grads={y: inputs})
utt.verify_grad(f, [numpy.random.rand(3, 4)])
return theano.grad(None, x, known_grads={y: inputs})
utt.verify_grad(f, [rng.rand(3, 4)])
def test_grad_softmax_grad():
rng = numpy.random.RandomState(utt.fetch_seed())
x = theano.shared(rng.normal(size=(3, 4)))
def f(inputs):
y = softmax_op(x)
return theano.grad(None, x, known_grads={y: inputs})
utt.verify_grad(f, [rng.rand(3, 4)])
def test_stabilize_log_softmax():
mode = theano.compile.mode.get_default_mode()
mode = mode.including('local_log_softmax', 'specialize')
x = matrix()
y = theano.tensor.nnet.softmax_graph(x)
y = softmax(x)
z = theano.tensor.log(y)
f = theano.function([x], z, mode=mode)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论