提交 83ea3e8f authored 作者: Harm de Vries's avatar Harm de Vries

Gradient of Softmaxgrad

上级 b22ae136
...@@ -77,11 +77,16 @@ class SoftmaxWithBias(gof.Op): ...@@ -77,11 +77,16 @@ class SoftmaxWithBias(gof.Op):
if b.shape[0] != x.shape[1]: if b.shape[0] != x.shape[1]:
raise ValueError('b must have same number of columns as x') raise ValueError('b must have same number of columns as x')
sm = numpy.zeros_like(x) # sm = numpy.zeros_like(x)
for i in xrange(sm.shape[0]): # for i in xrange(sm.shape[0]):
row = x[i] + b # row = x[i] + b
sm[i] = numpy.exp(row - numpy.max(row)) # sm[i] = numpy.exp(row - numpy.max(row))
sm[i] *= 1.0 / numpy.sum(sm[i]) # sm[i] *= 1.0 / numpy.sum(sm[i])
# output_storage[0][0] = sm
x_plus_b = x + b[None, :]
e_x = numpy.exp(x_plus_b - x_plus_b.max(axis=1)[:, None])
sm = e_x / e_x.sum(axis=1)[:, None]
output_storage[0][0] = sm output_storage[0][0] = sm
def grad(self, inp, grads): def grad(self, inp, grads):
...@@ -303,8 +308,17 @@ class SoftmaxGrad(gof.Op): ...@@ -303,8 +308,17 @@ class SoftmaxGrad(gof.Op):
dx[i] = dy_times_sm_i - sum(dy_times_sm_i) * sm[i] dx[i] = dy_times_sm_i - sum(dy_times_sm_i) * sm[i]
output_storage[0][0] = dx output_storage[0][0] = dx
def grad(self, *args): def grad(self, inp, grads):
raise NotImplementedError() dy, sm = inp
g, = grads
tmp = g + tensor.neg(tensor.sum(g*sm, axis=1).dimshuffle((0, 'x')))
g_dy = tmp * sm
tmp2 = tensor.sum(dy*sm, axis=1).dimshuffle((0, 'x'))
g_sm = tmp*dy - g *tmp2
return g_dy, g_sm
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [shape[1]] return [shape[1]]
...@@ -573,9 +587,7 @@ def softmax_graph(c): ...@@ -573,9 +587,7 @@ def softmax_graph(c):
return tensor.exp(c) / tensor.exp(c).sum(axis=-1, keepdims=True) return tensor.exp(c) / tensor.exp(c).sum(axis=-1, keepdims=True)
def softmax(c): def softmax(c):
if c.ndim == 1: return softmax_op(c)
c = tensor.shape_padleft(c, n_ones=1)
return softmax_graph(c)
@opt.register_specialize('fast_compile_gpu') @opt.register_specialize('fast_compile_gpu')
@gof.local_optimizer([softmax_op]) @gof.local_optimizer([softmax_op])
...@@ -733,7 +745,7 @@ if 0: ...@@ -733,7 +745,7 @@ if 0:
rest.append(add_in) rest.append(add_in)
# print 'maybe_ds =', maybe_ds # print 'maybe_ds =', maybe_ds
# if maybe_ds: # if maybe_ds:
#I will make a plot with the average over many realizations. # print 'maybe_ds.ndim =', maybe_ds.ndim, ', maybe_sm.ndim =', maybe_sm.ndim # print 'maybe_ds.ndim =', maybe_ds.ndim, ', maybe_sm.ndim =', maybe_sm.ndim
continue continue
if maybe_sm is mul_inputs[0]: if maybe_sm is mul_inputs[0]:
......
...@@ -1011,7 +1011,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester): ...@@ -1011,7 +1011,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
try: try:
g = theano.function([x, b, y], T.grad(expr, x), mode=mode) g = theano.function([x, b, y], T.grad(expr, x), mode=mode)
finally: finally:
config.warn.sum_div_dimshuffle_bug = backup config.warn.sum_div_dimshuffle_qbug = backup
if verbose: if verbose:
printing.debugprint(g) printing.debugprint(g)
...@@ -1026,7 +1026,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester): ...@@ -1026,7 +1026,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
theano.printing.debugprint(g) theano.printing.debugprint(g)
raise raise
def test_scale_cost(self): def test_scrossentropy_softmax_1hot_with_bias_dxcale_cost(self):
# TODO: add the optimization in FAST_COMPILE? # TODO: add the optimization in FAST_COMPILE?
# In the mean time, run it as 'FAST_RUN' instead # In the mean time, run it as 'FAST_RUN' instead
mode = theano.compile.mode.get_default_mode() mode = theano.compile.mode.get_default_mode()
...@@ -1071,25 +1071,25 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester): ...@@ -1071,25 +1071,25 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
# Cases to test # Cases to test
expressions = [ expressions = [
a * T.sum(-T.log(softmax_graph(x)[T.arange(y.shape[0]), y])), a * T.sum(-T.log(softmax(x)[T.arange(y.shape[0]), y])),
-a * T.sum(T.log(softmax_graph(x)[T.arange(y.shape[0]), y])), -a * T.sum(T.log(softmax(x)[T.arange(y.shape[0]), y])),
a * (-T.sum(T.log(softmax_graph(x)[T.arange(y.shape[0]), y]))), a * (-T.sum(T.log(softmax(x)[T.arange(y.shape[0]), y]))),
a * T.sum(T.log(softmax_graph(x)[T.arange(y.shape[0]), y])), a * T.sum(T.log(softmax(x)[T.arange(y.shape[0]), y])),
a * T.sum(-T.log(softmax_graph(x))[T.arange(y.shape[0]), y]), a * T.sum(-T.log(softmax(x))[T.arange(y.shape[0]), y]),
-a * T.sum(T.log(softmax_graph(x))[T.arange(y.shape[0]), y]), -a * T.sum(T.log(softmax(x))[T.arange(y.shape[0]), y]),
a * (-T.sum(T.log(softmax_graph(x))[T.arange(y.shape[0]), y])), a * (-T.sum(T.log(softmax(x))[T.arange(y.shape[0]), y])),
a * T.sum(T.log(softmax_graph(x))[T.arange(y.shape[0]), y]), a * T.sum(T.log(softmax(x))[T.arange(y.shape[0]), y]),
a * T.mean(-T.log(softmax_graph(x)[T.arange(y.shape[0]), y])), a * T.mean(-T.log(softmax(x)[T.arange(y.shape[0]), y])),
-a * T.mean(T.log(softmax_graph(x)[T.arange(y.shape[0]), y])), -a * T.mean(T.log(softmax(x)[T.arange(y.shape[0]), y])),
a * (-T.mean(T.log(softmax_graph(x)[T.arange(y.shape[0]), y]))), a * (-T.mean(T.log(softmax(x)[T.arange(y.shape[0]), y]))),
a * T.mean(T.log(softmax_graph(x)[T.arange(y.shape[0]), y])), a * T.mean(T.log(softmax(x)[T.arange(y.shape[0]), y])),
a * T.mean(-T.log(softmax_graph(x))[T.arange(y.shape[0]), y]), a * T.mean(-T.log(softmax(x))[T.arange(y.shape[0]), y]),
-a * T.mean(T.log(softmax_graph(x))[T.arange(y.shape[0]), y]), -a * T.mean(T.log(softmax(x))[T.arange(y.shape[0]), y]),
a * (-T.mean(T.log(softmax_graph(x))[T.arange(y.shape[0]), y])), a * (-T.mean(T.log(softmax(x))[T.arange(y.shape[0]), y])),
a * T.mean(T.log(softmax_graph(x))[T.arange(y.shape[0]), y]), a * T.mean(T.log(softmax(x))[T.arange(y.shape[0]), y]),
] ]
for expr in expressions: for expr in expressions:
...@@ -1130,7 +1130,7 @@ def test_argmax_pushdown(): ...@@ -1130,7 +1130,7 @@ def test_argmax_pushdown():
# test that the max_and_argmax is pushed down if the max is not used # test that the max_and_argmax is pushed down if the max is not used
out = tensor.max_and_argmax( out = tensor.max_and_argmax(
softmax_graph(tensor.exp(tensor.tanh(sigmoid(x)))), softmax(tensor.exp(tensor.tanh(sigmoid(x)))),
axis=-1)[1] axis=-1)[1]
fgraph = gof.FunctionGraph( fgraph = gof.FunctionGraph(
[x], [x],
...@@ -1147,7 +1147,7 @@ def test_argmax_pushdown(): ...@@ -1147,7 +1147,7 @@ def test_argmax_pushdown():
x = tensor.matrix() x = tensor.matrix()
# test that the max_and_argmax is not pushed down if the max is used # test that the max_and_argmax is not pushed down if the max is used
out = tensor.max_and_argmax( out = tensor.max_and_argmax(
softmax_graph(tensor.exp(tensor.tanh(sigmoid(x)))), softmax(tensor.exp(tensor.tanh(sigmoid(x)))),
axis=-1)[0] axis=-1)[0]
fgraph = gof.FunctionGraph( fgraph = gof.FunctionGraph(
[x], [x],
...@@ -1157,7 +1157,7 @@ def test_argmax_pushdown(): ...@@ -1157,7 +1157,7 @@ def test_argmax_pushdown():
config.warn.argmax_pushdown_bug = False config.warn.argmax_pushdown_bug = False
try: try:
theano.compile.mode.optdb.query( theano.compile.mode.optdb.query(
theano.compile.mode.OPT_FAST_RUN).optimize(fgraph) theano.compile.mode.OPT_FAST_RUN).optimize(fgraph)
finally: finally:
config.warn.argmax_pushdown_bug = backup config.warn.argmax_pushdown_bug = backup
...@@ -1236,7 +1236,7 @@ def test_asymptotic_32(): ...@@ -1236,7 +1236,7 @@ def test_asymptotic_32():
x2 = tensor.dvector() x2 = tensor.dvector()
y = tensor.lvector() y = tensor.lvector()
c = categorical_crossentropy(softmax_graph(x + x2), y) c = categorical_crossentropy(softmax(x + x2), y)
f = theano.function([x, y, x2], [c.sum(), f = theano.function([x, y, x2], [c.sum(),
tensor.grad(c.sum(), x)], mode='FAST_RUN') tensor.grad(c.sum(), x)], mode='FAST_RUN')
if 0: if 0:
...@@ -1374,23 +1374,34 @@ class Test_softmax_opt: ...@@ -1374,23 +1374,34 @@ class Test_softmax_opt:
# REPEAT 3 CASES in presence of log(softmax) with the advanced indexing # REPEAT 3 CASES in presence of log(softmax) with the advanced indexing
# etc. # etc.
def test_softmax_graph(): def test_softmax_graph():
rng = numpy.random.RandomState(utt.fetch_seed()) rng = numpy.random.RandomState(utt.fetch_seed())
x = theano.shared(rng.normal(size=(3, 4))) x = theano.shared(rng.normal(size=(3, 4)))
def f(inputs): def f(inputs):
y = softmax_graph(x) y = softmax_graph(x)
z = (y**2).mean() return theano.grad(None, x, known_grads={y: inputs})
return theano.grad(z, x, known_grads={y: inputs})
utt.verify_grad(f, [rng.rand(3, 4)])
utt.verify_grad(f, [numpy.random.rand(3, 4)])
def test_grad_softmax_grad():
rng = numpy.random.RandomState(utt.fetch_seed())
x = theano.shared(rng.normal(size=(3, 4)))
def f(inputs):
y = softmax_op(x)
return theano.grad(None, x, known_grads={y: inputs})
utt.verify_grad(f, [rng.rand(3, 4)])
def test_stabilize_log_softmax(): def test_stabilize_log_softmax():
mode = theano.compile.mode.get_default_mode() mode = theano.compile.mode.get_default_mode()
mode = mode.including('local_log_softmax', 'specialize') mode = mode.including('local_log_softmax', 'specialize')
x = matrix() x = matrix()
y = theano.tensor.nnet.softmax_graph(x) y = softmax(x)
z = theano.tensor.log(y) z = theano.tensor.log(y)
f = theano.function([x], z, mode=mode) f = theano.function([x], z, mode=mode)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论