提交 3dcba54d authored 作者: abergeron's avatar abergeron

Merge pull request #3363 from fvisin/logsoftmax

LogSoftmax
from .nnet import ( from .nnet import (
CrossentropyCategorical1Hot, CrossentropyCategorical1HotGrad, CrossentropyCategorical1Hot, CrossentropyCategorical1HotGrad,
CrossentropySoftmax1HotWithBiasDx, CrossentropySoftmaxArgmax1HotWithBias, CrossentropySoftmax1HotWithBiasDx, CrossentropySoftmaxArgmax1HotWithBias,
Prepend_scalar_constant_to_each_row, Prepend_scalar_to_each_row, Softmax, LogSoftmax, Prepend_scalar_constant_to_each_row,
Prepend_scalar_to_each_row, Softmax,
SoftmaxGrad, SoftmaxWithBias, binary_crossentropy, SoftmaxGrad, SoftmaxWithBias, binary_crossentropy,
categorical_crossentropy, crossentropy_categorical_1hot, categorical_crossentropy, crossentropy_categorical_1hot,
crossentropy_categorical_1hot_grad, crossentropy_softmax_1hot, crossentropy_categorical_1hot_grad, crossentropy_softmax_1hot,
...@@ -13,12 +14,7 @@ from .nnet import ( ...@@ -13,12 +14,7 @@ from .nnet import (
crossentropy_to_crossentropy_with_softmax, crossentropy_to_crossentropy_with_softmax,
crossentropy_to_crossentropy_with_softmax_with_bias, crossentropy_to_crossentropy_with_softmax_with_bias,
graph_merge_softmax_with_crossentropy_softmax, h_softmax, graph_merge_softmax_with_crossentropy_softmax, h_softmax,
local_advanced_indexing_crossentropy_onehot, logsoftmax, logsoftmax_op, prepend_0_to_each_row, prepend_1_to_each_row,
local_advanced_indexing_crossentropy_onehot_grad, local_argmax_pushdown,
local_log_softmax, local_softmax_grad_to_crossentropy_with_softmax_grad,
local_softmax_with_bias,
local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc,
make_out_pattern, prepend_0_to_each_row, prepend_1_to_each_row,
prepend_scalar_to_each_row, relu, softmax, softmax_grad, softmax_graph, prepend_scalar_to_each_row, relu, softmax, softmax_grad, softmax_graph,
softmax_op, softmax_simplifier, softmax_with_bias) softmax_op, softmax_simplifier, softmax_with_bias)
from . import opt from . import opt
......
...@@ -30,7 +30,6 @@ from theano.tensor.nnet.sigm import sigmoid, softplus ...@@ -30,7 +30,6 @@ from theano.tensor.nnet.sigm import sigmoid, softplus
from theano.gradient import DisconnectedType from theano.gradient import DisconnectedType
from theano.gradient import grad_not_implemented from theano.gradient import grad_not_implemented
from theano.tensor.nnet.blocksparse import sparse_block_dot from theano.tensor.nnet.blocksparse import sparse_block_dot
from theano.tensor.type import values_eq_approx_remove_nan
############ ############
...@@ -190,7 +189,6 @@ class SoftmaxWithBias(gof.Op): ...@@ -190,7 +189,6 @@ class SoftmaxWithBias(gof.Op):
{ {
size_t j; size_t j;
double sum = 0.0; double sum = 0.0;
bool discount_max = false;
const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i); const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i);
const dtype_%(b)s* __restrict__ b_i = (dtype_%(b)s*)(PyArray_BYTES(%(b)s)); const dtype_%(b)s* __restrict__ b_i = (dtype_%(b)s*)(PyArray_BYTES(%(b)s));
...@@ -431,6 +429,7 @@ class Softmax(gof.Op): ...@@ -431,6 +429,7 @@ class Softmax(gof.Op):
x.type) x.type)
if x.ndim == 1: if x.ndim == 1:
x = tensor.shape_padleft(x, n_ones=1) x = tensor.shape_padleft(x, n_ones=1)
return Apply(self, [x], [x.type()]) return Apply(self, [x], [x.type()])
def perform(self, node, input_storage, output_storage): def perform(self, node, input_storage, output_storage):
...@@ -508,12 +507,10 @@ class Softmax(gof.Op): ...@@ -508,12 +507,10 @@ class Softmax(gof.Op):
{ {
size_t j; size_t j;
double sum = 0.0; double sum = 0.0;
bool discount_max = false;
const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i); const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i);
dtype_%(sm) s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i); dtype_%(sm) s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
size_t row_max_j=0;
dtype_%(sm)s row_max = x_i[0]; dtype_%(sm)s row_max = x_i[0];
//std::cout << "0 " << row_max << "\\n"; //std::cout << "0 " << row_max << "\\n";
// Get the maximum value of the row // Get the maximum value of the row
...@@ -521,7 +518,6 @@ class Softmax(gof.Op): ...@@ -521,7 +518,6 @@ class Softmax(gof.Op):
{ {
dtype_%(sm)s row_ij = x_i[j * Sx1] ; dtype_%(sm)s row_ij = x_i[j * Sx1] ;
//std::cout << "1 " << row_ij << "\\n"; //std::cout << "1 " << row_ij << "\\n";
row_max_j = (row_ij > row_max) ? j : row_max_j;
row_max = (row_ij > row_max) ? row_ij : row_max; row_max = (row_ij > row_max) ? row_ij : row_max;
} }
...@@ -599,6 +595,198 @@ class Softmax(gof.Op): ...@@ -599,6 +595,198 @@ class Softmax(gof.Op):
softmax_op = Softmax() softmax_op = Softmax()
class LogSoftmax(gof.Op):
"""
LogSoftmax activation function
:math:`\\varphi(\\mathbf{x})_j =
\\e^{(\mathbf{x}_j - log{\sum_{k=1}^K e^{\mathbf{x}_k})}}
where :math:`K` is the total number of neurons in the layer. This
activation function gets applied row-wise.
"""
__props__ = ()
def make_node(self, x):
x = tensor.as_tensor_variable(x)
if x.type.ndim not in (1, 2) \
or x.type.dtype not in tensor.float_dtypes:
raise ValueError('x must be 1-d or 2-d tensor of floats. Got %s' %
x.type)
if x.ndim == 1:
x = tensor.shape_padleft(x, n_ones=1)
return Apply(self, [x], [x.type()])
def perform(self, node, input_storage, output_storage):
x, = input_storage
xdev = x - x.max(axis=1)[:, None]
lsm = xdev - numpy.log(numpy.sum(numpy.exp(xdev), axis=1,
keepdims=True))
output_storage[0][0] = lsm
def grad(self, inp, grads):
x, = inp
sm = softmax_op(x)
return [grads[0] - tensor.sum(grads[0], axis=1, keepdims=True) * sm]
def R_op(self, inputs, eval_points):
# I think the Jacobian is symmetric so the R_op
# is the same as the grad
if None in eval_points:
return [None]
return self.grad(inputs, eval_points)
def infer_shape(self, node, shape):
return shape
def c_headers(self):
return ['<cmath>']
@staticmethod
def c_code_template(dtype):
init_decl = """
npy_intp* Nx = PyArray_DIMS(%(x)s);
npy_intp Sx1 = 0;
npy_intp Ssm1 = 0;
if (PyArray_NDIM(%(x)s) != 2)
{
PyErr_SetString(PyExc_ValueError, "not a 2d tensor");
%(fail)s;
}
if ((PyArray_TYPE(%(x)s) != NPY_DOUBLE) &&
(PyArray_TYPE(%(x)s) != NPY_FLOAT))
{
PyErr_SetString(PyExc_TypeError, "not a float");
%(fail)s;
}
if ((NULL == %(sm)s)
|| (PyArray_DIMS(%(sm)s)[0] != PyArray_DIMS(%(x)s)[0])
|| (PyArray_DIMS(%(sm)s)[1] != PyArray_DIMS(%(x)s)[1]))
{
Py_XDECREF(%(sm)s);
%(sm)s = (PyArrayObject*)PyArray_SimpleNew(
2, PyArray_DIMS(%(x)s),
PyArray_TYPE(%(x)s));
if(!%(sm)s) {
PyErr_SetString(PyExc_MemoryError,
"failed to alloc sm output");
%(fail)s
}
}
Sx1 = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s);
Ssm1 = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
"""
begin_row_loop = """
// minibatch loop
for (size_t i = 0; i < Nx[0]; ++i)
{
size_t j;
double sum = 0.0;
const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(
PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i);
dtype_%(sm)s* __restrict__ sm_i = (dtype_%(sm)s*)(
PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
dtype_%(sm)s row_max = x_i[0];
// Get the maximum value of the row
for (j = 1; j < Nx[1]; ++j)
{
dtype_%(sm)s x_ij = x_i[j * Sx1] ;
row_max = (x_ij > row_max) ? x_ij : row_max;
}
"""
inside_row_loop = """
// Compute xdev and sum(exp(xdev), axis=1)
double xdev_exp_row_sum = 0.0;
for (j = 0; j < Nx[1]; j++)
{
// use sm_i to temporary store xdev
sm_i[j * Ssm1] = (dtype_%(sm)s) (x_i[j * Sx1] - row_max);
xdev_exp_row_sum += exp(sm_i[j * Ssm1]);
}
// Write sm = xdev - log(sum(exp(xdev), axis=1))
xdev_exp_row_sum = log(xdev_exp_row_sum);
for (j = 0; j < Nx[1]; ++j)
{
sm_i[j * Ssm1] -= (dtype_%(sm)s) xdev_exp_row_sum;
}
"""
end_row_loop = """
}
"""
return (init_decl, begin_row_loop, inside_row_loop, end_row_loop)
def c_code(self, node, name, inp, out, sub):
x, = inp
sm, = out
code_template = ''.join(self.c_code_template(
node.inputs[0].type.dtype_specs()[1]))
return code_template % dict(locals(), **sub)
@staticmethod
def c_code_cache_version():
return (0,)
logsoftmax_op = LogSoftmax()
@opt.register_specialize('stabilize', 'fast_compile')
@gof.local_optimizer([tensor.Elemwise])
def local_logsoftmax(node):
"""
Detect Log(Softmax(x)) and replace it with LogSoftmax(x)
Note: only forward pass is affected
"""
if (isinstance(node.op, tensor.Elemwise) and
isinstance(node.op.scalar_op, scalar.basic.Log) and
len(node.inputs) == 1 and
node.inputs[0].owner is not None and
isinstance(node.inputs[0].owner.op, Softmax)):
inVars = node.inputs[0].owner.inputs[0]
new_op = LogSoftmax()
return [new_op(inVars)]
@opt.register_specialize('stabilize', 'fast_compile')
@gof.local_optimizer([SoftmaxGrad])
def local_logsoftmax_grad(node):
"""
Detect Log(Softmax(x))'s grad and replace it with LogSoftmax(x)'s grad
Note: only grad is affected
"""
if (isinstance(node.op, SoftmaxGrad) and
len(node.inputs) == 2 and
node.inputs[0].owner is not None and
isinstance(node.inputs[0].owner.op, tensor.Elemwise) and
len(node.inputs[0].owner.inputs) >= 2 and
node.inputs[0].owner.inputs[1].owner is not None and
node.inputs[0].owner.inputs[1].owner.op == softmax_op and
node.inputs[1] == node.inputs[0].owner.inputs[1] and
not (
# skip if it will be optimized by
# local_advanced_indexing_crossentropy_onehot_grad
node.inputs[0].owner.op == tensor.true_div and
node.inputs[0].owner.inputs[0].owner is not None and
isinstance(node.inputs[0].owner.inputs[0].owner.op,
subtensor.AdvancedIncSubtensor))):
# get parameters from unoptimized op
sm = node.inputs[0].owner.inputs[1]
# sm_input = node.inputs[1].owner.inputs[0]
grads = node.inputs[0].owner.inputs[0]
if grads.broadcastable[1] and not sm.broadcastable[1]:
grads = tensor.alloc(grads, grads.shape[0], sm.shape[1])
return [grads - tensor.sum(grads, axis=1, keepdims=True) * sm]
def softmax_graph(c): def softmax_graph(c):
return tensor.exp(c) / tensor.exp(c).sum(axis=-1, keepdims=True) return tensor.exp(c) / tensor.exp(c).sum(axis=-1, keepdims=True)
...@@ -607,6 +795,10 @@ def softmax(c): ...@@ -607,6 +795,10 @@ def softmax(c):
return softmax_op(c) return softmax_op(c)
def logsoftmax(c):
return logsoftmax_op(c)
@opt.register_specialize('fast_compile_gpu') @opt.register_specialize('fast_compile_gpu')
@gof.local_optimizer([softmax_op]) @gof.local_optimizer([softmax_op])
def local_softmax_with_bias(node): def local_softmax_with_bias(node):
...@@ -1472,7 +1664,7 @@ def local_advanced_indexing_crossentropy_onehot(node): ...@@ -1472,7 +1664,7 @@ def local_advanced_indexing_crossentropy_onehot(node):
sm = log.owner.inputs[0] sm = log.owner.inputs[0]
# Second case: log(softmax(x)[rows, labels]) # Second case: log(softmax(x)[rows, labels])
if node.op == tensor.log: elif node.op == tensor.log:
pre_log = node.inputs[0].owner pre_log = node.inputs[0].owner
if pre_log and isinstance(pre_log.op, subtensor.AdvancedSubtensor): if pre_log and isinstance(pre_log.op, subtensor.AdvancedSubtensor):
try: try:
...@@ -1982,27 +2174,6 @@ prepend_0_to_each_row = Prepend_scalar_constant_to_each_row(0.) ...@@ -1982,27 +2174,6 @@ prepend_0_to_each_row = Prepend_scalar_constant_to_each_row(0.)
prepend_1_to_each_row = Prepend_scalar_constant_to_each_row(1.) prepend_1_to_each_row = Prepend_scalar_constant_to_each_row(1.)
# numerically stabilize log softmax (X)
# as X-X.max(axis=1).dimshuffle(0,'x') - log(exp(X-X.max(axis=1).dimshuffle(0,'x')).sum(axis=1)).dimshuffle(0,'x)
def make_out_pattern(X):
stabilized_X = X - X.max(axis=1).dimshuffle(0, 'x')
out_var = stabilized_X - tensor.log(tensor.exp(stabilized_X).sum(
axis=1)).dimshuffle(0, 'x')
# tell DEBUG_MODE that it's OK if the original graph produced NaN and the optimized graph does not
out_var.values_eq_approx = values_eq_approx_remove_nan
return out_var
local_log_softmax = gof.PatternSub(in_pattern=(tensor.log, (softmax_op, 'x')),
out_pattern=(make_out_pattern, 'x'),
allow_multiple_clients=True)
# don't do register_stabilize, this is to make local_log_softmax run
# only after another more specific optimization that stabilizes cross entropy
# opt.register_stabilize(local_log_softmax, name = 'local_log_softmax')
opt.register_specialize(local_log_softmax, 'fast_compile_gpu', name='local_log_softmax')
def relu(x, alpha=0): def relu(x, alpha=0):
""" """
Compute the element-wise rectified linear activation function. Compute the element-wise rectified linear activation function.
......
...@@ -24,8 +24,8 @@ from theano.tensor.nnet import (categorical_crossentropy, ...@@ -24,8 +24,8 @@ from theano.tensor.nnet import (categorical_crossentropy,
CrossentropyCategorical1HotGrad, CrossentropyCategorical1HotGrad,
sigmoid, softplus, Softmax, softmax, sigmoid, softplus, Softmax, softmax,
softmax_op, softmax_graph, SoftmaxWithBias, softmax_op, softmax_graph, SoftmaxWithBias,
softmax_grad, softmax_with_bias, LogSoftmax, logsoftmax_op,
softmax_with_bias, SoftmaxGrad, softmax_grad, SoftmaxGrad,
Prepend_scalar_constant_to_each_row, Prepend_scalar_constant_to_each_row,
Prepend_scalar_to_each_row, Prepend_scalar_to_each_row,
relu, relu,
...@@ -122,9 +122,9 @@ class T_SoftmaxWithBias(utt.InferShapeTester): ...@@ -122,9 +122,9 @@ class T_SoftmaxWithBias(utt.InferShapeTester):
# test that we don't raise an error during optimization for no good # test that we don't raise an error during optimization for no good
# reason as softmax_with_bias don't support correctly some/all # reason as softmax_with_bias don't support correctly some/all
# broadcasted inputs pattern # broadcasted inputs pattern
initial_W = numpy.asarray([[0.1, 0.1, 0.1], \ initial_W = numpy.asarray([[0.1, 0.1, 0.1],
[0.1, 0.1, 0.1], \ [0.1, 0.1, 0.1],
[0.1, 0.1, 0.1]], \ [0.1, 0.1, 0.1]],
dtype=theano.config.floatX) dtype=theano.config.floatX)
W = theano.shared(value=initial_W, name='W') W = theano.shared(value=initial_W, name='W')
vbias = theano.shared(value=0.1, name='vbias') # 0.01 vbias = theano.shared(value=0.1, name='vbias') # 0.01
...@@ -148,6 +148,118 @@ class T_SoftmaxWithBias(utt.InferShapeTester): ...@@ -148,6 +148,118 @@ class T_SoftmaxWithBias(utt.InferShapeTester):
[admat_val, advec_val], SoftmaxWithBias) [admat_val, advec_val], SoftmaxWithBias)
class T_LogSoftmax(utt.InferShapeTester):
def test0(self):
def f(a):
return logsoftmax_op(a)[:, 0]
utt.verify_grad(f, [numpy.random.rand(3, 4)])
def test1(self):
def f(a):
return logsoftmax_op(a)[:, 1]
utt.verify_grad(f, [numpy.random.rand(3, 4)])
def test2(self):
def f(a):
return logsoftmax_op(a)[:, 2]
utt.verify_grad(f, [numpy.random.rand(3, 4)])
def test3(self):
def f(a):
return logsoftmax_op(a)[:, 3]
utt.verify_grad(f, [numpy.random.rand(3, 4)])
def test_matrix(self):
def f(a):
return logsoftmax_op(a)
utt.verify_grad(f, [numpy.random.rand(3, 4)])
def test_vector(self):
x = T.vector()
f = theano.function([x], logsoftmax_op(x))
xv = numpy.random.randn(6).astype(config.floatX)
assert numpy.allclose(f(xv),
numpy.log(numpy.exp(xv) / numpy.exp(xv).sum()))
def test_vector_grad(self):
def f(a):
return logsoftmax_op(a)
utt.verify_grad(f, [numpy.random.rand(4)])
def test_allclose(self):
x, y = tensor.matrices('xy')
# regular softmax and crossentropy
sm = tensor.nnet.softmax(x)
cm = tensor.nnet.categorical_crossentropy(sm, y)
# numerically stable log-softmax with crossentropy
logsm = tensor.nnet.logsoftmax(x)
sm2 = tensor.exp(logsm) # just used to show equivalence with sm
cm2 = -tensor.sum(y*logsm, axis=1)
grad = tensor.grad(cm2.mean(), x)
# create some inputs into a softmax that are large and labels
a = numpy.exp(10*numpy.random.rand(5, 10).astype(theano.config.floatX))
# create some one-hot coded labels
b = numpy.eye(5, 10).astype(theano.config.floatX)
# show equivalence of softmax and exponentiated numerically stable
# log-softmax
f1 = theano.function([x], [sm, sm2])
sm_, sm2_ = f1(a)
utt.assert_allclose(sm_, sm2_)
# now show that the two versions result in the same crossentropy cost
# this indicates that the forward function does provide some numerical
# stability
f2 = theano.function([x, y], [cm, cm2])
cm_, cm2_ = f2(a, b)
utt.assert_allclose(cm_, cm2_)
# now, show that in the standard softmax case the gradients blow up
# while in the log-softmax case they don't
f3 = theano.function([x, y], [grad])
grad_ = f3(a, b)
assert numpy.all(numpy.isnan(grad_) == False)
def test_isclose(self):
def f(a):
return logsoftmax_op(a)
def test_local_softmax_optimization(self):
"""Test the Logsoftmax substitution
Check that Log(Softmax(x)) is substituted with Logsoftmax(x). Note that
only the forward pass is checked (i.e., doesn't check the gradient)
"""
x, y = tensor.matrices('xy')
sm = tensor.nnet.softmax(x)
logsm = tensor.log(sm)
f = theano.function([x], logsm)
assert isinstance(f.maker.fgraph.outputs[0].owner.op,
theano.tensor.nnet.nnet.LogSoftmax)
def test_local_softmax_grad_optimization_and_big_input(self):
"""Test the Logsoftmax's grad substitution.
Check that Log(Softmax(x))'s grad is substituted with Logsoftmax(x)'s
grad and that the new operation does not explode for big inputs.
Note that only the grad is checked.
"""
# some inputs that are large to make the gradient explode in the non
# optimized case
a = numpy.exp(10*numpy.random.rand(5, 10).astype(theano.config.floatX))
def myfunc(x):
sm = tensor.nnet.softmax(x)
logsm = tensor.log(sm)
return logsm
# We set step to 0.1 because for big values we need a big epsilon
utt.verify_grad(myfunc, [a], eps=0.1)
class T_SoftmaxGrad(utt.InferShapeTester): class T_SoftmaxGrad(utt.InferShapeTester):
def test_infer_shape(self): def test_infer_shape(self):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论