提交 430561ad authored 作者: Iban Harlouchet's avatar Iban Harlouchet

numpydoc for theano/tensor/nnet/nnet.py

上级 58586211
"""Provides neural-network specific Ops. """
Provides neural-network specific Ops.
:note: TODO: factor this out into a neural-network toolbox. Notes
-----
TODO: factor this out into a neural-network toolbox.
:note: We register all optimization with the gpu tag as we don't We register all optimization with the gpu tag as we don't
implement all the intermediate case on the GPU (in particular implement all the intermediate case on the GPU (in particular
AdvancedSubtensor). So to make sure it run well on the gpu with AdvancedSubtensor). So to make sure it run well on the gpu with
fast_compile, we register them as needed for the GPU. This can be fast_compile, we register them as needed for the GPU. This can be
revisited later when all the intermediate part are on the GPU. revisited later when all the intermediate part are on the GPU.
""" """
import logging import logging
...@@ -38,13 +41,16 @@ class SoftmaxWithBias(gof.Op): ...@@ -38,13 +41,16 @@ class SoftmaxWithBias(gof.Op):
""" """
An L{Op} for the output of neural-net multiclass classifiers. An L{Op} for the output of neural-net multiclass classifiers.
@type x: is a matrix of floats (32 or 64) Attributes
@type b: is a [row] vector of floats (32 or 64), ----------
length is number of cols in x x : a matrix of floats (32 or 64)
b : a [row] vector of floats (32 or 64), length is number of cols in x
This L{Op}'s output is softmax(x+b). This L{Op}'s output is softmax(x+b).
softmax(x[i]) is the i'th distribution over len(x[i]) options. softmax(x[i]) is the i'th distribution over len(x[i]) options.
""" """
nin = 2 nin = 2
nout = 1 nout = 1
__props__ = () __props__ = ()
...@@ -270,7 +276,11 @@ softmax_with_bias = SoftmaxWithBias() ...@@ -270,7 +276,11 @@ softmax_with_bias = SoftmaxWithBias()
class SoftmaxGrad(gof.Op): class SoftmaxGrad(gof.Op):
"""Gradient wrt x of the Softmax Op""" """
Gradient wrt x of the Softmax Op.
"""
nin = 2 nin = 2
nout = 1 nout = 1
__props__ = () __props__ = ()
...@@ -391,6 +401,7 @@ class Softmax(gof.Op): ...@@ -391,6 +401,7 @@ class Softmax(gof.Op):
\\frac{e^{\mathbf{x}_j}}{\sum_{k=1}^K e^{\mathbf{x}_k}}` \\frac{e^{\mathbf{x}_j}}{\sum_{k=1}^K e^{\mathbf{x}_k}}`
where :math:`K` is the total number of neurons in the layer. This where :math:`K` is the total number of neurons in the layer. This
activation function gets applied row-wise. activation function gets applied row-wise.
""" """
nin = 1 nin = 1
...@@ -584,7 +595,9 @@ def softmax(c): ...@@ -584,7 +595,9 @@ def softmax(c):
@opt.register_specialize('fast_compile_gpu') @opt.register_specialize('fast_compile_gpu')
@gof.local_optimizer([softmax_op]) @gof.local_optimizer([softmax_op])
def local_softmax_with_bias(node): def local_softmax_with_bias(node):
"""Try to turn softmax(sum_of_stuff) -> softmax_w_bias(matrix, bias) """
Try to turn softmax(sum_of_stuff) -> softmax_w_bias(matrix, bias).
""" """
if node.op == softmax_op: if node.op == softmax_op:
x, = node.inputs x, = node.inputs
...@@ -789,15 +802,18 @@ if 0: ...@@ -789,15 +802,18 @@ if 0:
class CrossentropySoftmaxArgmax1HotWithBias(gof.Op): class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
"""A special compound L{Op} for the output of neural-net classifiers. """
A special compound L{Op} for the output of neural-net classifiers.
:type x: is a matrix of floats (32 or 64) Attributes
:type b: is a [row] vector of floats (32 or 64), ----------
length is number of cols in x x : a matrix of floats (32 or 64)
:type y_idx: a [column] vector of int (32 or 64), b : a [row] vector of floats (32 or 64), length is number of cols in x
length is number of rows in x y_idx : a [column] vector of int (32 or 64), length is number of rows in x
:returns: row-wise NLL, softmax(x+b), row-wise argmax of (x+b) Returns
-------
row-wise NLL, softmax(x+b), row-wise argmax of (x+b).
@precondition: every entry in y_idx is a valid (non-negative) @precondition: every entry in y_idx is a valid (non-negative)
column index into x column index into x
...@@ -816,6 +832,7 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op): ...@@ -816,6 +832,7 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
i'th example. i'th example.
""" """
nin = 3 nin = 3
nout = 3 nout = 3
__props__ = () __props__ = ()
...@@ -846,7 +863,8 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op): ...@@ -846,7 +863,8 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
return Apply(self, [x, b, y_idx], [nll, sm, am]) return Apply(self, [x, b, y_idx], [nll, sm, am])
def perform(self, node, input_storage, output_storage): def perform(self, node, input_storage, output_storage):
"""The math, where x is an input vector, and t is a target index: """
The math, where x is an input vector, and t is a target index:
softmax(x)[i] = exp(x[i]) / sum_j(exp(x[j])) softmax(x)[i] = exp(x[i]) / sum_j(exp(x[j]))
nll(x,t) = -log(softmax(x)[t]) nll(x,t) = -log(softmax(x)[t])
...@@ -1037,12 +1055,15 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op): ...@@ -1037,12 +1055,15 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
class CrossentropySoftmax1HotWithBiasDx(gof.Op): class CrossentropySoftmax1HotWithBiasDx(gof.Op):
"""
Gradient wrt x of the CrossentropySoftmaxArgmax1HotWithBias Op.
"""
nin = 3 nin = 3
nout = 1 nout = 1
__props__ = () __props__ = ()
"""Gradient wrt x of the CrossentropySoftmaxArgmax1HotWithBias Op"""
def make_node(self, dy, sm, y_idx, **kwargs): def make_node(self, dy, sm, y_idx, **kwargs):
dy = tensor.as_tensor_variable(dy) dy = tensor.as_tensor_variable(dy)
sm = tensor.as_tensor_variable(sm) sm = tensor.as_tensor_variable(sm)
...@@ -1217,15 +1238,18 @@ def crossentropy_softmax_1hot(x, y_idx, **kwargs): ...@@ -1217,15 +1238,18 @@ def crossentropy_softmax_1hot(x, y_idx, **kwargs):
def crossentropy_softmax_max_and_argmax_1hot_with_bias(x, b, y_idx, **kwargs): def crossentropy_softmax_max_and_argmax_1hot_with_bias(x, b, y_idx, **kwargs):
""" """
@return: The cross-entropy, the softmax output, the max probability, Returns
and the argmax index -------
The cross-entropy, the softmax output, the max probability,
and the argmax index.
@todo: Since we are recomputing the argmax, TODO: Since we are recomputing the argmax,
we might as well assert that it is correct. we might as well assert that it is correct.
@todo: Make this entire function is TODO: Make this entire function is
unnecessary? e.g. CrossentropySoftmaxArgmax1HotWithBias should return unnecessary? e.g. CrossentropySoftmaxArgmax1HotWithBias should return
the appropriate information (i.e. the max probability)? the appropriate information (i.e. the max probability)?
""" """
(xent, softmax) = crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs) (xent, softmax) = crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs)
(max_pr, argmax) = tensor.max_and_argmax(softmax, axis=-1) (max_pr, argmax) = tensor.max_and_argmax(softmax, axis=-1)
...@@ -1262,29 +1286,34 @@ crossentropy_categorical_1hot_grad = CrossentropyCategorical1HotGrad() ...@@ -1262,29 +1286,34 @@ crossentropy_categorical_1hot_grad = CrossentropyCategorical1HotGrad()
class CrossentropyCategorical1Hot(gof.Op): class CrossentropyCategorical1Hot(gof.Op):
"""
"""Compute the cross entropy between a coding distribution and Compute the cross entropy between a coding distribution and
a true distribution of the form [0, 0, ... 0, 1, 0, ..., 0] a true distribution of the form [0, 0, ... 0, 1, 0, ..., 0].
.. math:: .. math::
y[i] = - \log(coding_dist[i, one_of_n[i]) y[i] = - \log(coding_dist[i, one_of_n[i])
Notes
:note: In the case that the coding distribution is the output of a -----
softmax, an application of this Op will probably be optimized In the case that the coding distribution is the output of a
away in favour of one with a C implementation. softmax, an application of this Op will probably be optimized
away in favour of one with a C implementation.
""" """
__props__ = () __props__ = ()
def make_node(self, coding_dist, true_one_of_n): def make_node(self, coding_dist, true_one_of_n):
""" """
:type coding_dist: dense matrix Parameters
----------
coding_dist : dense matrix
true_one_of_n : lvector
:type true_one_of_n: lvector Returns
-------
dvector
:rtype: dvector
""" """
_coding_dist = tensor.as_tensor_variable(coding_dist) _coding_dist = tensor.as_tensor_variable(coding_dist)
_true_one_of_n = tensor.as_tensor_variable(true_one_of_n) _true_one_of_n = tensor.as_tensor_variable(true_one_of_n)
...@@ -1332,10 +1361,13 @@ crossentropy_categorical_1hot = CrossentropyCategorical1Hot() ...@@ -1332,10 +1361,13 @@ crossentropy_categorical_1hot = CrossentropyCategorical1Hot()
@opt.register_specialize('fast_compile_gpu') @opt.register_specialize('fast_compile_gpu')
@gof.optimizer @gof.optimizer
def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph): def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph):
"""This is a stabilization optimization """
This is a stabilization optimization.
:note: not a local optimization because we are replacing outputs Notes
from several nodes at once -----
Not a local optimization because we are replacing outputs
from several nodes at once.
""" """
...@@ -1362,16 +1394,19 @@ def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph): ...@@ -1362,16 +1394,19 @@ def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph):
@gof.optimizer @gof.optimizer
def crossentropy_to_crossentropy_with_softmax(fgraph): def crossentropy_to_crossentropy_with_softmax(fgraph):
"""This is a stabilization optimization that is more general then """
crossentropy_to_crossentropy_with_softmax_with_bias This is a stabilization optimization that is more general than
crossentropy_to_crossentropy_with_softmax_with_bias.
It must be executed after local_softmax_with_bias optimization in It must be executed after local_softmax_with_bias optimization in
specialize specialize.
:todo: This is a stabilization optimization! How to make this more cleanly? TODO : This is a stabilization optimization! How to make this more cleanly?
:note: not a local optimization because we are replacing outputs Notes
from several nodes at once -----
Not a local optimization because we are replacing outputs from several
nodes at once.
""" """
...@@ -1460,11 +1495,13 @@ def local_argmax_pushdown(node): ...@@ -1460,11 +1495,13 @@ def local_argmax_pushdown(node):
def _check_rows_is_arange_len_labels(rows, labels): def _check_rows_is_arange_len_labels(rows, labels):
'''Check that 'rows' is the same node as T.arange(labels.shape[0]) """
Check that 'rows' is the same node as T.arange(labels.shape[0]).
Also considers the case where labels.shape[0] is constant and equal Also considers the case where labels.shape[0] is constant and equal
to 1, and T.arange(labels.shape[0]) has been constant-folded into 0. to 1, and T.arange(labels.shape[0]) has been constant-folded into 0.
'''
"""
if labels.owner and hasattr(labels.owner.fgraph, 'shape_feature'): if labels.owner and hasattr(labels.owner.fgraph, 'shape_feature'):
shape_of = labels.owner.fgraph.shape_feature.shape_of shape_of = labels.owner.fgraph.shape_feature.shape_of
...@@ -1795,10 +1832,11 @@ def graph_merge_softmax_with_crossentropy_softmax(node): ...@@ -1795,10 +1832,11 @@ def graph_merge_softmax_with_crossentropy_softmax(node):
@gof.local_optimizer([CrossentropySoftmax1HotWithBiasDx]) @gof.local_optimizer([CrossentropySoftmax1HotWithBiasDx])
def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node): def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node):
""" """
Replaces a CrossentropySoftmax1HotWithBiasDx op, whose incoming gradient is Replace a CrossentropySoftmax1HotWithBiasDx op, whose incoming gradient is
an `alloc` of a scalar variable or one that has either broadcastable or an `alloc` of a scalar variable or one that has either broadcastable or
matching dimensions with the output variable, by one that skips the matching dimensions with the output variable, by one that skips the
intermediate `alloc`. intermediate `alloc`.
""" """
if isinstance(node.op, CrossentropySoftmax1HotWithBiasDx): if isinstance(node.op, CrossentropySoftmax1HotWithBiasDx):
dy, sm, y_idx = node.inputs dy, sm, y_idx = node.inputs
...@@ -1850,30 +1888,38 @@ def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node): ...@@ -1850,30 +1888,38 @@ def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node):
def binary_crossentropy(output, target): def binary_crossentropy(output, target):
""" """
Compute the crossentropy of binary random variables Compute the crossentropy of binary random variables.
output and target are each expectations of binary random
Output and target are each expectations of binary random
variables; target may be exactly 0 or 1 but output must variables; target may be exactly 0 or 1 but output must
lie strictly between 0 and 1. lie strictly between 0 and 1.
@note: we could use the x log y op to support output=0
@ and output=1. The gradient would still be undefined though. Notes
@note: We do not sum, crossentropy is computed by component. -----
@todo: Rewrite as a scalar, and then broadcast to tensor. We could use the x log y op to support output=0 and output=1.
The gradient would still be undefined though.
We do not sum, crossentropy is computed by component.
TODO : Rewrite as a scalar, and then broadcast to tensor.
""" """
return -(target * tensor.log(output) + (1.0 - target) * tensor.log(1.0 - output)) return -(target * tensor.log(output) + (1.0 - target) * tensor.log(1.0 - output))
def categorical_crossentropy(coding_dist, true_dist): def categorical_crossentropy(coding_dist, true_dist):
""" """
WARNING: THIS FUNCTION IS UNNECESSARILY POLYMORPHIC. Return the cross-entropy between an approximating distribution and a true
We ultimately don't want the polymorphism, and will move this function to pylearn.algorithms.cost. distribution.
The 1hot version will be removed.
The length of the documentation here is a form of code smell.
Return the cross-entropy between an approximating distribution and a true distribution .. warning:: THIS FUNCTION IS UNNECESSARILY POLYMORPHIC.
We ultimately don't want the polymorphism, and will move this function
to pylearn.algorithms.cost. The 1hot version will be removed.
The length of the documentation here is a form of code smell.
The cross entropy between two probability distributions measures the average number of bits The cross entropy between two probability distributions measures the average
needed to identify an event from a set of possibilities, if a coding scheme is used based number of bits needed to identify an event from a set of possibilities, if a
on a given probability distribution q, rather than the "true" distribution p. coding scheme is used based on a given probability distribution q, rather
than the "true" distribution p.
Mathematically it is defined as follows: Mathematically it is defined as follows:
...@@ -1881,20 +1927,25 @@ def categorical_crossentropy(coding_dist, true_dist): ...@@ -1881,20 +1927,25 @@ def categorical_crossentropy(coding_dist, true_dist):
H(p,q) = - \sum_x p(x) \log(q(x)) H(p,q) = - \sum_x p(x) \log(q(x))
:type coding_dist: a dense matrix. Parameters
:param coding_dist: Each slice along axis represents one distribution. ----------
coding_dist : a dense matrix
:type true_dist: a dense matrix or sparse matrix or integer vector. Each slice along axis represents one distribution.
:param coding_dist: In the case of a matrix argument, each slice along axis represents one true_dist: a dense matrix or sparse matrix or integer vector
distribution. In the case of an integer vector argument, each element represents the In the case of a matrix argument, each slice along axis represents one
position of the '1' in a 1-of-N encoding. distribution. In the case of an integer vector argument, each element
represents the position of the '1' in a 1-of-N encoding.
:type axis: int
:param axis: the dimension over which each distribution runs. (1 for row distributions, 0 Returns
for column distributions) -------
tensor of rank one-less-than `coding_dist`
:rtype: tensor of rank one-less-than `coding_dist` The cross entropy between each coding and true distribution.
:returns: the cross entropy between each coding and true distribution.
Notes
-----
axis : int
The dimension over which each distribution runs
(1 for row distributions, 0 for column distributions).
""" """
if true_dist.ndim == coding_dist.ndim: if true_dist.ndim == coding_dist.ndim:
...@@ -2036,23 +2087,27 @@ def relu(x, alpha=0): ...@@ -2036,23 +2087,27 @@ def relu(x, alpha=0):
""" """
Compute the element-wise rectified linear activation function. Compute the element-wise rectified linear activation function.
:type x: symbolic tensor Parameters
:param x: Tensor to compute the activation function for. ----------
x : symbolic tensor
:type alpha: scalar or tensor, optional Tensor to compute the activation function for.
:param alpha: Slope for negative input, usually between 0 and 1. The alpha: scalar or tensor, optional
default value of 0 will lead to the standard rectifier, 1 will lead to Slope for negative input, usually between 0 and 1. The default value
of 0 will lead to the standard rectifier, 1 will lead to
a linear activation function, and any value in between will give a a linear activation function, and any value in between will give a
leaky rectifier. A shared variable (broadcastable against `x`) will leaky rectifier. A shared variable (broadcastable against `x`) will
result in a parameterized rectifier with learnable slope(s). result in a parameterized rectifier with learnable slope(s).
:rtype: symbolic tensor Returns
:return: element-wise rectifier applied to `x` -------
symbolic tensor
Element-wise rectifier applied to `x`.
.. note:: This is numerically equivalent to Notes
``T.switch(x > 0, x, alpha * x)`` -----
(or ``T.maximum(x, alpha * x)`` for ``alpha < 1``), but uses a faster This is numerically equivalent to ``T.switch(x > 0, x, alpha * x)``
formulation or an optimized Op, so we encourage to use this function. (or ``T.maximum(x, alpha * x)`` for ``alpha < 1``), but uses a faster
formulation or an optimized Op, so we encourage to use this function.
""" """
# This is probably the fastest implementation for GPUs. Both the forward # This is probably the fastest implementation for GPUs. Both the forward
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论