提交 a85a44fc authored 作者: Xavier Bouthillier's avatar Xavier Bouthillier

Merge pull request #3361 from adbrebs/h_softmax

Two-layer hierarchical softmax
.. _libdoc_blocksparse:
===========================================================================
:mod:`sandbox.blocksparse` -- Block sparse dot operations (gemv and outer)
===========================================================================
===============================================================================
:mod:`blocksparse` -- Block sparse dot operations (gemv and outer)
===============================================================================
.. module:: sandbox.blocksparse
.. module:: tensor.nnet.blocksparse
:platform: Unix, Windows
:synopsis: Block sparse dot
.. moduleauthor:: LISA
API
===
.. automodule:: theano.sandbox.blocksparse
.. automodule:: theano.tensor.nnet.blocksparse
:members:
......@@ -20,3 +20,4 @@ and ops which are particular to neural networks and deep learning.
nnet
neighbours
bn
blocksparse
......@@ -21,6 +21,7 @@
- :func:`relu() <theano.tensor.nnet.relu>`
- :func:`binary_crossentropy`
- :func:`.categorical_crossentropy`
- :func:`h_softmax() <theano.tensor.nnet.h_softmax>`
.. function:: sigmoid(x)
......@@ -204,3 +205,6 @@
y = T.nnet.softmax(T.dot(W, x) + b)
cost = T.nnet.categorical_crossentropy(y, o)
# o is either the above-mentioned 1-of-N vector or 2D tensor
.. autofunction:: theano.tensor.nnet.h_softmax
import numpy
import theano
from theano import Op, Apply
from theano import tensor
from theano.tensor import discrete_dtypes
from theano.gradient import grad_undefined
class SparseBlockGemv(Op):
"""
This op computes the dot product of specified pieces of vectors
and matrices, returning pieces of vectors::
for b in range(batch_size):
for j in range(o.shape[1]):
for i in range(h.shape[1]):
o[b, j, :] += numpy.dot(h[b, i], W[iIdx[b, i], oIdx[b, j]])
where b, h, W, o iIdx, oIdx are defined in the docstring of make_node.
.. image:: ../../images/blocksparse.png
:scale: 50 %
"""
registered_opts = []
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, o, W, h, inputIdx, outputIdx):
"""
Compute the dot product of the specified pieces of vectors
and matrices.
The parameter types are actually their expected shapes
relative to each other.
Parameters
----------
o : batch, oWin, oSize
output vector
W : iBlocks, oBlocks, iSize, oSize
weight matrix
h : batch, iWin, iSize
input from lower layer (sparse)
inputIdx : batch, iWin
indexes of the input blocks
outputIdx : batch, oWin
indexes of the output blocks
Returns
-------
(batch, oWin, oSize)
dot(W[i, j], h[i]) + o[j]
Notes
-----
- `batch` is the number of examples in a minibatch (batch size).
- `iBlocks` is the total number of blocks in the input (from lower
layer).
- `iSize` is the size of each of these input blocks.
- `iWin` is the number of blocks that will be used as inputs. Which
blocks will be used is specified in `inputIdx`.
- `oBlocks` is the number or possible output blocks.
- `oSize` is the size of each of these output blocks.
- `oWin` is the number of output blocks that will actually be computed.
Which blocks will be computed is specified in `outputIdx`.
"""
o = theano.tensor.as_tensor_variable(o)
W = theano.tensor.as_tensor_variable(W)
h = theano.tensor.as_tensor_variable(h)
inputIdx = theano.tensor.as_tensor_variable(inputIdx)
outputIdx = theano.tensor.as_tensor_variable(outputIdx)
if o.ndim != 3:
raise TypeError('The output o must be a 2D tensor')
if W.ndim != 4:
raise TypeError('The weight matrix W must be a 4D tensor')
if h.ndim != 3:
raise TypeError('The input h must be a 3D tensor')
if inputIdx.ndim != 2:
raise TypeError('The input indices inputIdx must be a 2D tensor')
if outputIdx.ndim != 2:
raise TypeError('The output indices outputIdx must be a 2D tensor')
assert inputIdx.type.dtype in discrete_dtypes
assert outputIdx.type.dtype in discrete_dtypes
output = o.type.__class__(dtype=o.type.dtype,
broadcastable=(False,) * o.ndim)()
return Apply(self, [o, W, h, inputIdx, outputIdx], [output])
def perform(self, node, inp, out_):
o, W, h, iIdx, oIdx = inp[:5]
if not self.inplace:
o = o.copy()
for b in range(o.shape[0]):
for j in range(o.shape[1]):
outputIdx = oIdx[b, j]
for i in range(h.shape[1]):
inputIdx = iIdx[b, i]
w = W[inputIdx, outputIdx]
o[b, j, :] += numpy.dot(h[b, i], w)
out_[0][0] = o
def grad(self, inputs, grads):
o, W, h, inputIdx, outputIdx = inputs
go = grads[0]
outer_fun = SparseBlockOuter(self.inplace)
gemv_fun = SparseBlockGemv(self.inplace)
Wgrad = outer_fun(W.zeros_like(), h, go, inputIdx, outputIdx)
hgrad = gemv_fun(h.zeros_like(), W.dimshuffle((1, 0, 3, 2)),
go, outputIdx, inputIdx)
return [go, Wgrad, hgrad,
grad_undefined(self, 3, inputIdx,
"grad of inputIdx makes no sense"),
grad_undefined(self, 4, outputIdx,
"grad of outputIdx makes no sense")]
class SparseBlockOuter(Op):
"""
This computes the outer product of two sets of pieces of vectors
updating a full matrix with the results::
for b in range(batch_size):
o[xIdx[b, i], yIdx[b, j]] += (alpha * outer(x[b, i], y[b, j]))
This op is involved in the gradient of SparseBlockGemv.
"""
registered_opts = []
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
"""
Compute the dot product of the specified pieces of vectors
and matrices.
The parameter types are actually their expected shapes
relative to each other.
Parameters
----------
o : xBlocks, yBlocks, xSize, ySize
x : batch, xWin, xSize
y : batch, yWin, ySize
xIdx : batch, iWin
indexes of the x blocks
yIdx : batch, oWin
indexes of the y blocks
Returns
-------
(xBlocks, yBlocks, xSize, ySize)
outer(x[i], y[j]) + o[i, j]
Notes
-----
- `batch` is the number of examples in a minibatch (batch size).
- `xBlocks` is the total number of blocks in x.
- `xSize` is the size of each of these x blocks.
- `xWin` is the number of blocks that will be used as x. Which blocks
will be used is specified in `xIdx`.
- `yBlocks` is the number or possible y blocks.
- `ySize` is the size of each of these y blocks.
- `yWin` is the number of y blocks that will actually be computed.
Which blocks will be computed is specified in `yIdx`.
"""
one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
o = theano.tensor.as_tensor_variable(o)
x = theano.tensor.as_tensor_variable(x)
y = theano.tensor.as_tensor_variable(y)
if alpha is None:
alpha = one
output = o.type.__class__(dtype=o.type.dtype,
broadcastable=(False,) * o.ndim)()
return Apply(self, [o, x, y, xIdx, yIdx, alpha],
[output])
def perform(self, node, inp, out_):
o, x, y, xIdx, yIdx, alpha = inp[:6]
if not self.inplace:
o = o.copy()
for b in range(x.shape[0]):
for i in range(xIdx.shape[1]):
for j in range(yIdx.shape[1]):
o[xIdx[b, i], yIdx[b, j]] += numpy.outer(x[b, i],
y[b, j, :])
out_[0][0] = o
sparse_block_gemv = SparseBlockGemv(False)
sparse_block_gemv_inplace = SparseBlockGemv(True)
sparse_block_outer = SparseBlockOuter(False)
sparse_block_outer_inplace = SparseBlockOuter(True)
def sparse_block_dot(W, h, inputIdx, b, outputIdx):
"""
Compute the dot product (plus bias) of the specified pieces of vectors
and matrices. See SparseBlockGemv to get more information.
The parameter types are actually their expected shapes relative to
each other.
Parameters
----------
W : iBlocks, oBlocks, iSize, oSize
weight matrix
h : batch, iWin, iSize
input from lower layer (sparse)
inputIdx : batch, iWin
indexes of the input blocks
b : oBlocks, oSize
bias vector
outputIdx : batch, oWin
indexes of the output blocks
Returns
-------
(batch, oWin, oSize)
dot(W[i, j], h[i]) + b[j] but b[j] is only added once
Notes
-----
- `batch` is the number of examples in a minibatch (batch size).
- `iBlocks` is the total number of blocks in the input (from lower layer).
- `iSize` is the size of each of these input blocks.
- `iWin` is the number of blocks that will be used as inputs. Which blocks
will be used is specified in `inputIdx`.
- `oBlocks` is the number or possible output blocks.
- `oSize` is the size of each of these output blocks.
- `oWin` is the number of output blocks that will actually be computed.
Which blocks will be computed is specified in `outputIdx`.
"""
assert inputIdx.ndim == h.ndim - 1
assert outputIdx.ndim == inputIdx.ndim
if h.ndim == 2:
h = h.dimshuffle('x', 0, 1)
inputIdx = inputIdx.dimshuffle('x', 0)
outputIdx = outputIdx.dimshuffle('x', 0)
return SparseBlockGemv()(b.take(outputIdx, axis=0), W, h,
inputIdx, outputIdx)
import warnings
from theano.tensor.nnet.blocksparse import (
SparseBlockGemv, SparseBlockOuter, sparse_block_dot, sparse_block_gemv,
sparse_block_gemv_inplace, sparse_block_outer, sparse_block_outer_inplace)
__all__ = [SparseBlockGemv, SparseBlockOuter, sparse_block_dot,
sparse_block_gemv, sparse_block_gemv_inplace, sparse_block_outer,
sparse_block_outer_inplace]
warnings.warn("DEPRECATION: theano.sandbox.blocksparse does not exist anymore,"
"it has been moved to theano.tensor.nnet.blocksparse.",
category=DeprecationWarning)
......@@ -46,7 +46,7 @@ from theano.sandbox.cuda.blas import (
GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad,
GpuDownsampleFactorMaxGradGrad)
from theano.sandbox.blocksparse import SparseBlockGemv, SparseBlockOuter
from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
from theano.sandbox.cuda.blocksparse import (
GpuSparseBlockGemv,
GpuSparseBlockOuter,
......
......@@ -4,7 +4,7 @@ from nose.plugins.skip import SkipTest
import theano
from theano import tensor
import theano.tests.unittest_tools as utt
import theano.sandbox.tests.test_blocksparse
import theano.tensor.nnet.tests.test_blocksparse
import theano.sandbox.cuda as cuda_ndarray
from theano.sandbox.cuda.blocksparse import (GpuSparseBlockOuter,
......@@ -22,7 +22,7 @@ else:
class BlockSparse_Gemv_and_Outer(
theano.sandbox.tests.test_blocksparse.BlockSparse_Gemv_and_Outer):
theano.tensor.nnet.tests.test_blocksparse.BlockSparse_Gemv_and_Outer):
def setUp(self):
utt.seed_rng()
self.mode = mode_with_gpu.excluding('constant_folding')
......
......@@ -29,7 +29,7 @@ from theano.sandbox.cuda import basic_ops
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.scalar.basic_scipy import erfinv
from theano.sandbox.blocksparse import sparse_block_dot
from theano.tensor.nnet.blocksparse import sparse_block_dot
from theano.sandbox.cuda.blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter
......
from .nnet import *
from .nnet import (
CrossentropyCategorical1Hot, CrossentropyCategorical1HotGrad,
CrossentropySoftmax1HotWithBiasDx, CrossentropySoftmaxArgmax1HotWithBias,
Prepend_scalar_constant_to_each_row, Prepend_scalar_to_each_row, Softmax,
SoftmaxGrad, SoftmaxWithBias, binary_crossentropy,
categorical_crossentropy, crossentropy_categorical_1hot,
crossentropy_categorical_1hot_grad, crossentropy_softmax_1hot,
crossentropy_softmax_1hot_with_bias,
crossentropy_softmax_1hot_with_bias_dx,
crossentropy_softmax_argmax_1hot_with_bias,
crossentropy_softmax_max_and_argmax_1hot,
crossentropy_softmax_max_and_argmax_1hot_with_bias,
crossentropy_to_crossentropy_with_softmax,
crossentropy_to_crossentropy_with_softmax_with_bias,
graph_merge_softmax_with_crossentropy_softmax, h_softmax,
local_advanced_indexing_crossentropy_onehot,
local_advanced_indexing_crossentropy_onehot_grad, local_argmax_pushdown,
local_log_softmax, local_softmax_grad_to_crossentropy_with_softmax_grad,
local_softmax_with_bias,
local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc,
make_out_pattern, prepend_0_to_each_row, prepend_1_to_each_row,
prepend_scalar_to_each_row, relu, softmax, softmax_grad, softmax_graph,
softmax_op, softmax_simplifier, softmax_with_bias)
from . import opt
from .conv import conv2d, ConvOp
from .Conv3D import *
from .ConvGrad3D import *
......
import numpy
import theano
from theano import Op, Apply
from theano.tensor import discrete_dtypes
from theano.gradient import grad_undefined
class SparseBlockGemv(Op):
"""
This op computes the dot product of specified pieces of vectors
and matrices, returning pieces of vectors::
for b in range(batch_size):
for j in range(o.shape[1]):
for i in range(h.shape[1]):
o[b, j, :] += numpy.dot(h[b, i], W[iIdx[b, i], oIdx[b, j]])
where b, h, W, o iIdx, oIdx are defined in the docstring of make_node.
.. image:: ../../../images/blocksparse.png
:scale: 50 %
"""
registered_opts = []
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, o, W, h, inputIdx, outputIdx):
"""
Compute the dot product of the specified pieces of vectors
and matrices.
The parameter types are actually their expected shapes
relative to each other.
Parameters
----------
o : batch, oWin, oSize
output vector
W : iBlocks, oBlocks, iSize, oSize
weight matrix
h : batch, iWin, iSize
input from lower layer (sparse)
inputIdx : batch, iWin
indexes of the input blocks
outputIdx : batch, oWin
indexes of the output blocks
Returns
-------
(batch, oWin, oSize)
dot(W[i, j], h[i]) + o[j]
Notes
-----
- `batch` is the number of examples in a minibatch (batch size).
- `iBlocks` is the total number of blocks in the input (from lower
layer).
- `iSize` is the size of each of these input blocks.
- `iWin` is the number of blocks that will be used as inputs. Which
blocks will be used is specified in `inputIdx`.
- `oBlocks` is the number or possible output blocks.
- `oSize` is the size of each of these output blocks.
- `oWin` is the number of output blocks that will actually be computed.
Which blocks will be computed is specified in `outputIdx`.
"""
o = theano.tensor.as_tensor_variable(o)
W = theano.tensor.as_tensor_variable(W)
h = theano.tensor.as_tensor_variable(h)
inputIdx = theano.tensor.as_tensor_variable(inputIdx)
outputIdx = theano.tensor.as_tensor_variable(outputIdx)
if o.ndim != 3:
raise TypeError('The output o must be a 2D tensor')
if W.ndim != 4:
raise TypeError('The weight matrix W must be a 4D tensor')
if h.ndim != 3:
raise TypeError('The input h must be a 3D tensor')
if inputIdx.ndim != 2:
raise TypeError('The input indices inputIdx must be a 2D tensor')
if outputIdx.ndim != 2:
raise TypeError('The output indices outputIdx must be a 2D tensor')
assert inputIdx.type.dtype in discrete_dtypes
assert outputIdx.type.dtype in discrete_dtypes
output = o.type.__class__(dtype=o.type.dtype,
broadcastable=(False,) * o.ndim)()
return Apply(self, [o, W, h, inputIdx, outputIdx], [output])
def perform(self, node, inp, out_):
o, W, h, iIdx, oIdx = inp[:5]
if not self.inplace:
o = o.copy()
for b in range(o.shape[0]):
for j in range(o.shape[1]):
outputIdx = oIdx[b, j]
for i in range(h.shape[1]):
inputIdx = iIdx[b, i]
w = W[inputIdx, outputIdx]
o[b, j, :] += numpy.dot(h[b, i], w)
out_[0][0] = o
def grad(self, inputs, grads):
o, W, h, inputIdx, outputIdx = inputs
go = grads[0]
outer_fun = SparseBlockOuter(self.inplace)
gemv_fun = SparseBlockGemv(self.inplace)
Wgrad = outer_fun(W.zeros_like(), h, go, inputIdx, outputIdx)
hgrad = gemv_fun(h.zeros_like(), W.dimshuffle((1, 0, 3, 2)),
go, outputIdx, inputIdx)
return [go, Wgrad, hgrad,
grad_undefined(self, 3, inputIdx,
"grad of inputIdx makes no sense"),
grad_undefined(self, 4, outputIdx,
"grad of outputIdx makes no sense")]
class SparseBlockOuter(Op):
"""
This computes the outer product of two sets of pieces of vectors
updating a full matrix with the results::
for b in range(batch_size):
o[xIdx[b, i], yIdx[b, j]] += (alpha * outer(x[b, i], y[b, j]))
This op is involved in the gradient of SparseBlockGemv.
"""
registered_opts = []
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
"""
Compute the dot product of the specified pieces of vectors
and matrices.
The parameter types are actually their expected shapes
relative to each other.
Parameters
----------
o : xBlocks, yBlocks, xSize, ySize
x : batch, xWin, xSize
y : batch, yWin, ySize
xIdx : batch, iWin
indexes of the x blocks
yIdx : batch, oWin
indexes of the y blocks
Returns
-------
(xBlocks, yBlocks, xSize, ySize)
outer(x[i], y[j]) + o[i, j]
Notes
-----
- `batch` is the number of examples in a minibatch (batch size).
- `xBlocks` is the total number of blocks in x.
- `xSize` is the size of each of these x blocks.
- `xWin` is the number of blocks that will be used as x. Which blocks
will be used is specified in `xIdx`.
- `yBlocks` is the number or possible y blocks.
- `ySize` is the size of each of these y blocks.
- `yWin` is the number of y blocks that will actually be computed.
Which blocks will be computed is specified in `yIdx`.
"""
one = theano.tensor.constant(numpy.asarray(1.0, dtype='float32'))
o = theano.tensor.as_tensor_variable(o)
x = theano.tensor.as_tensor_variable(x)
y = theano.tensor.as_tensor_variable(y)
if alpha is None:
alpha = one
output = o.type.__class__(dtype=o.type.dtype,
broadcastable=(False,) * o.ndim)()
return Apply(self, [o, x, y, xIdx, yIdx, alpha],
[output])
def perform(self, node, inp, out_):
o, x, y, xIdx, yIdx, alpha = inp[:6]
if not self.inplace:
o = o.copy()
for b in range(x.shape[0]):
for i in range(xIdx.shape[1]):
for j in range(yIdx.shape[1]):
o[xIdx[b, i], yIdx[b, j]] += numpy.outer(x[b, i],
y[b, j, :])
out_[0][0] = o
sparse_block_gemv = SparseBlockGemv(False)
sparse_block_gemv_inplace = SparseBlockGemv(True)
sparse_block_outer = SparseBlockOuter(False)
sparse_block_outer_inplace = SparseBlockOuter(True)
def sparse_block_dot(W, h, inputIdx, b, outputIdx):
"""
Compute the dot product (plus bias) of the specified pieces of vectors
and matrices. See SparseBlockGemv to get more information.
The parameter types are actually their expected shapes relative to
each other.
Parameters
----------
W : iBlocks, oBlocks, iSize, oSize
weight matrix
h : batch, iWin, iSize
input from lower layer (sparse)
inputIdx : batch, iWin
indexes of the input blocks
b : oBlocks, oSize
bias vector
outputIdx : batch, oWin
indexes of the output blocks
Returns
-------
(batch, oWin, oSize)
dot(W[i, j], h[i]) + b[j] but b[j] is only added once
Notes
-----
- `batch` is the number of examples in a minibatch (batch size).
- `iBlocks` is the total number of blocks in the input (from lower layer).
- `iSize` is the size of each of these input blocks.
- `iWin` is the number of blocks that will be used as inputs. Which blocks
will be used is specified in `inputIdx`.
- `oBlocks` is the number or possible output blocks.
- `oSize` is the size of each of these output blocks.
- `oWin` is the number of output blocks that will actually be computed.
Which blocks will be computed is specified in `outputIdx`.
"""
assert inputIdx.ndim == h.ndim - 1
assert outputIdx.ndim == inputIdx.ndim
if h.ndim == 2:
h = h.dimshuffle('x', 0, 1)
inputIdx = inputIdx.dimshuffle('x', 0)
outputIdx = outputIdx.dimshuffle('x', 0)
return SparseBlockGemv()(b.take(outputIdx, axis=0), W, h,
inputIdx, outputIdx)
......@@ -29,6 +29,7 @@ from theano.gof import Apply
from theano.tensor.nnet.sigm import sigmoid, softplus
from theano.gradient import DisconnectedType
from theano.gradient import grad_not_implemented
from theano.tensor.nnet.blocksparse import sparse_block_dot
from theano.tensor.type import values_eq_approx_remove_nan
......@@ -2041,3 +2042,125 @@ def relu(x, alpha=0):
f1 = 0.5 * (1 + alpha)
f2 = 0.5 * (1 - alpha)
return f1 * x + f2 * abs(x)
def h_softmax(x, batch_size, n_outputs, n_classes, n_outputs_per_class,
W1, b1, W2, b2, target=None):
""" Two-level hierarchical softmax.
The architecture is composed of two softmax layers: the first predicts the
class of the input x while the second predicts the output of the input x in
the predicted class.
More explanations can be found in the original paper [1]_.
If target is specified, it will only compute the outputs of the
corresponding targets. Otherwise, if target is None, it will compute all
the outputs.
The outputs are grouped in the same order as they are initially defined.
.. versionadded:: 0.7.1
Parameters
----------
x: tensor of shape (batch_size, number of features)
the minibatch input of the two-layer hierarchical softmax.
batch_size: int
the size of the minibatch input x.
n_outputs: int
the number of outputs.
n_classes: int
the number of classes of the two-layer hierarchical softmax. It
corresponds to the number of outputs of the first softmax. See note at
the end.
n_outputs_per_class: int
the number of outputs per class. See note at the end.
W1: tensor of shape (number of features of the input x, n_classes)
the weight matrix of the first softmax, which maps the input x to the
probabilities of the classes.
b1: tensor of shape (n_classes,)
the bias vector of the first softmax layer.
W2: tensor of shape (n_classes, number of features of the input x, n_outputs_per_class)
the weight matrix of the second softmax, which maps the input x to
the probabilities of the outputs.
b2: tensor of shape (n_classes, n_outputs_per_class)
the bias vector of the second softmax layer.
target: tensor of shape either (batch_size,) or (batch_size, 1)
(optional, default None)
contains the indices of the targets for the minibatch
input x. For each input, the function computes the output for its
corresponding target. If target is None, then all the outputs are
computed for each input.
Returns
-------
output_probs: tensor of shape (batch_size, n_outputs) or (batch_size, 1)
Output of the two-layer hierarchical softmax for input x. If target is
not specified (None), then all the outputs are computed and the
returned tensor has shape (batch_size, n_outputs). Otherwise, when
target is specified, only the corresponding outputs are computed and
the returned tensor has thus shape (batch_size, 1).
Notes
-----
The product of n_outputs_per_class and n_classes has to be greater or equal
to n_outputs. If it is strictly greater, then the irrelevant outputs will
be ignored.
n_outputs_per_class and n_classes have to be the same as the corresponding
dimensions of the tensors of W1, b1, W2 and b2.
The most computational efficient configuration is when n_outputs_per_class
and n_classes are equal to the square root of n_outputs.
References
----------
.. [1] J. Goodman, "Classes for Fast Maximum Entropy Training,"
ICASSP, 2001, <http://arxiv.org/abs/cs/0108006>`.
"""
# First softmax that computes the probabilities of belonging to each class
class_probs = theano.tensor.nnet.softmax(tensor.dot(x, W1) + b1)
if target is None: # Computes the probabilites of all the outputs
class_ids = tensor.tile(
tensor.arange(n_classes, dtype="int32")[None, :], (batch_size, 1))
# Second softmax that computes the output probabilities
activations = sparse_block_dot(
W2[None, :, :, :], x[:, None, :],
tensor.zeros((batch_size, 1), dtype='int32'), b2, class_ids)
output_probs = theano.tensor.nnet.softmax(
activations.reshape((-1, n_outputs_per_class)))
output_probs = output_probs.reshape((batch_size, n_classes, -1))
output_probs = class_probs[:, :, None] * output_probs
output_probs = output_probs.reshape((batch_size, -1))
# output_probs.shape[1] is n_classes * n_outputs_per_class, which might
# be greater than n_outputs, so we ignore the potential irrelevant
# outputs with the next line:
output_probs = output_probs[:, :n_outputs]
else: # Computes the probabilities of the outputs specified by the targets
target = target.flatten()
# Classes to which belong each target
target_classes = target // n_outputs_per_class
# Outputs to which belong each target inside a class
target_outputs_in_class = target % n_outputs_per_class
# Second softmax that computes the output probabilities
activations = sparse_block_dot(
W2[None, :, :, :], x[:, None, :],
tensor.zeros((batch_size, 1), dtype='int32'), b2,
target_classes[:, None])
output_probs = theano.tensor.nnet.softmax(activations[:, 0, :])
target_class_probs = class_probs[tensor.arange(batch_size),
target_classes]
output_probs = output_probs[tensor.arange(batch_size),
target_outputs_in_class]
output_probs = target_class_probs * output_probs
return output_probs
"""
Optimizations addressing the ops in sandbox root directory
Optimizations addressing the ops in nnet root directory
"""
from theano import compile # to register the optimizer built by this file
from theano import gof
from theano.sandbox.blocksparse import (
from theano.tensor.nnet.blocksparse import (
SparseBlockGemv,
SparseBlockOuter,
sparse_block_gemv_inplace,
......
......@@ -10,7 +10,7 @@ import theano
from theano import tensor
import theano.tests.unittest_tools as utt
from theano.sandbox.blocksparse import sparse_block_dot, \
from theano.tensor.nnet.blocksparse import sparse_block_dot, \
sparse_block_gemv, sparse_block_outer
......
......@@ -28,7 +28,8 @@ from theano.tensor.nnet import (categorical_crossentropy,
softmax_with_bias, SoftmaxGrad,
Prepend_scalar_constant_to_each_row,
Prepend_scalar_to_each_row,
relu)
relu,
h_softmax)
from theano.tensor import matrix, vector, lvector, scalar
......@@ -1437,5 +1438,70 @@ def test_relu():
assert numpy.allclose(y, numpy.where(X > 0, X, A * X), rtol=3e-5)
if __name__ == '__main__':
unittest.main()
def test_h_softmax():
"""
Tests the output dimensions of the h_softmax when a target is provided or
not.
"""
#############
# Config
#############
input_size = 4
batch_size = 2
h_softmax_level1_size = 5
h_softmax_level2_size = 3
output_size = h_softmax_level1_size * h_softmax_level2_size
#############
# Initialize shared variables
#############
floatX = theano.config.floatX
shared = theano.shared
# First level of h_softmax
W1 = numpy.asarray(numpy.random.normal(
size=(input_size, h_softmax_level1_size)), dtype=floatX)
W1 = shared(W1)
b1 = shared(numpy.asarray(numpy.zeros((h_softmax_level1_size,)),
dtype=floatX))
# Second level of h_softmax
W2 = numpy.asarray(numpy.random.normal(
size=(h_softmax_level1_size, input_size, h_softmax_level2_size)),
dtype=floatX)
W2 = shared(W2)
b2 = shared(
numpy.asarray(numpy.zeros((h_softmax_level1_size,
h_softmax_level2_size)), dtype=floatX))
#############
# Build graph
#############
x = tensor.matrix('x')
y = tensor.ivector('y')
# This only computes the output corresponding to the target
y_hat_tg = h_softmax(x, batch_size, output_size, h_softmax_level1_size,
h_softmax_level2_size, W1, b1, W2, b2, y)
# This computes all the outputs
y_hat_all = h_softmax(x, batch_size, output_size, h_softmax_level1_size,
h_softmax_level2_size, W1, b1, W2, b2)
#############
# Compile functions
#############
fun_output_tg = theano.function([x, y], y_hat_tg)
fun_output = theano.function([x], y_hat_all)
#############
# Test
#############
x_mat = numpy.random.normal(size=(batch_size, input_size)).astype(floatX)
y_mat = numpy.random.randint(0, output_size, batch_size).astype('int32')
assert(fun_output_tg(x_mat, y_mat).shape == (batch_size,))
assert(fun_output(x_mat).shape == (batch_size, output_size))
import theano
from theano import tensor
from theano.sandbox.blocksparse import sparse_block_dot
from theano.tensor.nnet.blocksparse import sparse_block_dot
def test_blocksparse_inplace_gemv_opt():
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论