Merge pull request #3361 from adbrebs/h_softmax

Two-layer hierarchical softmax

Merge pull request #3361 from adbrebs/h_softmax
a85a44fc · Xavier Bouthillier · f979d9e5 · 43a27b25 · a85a44fc · a85a44fc
--- a/doc/library/sandbox/blocksparse.txt
+++ b/doc/library/sandbox/blocksparse.txt
 .. _libdoc_blocksparse:

-===========================================================================
-:mod:`sandbox.blocksparse` --  Block sparse dot operations (gemv and outer)
-===========================================================================
+===============================================================================
+:mod:`blocksparse` --  Block sparse dot operations (gemv and outer)
+===============================================================================

-.. module:: sandbox.blocksparse
+.. module:: tensor.nnet.blocksparse
   :platform: Unix, Windows
   :synopsis: Block sparse dot
 .. moduleauthor:: LISA

-API
-===

-.. automodule:: theano.sandbox.blocksparse
+.. automodule:: theano.tensor.nnet.blocksparse
    :members:
--- a/doc/library/tensor/nnet/index.txt
+++ b/doc/library/tensor/nnet/index.txt
@@ -20,3 +20,4 @@ and ops which are particular to neural networks and deep learning.
    nnet
    neighbours
    bn
+    blocksparse
--- a/doc/library/tensor/nnet/nnet.txt
+++ b/doc/library/tensor/nnet/nnet.txt
@@ -21,6 +21,7 @@
   - :func:`relu() <theano.tensor.nnet.relu>`
   - :func:`binary_crossentropy`
   - :func:`.categorical_crossentropy`
+   - :func:`h_softmax() <theano.tensor.nnet.h_softmax>`

 .. function:: sigmoid(x)

@@ -204,3 +205,6 @@
       y = T.nnet.softmax(T.dot(W, x) + b)
       cost = T.nnet.categorical_crossentropy(y, o)
       # o is either the above-mentioned 1-of-N vector or 2D tensor
+
+
+.. autofunction:: theano.tensor.nnet.h_softmax
--- a/theano/sandbox/__init__.py
+++ b/theano/sandbox/__init__.py
-from . import opt
--- a/theano/sandbox/blocksparse.py
+++ b/theano/sandbox/blocksparse.py
-import numpy
-
-import theano
-from theano import Op, Apply
-from theano import tensor
-from theano.tensor import discrete_dtypes
-from theano.gradient import grad_undefined
-
-
-class SparseBlockGemv(Op):
-    """
-    This op computes the dot product of specified pieces of vectors
-    and matrices, returning pieces of vectors::
-
-        for b in range(batch_size):
-            for j in range(o.shape[1]):
-                for i in range(h.shape[1]):
-                    o[b, j, :] += numpy.dot(h[b, i], W[iIdx[b, i], oIdx[b, j]])
-
-    where b, h, W, o iIdx, oIdx are defined in the docstring of make_node.
-
-    .. image:: ../../images/blocksparse.png
-        :scale: 50 %
-
-    """
-
-    registered_opts = []
-
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-
-    def make_node(self, o, W, h, inputIdx, outputIdx):
-        """
-        Compute the dot product of the specified pieces of vectors
-        and matrices.
-
-        The parameter types are actually their expected shapes
-        relative to each other.
-
-        Parameters
-        ----------
-        o : batch, oWin, oSize
-            output vector
-        W : iBlocks, oBlocks, iSize, oSize
-            weight matrix
-        h : batch, iWin, iSize
-            input from lower layer (sparse)
-        inputIdx : batch, iWin
-            indexes of the input blocks
-        outputIdx : batch, oWin
-            indexes of the output blocks
-
-        Returns
-        -------
-        (batch, oWin, oSize)
-            dot(W[i, j], h[i]) + o[j]
-
-        Notes
-        -----
-        - `batch` is the number of examples in a minibatch (batch size).
-        - `iBlocks` is the total number of blocks in the input (from lower
-            layer).
-        - `iSize` is the size of each of these input blocks.
-        - `iWin` is the number of blocks that will be used as inputs. Which
-           blocks will be used is specified in `inputIdx`.
-        - `oBlocks` is the number or possible output blocks.
-        - `oSize` is the size of each of these output blocks.
-        - `oWin` is the number of output blocks that will actually be computed.
-            Which blocks will be computed is specified in `outputIdx`.
-
-        """
-        o = theano.tensor.as_tensor_variable(o)
-        W = theano.tensor.as_tensor_variable(W)
-        h = theano.tensor.as_tensor_variable(h)
-        inputIdx = theano.tensor.as_tensor_variable(inputIdx)
-        outputIdx = theano.tensor.as_tensor_variable(outputIdx)
-
-        if o.ndim != 3:
-            raise TypeError('The output o must be a 2D tensor')
-        if W.ndim != 4:
-            raise TypeError('The weight matrix W must be a 4D tensor')
-        if h.ndim != 3:
-            raise TypeError('The input h must be a 3D tensor')
-        if inputIdx.ndim != 2:
-            raise TypeError('The input indices inputIdx must be a 2D tensor')
-        if outputIdx.ndim != 2:
-            raise TypeError('The output indices outputIdx must be a 2D tensor')
-
-        assert inputIdx.type.dtype in discrete_dtypes
-        assert outputIdx.type.dtype in discrete_dtypes
-
-        output = o.type.__class__(dtype=o.type.dtype,
-                                  broadcastable=(False,) * o.ndim)()
-
-        return Apply(self, [o, W, h, inputIdx, outputIdx], [output])
-
-    def perform(self, node, inp, out_):
-        o, W, h, iIdx, oIdx = inp[:5]
-
-        if not self.inplace:
-            o = o.copy()
-
-        for b in range(o.shape[0]):
-            for j in range(o.shape[1]):
-                outputIdx = oIdx[b, j]
-                for i in range(h.shape[1]):
-                    inputIdx = iIdx[b, i]
-                    w = W[inputIdx, outputIdx]
-                    o[b, j, :] += numpy.dot(h[b, i], w)
-        out_[0][0] = o
-
-    def grad(self, inputs, grads):
-        o, W, h, inputIdx, outputIdx = inputs
-        go = grads[0]
-
-        outer_fun = SparseBlockOuter(self.inplace)
-        gemv_fun = SparseBlockGemv(self.inplace)
-
-        Wgrad = outer_fun(W.zeros_like(), h, go, inputIdx, outputIdx)
-        hgrad = gemv_fun(h.zeros_like(), W.dimshuffle((1, 0, 3, 2)),
-                         go, outputIdx, inputIdx)
-        return [go, Wgrad, hgrad,
-                grad_undefined(self, 3, inputIdx,
-                               "grad of inputIdx makes no sense"),
-                grad_undefined(self, 4, outputIdx,
-                               "grad of outputIdx makes no sense")]
-
-
-class SparseBlockOuter(Op):
-    """
-    This computes the outer product of two sets of pieces of vectors
-    updating a full matrix with the results::
-
-        for b in range(batch_size):
-            o[xIdx[b, i], yIdx[b, j]] += (alpha * outer(x[b, i], y[b, j]))
-
-    This op is involved in the gradient of SparseBlockGemv.
-
-    """
-
-    registered_opts = []
-
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-
-    def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
-        """
-        Compute the dot product of the specified pieces of vectors
-        and matrices.
-
-        The parameter types are actually their expected shapes
-        relative to each other.
-
-        Parameters
-        ----------
-        o : xBlocks, yBlocks, xSize, ySize
-        x : batch, xWin, xSize
-        y : batch, yWin, ySize
-        xIdx : batch, iWin
-            indexes of the x blocks
-        yIdx : batch, oWin
-            indexes of the y blocks
-
-        Returns
-        -------
-        (xBlocks, yBlocks, xSize, ySize)
-            outer(x[i], y[j]) + o[i, j]
-
-        Notes
-        -----
-        - `batch` is the number of examples in a minibatch (batch size).
-        - `xBlocks` is the total number of blocks in x.
-        - `xSize` is the size of each of these x blocks.
-        - `xWin` is the number of blocks that will be used as x. Which blocks
-          will be used is specified in `xIdx`.
-        - `yBlocks` is the number or possible y blocks.
-        - `ySize` is the size of each of these y blocks.
-        - `yWin` is the number of y blocks that will actually be computed.
-          Which blocks will be computed is specified in `yIdx`.
-
-        """
-        one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
-        o = theano.tensor.as_tensor_variable(o)
-        x = theano.tensor.as_tensor_variable(x)
-        y = theano.tensor.as_tensor_variable(y)
-
-        if alpha is None:
-            alpha = one
-
-        output = o.type.__class__(dtype=o.type.dtype,
-                                  broadcastable=(False,) * o.ndim)()
-
-        return Apply(self, [o, x, y, xIdx, yIdx, alpha],
-                     [output])
-
-    def perform(self, node, inp, out_):
-        o, x, y, xIdx, yIdx, alpha = inp[:6]
-
-        if not self.inplace:
-            o = o.copy()
-
-        for b in range(x.shape[0]):
-            for i in range(xIdx.shape[1]):
-                for j in range(yIdx.shape[1]):
-                    o[xIdx[b, i], yIdx[b, j]] += numpy.outer(x[b, i],
-                                                             y[b, j, :])
-        out_[0][0] = o
-
-
-sparse_block_gemv = SparseBlockGemv(False)
-sparse_block_gemv_inplace = SparseBlockGemv(True)
-sparse_block_outer = SparseBlockOuter(False)
-sparse_block_outer_inplace = SparseBlockOuter(True)
-
-
-def sparse_block_dot(W, h, inputIdx, b, outputIdx):
-    """
-    Compute the dot product (plus bias) of the specified pieces of vectors
-    and matrices. See SparseBlockGemv to get more information.
-
-    The parameter types are actually their expected shapes relative to
-    each other.
-
-    Parameters
-    ----------
-    W : iBlocks, oBlocks, iSize, oSize
-        weight matrix
-    h : batch, iWin, iSize
-        input from lower layer (sparse)
-    inputIdx : batch, iWin
-        indexes of the input blocks
-    b : oBlocks, oSize
-        bias vector
-    outputIdx : batch, oWin
-        indexes of the output blocks
-
-    Returns
-    -------
-    (batch, oWin, oSize)
-        dot(W[i, j], h[i]) + b[j] but b[j] is only added once
-
-    Notes
-    -----
-    - `batch` is the number of examples in a minibatch (batch size).
-    - `iBlocks` is the total number of blocks in the input (from lower layer).
-    - `iSize` is the size of each of these input blocks.
-    - `iWin` is the number of blocks that will be used as inputs. Which blocks
-       will be used is specified in `inputIdx`.
-    - `oBlocks` is the number or possible output blocks.
-    - `oSize` is the size of each of these output blocks.
-    - `oWin` is the number of output blocks that will actually be computed.
-       Which blocks will be computed is specified in `outputIdx`.
-
-    """
-    assert inputIdx.ndim == h.ndim - 1
-    assert outputIdx.ndim == inputIdx.ndim
-    if h.ndim == 2:
-        h = h.dimshuffle('x', 0, 1)
-        inputIdx = inputIdx.dimshuffle('x', 0)
-        outputIdx = outputIdx.dimshuffle('x', 0)
-    return SparseBlockGemv()(b.take(outputIdx, axis=0), W, h,
-                             inputIdx, outputIdx)
+import warnings
+from theano.tensor.nnet.blocksparse import (
+    SparseBlockGemv, SparseBlockOuter, sparse_block_dot, sparse_block_gemv,
+    sparse_block_gemv_inplace, sparse_block_outer, sparse_block_outer_inplace)
+
+__all__ = [SparseBlockGemv, SparseBlockOuter, sparse_block_dot,
+           sparse_block_gemv, sparse_block_gemv_inplace, sparse_block_outer,
+           sparse_block_outer_inplace]
+
+warnings.warn("DEPRECATION: theano.sandbox.blocksparse does not exist anymore,"
+              "it has been moved to theano.tensor.nnet.blocksparse.",
+              category=DeprecationWarning)
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -46,7 +46,7 @@ from theano.sandbox.cuda.blas import (
    GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad,
    GpuDownsampleFactorMaxGradGrad)

-from theano.sandbox.blocksparse import SparseBlockGemv, SparseBlockOuter
+from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
 from theano.sandbox.cuda.blocksparse import (
    GpuSparseBlockGemv,
    GpuSparseBlockOuter,

--- a/theano/sandbox/cuda/tests/test_blocksparse.py
+++ b/theano/sandbox/cuda/tests/test_blocksparse.py
@@ -4,7 +4,7 @@ from nose.plugins.skip import SkipTest
 import theano
 from theano import tensor
 import theano.tests.unittest_tools as utt
-import theano.sandbox.tests.test_blocksparse
+import theano.tensor.nnet.tests.test_blocksparse

 import theano.sandbox.cuda as cuda_ndarray
 from theano.sandbox.cuda.blocksparse import (GpuSparseBlockOuter,
@@ -22,7 +22,7 @@ else:


 class BlockSparse_Gemv_and_Outer(
-        theano.sandbox.tests.test_blocksparse.BlockSparse_Gemv_and_Outer):
+        theano.tensor.nnet.tests.test_blocksparse.BlockSparse_Gemv_and_Outer):
    def setUp(self):
        utt.seed_rng()
        self.mode = mode_with_gpu.excluding('constant_folding')

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -29,7 +29,7 @@ from theano.sandbox.cuda import basic_ops
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.scalar.basic_scipy import erfinv

-from theano.sandbox.blocksparse import sparse_block_dot
+from theano.tensor.nnet.blocksparse import sparse_block_dot
 from theano.sandbox.cuda.blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter



--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
-from .nnet import *
+from .nnet import (
+    CrossentropyCategorical1Hot, CrossentropyCategorical1HotGrad,
+    CrossentropySoftmax1HotWithBiasDx, CrossentropySoftmaxArgmax1HotWithBias,
+    Prepend_scalar_constant_to_each_row, Prepend_scalar_to_each_row, Softmax,
+    SoftmaxGrad, SoftmaxWithBias, binary_crossentropy,
+    categorical_crossentropy, crossentropy_categorical_1hot,
+    crossentropy_categorical_1hot_grad, crossentropy_softmax_1hot,
+    crossentropy_softmax_1hot_with_bias,
+    crossentropy_softmax_1hot_with_bias_dx,
+    crossentropy_softmax_argmax_1hot_with_bias,
+    crossentropy_softmax_max_and_argmax_1hot,
+    crossentropy_softmax_max_and_argmax_1hot_with_bias,
+    crossentropy_to_crossentropy_with_softmax,
+    crossentropy_to_crossentropy_with_softmax_with_bias,
+    graph_merge_softmax_with_crossentropy_softmax, h_softmax,
+    local_advanced_indexing_crossentropy_onehot,
+    local_advanced_indexing_crossentropy_onehot_grad, local_argmax_pushdown,
+    local_log_softmax, local_softmax_grad_to_crossentropy_with_softmax_grad,
+    local_softmax_with_bias,
+    local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc,
+    make_out_pattern, prepend_0_to_each_row, prepend_1_to_each_row,
+    prepend_scalar_to_each_row, relu, softmax, softmax_grad, softmax_graph,
+    softmax_op, softmax_simplifier, softmax_with_bias)
+from . import opt
 from .conv import conv2d, ConvOp
 from .Conv3D import *
 from .ConvGrad3D import *

--- a/theano/tensor/nnet/blocksparse.py
+++ b/theano/tensor/nnet/blocksparse.py
+import numpy
+
+import theano
+from theano import Op, Apply
+from theano.tensor import discrete_dtypes
+from theano.gradient import grad_undefined
+
+
+class SparseBlockGemv(Op):
+    """
+    This op computes the dot product of specified pieces of vectors
+    and matrices, returning pieces of vectors::
+
+        for b in range(batch_size):
+            for j in range(o.shape[1]):
+                for i in range(h.shape[1]):
+                    o[b, j, :] += numpy.dot(h[b, i], W[iIdx[b, i], oIdx[b, j]])
+
+    where b, h, W, o iIdx, oIdx are defined in the docstring of make_node.
+
+    .. image:: ../../../images/blocksparse.png
+        :scale: 50 %
+
+    """
+
+    registered_opts = []
+
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+
+    def make_node(self, o, W, h, inputIdx, outputIdx):
+        """
+        Compute the dot product of the specified pieces of vectors
+        and matrices.
+
+        The parameter types are actually their expected shapes
+        relative to each other.
+
+        Parameters
+        ----------
+        o : batch, oWin, oSize
+            output vector
+        W : iBlocks, oBlocks, iSize, oSize
+            weight matrix
+        h : batch, iWin, iSize
+            input from lower layer (sparse)
+        inputIdx : batch, iWin
+            indexes of the input blocks
+        outputIdx : batch, oWin
+            indexes of the output blocks
+
+        Returns
+        -------
+        (batch, oWin, oSize)
+            dot(W[i, j], h[i]) + o[j]
+
+        Notes
+        -----
+        - `batch` is the number of examples in a minibatch (batch size).
+        - `iBlocks` is the total number of blocks in the input (from lower
+            layer).
+        - `iSize` is the size of each of these input blocks.
+        - `iWin` is the number of blocks that will be used as inputs. Which
+           blocks will be used is specified in `inputIdx`.
+        - `oBlocks` is the number or possible output blocks.
+        - `oSize` is the size of each of these output blocks.
+        - `oWin` is the number of output blocks that will actually be computed.
+            Which blocks will be computed is specified in `outputIdx`.
+
+        """
+        o = theano.tensor.as_tensor_variable(o)
+        W = theano.tensor.as_tensor_variable(W)
+        h = theano.tensor.as_tensor_variable(h)
+        inputIdx = theano.tensor.as_tensor_variable(inputIdx)
+        outputIdx = theano.tensor.as_tensor_variable(outputIdx)
+
+        if o.ndim != 3:
+            raise TypeError('The output o must be a 2D tensor')
+        if W.ndim != 4:
+            raise TypeError('The weight matrix W must be a 4D tensor')
+        if h.ndim != 3:
+            raise TypeError('The input h must be a 3D tensor')
+        if inputIdx.ndim != 2:
+            raise TypeError('The input indices inputIdx must be a 2D tensor')
+        if outputIdx.ndim != 2:
+            raise TypeError('The output indices outputIdx must be a 2D tensor')
+
+        assert inputIdx.type.dtype in discrete_dtypes
+        assert outputIdx.type.dtype in discrete_dtypes
+
+        output = o.type.__class__(dtype=o.type.dtype,
+                                  broadcastable=(False,) * o.ndim)()
+
+        return Apply(self, [o, W, h, inputIdx, outputIdx], [output])
+
+    def perform(self, node, inp, out_):
+        o, W, h, iIdx, oIdx = inp[:5]
+
+        if not self.inplace:
+            o = o.copy()
+
+        for b in range(o.shape[0]):
+            for j in range(o.shape[1]):
+                outputIdx = oIdx[b, j]
+                for i in range(h.shape[1]):
+                    inputIdx = iIdx[b, i]
+                    w = W[inputIdx, outputIdx]
+                    o[b, j, :] += numpy.dot(h[b, i], w)
+        out_[0][0] = o
+
+    def grad(self, inputs, grads):
+        o, W, h, inputIdx, outputIdx = inputs
+        go = grads[0]
+
+        outer_fun = SparseBlockOuter(self.inplace)
+        gemv_fun = SparseBlockGemv(self.inplace)
+
+        Wgrad = outer_fun(W.zeros_like(), h, go, inputIdx, outputIdx)
+        hgrad = gemv_fun(h.zeros_like(), W.dimshuffle((1, 0, 3, 2)),
+                         go, outputIdx, inputIdx)
+        return [go, Wgrad, hgrad,
+                grad_undefined(self, 3, inputIdx,
+                               "grad of inputIdx makes no sense"),
+                grad_undefined(self, 4, outputIdx,
+                               "grad of outputIdx makes no sense")]
+
+
+class SparseBlockOuter(Op):
+    """
+    This computes the outer product of two sets of pieces of vectors
+    updating a full matrix with the results::
+
+        for b in range(batch_size):
+            o[xIdx[b, i], yIdx[b, j]] += (alpha * outer(x[b, i], y[b, j]))
+
+    This op is involved in the gradient of SparseBlockGemv.
+
+    """
+
+    registered_opts = []
+
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+
+    def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
+        """
+        Compute the dot product of the specified pieces of vectors
+        and matrices.
+
+        The parameter types are actually their expected shapes
+        relative to each other.
+
+        Parameters
+        ----------
+        o : xBlocks, yBlocks, xSize, ySize
+        x : batch, xWin, xSize
+        y : batch, yWin, ySize
+        xIdx : batch, iWin
+            indexes of the x blocks
+        yIdx : batch, oWin
+            indexes of the y blocks
+
+        Returns
+        -------
+        (xBlocks, yBlocks, xSize, ySize)
+            outer(x[i], y[j]) + o[i, j]
+
+        Notes
+        -----
+        - `batch` is the number of examples in a minibatch (batch size).
+        - `xBlocks` is the total number of blocks in x.
+        - `xSize` is the size of each of these x blocks.
+        - `xWin` is the number of blocks that will be used as x. Which blocks
+          will be used is specified in `xIdx`.
+        - `yBlocks` is the number or possible y blocks.
+        - `ySize` is the size of each of these y blocks.
+        - `yWin` is the number of y blocks that will actually be computed.
+          Which blocks will be computed is specified in `yIdx`.
+
+        """
+        one = theano.tensor.constant(numpy.asarray(1.0, dtype='float32'))
+        o = theano.tensor.as_tensor_variable(o)
+        x = theano.tensor.as_tensor_variable(x)
+        y = theano.tensor.as_tensor_variable(y)
+
+        if alpha is None:
+            alpha = one
+
+        output = o.type.__class__(dtype=o.type.dtype,
+                                  broadcastable=(False,) * o.ndim)()
+
+        return Apply(self, [o, x, y, xIdx, yIdx, alpha],
+                     [output])
+
+    def perform(self, node, inp, out_):
+        o, x, y, xIdx, yIdx, alpha = inp[:6]
+
+        if not self.inplace:
+            o = o.copy()
+
+        for b in range(x.shape[0]):
+            for i in range(xIdx.shape[1]):
+                for j in range(yIdx.shape[1]):
+                    o[xIdx[b, i], yIdx[b, j]] += numpy.outer(x[b, i],
+                                                             y[b, j, :])
+        out_[0][0] = o
+
+
+sparse_block_gemv = SparseBlockGemv(False)
+sparse_block_gemv_inplace = SparseBlockGemv(True)
+sparse_block_outer = SparseBlockOuter(False)
+sparse_block_outer_inplace = SparseBlockOuter(True)
+
+
+def sparse_block_dot(W, h, inputIdx, b, outputIdx):
+    """
+    Compute the dot product (plus bias) of the specified pieces of vectors
+    and matrices. See SparseBlockGemv to get more information.
+
+    The parameter types are actually their expected shapes relative to
+    each other.
+
+    Parameters
+    ----------
+    W : iBlocks, oBlocks, iSize, oSize
+        weight matrix
+    h : batch, iWin, iSize
+        input from lower layer (sparse)
+    inputIdx : batch, iWin
+        indexes of the input blocks
+    b : oBlocks, oSize
+        bias vector
+    outputIdx : batch, oWin
+        indexes of the output blocks
+
+    Returns
+    -------
+    (batch, oWin, oSize)
+        dot(W[i, j], h[i]) + b[j] but b[j] is only added once
+
+    Notes
+    -----
+    - `batch` is the number of examples in a minibatch (batch size).
+    - `iBlocks` is the total number of blocks in the input (from lower layer).
+    - `iSize` is the size of each of these input blocks.
+    - `iWin` is the number of blocks that will be used as inputs. Which blocks
+       will be used is specified in `inputIdx`.
+    - `oBlocks` is the number or possible output blocks.
+    - `oSize` is the size of each of these output blocks.
+    - `oWin` is the number of output blocks that will actually be computed.
+       Which blocks will be computed is specified in `outputIdx`.
+
+    """
+    assert inputIdx.ndim == h.ndim - 1
+    assert outputIdx.ndim == inputIdx.ndim
+    if h.ndim == 2:
+        h = h.dimshuffle('x', 0, 1)
+        inputIdx = inputIdx.dimshuffle('x', 0)
+        outputIdx = outputIdx.dimshuffle('x', 0)
+    return SparseBlockGemv()(b.take(outputIdx, axis=0), W, h,
+                             inputIdx, outputIdx)
--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -29,6 +29,7 @@ from theano.gof import Apply
 from theano.tensor.nnet.sigm import sigmoid, softplus
 from theano.gradient import DisconnectedType
 from theano.gradient import grad_not_implemented
+from theano.tensor.nnet.blocksparse import sparse_block_dot
 from theano.tensor.type import values_eq_approx_remove_nan


@@ -2041,3 +2042,125 @@ def relu(x, alpha=0):
        f1 = 0.5 * (1 + alpha)
        f2 = 0.5 * (1 - alpha)
        return f1 * x + f2 * abs(x)
+
+
+def h_softmax(x, batch_size, n_outputs, n_classes, n_outputs_per_class,
+              W1, b1, W2, b2, target=None):
+    """ Two-level hierarchical softmax.
+
+    The architecture is composed of two softmax layers: the first predicts the
+    class of the input x while the second predicts the output of the input x in
+    the predicted class.
+    More explanations can be found in the original paper [1]_.
+
+    If target is specified, it will only compute the outputs of the
+    corresponding targets. Otherwise, if target is None, it will compute all
+    the outputs.
+
+    The outputs are grouped in the same order as they are initially defined.
+
+    .. versionadded:: 0.7.1
+
+    Parameters
+    ----------
+    x: tensor of shape (batch_size, number of features)
+        the minibatch input of the two-layer hierarchical softmax.
+    batch_size: int
+        the size of the minibatch input x.
+    n_outputs: int
+        the number of outputs.
+    n_classes: int
+        the number of classes of the two-layer hierarchical softmax. It
+        corresponds to the number of outputs of the first softmax. See note at
+        the end.
+    n_outputs_per_class: int
+        the number of outputs per class. See note at the end.
+    W1: tensor of shape (number of features of the input x, n_classes)
+        the weight matrix of the first softmax, which maps the input x to the
+        probabilities of the classes.
+    b1: tensor of shape (n_classes,)
+        the bias vector of the first softmax layer.
+    W2: tensor of shape (n_classes, number of features of the input x, n_outputs_per_class)
+        the weight matrix of the second softmax, which maps the input x to
+        the probabilities of the outputs.
+    b2: tensor of shape (n_classes, n_outputs_per_class)
+        the bias vector of the second softmax layer.
+    target: tensor of shape either (batch_size,) or (batch_size, 1)
+        (optional, default None)
+        contains the indices of the targets for the minibatch
+        input x. For each input, the function computes the output for its
+        corresponding target. If target is None, then all the outputs are
+        computed for each input.
+
+    Returns
+    -------
+    output_probs: tensor of shape (batch_size, n_outputs) or (batch_size, 1)
+        Output of the two-layer hierarchical softmax for input x. If target is
+        not specified (None), then all the outputs are computed and the
+        returned tensor has shape (batch_size, n_outputs). Otherwise, when
+        target is specified, only the corresponding outputs are computed and
+        the returned tensor has thus shape (batch_size, 1).
+
+    Notes
+    -----
+    The product of n_outputs_per_class and n_classes has to be greater or equal
+    to n_outputs. If it is strictly greater, then the irrelevant outputs will
+    be ignored.
+    n_outputs_per_class and n_classes have to be the same as the corresponding
+    dimensions of the tensors of W1, b1, W2 and b2.
+    The most computational efficient configuration is when n_outputs_per_class
+    and n_classes are equal to the square root of n_outputs.
+
+    References
+    ----------
+    .. [1] J. Goodman, "Classes for Fast Maximum Entropy Training,"
+        ICASSP, 2001, <http://arxiv.org/abs/cs/0108006>`.
+    """
+
+    # First softmax that computes the probabilities of belonging to each class
+    class_probs = theano.tensor.nnet.softmax(tensor.dot(x, W1) + b1)
+
+    if target is None:  # Computes the probabilites of all the outputs
+
+        class_ids = tensor.tile(
+            tensor.arange(n_classes, dtype="int32")[None, :], (batch_size, 1))
+
+        # Second softmax that computes the output probabilities
+        activations = sparse_block_dot(
+            W2[None, :, :, :], x[:, None, :],
+            tensor.zeros((batch_size, 1), dtype='int32'), b2, class_ids)
+
+        output_probs = theano.tensor.nnet.softmax(
+            activations.reshape((-1, n_outputs_per_class)))
+        output_probs = output_probs.reshape((batch_size, n_classes, -1))
+        output_probs = class_probs[:, :, None] * output_probs
+        output_probs = output_probs.reshape((batch_size, -1))
+        # output_probs.shape[1] is n_classes * n_outputs_per_class, which might
+        # be greater than n_outputs, so we ignore the potential irrelevant
+        # outputs with the next line:
+        output_probs = output_probs[:, :n_outputs]
+
+    else:  # Computes the probabilities of the outputs specified by the targets
+
+        target = target.flatten()
+
+        # Classes to which belong each target
+        target_classes = target // n_outputs_per_class
+
+        # Outputs to which belong each target inside a class
+        target_outputs_in_class = target % n_outputs_per_class
+
+        # Second softmax that computes the output probabilities
+        activations = sparse_block_dot(
+            W2[None, :, :, :], x[:, None, :],
+            tensor.zeros((batch_size, 1), dtype='int32'), b2,
+            target_classes[:, None])
+
+        output_probs = theano.tensor.nnet.softmax(activations[:, 0, :])
+        target_class_probs = class_probs[tensor.arange(batch_size),
+                                         target_classes]
+        output_probs = output_probs[tensor.arange(batch_size),
+                                    target_outputs_in_class]
+        output_probs = target_class_probs * output_probs
+
+    return output_probs
--- a/theano/sandbox/opt.py
+++ b/theano/sandbox/opt.py
 """
-Optimizations addressing the ops in sandbox root directory
+Optimizations addressing the ops in nnet root directory
 """

 from theano import compile  # to register the optimizer built by this file
 from theano import gof
-from theano.sandbox.blocksparse import (
+from theano.tensor.nnet.blocksparse import (
    SparseBlockGemv,
    SparseBlockOuter,
    sparse_block_gemv_inplace,

--- a/theano/sandbox/tests/test_blocksparse.py
+++ b/theano/sandbox/tests/test_blocksparse.py
@@ -10,7 +10,7 @@ import theano
 from theano import tensor
 import theano.tests.unittest_tools as utt

-from theano.sandbox.blocksparse import sparse_block_dot, \
+from theano.tensor.nnet.blocksparse import sparse_block_dot, \
    sparse_block_gemv, sparse_block_outer



--- a/theano/tensor/nnet/tests/test_nnet.py
+++ b/theano/tensor/nnet/tests/test_nnet.py
@@ -28,7 +28,8 @@ from theano.tensor.nnet import (categorical_crossentropy,
                                softmax_with_bias, SoftmaxGrad,
                                Prepend_scalar_constant_to_each_row,
                                Prepend_scalar_to_each_row,
-                                relu)
+                                relu,
+                                h_softmax)
 from theano.tensor import matrix, vector, lvector, scalar


@@ -1437,5 +1438,70 @@ def test_relu():
        assert numpy.allclose(y, numpy.where(X > 0, X, A * X), rtol=3e-5)


-if __name__ == '__main__':
-    unittest.main()
+def test_h_softmax():
+    """
+    Tests the output dimensions of the h_softmax when a target is provided or
+    not.
+    """
+
+    #############
+    # Config
+    #############
+
+    input_size = 4
+    batch_size = 2
+    h_softmax_level1_size = 5
+    h_softmax_level2_size = 3
+    output_size = h_softmax_level1_size * h_softmax_level2_size
+
+    #############
+    # Initialize shared variables
+    #############
+
+    floatX = theano.config.floatX
+    shared = theano.shared
+
+    # First level of h_softmax
+    W1 = numpy.asarray(numpy.random.normal(
+        size=(input_size, h_softmax_level1_size)), dtype=floatX)
+    W1 = shared(W1)
+    b1 = shared(numpy.asarray(numpy.zeros((h_softmax_level1_size,)),
+                              dtype=floatX))
+
+    # Second level of h_softmax
+    W2 = numpy.asarray(numpy.random.normal(
+        size=(h_softmax_level1_size, input_size, h_softmax_level2_size)),
+        dtype=floatX)
+    W2 = shared(W2)
+    b2 = shared(
+        numpy.asarray(numpy.zeros((h_softmax_level1_size,
+                                   h_softmax_level2_size)), dtype=floatX))
+
+    #############
+    # Build graph
+    #############
+    x = tensor.matrix('x')
+    y = tensor.ivector('y')
+
+    # This only computes the output corresponding to the target
+    y_hat_tg = h_softmax(x, batch_size, output_size, h_softmax_level1_size,
+                         h_softmax_level2_size, W1, b1, W2, b2, y)
+
+    # This computes all the outputs
+    y_hat_all = h_softmax(x, batch_size, output_size, h_softmax_level1_size,
+                          h_softmax_level2_size, W1, b1, W2, b2)
+
+    #############
+    # Compile functions
+    #############
+    fun_output_tg = theano.function([x, y], y_hat_tg)
+    fun_output = theano.function([x], y_hat_all)
+
+    #############
+    # Test
+    #############
+    x_mat = numpy.random.normal(size=(batch_size, input_size)).astype(floatX)
+    y_mat = numpy.random.randint(0, output_size, batch_size).astype('int32')
+    
+    assert(fun_output_tg(x_mat, y_mat).shape == (batch_size,))
+    assert(fun_output(x_mat).shape == (batch_size, output_size))
--- a/theano/sandbox/tests/test_opt.py
+++ b/theano/sandbox/tests/test_opt.py
 import theano
 from theano import tensor
-from theano.sandbox.blocksparse import sparse_block_dot
+from theano.tensor.nnet.blocksparse import sparse_block_dot


 def test_blocksparse_inplace_gemv_opt():