Add doc to h_softmax

8e1612a7 · AdeB · 02482e46 · 8e1612a7 · 8e1612a7 · 8e1612a7
--- a/doc/library/tensor/nnet/index.txt
+++ b/doc/library/tensor/nnet/index.txt
@@ -20,3 +20,4 @@ and ops which are particular to neural networks and deep learning.
    nnet
    neighbours
    bn
+    blocksparse
--- a/doc/library/tensor/nnet/nnet.txt
+++ b/doc/library/tensor/nnet/nnet.txt
@@ -21,6 +21,7 @@
   - :func:`relu() <theano.tensor.nnet.relu>`
   - :func:`binary_crossentropy`
   - :func:`.categorical_crossentropy`
+   - :func:`h_softmax() <theano.tensor.nnet.h_softmax>`
 .. function:: sigmoid(x)
@@ -204,3 +205,6 @@
       y = T.nnet.softmax(T.dot(W, x) + b)
       cost = T.nnet.categorical_crossentropy(y, o)
       # o is either the above-mentioned 1-of-N vector or 2D tensor
+.. autofunction:: theano.tensor.nnet.h_softmax
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -46,7 +46,7 @@ from theano.sandbox.cuda.blas import (
    GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad,
    GpuDownsampleFactorMaxGradGrad)
-from theano.sandbox.blocksparse import SparseBlockGemv, SparseBlockOuter
+from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
 from theano.sandbox.cuda.blocksparse import (
    GpuSparseBlockGemv,
    GpuSparseBlockOuter,

--- a/theano/sandbox/cuda/tests/test_blocksparse.py
+++ b/theano/sandbox/cuda/tests/test_blocksparse.py
@@ -22,7 +22,7 @@ else:
 class BlockSparse_Gemv_and_Outer(
-        theano.sandbox.tests.test_blocksparse.BlockSparse_Gemv_and_Outer):
+        theano.tensor.nnet.tests.test_blocksparse.BlockSparse_Gemv_and_Outer):
    def setUp(self):
        utt.seed_rng()
        self.mode = mode_with_gpu.excluding('constant_folding')

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -2068,7 +2068,9 @@ def h_softmax(x, batch_size, n_outputs, n_classes, n_outputs_per_class,
    The outputs are grouped in the same order as they are initially defined.
-    Arguments:
+    .. versionadded:: 0.7.1
+    Parameters
    ----------
    x: tensor of shape (batch_size, number of features)
        the minibatch input of the two-layer hierarchical softmax.
@@ -2087,19 +2089,18 @@ def h_softmax(x, batch_size, n_outputs, n_classes, n_outputs_per_class,
    n_outputs_per_class: int
        the number of outputs per class. See note at the end.
-    W1: tensor of shape (number of features of the input x, number of classes)
+    W1: tensor of shape (number of features of the input x, n_classes)
        the weight matrix of the first softmax, which maps the input x to the
        probabilities of the classes.
-    b1: tensor of shape (number of classes,)
+    b1: tensor of shape (n_classes,)
        the bias vector of the first softmax layer.
-    W2: tensor of shape (number of classes, number of features of the input x,
+    W2: tensor of shape (n_classes, number of features of the input x, n_outputs_per_class)
-        number of outputs per class)
        the weight matrix of the second softmax, which maps the input x to
        the probabilities of the outputs.
-    b2: tensor of shape (number of classes, number of outputs per class)
+    b2: tensor of shape (n_classes, n_outputs_per_class)
        the bias vector of the second softmax layer.
    target: tensor of shape either (batch_size,) or (batch_size, 1)
@@ -2109,7 +2110,16 @@ def h_softmax(x, batch_size, n_outputs, n_classes, n_outputs_per_class,
        corresponding target. If target is None, then all the outputs are
        computed for each input.
-    Notes
+    Returns
+    -------
+    tensor of shape (batch_size, n_outputs) or (batch_size, 1)
+        Output of the two-layer hierarchical softmax for input x. If target is
+        not specified (None), then all the outputs are computed and the
+        returned tensor has shape (batch_size, n_outputs). Otherwise, when
+        target is specified, only the corresponding outputs are computed and
+        the returned tensor has thus shape (batch_size, 1).
+    Notes:
    -----
    The product of n_outputs_per_class and n_classes has to be greater or equal
    to n_outputs. If it is strictly greater, then the irrelevant outputs will