Merge pull request #1392 from nouiz/sigmoid

Add hard_sigmoid, faster, but more approx.

Merge pull request #1392 from nouiz/sigmoid
0ff016bc · lamblin · 00fbaf1d · aba531c0 · 0ff016bc · 0ff016bc
--- a/doc/library/tensor/nnet/nnet.txt
+++ b/doc/library/tensor/nnet/nnet.txt
@@ -15,59 +15,92 @@
    :Parameters: *x* - symbolic Tensor (or compatible)
    :Return type: same as x
    :Returns: element-wise sigmoid: :math:`sigmoid(x) = \frac{1}{1 + \exp(-x)}`.
-    :note: see :func:`ultra_fast_sigmoid` for a faster version
+    :note: see :func:`ultra_fast_sigmoid` or :func:`hard_sigmoid` for faster version.
+        Speed comparison for 100M float64 element on a Core2 Duo @ 3.16 GHz.

-Example:
+          - hard_sigmoid: 1.1s
+          - ultra_fast_sigmoid: 1.4s
+          - sigmoid (with amdlibm): 2.3s
+          - sigmoid (without amdlibm): 3.7s

-.. code-block:: python
+        Precision: sigmoid(without or without amdlibm) > ultra_fast_sigmoid > hard_sigmoid.

-    x,y,b = T.dvectors('x','y','b')
-    W = T.dmatrix('W')
-    y = T.nnet.sigmoid(T.dot(W,x) + b)
+   .. image:: sigmoid_prec.png

-.. note:: The underlying code will return an exact 0 or 1 if an element of x is too small or too big.
+   Example:
+
+   .. code-block:: python
+
+       x,y,b = T.dvectors('x','y','b')
+       W = T.dmatrix('W')
+       y = T.nnet.sigmoid(T.dot(W,x) + b)
+
+   .. note:: The underlying code will return an exact 0 or 1 if an
+      element of x is too small or too big.

 .. function:: ultra_fast_sigmoid(x)

-   Returns the standard sigmoid nonlinearity applied to x
+   Returns the *approximated* standard :func:`sigmoid` nonlinearity applied to x.
    :Parameters: *x* - symbolic Tensor (or compatible)
    :Return type: same as x
    :Returns: approximated element-wise sigmoid: :math:`sigmoid(x) = \frac{1}{1 + \exp(-x)}`.
-    :note: To automatically change all sigmoid op to this version, use
+    :note: To automatically change all :func:`sigmoid` op to this version, use
      the Theano optimization ``local_ultra_fast_sigmoid``. This can be done
      with the Theano flag ``optimizer_including=local_ultra_fast_sigmoid``.
      This optimization is done late, so it shouldn't affect
      stabilization optimization.

+   .. note:: The underlying code will return 0.00247262315663 as the
+       minimum value and 0.997527376843 as the maximum value. So it
+       never return 0 or 1.
+
+
+
+
+.. function:: hard_sigmoid(x)
+
+   Returns the *approximated* standard :func:`sigmoid` nonlinearity applied to x.
+    :Parameters: *x* - symbolic Tensor (or compatible)
+    :Return type: same as x
+    :Returns: approximated element-wise sigmoid: :math:`sigmoid(x) = \frac{1}{1 + \exp(-x)}`.
+    :note: To automatically change all :func:`sigmoid` op to this version, use
+      the Theano optimization ``local_hard_sigmoid``. This can be done
+      with the Theano flag ``optimizer_including=local_hard_sigmoid``.
+      This optimization is done late, so it shouldn't affect
+      stabilization optimization.
+
+   .. note:: The underlying code will return an exact 0 or 1 if an
+      element of x is too small or too big.
+
 .. function:: softplus(x)

   Returns the softplus nonlinearity applied to x
    :Parameter: *x* - symbolic Tensor (or compatible)
    :Return type: same as x
-    :Returns: elementwise softplus: :math:`softplus(x) = \log_e{\left(1 + \exp(x)\right)}`. 
+    :Returns: elementwise softplus: :math:`softplus(x) = \log_e{\left(1 + \exp(x)\right)}`.

-.. note:: The underlying code will return an exact 0 if an element of x is too small.
+   .. note:: The underlying code will return an exact 0 if an element of x is too small.

-.. code-block:: python
+   .. code-block:: python

-    x,y,b = T.dvectors('x','y','b')
-    W = T.dmatrix('W')
-    y = T.nnet.softplus(T.dot(W,x) + b)
+       x,y,b = T.dvectors('x','y','b')
+       W = T.dmatrix('W')
+       y = T.nnet.softplus(T.dot(W,x) + b)

 .. function:: softmax(x)

   Returns the softmax function of x:
-    :Parameter: *x* symbolic **2D** Tensor (or compatible). 
+    :Parameter: *x* symbolic **2D** Tensor (or compatible).
    :Return type: same as x
-    :Returns: a symbolic 2D tensor whose ijth element is  :math:`softmax_{ij}(x) = \frac{\exp{x_{ij}}}{\sum_k\exp(x_{ik})}`. 
+    :Returns: a symbolic 2D tensor whose ijth element is  :math:`softmax_{ij}(x) = \frac{\exp{x_{ij}}}{\sum_k\exp(x_{ik})}`.

-The softmax function will, when applied to a matrix, compute the softmax values row-wise. 
+   The softmax function will, when applied to a matrix, compute the softmax values row-wise.

-.. code-block:: python
+   .. code-block:: python

-    x,y,b = T.dvectors('x','y','b')
-    W = T.dmatrix('W')
-    y = T.nnet.softmax(T.dot(W,x) + b)
+       x,y,b = T.dvectors('x','y','b')
+       W = T.dmatrix('W')
+       y = T.nnet.softmax(T.dot(W,x) + b)

 .. function:: binary_crossentropy(output,target)

@@ -78,27 +111,27 @@ The softmax function will, when applied to a matrix, compute the softmax values
       * *output* - symbolic Tensor (or compatible)

    :Return type: same as target
-    :Returns: a symbolic tensor, where the following is applied elementwise :math:`crossentropy(t,o) = -(t\cdot log(o) + (1 - t) \cdot log(1 - o))`. 
+    :Returns: a symbolic tensor, where the following is applied elementwise :math:`crossentropy(t,o) = -(t\cdot log(o) + (1 - t) \cdot log(1 - o))`.

-The following block implements a simple auto-associator with a sigmoid
-nonlinearity and a reconstruction error which corresponds to the binary
-cross-entropy (note that this assumes that x will contain values between 0 and
-1):
+   The following block implements a simple auto-associator with a
+   sigmoid nonlinearity and a reconstruction error which corresponds
+   to the binary cross-entropy (note that this assumes that x will
+   contain values between 0 and 1):

-.. code-block:: python
+   .. code-block:: python

-    x, y, b = T.dvectors('x', 'y', 'b')
-    W = T.dmatrix('W') 
-    h = T.nnet.sigmoid(T.dot(W, x) + b)
-    x_recons = T.nnet.sigmoid(T.dot(V, h) + c)
-    recon_cost = T.nnet.binary_crossentropy(x_recons, x).mean()
+       x, y, b = T.dvectors('x', 'y', 'b')
+       W = T.dmatrix('W')
+       h = T.nnet.sigmoid(T.dot(W, x) + b)
+       x_recons = T.nnet.sigmoid(T.dot(V, h) + c)
+       recon_cost = T.nnet.binary_crossentropy(x_recons, x).mean()

 .. function:: categorical_crossentropy(coding_dist,true_dist)

-    Return the cross-entropy between an approximating distribution and a true distribution. 
+    Return the cross-entropy between an approximating distribution and a true distribution.
    The cross entropy between two probability distributions measures the average number of bits
    needed to identify an event from a set of possibilities, if a coding scheme is used based
-    on a given probability distribution q, rather than the "true" distribution p. Mathematically, this 
+    on a given probability distribution q, rather than the "true" distribution p. Mathematically, this
    function computes :math:`H(p,q) = - \sum_x p(x) \log(q(x))`, where
    p=true_dist and q=coding_dist.

@@ -112,15 +145,17 @@ cross-entropy (note that this assumes that x will contain values between 0 and

    :Return type: tensor of rank one-less-than `coding_dist`

-.. note:: An application of the scenario where *true_dist* has a 1-of-N representation
-    is in classification with softmax outputs. If `coding_dist` is the output of
-    the softmax and `true_dist` is a vector of correct labels, then the function
-    will compute ``y_i = - \log(coding_dist[i, one_of_n[i]])``, which corresponds
-    to computing the neg-log-probability of the correct class (which is typically
-    the training criterion in classification settings).
+   .. note:: An application of the scenario where *true_dist* has a
+       1-of-N representation is in classification with softmax
+       outputs. If `coding_dist` is the output of the softmax and
+       `true_dist` is a vector of correct labels, then the function
+       will compute ``y_i = - \log(coding_dist[i, one_of_n[i]])``,
+       which corresponds to computing the neg-log-probability of the
+       correct class (which is typically the training criterion in
+       classification settings).

-.. code-block:: python
+   .. code-block:: python

-    y = T.nnet.softmax(T.dot(W, x) + b)
-    cost = T.nnet.categorical_crossentropy(y, o)
-    # o is either the above-mentioned 1-of-N vector or 2D tensor
+       y = T.nnet.softmax(T.dot(W, x) + b)
+       cost = T.nnet.categorical_crossentropy(y, o)
+       # o is either the above-mentioned 1-of-N vector or 2D tensor
--- a/doc/library/tensor/nnet/sigmoid_prec.png
+++ b/doc/library/tensor/nnet/sigmoid_prec.png
--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
@@ -4,4 +4,5 @@ from Conv3D import *
 from ConvGrad3D import *
 from ConvTransp3D import *
 from sigm import (softplus, sigmoid, sigmoid_inplace,
-                  scalar_sigmoid, ultra_fast_sigmoid)
+                  scalar_sigmoid, ultra_fast_sigmoid,
+                  hard_sigmoid)
--- a/theano/tensor/nnet/sigm.py
+++ b/theano/tensor/nnet/sigm.py
@@ -112,6 +112,42 @@ for i in xrange(750):
        """ % locals()
        raise theano.gof.utils.MethodNotDefined()

+    @staticmethod
+    def gen_graph():
+        """
+        This method was used to generate the graph: sigmoid_prec.png in the doc
+        """
+        import matplotlib
+        data = numpy.arange(-15, 15, .1)
+        val = 1/(1+numpy.exp(-data))
+
+        def hard_sigmoid(x):
+            return theano.tensor.nnet.hard_sigmoid(x)
+
+        def ultra_fast_sigmoid(x):
+            return theano.tensor.nnet.ultra_fast_sigmoid(x)
+
+        val_hard = hard_sigmoid(data).eval()
+        val_ultra = ultra_fast_sigmoid(data).eval()
+
+        import matplotlib.pyplot as plt
+        import os
+        fig = plt.figure()
+        ax = fig.add_subplot(111)
+        ax.plot(data, val)#, 'o-')
+        ax.plot(data, val_ultra)#, '-')
+        ax.plot(data, val_hard)#, '-')
+        ax.grid(True)
+        ax.legend(("sigmoid", "ultra_fast", "hard"), "upper left")
+        fname = os.path.join(os.path.dirname(theano.__file__), '..',
+                             'doc', 'library', 'tensor', 'nnet',
+                             'sigmoid_prec.png')
+        plt.savefig(fname)
+        print "New picture saved at", fname
+        print val_ultra.max()
+        print val_ultra.min()
+
+
 scalar_sigmoid = ScalarSigmoid(scalar.upgrade_to_float, name='scalar_sigmoid')
 sigmoid = elemwise.Elemwise(scalar_sigmoid, name='sigmoid')

@@ -210,7 +246,6 @@ def local_ultra_fast_sigmoid(node):
    if (isinstance(node.op, tensor.Elemwise) and
            node.op.scalar_op == scalar_sigmoid):
        out = ultra_fast_sigmoid(node.inputs[0])
-        out2 = ultra_fast_sigmoid(node.inputs[0])

        def values_eq_approx_remove_low_prec(a, b):
            # atol is found by trial/error.
@@ -223,6 +258,41 @@ theano.compile.optdb['uncanonicalize'].register("local_ultra_fast_sigmoid",
                                                local_ultra_fast_sigmoid)


+def hard_sigmoid(x):
+    """An approximation of sigmoid.
+
+    More approximate and faster then ultra_fast_sigmoid.
+
+    Approx in 3 parts: 0, scaled linear, 1
+
+    Removing the slop and shift don't make it faster.
+
+    """
+    slop = 0.2
+    shift = 0.5
+    x = (x * 0.2) + shift
+    x = tensor.clip(x, 0, 1)
+    return x
+
+
+#@opt.register_uncanonicalize
+@gof.local_optimizer([sigmoid])
+def local_hard_sigmoid(node):
+    if (isinstance(node.op, tensor.Elemwise) and
+            node.op.scalar_op == scalar_sigmoid):
+        out = hard_sigmoid(node.inputs[0])
+
+        def values_eq_approx_remove_low_prec(a, b):
+            # atol is found by trial/error.
+            # Other test could fail without good reason.
+            return tensor.TensorType.values_eq_approx(a, b, atol=0.1)
+        # Let DebugMode know that there this opt approx the values.
+        out.values_eq_approx = values_eq_approx_remove_low_prec
+        return [out]
+theano.compile.optdb['uncanonicalize'].register("local_hard_sigmoid",
+                                                local_hard_sigmoid)
+
+
 class ScalarSoftplus(scalar.UnaryScalarOp):
    @staticmethod
    def static_impl(x):

--- a/theano/tensor/nnet/tests/test_sigm.py
+++ b/theano/tensor/nnet/tests/test_sigm.py
@@ -9,7 +9,7 @@ from theano import tensor as T
 from theano import config
 from theano.tests import unittest_tools as utt
 from theano.tensor.nnet import (sigmoid, sigmoid_inplace,
-                                softplus, ultra_fast_sigmoid)
+                                softplus, ultra_fast_sigmoid, hard_sigmoid)
 from theano.tensor.nnet.sigm import (
    compute_mul, is_1pexp, parse_mul_tree, perform_sigm_times_exp,
    register_local_1msigmoid, simplify_mul,
@@ -46,6 +46,16 @@ UltraFastSigmoidTester = makeBroadcastTester(
 # This is an approx of the sigmoid. That is why we raise eps
    eps=5e-2)

+HardSigmoidTester = makeBroadcastTester(
+    op=hard_sigmoid,
+    expected=lambda inputs: check_floatX(
+        inputs, 1/(1+numpy.exp(-inputs))),
+    good=_good_broadcast_unary_normal_no_complex,
+    #grad=_grad_broadcast_unary_normal,
+    name='UltraFastSigmoidTester',
+# This is an approx of the sigmoid. That is why we raise eps
+    eps=1e-1)
+

 SoftplusTester = makeBroadcastTester(
    op=softplus,
@@ -295,11 +305,32 @@ class T_sigmoid_opts(unittest.TestCase):

        mode = self.get_mode('local_ultra_fast_sigmoid')
        f = theano.function([x], s, mode=mode)
-        assert f.maker.fgraph.toposort()[0].op == sigmoid
+        topo = f.maker.fgraph.toposort()
+        assert len(topo) == 1
+        assert topo[0].op == sigmoid

        mode = self.get_mode().including('local_ultra_fast_sigmoid')
        f = theano.function([x], s, mode=mode)
-        assert f.maker.fgraph.toposort()[0].op == ultra_fast_sigmoid
+        topo = f.maker.fgraph.toposort()
+        assert topo[0].op == ultra_fast_sigmoid
+        assert len(topo) == 1
+        ux_v = f([[-50, -10, -4, -1, 0, 1, 4, 10, 50]])
+
+    def test_local_hard_sigmoid(self):
+        x = tensor.matrix('x')
+        s = sigmoid(x)
+
+        mode = self.get_mode('local_hard_sigmoid')
+        f = theano.function([x], s, mode=mode)
+        topo = f.maker.fgraph.toposort()
+        assert topo[0].op == sigmoid
+        assert len(topo) == 1
+
+        mode = self.get_mode().including('local_hard_sigmoid')
+        f = theano.function([x], s, mode=mode)
+        topo = f.maker.fgraph.toposort()
+        assert len(topo) > 1
+        assert not any([n.op == sigmoid for n in topo])
        ux_v = f([[-50, -10, -4, -1, 0, 1, 4, 10, 50]])