Moved sigmoid() and softmax() to new file, added some related optimizations and tests.

60a89d0b · James Bergstra · 525000a6 · 60a89d0b · 60a89d0b · 60a89d0b
--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
 from nnet import *
+from sigm import softplus, sigmoid, sigmoid_inplace, scalar_sigmoid
--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -4,89 +4,14 @@
 """

 from theano import gof
-from theano import scalar
 from theano import printing
-from theano.printing import pprint
 from theano.tensor import basic as tensor
 from theano.tensor import elemwise
 from theano.tensor import opt
 from theano.compile import optdb
 import numpy

-############
-#
-# SCALAR OPS
-#
-
-class ScalarSigmoid(scalar.UnaryScalarOp):
-    @staticmethod
-    def st_impl(x):
-        if x < -30.0:
-            return 0.0
-        if x > 30.0:
-            return 1.0 
-        return 1.0 / (1.0 + numpy.exp(-x))
-    def impl(self, x):
-        return ScalarSigmoid.st_impl(x)
-    def grad(self, (x,), (gz,)):
-        y = scalar_sigmoid(x)
-        return [gz * y * (1.0 - y)]
-    def c_code(self, node, name, (x,), (z,), sub):
-        if node.inputs[0].type == scalar.float32:
-            # These constants were obtained by looking at the output of python commands like:
-            #  for i in xrange(750):
-            #      print i, repr( theano._asarray(1.0, dtype=dt) / (theano._asarray(1.0, dtype=dt) + numpy.exp(-theano._asarray([i,-i], dtype=dt))))
-            # the boundary checks prevent us from generating inf
-            return """%(z)s = %(x)s < -88.0f ? 0.0 : %(x)s > 15.0f ? 1.0f : 1.0f /(1.0f + exp(-%(x)s));""" % locals()
-        elif node.inputs[0].type == scalar.float64:
-            return """%(z)s = %(x)s < -709.0 ? 0.0 : %(x)s > 19.0 ? 1.0 : 1.0 /(1.0+exp(-%(x)s));""" % locals()
-        else:
-            raise NotImplementedError('only floatingpoint is implemented')
-    def c_code_cache_version(self):
-        v = super(ScalarSigmoid, self).c_code_cache_version()
-        if v:
-            return (2,) + v
-        else:
-            return v
-scalar_sigmoid = ScalarSigmoid(scalar.upgrade_to_float, name='scalar_sigmoid')
-sigmoid = elemwise.Elemwise(scalar_sigmoid, name='sigmoid')
-
-pprint.assign(sigmoid, printing.FunctionPrinter('sigmoid'))
-
-
-class ScalarSoftplus(scalar.UnaryScalarOp):
-    @staticmethod
-    def static_impl(x):
-        if x < -30.0:
-            return 0.0
-        if x > 30.0:
-            return x
-        return numpy.log1p(numpy.exp(x))
-    def impl(self, x):
-        return ScalarSoftplus.static_impl(x)
-    def grad(self, (x,), (gz,)):
-        return [gz * scalar_sigmoid(x)]
-    def c_code(self, node, name, (x,), (z,), sub):
-        if node.inputs[0].type == scalar.float32:
-            # These constants were obtained by looking at the output of python commands like:
-            #  for i in xrange(750):
-            #      print i, repr( numpy.log1p(numpy.exp(theano._asarray([i,-i], dtype=dt))))
-            # the boundary checks prevent us from generating inf
-            return """%(z)s = %(x)s < -103.0f ? 0.0 : %(x)s > 14.0f ? %(x)s : log1p(exp(%(x)s));""" % locals()
-        elif node.inputs[0].type == scalar.float64:
-            return """%(z)s = %(x)s < -745.0 ? 0.0 : %(x)s > 16.0 ? %(x)s : log1p(exp(%(x)s));""" % locals()
-        else:
-            raise NotImplementedError('only floatingpoint is implemented')
-    def c_code_cache_version(self):
-        v = super(ScalarSoftplus, self).c_code_cache_version()
-        if v:
-            return (2,) + v
-        else:
-            return v
-scalar_softplus = ScalarSoftplus(scalar.upgrade_to_float, name='scalar_softplus')
-softplus = elemwise.Elemwise(scalar_softplus, name='softplus')
-
-pprint.assign(softplus, printing.FunctionPrinter('softplus'))
+from .sigm import sigmoid


 ############
@@ -1351,6 +1276,7 @@ def categorical_crossentropy(coding_dist, true_dist):
        raise TypeError('rank mismatch between coding and true distributions')


+from theano import scalar

 class Prepend_scalar_constant_to_each_row(gof.Op):
    def __init__(self, val = 0):
@@ -1440,14 +1366,3 @@ prepend_scalar_to_each_row = Prepend_scalar_to_each_row()
 prepend_0_to_each_row = Prepend_scalar_constant_to_each_row(0.)
 prepend_1_to_each_row = Prepend_scalar_constant_to_each_row(1.)

-logsigm_to_softplus = gof.PatternSub(
-    (tensor.log, (sigmoid, 'x')),
-    (tensor.neg, (softplus, (tensor.neg, 'x'))),
-    allow_multiple_clients = True)
-log1msigm_to_softplus = gof.PatternSub(
-    (tensor.log, (tensor.sub, tensor.constant([[1.0]]), (sigmoid, 'x'))),
-    (tensor.neg, (softplus, 'x')),
-    allow_multiple_clients = True)
-
-opt.register_specialize(logsigm_to_softplus, name = 'logsigm_to_softplus')
-opt.register_specialize(log1msigm_to_softplus, name = 'log1msigm_to_softplus')
--- a/theano/tensor/nnet/sigm.py
+++ b/theano/tensor/nnet/sigm.py
+"""Ops and optimizations: sigmoid, softplus
+
+These functions implement special cases of exp and log to improve numerical stability.
+"""
+import numpy
+
+from theano import gof
+from theano import scalar
+from theano import printing
+from theano.tensor import basic as tensor
+from theano.printing import pprint
+from theano.tensor import elemwise
+from theano.tensor import opt
+from theano.compile import optdb
+
+
+############
+#
+# SCALAR OPS
+#
+
+class ScalarSigmoid(scalar.UnaryScalarOp):
+    @staticmethod
+    def st_impl(x):
+        if x < -30.0:
+            return 0.0
+        if x > 30.0:
+            return 1.0 
+        return 1.0 / (1.0 + numpy.exp(-x))
+    def impl(self, x):
+        return ScalarSigmoid.st_impl(x)
+    def grad(self, (x,), (gz,)):
+        y = scalar_sigmoid(x)
+        return [gz * y * (1.0 - y)]
+    def c_code(self, node, name, (x,), (z,), sub):
+        if node.inputs[0].type == scalar.float32:
+            # These constants were obtained by looking at the output of python commands like:
+            #  for i in xrange(750):
+            #      print i, repr( theano._asarray(1.0, dtype=dt) / (theano._asarray(1.0, dtype=dt) + numpy.exp(-theano._asarray([i,-i], dtype=dt))))
+            # the boundary checks prevent us from generating inf
+            return """%(z)s = %(x)s < -88.0f ? 0.0 : %(x)s > 15.0f ? 1.0f : 1.0f /(1.0f + exp(-%(x)s));""" % locals()
+        elif node.inputs[0].type == scalar.float64:
+            return """%(z)s = %(x)s < -709.0 ? 0.0 : %(x)s > 19.0 ? 1.0 : 1.0 /(1.0+exp(-%(x)s));""" % locals()
+        else:
+            raise NotImplementedError('only floatingpoint is implemented')
+    def c_code_cache_version(self):
+        v = super(ScalarSigmoid, self).c_code_cache_version()
+        if v:
+            return (2,) + v
+        else:
+            return v
+scalar_sigmoid = ScalarSigmoid(scalar.upgrade_to_float, name='scalar_sigmoid')
+sigmoid = elemwise.Elemwise(scalar_sigmoid, name='sigmoid')
+
+sigmoid_inplace = elemwise.Elemwise(
+        ScalarSigmoid(scalar.transfer_type(0)),
+        inplace_pattern={0:0},
+        name='sigmoid_inplace',
+        )
+
+pprint.assign(sigmoid, printing.FunctionPrinter('sigmoid'))
+
+
+class ScalarSoftplus(scalar.UnaryScalarOp):
+    @staticmethod
+    def static_impl(x):
+        if x < -30.0:
+            return 0.0
+        if x > 30.0:
+            return x
+        return numpy.log1p(numpy.exp(x))
+    def impl(self, x):
+        return ScalarSoftplus.static_impl(x)
+    def grad(self, (x,), (gz,)):
+        return [gz * scalar_sigmoid(x)]
+    def c_code(self, node, name, (x,), (z,), sub):
+        if node.inputs[0].type == scalar.float32:
+            # These constants were obtained by looking at the output of python commands like:
+            #  for i in xrange(750):
+            #      print i, repr( numpy.log1p(numpy.exp(theano._asarray([i,-i], dtype=dt))))
+            # the boundary checks prevent us from generating inf
+            return """%(z)s = %(x)s < -103.0f ? 0.0 : %(x)s > 14.0f ? %(x)s : log1p(exp(%(x)s));""" % locals()
+        elif node.inputs[0].type == scalar.float64:
+            return """%(z)s = %(x)s < -745.0 ? 0.0 : %(x)s > 16.0 ? %(x)s : log1p(exp(%(x)s));""" % locals()
+        else:
+            raise NotImplementedError('only floatingpoint is implemented')
+    def c_code_cache_version(self):
+        v = super(ScalarSoftplus, self).c_code_cache_version()
+        if v:
+            return (2,) + v
+        else:
+            return v
+scalar_softplus = ScalarSoftplus(scalar.upgrade_to_float, name='scalar_softplus')
+softplus = elemwise.Elemwise(scalar_softplus, name='softplus')
+
+pprint.assign(softplus, printing.FunctionPrinter('softplus'))
+
+logsigm_to_softplus = gof.PatternSub(
+    (tensor.log, (sigmoid, 'x')),
+    (tensor.neg, (softplus, (tensor.neg, 'x'))),
+    allow_multiple_clients = True)
+
+log1msigm_to_softplus = gof.PatternSub(
+    (tensor.log, (tensor.sub, tensor.constant([[1.0]]), (sigmoid, 'x'))),
+    (tensor.neg, (softplus, 'x')),
+    allow_multiple_clients = True)
+
+opt.register_specialize(logsigm_to_softplus, name = 'logsigm_to_softplus')
+opt.register_specialize(log1msigm_to_softplus, name = 'log1msigm_to_softplus')
+
+def is_1pexp(t):
+    # if t is of form (1+exp(x)), return x
+    # else return None
+    if t.owner and t.owner.op == tensor.add:
+        scalars, scalar_inputs, nonconsts = \
+                opt.scalarconsts_rest(t.owner.inputs)
+        # scalar_inputs are potentially dimshuffled and fill'd scalars
+        if len(nonconsts) == 1:
+            maybe_exp = nonconsts[0]
+            if maybe_exp.owner and maybe_exp.owner.op == tensor.exp:
+                return False, maybe_exp.owner.inputs[0]
+    return None
+
+def is_exp(t):
+    # if t is of form (exp(x)) then return x
+    # else return None
+    neg = False
+    if t.owner and t.owner.op == tensor.neg:
+        t = t.owner.inputs[0]
+        neg = True
+    if t.owner and t.owner.op == tensor.exp:
+        return neg, t.owner.inputs[0]
+
+def partition_num_or_denom(r, f):
+    if r.owner and r.owner.op == tensor.mul:
+        a = r.owner.inputs
+    else:
+        a = [r]
+
+    # ugly 2.4-compatible thing
+    f_terms = []
+    neg = False
+    rest = []
+    for t in a:
+        f_t = f(t)
+        if f_t is None:
+            rest.append(t)
+        else:
+            neg_t, f_t = f_t
+            f_terms.append(f_t)
+            neg ^= neg_t #bit flip if neg_t is true
+    return f_terms, rest, neg
+
+
+@opt.register_specialize
+@opt.register_canonicalize
+@gof.local_optimizer([tensor.true_div])
+def local_exp_over_1_plus_exp(node):
+    """exp(x)/(1+exp(x)) -> sigm(x)
+    c/(1+exp(x)) -> c*sigm(-x)
+    """
+    # this optimization should be done for numerical stability
+    # so we don't care to check client counts
+    if node.op == tensor.true_div:
+
+        #find all the exp() terms in the numerator
+        num, denom = node.inputs
+        num_exp_x, num_rest, num_neg = partition_num_or_denom(num, is_exp)
+        denom_1pexp, denom_rest, denom_neg = partition_num_or_denom(denom, is_1pexp)
+
+        sigmoids = []
+        for t in denom_1pexp:
+            if t in num_exp_x:
+                # case: exp(x) /(1+exp(x))
+                sigmoids.append(sigmoid(t))
+                del num_exp_x[num_exp_x.index(t)]
+            else:
+                # case: 1/(1+exp(x))
+                sigmoids.append(sigmoid(-t))
+
+        if not sigmoids: # we didn't find any.  abort
+            return
+        # put the new numerator together
+        new_num = sigmoids + [tensor.exp(t) for t in num_exp_x] + num_rest
+        if len(new_num) == 1:
+            new_num = new_num[0]
+        else:
+            new_num = tensor.mul(*new_num)
+
+        if num_neg ^ denom_neg:
+            new_num = -new_num
+
+        if len(denom_rest) == 0:
+            return [new_num]
+        elif len(denom_rest) == 1:
+            return [new_num / denom_rest[0]]
+        else:
+            return [new_num / tensor.mul(*denom_rest)]
+
+@opt.register_specialize
+@opt.register_canonicalize
+@gof.local_optimizer([tensor.mul])
+def local_sigm_times_exp(node):
+    """
+    exp(x)*sigm(-x) -> -sigm(x)
+    """
+    # this is a numerical stability thing, so we dont check clients
+    if node.op == tensor.mul:
+        exp_x = []
+        exp_minus_x = []
+        sigm_x = []
+        sigm_minus_x = []
+        other = []
+        neg = False
+        for i in node.inputs:
+            while i.owner and i.owner.op == tensor.neg:
+                neg ^= True
+                i = i.owner.inputs[0]
+            if i.owner and i.owner.op == tensor.exp:
+                exp_arg = i.owner.inputs[0]
+                if exp_arg.owner and exp_arg.owner.op == tensor.neg:
+                    exp_minus_x.append(exp_arg.owner.inputs[0])
+                else:
+                    exp_x.append(exp_arg)
+            elif i.owner and i.owner.op == sigmoid:
+                sigm_arg = i.owner.inputs[0]
+                if sigm_arg.owner and sigm_arg.owner.op == tensor.neg:
+                    sigm_minus_x.append(sigm_arg.owner.inputs[0])
+                else:
+                    sigm_x.append(sigm_arg)
+            else:
+                other.append(i)
+
+        # remove matched pairs in exp_x and sigm_minus_x
+        did_something = False
+        for i in exp_x:
+            if i in sigm_minus_x:
+                del sigm_minus_x[sigm_minus_x.index(i)]
+                other.append(sigmoid(i))
+                did_something = True
+            else:
+                other.append(i)
+
+        # remove matched pairs in exp_minus_x and sigm_x
+        for i in exp_minus_x:
+            if i in sigm_x:
+                del sigm_x[sigm_x.index(i)]
+                other.append(sigm(-i))
+                did_something = True
+            else:
+                other.append(i)
+        if did_something:
+            terms = other + [sigmoid(x) for x in sigm_x] \
+                    + [sigmoid(-x) for x in sigm_minus_x]
+            if len(terms)>1:
+                rval = tensor.mul(*terms)
+            else:
+                rval = terms[0]
+            
+            if neg:
+                return [-rval]
+            else:
+                return [rval]
+
+
+@opt.register_specialize
+@opt.register_canonicalize
+@gof.local_optimizer([tensor.inv])
+def local_inv_1_plus_exp(node):
+    """
+    1/(1+exp(x)) -> sigm(-x)
+    """
+    # this optimization should be done for numerical stability
+    # so we don't care to check client counts
+    if node.op == tensor.inv:
+        inv_arg = node.inputs[0]
+        if inv_arg.owner and inv_arg.owner.op == tensor.add:
+            scalars, scalar_inputs, nonconsts = \
+                    opt.scalarconsts_rest(inv_arg.owner.inputs)
+            # scalar_inputs are potentially dimshuffled and fill'd scalars
+            if len(nonconsts) == 1:
+                if nonconsts[0].owner and nonconsts[0].owner.op == tensor.exp:
+                    if scalars and numpy.allclose(numpy.sum(scalars), 1):
+                        return opt._fill_chain(
+                                sigmoid(tensor.neg(nonconsts[0].owner.inputs[0])),
+                                scalar_inputs)
+
+@opt.register_specialize
+@gof.local_optimizer([tensor.sub])
+def local_1msigmoid(node):
+    """
+    1-sigm(x) -> sigm(-x)
+    """
+    # this optimization is for speed alone
+    # so we do check the client count on the sigmoid
+    if node.op == tensor.sub:
+        sub_l, sub_r = node.inputs
+        if len(sub_r.clients) > 1:
+            return # we probably need both sigm and 1-sigm
+        if sub_r.owner and sub_r.owner.op == sigmoid:
+            try:
+                val_l = opt.get_constant_value(sub_l)
+            except Exception, e:
+                return
+            if numpy.allclose(numpy.sum(val_l), 1):
+                return [sigmoid(-sub_r.owner.inputs[0])]
+
--- a/theano/tensor/nnet/tests/test_sigm.py
+++ b/theano/tensor/nnet/tests/test_sigm.py
+import unittest
+import theano
+from theano import tensor as T
+from theano import gof
+import numpy
+from theano.tests import unittest_tools as utt
+from theano.tensor.tests import test_basic as TT
+
+from theano.tensor.nnet import *
+
+
+class T_sigmoid(unittest.TestCase):
+    def setUp(self):
+        utt.seed_rng()
+    def test_elemwise(self):
+        utt.verify_grad(sigmoid, [numpy.random.rand(3,4)])
+
+class T_softplus(unittest.TestCase):
+    def setUp(self):
+        utt.seed_rng()
+    def test_elemwise(self):
+        utt.verify_grad(softplus, [numpy.random.rand(3,4)])
+
+
+class T_sigmoid_opts(unittest.TestCase):
+    def test_exp_over_1_plus_exp(self):
+        m = theano.config.mode
+        if m == 'FAST_COMPILE':
+            m = 'FAST_RUN'
+
+        x = T.dvector()
+
+        # tests exp_over_1_plus_exp
+        f = theano.function([x], T.exp(x)/(1+T.exp(x)), mode=m)
+        #theano.printing.debugprint(f)
+        assert [node.op for node in f.maker.env.toposort()] == [sigmoid]
+
+        # tests inv_1_plus_exp
+        f = theano.function([x], T.fill(x,1.0) / (1+T.exp(-x)), mode=m)
+        #theano.printing.debugprint(f)
+        assert [node.op for node in f.maker.env.toposort()] == [sigmoid]
+
+        # tests inv_1_plus_exp with neg
+        f = theano.function([x], T.fill(x,-1.0) / (1+T.exp(-x)), mode=m)
+        #theano.printing.debugprint(f)
+        assert [node.op for node in f.maker.env.toposort()] == [sigmoid, 
+                T.inplace.neg_inplace]
+
+        # tests double inv_1_plus_exp with neg
+        f = theano.function([x], (T.fill(x,-1.0)*T.exp(x)) / ((1+T.exp(x))*(1+T.exp(-x))), mode=m)
+        #theano.printing.debugprint(f)
+        assert [node.op for node in f.maker.env.toposort()] == [sigmoid, 
+                T.mul]
+
+    def test_1msigmoid(self):
+        m = theano.config.mode
+        if m == 'FAST_COMPILE':
+            m = 'FAST_RUN'
+
+        x = T.fmatrix()
+
+        # tests exp_over_1_plus_exp
+        f = theano.function([x], 1 - T.exp(x)/(1+T.exp(x)), mode=m)
+        theano.printing.debugprint(f)
+        assert [node.op for node in f.maker.env.toposort()] == [tensor.neg, sigmoid_inplace]
+
+        # tests inv_1_plus_exp
+        f = theano.function([x], 1 - T.fill(x,1.0) / (1+T.exp(-x)), mode=m)
+        theano.printing.debugprint(f)
+        assert [node.op for node in f.maker.env.toposort()] == [tensor.neg, 
+                sigmoid_inplace]
+
+
+
+