Merge pull request #2228 from lamblin/fix_float16

Prevent computations in float16 in scalar and elemwise

Merge pull request #2228 from lamblin/fix_float16
52cb8ec7 · Frédéric Bastien · d7071622 · 81369296 · 52cb8ec7 · 52cb8ec7
--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
--- a/theano/scalar/tests/test_basic.py
+++ b/theano/scalar/tests/test_basic.py
@@ -10,6 +10,7 @@ If you do want to rewrite these tests, bear in mind:
 """

 import unittest
+import numpy as np

 import theano
 from theano.gof import FunctionGraph
@@ -20,8 +21,12 @@ from theano.scalar.basic import (floats, float32, float64,
                                 ints, int8, int32, complex64,
                                 ComplexError, IntDiv, TrueDiv,
                                 Composite, add, div_proxy, clip,
-                                 and_, eq, neq, invert, mul)
-import numpy
+                                 and_, eq, neq, invert, mul, Scalar)
+from theano.scalar.basic import (
+    true_div, inv, log, log2, log10, log1p, exp, exp2, expm1, sqrt, deg2rad,
+    rad2deg, cos, arccos, sin, arcsin, tan, arctan, arctan2, cosh, arccosh,
+    sinh, arcsinh, tanh, arctanh)
+

 def inputs():
    return floats('xyz')
@@ -75,7 +80,7 @@ class test_ScalarOps(unittest.TestCase):
        g3 = theano.gradient.grad(a3, x)
        fn3 = gof.DualLinker().accept(FunctionGraph([x], [g3])).make_function()

-        rng = numpy.random.RandomState(utt.fetch_seed())
+        rng = np.random.RandomState(utt.fetch_seed())

        ntests = 50
        for i in xrange(ntests):
@@ -235,6 +240,128 @@ class test_logical(unittest.TestCase):
            self.assertTrue(fn(a,b) == ~a, (a,))


+# This class does not inherit from unittest.TestCase, because it would
+# interfere with the "yield" mechanism that automatically generates test, see
+# http://stackoverflow.com/questions/6689537/nose-test-generators-inside-class
+# Therefore, it needs to be named "test_..." or "Test_...", so nose can pick
+# it up by name, otherwise the tests would not be executed.
+class test_upgrade_to_float(object):
+    # Test for Ops whose output has to be floating point, even when all
+    # inputs are ints.
+    # In particular, when the inputs are int8, the output should be
+    # at least float32, not float16.
+
+    unary_ops_vals = [
+        (inv, range(-127, 0) + range(1, 127)),
+        (sqrt, range(0, 128)),
+        (log, range(1, 128)),
+        (log2, range(1, 128)),
+        (log10, range(1, 128)),
+        (log1p, range(0, 128)),
+        (exp, range(-127, 89)),
+        (exp2, range(-127, 89)),
+        (expm1, range(-127, 89)),
+        (deg2rad, range(-127, 128)),
+        (rad2deg, range(-127, 128)),
+        (cos, range(-127, 128)),
+        (arccos, range(-1, 2)),
+        (cosh, range(-89, 90)),
+        (arccosh, range(1, 128)),
+        (sin, range(-127, 128)),
+        (arcsin, range(-1, 2)),
+        (sinh, range(-89, 90)),
+        (arcsinh, range(-127, 128)),
+        (tan, range(-3, 4)),
+        (arctan, range(-127, 128)),
+        (tanh, range(-127, 128)),
+        (arctanh, [0])]
+
+    binary_ops_vals = [
+        (arctan2, range(-127, 128), range(-127, 128))]
+
+    @staticmethod
+    def _test_unary(unary_op, x_range):
+        xi = int8('xi')
+        xf = float32('xf')
+
+        ei = unary_op(xi)
+        fi = theano.function([xi], ei)
+
+        ef = unary_op(xf)
+        ff = theano.function([xf], ef)
+
+        for x_val in x_range:
+            outi = fi(x_val)
+            outf = ff(x_val)
+
+            assert outi.dtype == outf.dtype, 'incorrect dtype'
+            assert np.allclose(outi, outf), 'insufficient precision'
+
+    @staticmethod
+    def _test_binary(binary_op, x_range, y_range):
+        xi = int8('xi')
+        yi = int8('yi')
+        xf = float32('xf')
+        yf = float32('yf')
+
+        ei = binary_op(xi, yi)
+        fi = theano.function([xi, yi], ei)
+
+        ef = binary_op(xf, yf)
+        ff = theano.function([xf, yf], ef)
+
+        for x_val in x_range:
+            for y_val in y_range:
+                outi = fi(x_val, y_val)
+                outf = ff(x_val, y_val)
+
+                assert outi.dtype == outf.dtype, 'incorrect dtype'
+                assert np.allclose(outi, outf), 'insufficient precision'
+
+    def test_true_div(self):
+        # true_div's upcast policy is not exactly "upgrade_to_float",
+        # so the test is a little bit different
+        x_range = range(-127, 128)
+        y_range = range(-127, 0) + range(1, 127)
+
+        xi = int8('xi')
+        yi = int8('yi')
+        xf = Scalar(theano.config.floatX)('xf')
+        yf = Scalar(theano.config.floatX)('yf')
+
+        ei = true_div(xi, yi)
+        fi = theano.function([xi, yi], ei)
+
+        ef = true_div(xf, yf)
+        ff = theano.function([xf, yf], ef)
+
+        for x_val in x_range:
+            for y_val in y_range:
+                outi = fi(x_val, y_val)
+                outf = ff(x_val, y_val)
+
+                assert outi.dtype == outf.dtype, 'incorrect dtype'
+                assert np.allclose(outi, outf), 'insufficient precision'
+
+    def test_unary(self):
+        # Automatically define all individual unary tests
+        for unary_op, x_range in self.unary_ops_vals:
+            test_name = 'test_%s' % unary_op.name
+            # Make a lambda function so we can name the test
+            test = lambda: self._test_unary(unary_op, x_range)
+            test.description = test_name
+            yield test
+
+    def test_binary(self):
+        # Automatically define all individual binary tests
+        for binary_op, x_range, y_range in self.binary_ops_vals:
+            test_name = 'test_%s' % binary_op.name
+            # Make a lambda function so we can name the test
+            test = lambda: self._test_binary(binary_op, x_range, y_range)
+            test.description = test_name
+            yield test
+
+
 class test_complex_mod(unittest.TestCase):
    """Make sure % fails on complex numbers."""


--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -1812,7 +1812,7 @@ def round(a, mode="half_away_from_zero"):
        raise Exception("round mode %s is not implemented." % mode)


-@_scal_elemwise_with_nfunc('around', 1, -1)
+@_scal_elemwise_with_nfunc('around', 1, 1)
 def round_half_to_even(a):
    """round_half_to_even(a)"""

@@ -1952,20 +1952,20 @@ def chi2sf(x, k):


 #numpy.real(float32) return a view on the inputs.
-#@_scal_elemwise_with_nfunc('real', 1, -1)
+#@_scal_elemwise_with_nfunc('real', 1, 1)
 @_scal_elemwise
 def real(z):
    """Return real component of complex-valued tensor `z`"""
 _tensor_py_operators.real = property(real)


-@_scal_elemwise_with_nfunc('imag', 1, -1)
+@_scal_elemwise_with_nfunc('imag', 1, 1)
 def imag(z):
    """Return imaginary component of complex-valued tensor `z`"""
 _tensor_py_operators.imag = property(imag)


-@_scal_elemwise_with_nfunc('angle', 1, -1)
+@_scal_elemwise_with_nfunc('angle', 1, 1)
 def angle(z):
    """Return polar-coordinate angle of complex-valued tensor `z`"""

@@ -1975,7 +1975,7 @@ def complex(real, imag):
    """Return complex-valued tensor with `real` and `imag` components"""


-@_scal_elemwise_with_nfunc('conj', 1, -1)
+@_scal_elemwise_with_nfunc('conj', 1, 1)
 def conj(z):
    """Return the complex conjugate of `z`."""


--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -18,9 +18,10 @@ from theano.tensor import elemwise_cgen as cgen

 config = theano.config

-# We cannot import discrete_dtypes from tensor.basic yet,
+# We cannot import discrete_dtypes or float_dtypes from tensor.basic yet,
 # so we redefine them here
 discrete_dtypes = map(str, scalar.discrete_types)
+float_dtypes = map(str, scalar.float_types)


 # tensor depends on elemwise to provide definitions for several ops
@@ -472,14 +473,11 @@ class Elemwise(OpenMPOp):
            the input's storage. (Just like destroymap, but without the lists.)
        * nfunc_spec: either None or a tuple of three elements,
            (nfunc_name, nin, nout) such that getattr(numpy, nfunc_name)
-            implements this operation, takes nin inputs and abs(nout) outputs
-            (nout < 0 if the numpy function does not provide the option of
-            providing a numpy array to store the results in). Note that nin
-            cannot always be inferred from the scalar op's own nin field
-            because that value is sometimes 0 (meaning a variable number of
-            inputs), whereas the numpy function may not have varargs.
-            NOTE: as of now, the sign of the nout field is ignored (some work
-            needs to be done to resize the destinations when needed).
+            implements this operation, takes nin inputs and nout outputs.
+            Note that nin cannot always be inferred from the scalar op's
+            own nin field because that value is sometimes 0 (meaning a
+            variable number of inputs), whereas the numpy function may
+            not have varargs.
        """
        if inplace_pattern is None:
            inplace_pattern = {}
@@ -819,43 +817,24 @@ class Elemwise(OpenMPOp):
                out_shape.append(max(values))
        out_shape = tuple(out_shape)

-        # Commented as we don't reuse outputs now.
-        #
-        # if not self.inplace_pattern:
-        #     for output, storage in izip(node.outputs, output_storage):
-        #         odat = storage[0]
-        #         if odat is not None:
-        #             if odat.shape != out_shape:
-        #                 # It is unsafe to try to resize odat,
-        #                 # we have to allocate output storage.
-        #                 odat = None
-        #         if odat is None:
-        #             odat = numpy.ndarray(out_shape, dtype=output.type.dtype)
-        #         storage[0] = odat
-        # else:
-        #     for i, (output, storage) in enumerate(
-        #             izip(node.outputs, output_storage)):
-        #         #i is an output idx
-        #         if i in self.inplace_pattern:
-        #             odat = inputs[self.inplace_pattern[i]]
-        #         else:
-        #             odat = storage[0]
-        #             if odat is not None:
-        #                 if odat.shape != out_shape:
-        #                     # It is unsafe to try to resize odat,
-        #                     # we have to allocate output storage.
-        #                     odat = None
-        #             if odat is None:
-        #                 odat = numpy.ndarray(out_shape,
-        #                         dtype=output.type.dtype)
-        #         storage[0] = odat
-
-        ufunc_args = inputs  # + output_storage
+        ufunc_args = inputs
+        ufunc_kwargs = {}
        if self.nfunc and len(inputs) == self.nfunc_spec[1]:
            ufunc = self.nfunc
            nout = self.nfunc_spec[2]
-            if nout < 0:
-                nout = -nout
+            # Numpy ufuncs will sometimes perform operations in
+            # float16, in particular when the input is int8.
+            # This is not something that we want, and we do not
+            # do it in the C code, so we specify that the computation
+            # should be carried out in the returned dtype.
+            # This is done via the "sig" kwarg of the ufunc, its value
+            # should be something like "ff->f", where the characters
+            # represent the dtype of the inputs and outputs.
+            out_dtype = node.outputs[0].dtype
+            if out_dtype in float_dtypes and isinstance(ufunc, numpy.ufunc):
+                char = numpy.sctype2char(out_dtype)
+                sig = char * node.nin + '->' + char * node.nout
+                ufunc_kwargs['sig'] = sig
            # Unfortunately, the else case does not allow us to
            # directly feed the destination arguments to the nfunc
            # since it sometimes requires resizing. Doing this
@@ -869,7 +848,7 @@ class Elemwise(OpenMPOp):
                                      self.scalar_op.nout))
            nout = ufunc.nout

-        variables = ufunc(*ufunc_args)
+        variables = ufunc(*ufunc_args, **ufunc_kwargs)

        if nout == 1:
            variables = [variables]

--- a/theano/tensor/nnet/sigm.py
+++ b/theano/tensor/nnet/sigm.py
@@ -31,6 +31,11 @@ class ScalarSigmoid(scalar.UnaryScalarOp):
            return 0.0
        if x > 30.0:
            return 1.0
+        # If x is an int8 or uint8, numpy.exp will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return 1.0 / (1.0 + numpy.exp(-x, sig='f'))
        return 1.0 / (1.0 + numpy.exp(-x))

    def impl(self, x):
@@ -268,8 +273,11 @@ def hard_sigmoid(x):
    Removing the slope and shift does not make it faster.

    """
-    slope = 0.2
-    shift = 0.5
+    # Use the same dtype as determined by "upgrade_to_float",
+    # and perform computation in that dtype.
+    out_dtype = scalar.upgrade_to_float(scalar.Scalar(dtype=x.dtype))[0].dtype
+    slope = tensor.constant(0.2, dtype=out_dtype)
+    shift = tensor.constant(0.5, dtype=out_dtype)
    x = (x * slope) + shift
    x = tensor.clip(x, 0, 1)
    return x
@@ -300,6 +308,11 @@ class ScalarSoftplus(scalar.UnaryScalarOp):
            return 0.0
        if x > 30.0:
            return x
+        # If x is an int8 or uint8, numpy.exp will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.log1p(numpy.exp(x, sig='f'))
        return numpy.log1p(numpy.exp(x))

    def impl(self, x):

--- a/theano/tensor/nnet/tests/test_sigm.py
+++ b/theano/tensor/nnet/tests/test_sigm.py
@@ -16,7 +16,7 @@ from theano.tensor.nnet.sigm import (
    register_local_1msigmoid, simplify_mul,
 )
 from theano.tensor.tests.test_basic import (makeBroadcastTester, rand,
-                                            check_floatX,
+                                            check_floatX, upcast_int8_nfunc,
                                            _good_broadcast_unary_normal_no_complex)


@@ -30,8 +30,8 @@ class T_sigmoid(unittest.TestCase):

 SigmoidTester = makeBroadcastTester(
    op=sigmoid,
-    expected=lambda inputs: check_floatX(
-        inputs, 1/(1+numpy.exp(-inputs))),
+    expected=upcast_int8_nfunc(lambda inputs: check_floatX(
+        inputs, 1 / (1 + numpy.exp(-inputs)))),
    good=_good_broadcast_unary_normal_no_complex,
    #grad=_grad_broadcast_unary_normal,
    name='SigmoidTester',
@@ -39,8 +39,8 @@ SigmoidTester = makeBroadcastTester(

 UltraFastSigmoidTester = makeBroadcastTester(
    op=ultra_fast_sigmoid,
-    expected=lambda inputs: check_floatX(
-        inputs, 1/(1+numpy.exp(-inputs))),
+    expected=upcast_int8_nfunc(lambda inputs: check_floatX(
+        inputs, 1 / (1 + numpy.exp(-inputs)))),
    good=_good_broadcast_unary_normal_no_complex,
    #grad=_grad_broadcast_unary_normal,
    name='UltraFastSigmoidTester',
@@ -49,20 +49,21 @@ UltraFastSigmoidTester = makeBroadcastTester(

 HardSigmoidTester = makeBroadcastTester(
    op=hard_sigmoid,
-    expected=lambda inputs: check_floatX(
-        inputs, 1/(1+numpy.exp(-inputs))),
+    expected=upcast_int8_nfunc(lambda inputs: check_floatX(
+        inputs, 1 / (1 + numpy.exp(-inputs)))),
    good=_good_broadcast_unary_normal_no_complex,
    #grad=_grad_broadcast_unary_normal,
-    name='UltraFastSigmoidTester',
+    name='HardSigmoidTester',
 # This is an approx of the sigmoid. That is why we raise eps
    eps=1e-1)


 SoftplusTester = makeBroadcastTester(
    op=softplus,
-    expected=lambda inputs: check_floatX(
-        inputs, numpy.log1p(numpy.exp(inputs))),
-    good=_good_broadcast_unary_normal_no_complex,
+    expected=upcast_int8_nfunc(lambda inputs: check_floatX(
+        inputs, numpy.log1p(numpy.exp(inputs)))),
+    good=dict(_good_broadcast_unary_normal_no_complex,
+              int8=[numpy.arange(-127, 89, dtype='int8')]),
    #grad=_grad_broadcast_unary_normal,
    name='SoftplusTester',
 )

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py