Merge pull request #6097 from lamblin/mrg_uniform_f16

Make sure MRG uniform in float16 do not return 0

Merge pull request #6097 from lamblin/mrg_uniform_f16
4b634d24 · abergeron · GitHub · 1ac12452 · 84b61a6c · 4b634d24
--- a/theano/gpuarray/rng_mrg.py
+++ b/theano/gpuarray/rng_mrg.py
@@ -61,18 +61,21 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
            otype = 'ga_half'
            # limit the values of the state that we use.
            mask = '& 0x7fff'
-            NORM = '3.0518e-05f'  # numpy.float16(1.0/(2**15+8))
+            offset = '+ 1'
+            NORM = '3.0458e-05f'  # numpy.float16(1.0/(2**15+33))
            # this was determined by finding the biggest number such that
-            # numpy.float16(number * (M1 & 0x7fff)) < 1.0
+            # numpy.float16(number * ((M1 & 0x7fff) + 1)) < 1.0
        elif self.output_type.dtype == 'float32':
            otype = 'float'
            mask = ''
+            offset = ''
            NORM = '4.6566126e-10f'  # numpy.float32(1.0/(2**31+65))
            # this was determined by finding the biggest number such that
            # numpy.float32(number * M1) < 1.0
        elif self.output_type.dtype == 'float64':
            otype = 'double'
            mask = ''
+            offset = ''
            NORM = '4.656612873077392578125e-10'
        else:
            raise ValueError('Unsupported data type for output',
@@ -143,11 +146,11 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
                x21 = y2;

                if (x11 <= x21) {
-                    sample_data[i] = %(write)s(((x11 - x21 + M1) %(mask)s) * %(NORM)s);
+                    sample_data[i] = %(write)s((((x11 - x21 + M1) %(mask)s) %(offset)s) * %(NORM)s);
                }
                else
                {
-                    sample_data[i] = %(write)s(((x11 - x21) %(mask)s) * %(NORM)s);
+                    sample_data[i] = %(write)s((((x11 - x21) %(mask)s) %(offset)s) * %(NORM)s);
                }
            }

@@ -299,7 +302,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
                   """ % dict(fail=sub['fail']))

    def c_code_cache_version(self):
-        return (15,)
+        return (16,)


 @register_opt2([mrg_uniform], 'fast_compile')

--- a/theano/gpuarray/tests/test_rng_mrg.py
+++ b/theano/gpuarray/tests/test_rng_mrg.py
@@ -9,6 +9,7 @@ from theano.configparser import change_flags
 from theano.sandbox import rng_mrg
 from theano.sandbox.rng_mrg import MRG_RandomStreams
 from theano.sandbox.tests.test_rng_mrg import java_samples, rng_mrg_overflow
+from theano.sandbox.tests.test_rng_mrg import test_f16_nonzero as cpu_f16_nonzero
 from theano.tests import unittest_tools as utt

 from .config import mode_with_gpu as mode
@@ -162,3 +163,7 @@ def test_validate_input_types_gpuarray_backend():
        rstate = np.zeros((7, 6), dtype="int32")
        rstate = gpuarray_shared_constructor(rstate)
        rng_mrg.mrg_uniform.new(rstate, ndim=None, dtype="float32", size=(3,))
+
+
+def test_f16_nonzero():
+    cpu_f16_nonzero(mode=mode, op_to_check=GPUA_mrg_uniform)
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -234,7 +234,7 @@ def ff_2p72(rstate):
    return multMatVect(rstate, A1p72, M1, A2p72, M2)


-def mrg_next_value(rstate, new_rstate):
+def mrg_next_value(rstate, new_rstate, NORM, mask, offset):
    # TODO : need description for method, parameter and return
    x11, x12, x13, x21, x22, x23 = rstate
    assert type(x11) == np.int32
@@ -279,9 +279,9 @@ def mrg_next_value(rstate, new_rstate):
    new_rstate[...] = [x11, x12, x13, x21, x22, x23]
    assert new_rstate.dtype == np.int32
    if (x11 <= x21):
-        return (x11 - x21 + M1) * NORM
+        return (((x11 - x21 + M1) & mask) + offset) * NORM
    else:
-        return (x11 - x21) * NORM
+        return (((x11 - x21) & mask) + offset) * NORM


 class mrg_uniform_base(Op):
@@ -330,6 +330,7 @@ class mrg_uniform_base(Op):

 class mrg_uniform(mrg_uniform_base):
    # CPU VERSION
+    _f16_ok = True

    def make_node(self, rstate, size):
        # error checking slightly redundant here, since
@@ -374,12 +375,25 @@ class mrg_uniform(mrg_uniform_base):
        n_streams, _ = rstate.shape

        rval = np.zeros(n_elements, dtype=self.output_type.dtype)
+        if rval.dtype == 'float16':
+            mask = 0x7fff
+            offset = 1
+            NORM = np.float16(3.0458e-05)
+        elif rval.dtype == 'float32':
+            mask = 0xffffffff
+            offset = 0
+            NORM = np.float32(4.6566126e-10)
+        elif rval.dtype == 'float64':
+            mask = 0xffffffff
+            offset = 0
+            NORM = 4.656612873077392578125e-10  # 1./2^31

        err_orig = np.seterr(over='ignore')
        try:
            for i in xrange(n_elements):
                sample = mrg_next_value(rstate[i % n_streams],
-                                        rstate[i % n_streams])
+                                        rstate[i % n_streams],
+                                        NORM=NORM, mask=mask, offset=offset)
                rval[i] = sample
        finally:
            np.seterr(**err_orig)
@@ -476,6 +490,9 @@ class mrg_uniform(mrg_uniform_base):
        # TensorType, something is wrong (likely one of the GPU ops
        # not defining C code correctly).
        assert isinstance(node.inputs[0].type, TensorType)
+        if self.output_type.dtype == 'float16':
+            # C code is not tested, fall back to Python
+            super(mrg_uniform, self).c_code(node, name, inp, out, sub)
        return """
        //////// <code generated by mrg_uniform>
        npy_int64 odims_i;
@@ -592,7 +609,7 @@ class mrg_uniform(mrg_uniform_base):
                   """ % dict(fail=sub['fail']))

    def c_code_cache_version(self):
-        return (9,)
+        return (10,)


 def guess_n_streams(size, warn=False):

--- a/theano/sandbox/tests/test_rng_mrg.py
+++ b/theano/sandbox/tests/test_rng_mrg.py
@@ -742,6 +742,16 @@ def test_undefined_grad():
                  (avg, std))


+def test_f16_nonzero(mode=None, op_to_check=rng_mrg.mrg_uniform):
+    srng = MRG_RandomStreams(seed=utt.fetch_seed())
+    m = srng.uniform(size=(1000, 1000), dtype='float16')
+    assert m.dtype == 'float16', m.type
+    f = theano.function([], m, mode=mode)
+    assert any(isinstance(n.op, op_to_check) for n in f.maker.fgraph.apply_nodes)
+    m_val = f()
+    assert np.all((0 < m_val) & (m_val < 1))
+
+
 if __name__ == "__main__":
    rng = MRG_RandomStreams(np.random.randint(2147462579))
    print(theano.__file__)