Make sure MRG uniform in float16 do not return 0

Also update Python code to be consistent.

Make sure MRG uniform in float16 do not return 0
5647b421 · Pascal Lamblin · 9df6ce4e · 5647b421 · 5647b421
--- a/theano/gpuarray/rng_mrg.py
+++ b/theano/gpuarray/rng_mrg.py
@@ -61,18 +61,21 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
            otype = 'ga_half'
            # limit the values of the state that we use.
            mask = '& 0x7fff'
-            NORM = '3.0518e-05f'  # numpy.float16(1.0/(2**15+8))
+            offset = '+ 1'
+            NORM = '3.0458e-05f'  # numpy.float16(1.0/(2**15+33))
            # this was determined by finding the biggest number such that
-            # numpy.float16(number * (M1 & 0x7fff)) < 1.0
+            # numpy.float16(number * ((M1 & 0x7fff) + 1)) < 1.0
        elif self.output_type.dtype == 'float32':
            otype = 'float'
            mask = ''
+            offset = ''
            NORM = '4.6566126e-10f'  # numpy.float32(1.0/(2**31+65))
            # this was determined by finding the biggest number such that
            # numpy.float32(number * M1) < 1.0
        elif self.output_type.dtype == 'float64':
            otype = 'double'
            mask = ''
+            offset = ''
            NORM = '4.656612873077392578125e-10'
        else:
            raise ValueError('Unsupported data type for output',
@@ -143,11 +146,11 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
                x21 = y2;

                if (x11 <= x21) {
-                    sample_data[i] = %(write)s(((x11 - x21 + M1) %(mask)s) * %(NORM)s);
+                    sample_data[i] = %(write)s((((x11 - x21 + M1) %(mask)s) %(offset)s) * %(NORM)s);
                }
                else
                {
-                    sample_data[i] = %(write)s(((x11 - x21) %(mask)s) * %(NORM)s);
+                    sample_data[i] = %(write)s((((x11 - x21) %(mask)s) %(offset)s) * %(NORM)s);
                }
            }

@@ -299,7 +302,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
                   """ % dict(fail=sub['fail']))

    def c_code_cache_version(self):
-        return (15,)
+        return (16,)


 @register_opt2([mrg_uniform], 'fast_compile')

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -234,7 +234,7 @@ def ff_2p72(rstate):
    return multMatVect(rstate, A1p72, M1, A2p72, M2)


-def mrg_next_value(rstate, new_rstate):
+def mrg_next_value(rstate, new_rstate, NORM, mask, offset):
    # TODO : need description for method, parameter and return
    x11, x12, x13, x21, x22, x23 = rstate
    assert type(x11) == np.int32
@@ -279,9 +279,9 @@ def mrg_next_value(rstate, new_rstate):
    new_rstate[...] = [x11, x12, x13, x21, x22, x23]
    assert new_rstate.dtype == np.int32
    if (x11 <= x21):
-        return (x11 - x21 + M1) * NORM
+        return (((x11 - x21 + M1) & mask) + offset) * NORM
    else:
-        return (x11 - x21) * NORM
+        return (((x11 - x21) & mask) + offset) * NORM


 class mrg_uniform_base(Op):
@@ -330,6 +330,7 @@ class mrg_uniform_base(Op):

 class mrg_uniform(mrg_uniform_base):
    # CPU VERSION
+    _f16_ok = True

    def make_node(self, rstate, size):
        # error checking slightly redundant here, since
@@ -374,12 +375,25 @@ class mrg_uniform(mrg_uniform_base):
        n_streams, _ = rstate.shape

        rval = np.zeros(n_elements, dtype=self.output_type.dtype)
+        if rval.dtype == 'float16':
+            mask = 0x7fff
+            offset = 1
+            NORM = np.float16(3.0458e-05)
+        elif rval.dtype == 'float32':
+            mask = 0xffffffff
+            offset = 0
+            NORM = np.float32(4.6566126e-10)
+        elif rval.dtype == 'float64':
+            mask = 0xffffffff
+            offset = 0
+            NORM = 4.656612873077392578125e-10  # 1./2^31

        err_orig = np.seterr(over='ignore')
        try:
            for i in xrange(n_elements):
                sample = mrg_next_value(rstate[i % n_streams],
-                                        rstate[i % n_streams])
+                                        rstate[i % n_streams],
+                                        NORM=NORM, mask=mask, offset=offset)
                rval[i] = sample
        finally:
            np.seterr(**err_orig)
@@ -476,6 +490,9 @@ class mrg_uniform(mrg_uniform_base):
        # TensorType, something is wrong (likely one of the GPU ops
        # not defining C code correctly).
        assert isinstance(node.inputs[0].type, TensorType)
+        if self.output_type.dtype == 'float16':
+            # C code is not tested, fall back to Python
+            super(mrg_uniform, self).c_code(node, name, inp, out, sub)
        return """
        //////// <code generated by mrg_uniform>
        npy_int64 odims_i;
@@ -592,7 +609,7 @@ class mrg_uniform(mrg_uniform_base):
                   """ % dict(fail=sub['fail']))

    def c_code_cache_version(self):
-        return (9,)
+        return (10,)


 def guess_n_streams(size, warn=False):