Merge pull request #6401 from mrTsjolder/master

Implement truncated normal with box-muller

Merge pull request #6401 from mrTsjolder/master
25dfa312 · Frédéric Bastien · GitHub · e3c95974 · e29586c9 · 25dfa312
--- a/theano/gpuarray/multinomial.py
+++ b/theano/gpuarray/multinomial.py
@@ -39,11 +39,11 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
        return [gpuarray_helper_inc_dir()]

    def make_node(self, pvals, unis):
-        assert unis.dtype == pvals.dtype
-        assert pvals.dtype in ['float32', 'float16', 'float64']
        ctx_name = infer_context_name(pvals, unis)
        pvals = as_gpuarray_variable(pvals, ctx_name)
        unis = as_gpuarray_variable(unis, ctx_name)
+        assert pvals.dtype in ['float32', 'float16', 'float64']
+        assert unis.dtype in ['float32', 'float16', 'float64']

        if pvals.ndim != 2:
            raise NotImplementedError('pvals ndim should be 2', pvals.ndim)
@@ -62,7 +62,8 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):

    def gpu_kernels(self, node, name):
        out_ctype = pygpu.gpuarray.dtype_to_ctype(node.outputs[0].dtype)
-        in_ctype = pygpu.gpuarray.dtype_to_ctype(node.inputs[0].dtype)
+        pvals_ctype = pygpu.gpuarray.dtype_to_ctype(node.inputs[0].dtype)
+        unis_ctype = pygpu.gpuarray.dtype_to_ctype(node.inputs[1].dtype)
        work_ctype = pygpu.gpuarray.dtype_to_ctype(work_dtype(node.inputs[0].dtype))
        write_out_ctype = write_w(node.outputs[0].dtype)
        load_in_ctype = load_w(node.inputs[0].dtype)
@@ -71,11 +72,11 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
 KERNEL void k_multi_warp_multinomial(
    const ga_size nb_multi,
    const ga_size nb_outcomes,
-    GLOBAL_MEM %(in_ctype)s *global_pvals,
+    GLOBAL_MEM %(pvals_ctype)s *global_pvals,
    const ga_size global_pvals_offset,
    const ga_ssize pvals_row_stride,
    const ga_ssize pvals_col_stride,
-    GLOBAL_MEM %(in_ctype)s *global_unis,
+    GLOBAL_MEM %(unis_ctype)s *global_unis,
    const ga_size global_unis_offset,
    const ga_ssize unis_stride,
    GLOBAL_MEM %(out_ctype)s *global_outs,
@@ -84,8 +85,8 @@ KERNEL void k_multi_warp_multinomial(
    const ga_ssize outs_col_stride
 )
 {
-    global_pvals = (GLOBAL_MEM %(in_ctype)s *)(((GLOBAL_MEM char *)global_pvals) + global_pvals_offset);
-    global_unis = (GLOBAL_MEM %(in_ctype)s *)(((GLOBAL_MEM char *)global_unis) + global_unis_offset);
+    global_pvals = (GLOBAL_MEM %(pvals_ctype)s *)(((GLOBAL_MEM char *)global_pvals) + global_pvals_offset);
+    global_unis = (GLOBAL_MEM %(unis_ctype)s *)(((GLOBAL_MEM char *)global_unis) + global_unis_offset);
    global_outs = (GLOBAL_MEM %(out_ctype)s *)(((GLOBAL_MEM char *)global_outs) + global_outs_offset);
    // each thread takes care of one multinomial draw
    int n = LDIM_0*GID_0 + LID_0;
@@ -113,7 +114,8 @@ KERNEL void k_multi_warp_multinomial(
    }
 }
 """ % dict(out_ctype=out_ctype, write_out_ctype=write_out_ctype,
-           work_ctype=work_ctype, in_ctype=in_ctype, load_in_ctype=load_in_ctype)
+           work_ctype=work_ctype, pvals_ctype=pvals_ctype,
+           unis_ctype=unis_ctype, load_in_ctype=load_in_ctype)
        return [Kernel(
            code=code, name="k_multi_warp_multinomial",
            params=[pygpu.gpuarray.SIZE,
@@ -139,7 +141,8 @@ KERNEL void k_multi_warp_multinomial(
        ctx = sub['params']
        kname = self.gpu_kernels(node, name)[0].objvar
        out_typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
-        in_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
+        pvals_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
+        unis_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
        s = """
        PyGpuArrayObject * pvals = %(pvals)s;
        PyGpuArrayObject * unis = %(unis)s;
@@ -201,7 +204,15 @@ KERNEL void k_multi_warp_multinomial(

        assert(nb_blocks*nb_threads >= nb_multi);

-        int err = k_multi_warp_multinomial_call(1, &nb_blocks, &nb_threads, 0,  PyGpuArray_DIMS(out)[1], PyGpuArray_DIMS(out)[0], pvals->ga.data, pvals->ga.offset, PyGpuArray_STRIDES(pvals)[0]/gpuarray_get_elsize(%(in_typecode)s), PyGpuArray_STRIDES(pvals)[1]/gpuarray_get_elsize(%(in_typecode)s), unis->ga.data, unis->ga.offset, PyGpuArray_STRIDES(unis)[0]/gpuarray_get_elsize(%(in_typecode)s), out->ga.data, out->ga.offset, PyGpuArray_STRIDES(out)[0]/gpuarray_get_elsize(%(out_typecode)s), PyGpuArray_STRIDES(out)[1]/gpuarray_get_elsize(%(out_typecode)s));
+        int err = k_multi_warp_multinomial_call(
+          1, &nb_blocks, &nb_threads, 0,
+          PyGpuArray_DIMS(out)[1], PyGpuArray_DIMS(out)[0], pvals->ga.data, pvals->ga.offset,
+          PyGpuArray_STRIDES(pvals)[0]/gpuarray_get_elsize(%(pvals_typecode)s),
+          PyGpuArray_STRIDES(pvals)[1]/gpuarray_get_elsize(%(pvals_typecode)s),
+          unis->ga.data, unis->ga.offset,
+          PyGpuArray_STRIDES(unis)[0]/gpuarray_get_elsize(%(unis_typecode)s), out->ga.data,
+          out->ga.offset, PyGpuArray_STRIDES(out)[0]/gpuarray_get_elsize(%(out_typecode)s),
+          PyGpuArray_STRIDES(out)[1]/gpuarray_get_elsize(%(out_typecode)s));

        if (err != GA_NO_ERROR) {
           PyErr_Format(
@@ -218,7 +229,7 @@ KERNEL void k_multi_warp_multinomial(
        return s

    def c_code_cache_version(self):
-        return (6,)
+        return (7,)


 class GPUAChoiceFromUniform(GpuKernelBase, Op):

--- a/theano/gpuarray/rng_mrg.py
+++ b/theano/gpuarray/rng_mrg.py
@@ -271,7 +271,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        if (n_streams > n_elements)
          n_streams = n_elements;

-        {
+        if (n_elements > 0){
          size_t ls = 0, gs = 0;
          int err = GpuKernel_sched(&%(kname)s, n_streams, &ls, &gs);
          if (err != GA_NO_ERROR) {
@@ -303,7 +303,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
                   """ % dict(fail=sub['fail']))

    def c_code_cache_version(self):
-        return (16,)
+        return (17,)


 @register_opt2([mrg_uniform], 'fast_compile')

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
--- a/theano/sandbox/tests/test_rng_mrg.py
+++ b/theano/sandbox/tests/test_rng_mrg.py
@@ -298,7 +298,8 @@ def test_broadcastable():
    pvals_2 = R.uniform(size=size2)
    pvals_2 = pvals_2 / tensor.sum(pvals_2)

-    for distribution in [R.uniform, R.binomial, R.multinomial, R.multinomial_wo_replacement, R.normal]:
+    for distribution in [R.uniform, R.normal, R.truncated_normal,
+                         R.binomial, R.multinomial, R.multinomial_wo_replacement]:
        # multinomial or multinomial_wo_replacement does not support "size" argument,
        # the sizes of them are implicitly defined with "pvals" argument.
        if distribution in [R.multinomial, R.multinomial_wo_replacement]:
@@ -378,7 +379,6 @@ def t_binomial(mean, size, const_size, var_input, input, steps, rtol):

 @attr('slow')
 def test_normal0():
-
    steps = 50
    std = 2.
    if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or
@@ -391,7 +391,7 @@ def test_normal0():
    sample_size_odd = (sample_size[0], sample_size[1] - 1)
    x = tensor.matrix()

-    for size, const_size, var_input, input, avg, rtol, std_tol in [
+    test_cases = [
        (sample_size, sample_size, [], [], -5., default_rtol, default_rtol),
        (x.shape, sample_size, [x],
         [np.zeros(sample_size, dtype=config.floatX)],
@@ -409,8 +409,9 @@ def test_normal0():
        # test with few samples at the same time
        ((1,), (1,), [], [], -5., default_rtol, 0.02),
        ((3,), (3,), [], [], -5., default_rtol, 0.02),
-            ]:
+    ]

+    for size, const_size, var_input, input, avg, rtol, std_tol in test_cases:
        R = MRG_RandomStreams(234)
        # Note: we specify `nstreams` to avoid a warning.
        n = R.normal(size=size, avg=avg, std=std,
@@ -438,6 +439,126 @@ def test_normal0():
                  prefix='numpy ', allow_01=True, inputs=input, mean_rtol=rtol)


+@attr('slow')
+def test_normal_truncation():
+    # just a copy of test_normal0 with extra bound check
+    steps = 50
+    std = 2.
+    # standard deviation is slightly less than for a regular Gaussian
+    # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+    target_std = .87962566103423978 * std
+
+    if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or
+            config.mode == 'Mode' and config.linker in ['py']):
+        sample_size = (25, 30)
+        default_rtol = .02
+    else:
+        sample_size = (999, 50)
+        default_rtol = .01
+    sample_size_odd = (sample_size[0], sample_size[1] - 1)
+    x = tensor.matrix()
+
+    test_cases = [
+        (sample_size, sample_size, [], [], -5., default_rtol, default_rtol),
+        (x.shape, sample_size, [x],
+         [np.zeros(sample_size, dtype=config.floatX)],
+         -5., default_rtol, default_rtol),
+        # test odd value
+        (x.shape, sample_size_odd, [x],
+         [np.zeros(sample_size_odd, dtype=config.floatX)],
+         -5., default_rtol, default_rtol),
+        (sample_size, sample_size, [], [],
+         np.arange(np.prod(sample_size),
+                   dtype='float32').reshape(sample_size),
+         10. * std / np.sqrt(steps), default_rtol),
+        # test empty size (scalar)
+        ((), (), [], [], -5., default_rtol, 0.02),
+        # test with few samples at the same time
+        ((1,), (1,), [], [], -5., default_rtol, 0.02),
+        ((3,), (3,), [], [], -5., default_rtol, 0.02),
+    ]
+
+    for size, const_size, var_input, input, avg, rtol, std_tol in test_cases:
+        R = MRG_RandomStreams(234)
+        # Note: we specify `nstreams` to avoid a warning.
+        n = R.normal(size=size, avg=avg, std=std, truncate=True,
+                     nstreams=rng_mrg.guess_n_streams(size, warn=False))
+        f = theano.function(var_input, n)
+
+        # check if truncated at 2*std
+        samples = f(*input)
+        assert np.all(avg + 2 * std - samples >= 0), \
+            ("bad upper bound? %s %s" % (samples, avg + 2 * std))
+        assert np.all(samples - (avg - 2 * std) >= 0), \
+            ("bad lower bound? %s %s" % (samples, avg - 2 * std))
+
+        # Increase the number of steps if size implies only a few samples
+        if np.prod(const_size) < 10:
+            steps_ = steps * 50
+        else:
+            steps_ = steps
+        basictest(f, steps_, const_size, target_avg=avg, target_std=target_std,
+                  prefix='mrg ', allow_01=True, inputs=input,
+                  mean_rtol=rtol, std_tol=std_tol)
+
+        sys.stdout.flush()
+
+
+@attr('slow')
+def test_truncated_normal():
+    # just a copy of test_normal0 for truncated normal
+    steps = 50
+    std = 2.
+
+    if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or
+            config.mode == 'Mode' and config.linker in ['py']):
+        sample_size = (25, 30)
+        default_rtol = .02
+    else:
+        sample_size = (999, 50)
+        default_rtol = .01
+    sample_size_odd = (sample_size[0], sample_size[1] - 1)
+    x = tensor.matrix()
+
+    test_cases = [
+        (sample_size, sample_size, [], [], -5., default_rtol, default_rtol),
+        (x.shape, sample_size, [x],
+         [np.zeros(sample_size, dtype=config.floatX)],
+         -5., default_rtol, default_rtol),
+        # test odd value
+        (x.shape, sample_size_odd, [x],
+         [np.zeros(sample_size_odd, dtype=config.floatX)],
+         -5., default_rtol, default_rtol),
+        (sample_size, sample_size, [], [],
+         np.arange(np.prod(sample_size),
+                   dtype='float32').reshape(sample_size),
+         10. * std / np.sqrt(steps), default_rtol),
+        # test empty size (scalar)
+        ((), (), [], [], -5., default_rtol, 0.02),
+        # test with few samples at the same time
+        ((1,), (1,), [], [], -5., default_rtol, 0.02),
+        ((3,), (3,), [], [], -5., default_rtol, 0.02),
+    ]
+
+    for size, const_size, var_input, input, avg, rtol, std_tol in test_cases:
+        R = MRG_RandomStreams(234)
+        # Note: we specify `nstreams` to avoid a warning.
+        n = R.truncated_normal(size=size, avg=avg, std=std,
+                               nstreams=rng_mrg.guess_n_streams(size, warn=False))
+        f = theano.function(var_input, n)
+
+        # Increase the number of steps if size implies only a few samples
+        if np.prod(const_size) < 10:
+            steps_ = steps * 60
+        else:
+            steps_ = steps
+        basictest(f, steps_, const_size, target_avg=avg, target_std=std,
+                  prefix='mrg ', allow_01=True, inputs=input,
+                  mean_rtol=rtol, std_tol=std_tol)
+
+        sys.stdout.flush()
+
+
 def basic_multinomialtest(f, steps, sample_size, target_pvals, n_samples,
                          prefix="", mean_rtol=0.04):

@@ -519,6 +640,7 @@ class T_MRG(unittest.TestCase):
            self.assertRaises(ValueError, R.binomial, size)
            self.assertRaises(ValueError, R.multinomial, size, 1, [])
            self.assertRaises(ValueError, R.normal, size)
+            self.assertRaises(ValueError, R.truncated_normal, size)


 def test_multiple_rng_aliasing():
@@ -734,6 +856,19 @@ def test_undefined_grad():
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out,
                  (avg, std))

+    # checking truncated normal distribution
+    avg = tensor.scalar()
+    out = srng.truncated_normal((), avg=avg)
+    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, avg)
+
+    std = tensor.scalar()
+    out = srng.truncated_normal((), avg=0, std=std)
+    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, std)
+
+    out = srng.truncated_normal((), avg=avg, std=std)
+    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out,
+                  (avg, std))
+

 def test_f16_nonzero(mode=None, op_to_check=rng_mrg.mrg_uniform):
    srng = MRG_RandomStreams(seed=utt.fetch_seed())
@@ -755,6 +890,8 @@ def test_target_parameter():
        assert isinstance(f(), np.ndarray)

    basic_target_parameter_test(srng.uniform((3, 2), target='cpu'))
+    basic_target_parameter_test(srng.normal((3, 2), target='cpu'))
+    basic_target_parameter_test(srng.truncated_normal((3, 2), target='cpu'))
    basic_target_parameter_test(srng.binomial((3, 2), target='cpu'))
    basic_target_parameter_test(srng.multinomial(pvals=pvals.astype('float32'), target='cpu'))
    basic_target_parameter_test(srng.choice(p=pvals.astype('float32'), replace=False, target='cpu'))