Merge pull request #6401 from mrTsjolder/master

Implement truncated normal with box-muller

Merge pull request #6401 from mrTsjolder/master
25dfa312 · Frédéric Bastien · GitHub · e3c95974 · e29586c9 · 25dfa312
--- a/theano/gpuarray/multinomial.py
+++ b/theano/gpuarray/multinomial.py
@@ -39,11 +39,11 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
        return [gpuarray_helper_inc_dir()]
    def make_node(self, pvals, unis):
-        assert unis.dtype == pvals.dtype
-        assert pvals.dtype in ['float32', 'float16', 'float64']
        ctx_name = infer_context_name(pvals, unis)
        pvals = as_gpuarray_variable(pvals, ctx_name)
        unis = as_gpuarray_variable(unis, ctx_name)
+        assert pvals.dtype in ['float32', 'float16', 'float64']
+        assert unis.dtype in ['float32', 'float16', 'float64']
        if pvals.ndim != 2:
            raise NotImplementedError('pvals ndim should be 2', pvals.ndim)
@@ -62,7 +62,8 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
    def gpu_kernels(self, node, name):
        out_ctype = pygpu.gpuarray.dtype_to_ctype(node.outputs[0].dtype)
-        in_ctype = pygpu.gpuarray.dtype_to_ctype(node.inputs[0].dtype)
+        pvals_ctype = pygpu.gpuarray.dtype_to_ctype(node.inputs[0].dtype)
+        unis_ctype = pygpu.gpuarray.dtype_to_ctype(node.inputs[1].dtype)
        work_ctype = pygpu.gpuarray.dtype_to_ctype(work_dtype(node.inputs[0].dtype))
        write_out_ctype = write_w(node.outputs[0].dtype)
        load_in_ctype = load_w(node.inputs[0].dtype)
@@ -71,11 +72,11 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
 KERNEL void k_multi_warp_multinomial(
    const ga_size nb_multi,
    const ga_size nb_outcomes,
-    GLOBAL_MEM %(in_ctype)s *global_pvals,
+    GLOBAL_MEM %(pvals_ctype)s *global_pvals,
    const ga_size global_pvals_offset,
    const ga_ssize pvals_row_stride,
    const ga_ssize pvals_col_stride,
-    GLOBAL_MEM %(in_ctype)s *global_unis,
+    GLOBAL_MEM %(unis_ctype)s *global_unis,
    const ga_size global_unis_offset,
    const ga_ssize unis_stride,
    GLOBAL_MEM %(out_ctype)s *global_outs,
@@ -84,8 +85,8 @@ KERNEL void k_multi_warp_multinomial(
    const ga_ssize outs_col_stride
 )
 {
-    global_pvals = (GLOBAL_MEM %(in_ctype)s *)(((GLOBAL_MEM char *)global_pvals) + global_pvals_offset);
+    global_pvals = (GLOBAL_MEM %(pvals_ctype)s *)(((GLOBAL_MEM char *)global_pvals) + global_pvals_offset);
-    global_unis = (GLOBAL_MEM %(in_ctype)s *)(((GLOBAL_MEM char *)global_unis) + global_unis_offset);
+    global_unis = (GLOBAL_MEM %(unis_ctype)s *)(((GLOBAL_MEM char *)global_unis) + global_unis_offset);
    global_outs = (GLOBAL_MEM %(out_ctype)s *)(((GLOBAL_MEM char *)global_outs) + global_outs_offset);
    // each thread takes care of one multinomial draw
    int n = LDIM_0*GID_0 + LID_0;
@@ -113,7 +114,8 @@ KERNEL void k_multi_warp_multinomial(
    }
 }
 """ % dict(out_ctype=out_ctype, write_out_ctype=write_out_ctype,
-           work_ctype=work_ctype, in_ctype=in_ctype, load_in_ctype=load_in_ctype)
+           work_ctype=work_ctype, pvals_ctype=pvals_ctype,
+           unis_ctype=unis_ctype, load_in_ctype=load_in_ctype)
        return [Kernel(
            code=code, name="k_multi_warp_multinomial",
            params=[pygpu.gpuarray.SIZE,
@@ -139,7 +141,8 @@ KERNEL void k_multi_warp_multinomial(
        ctx = sub['params']
        kname = self.gpu_kernels(node, name)[0].objvar
        out_typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
-        in_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
+        pvals_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
+        unis_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
        s = """
        PyGpuArrayObject * pvals = %(pvals)s;
        PyGpuArrayObject * unis = %(unis)s;
@@ -201,7 +204,15 @@ KERNEL void k_multi_warp_multinomial(
        assert(nb_blocks*nb_threads >= nb_multi);
-        int err = k_multi_warp_multinomial_call(1, &nb_blocks, &nb_threads, 0,  PyGpuArray_DIMS(out)[1], PyGpuArray_DIMS(out)[0], pvals->ga.data, pvals->ga.offset, PyGpuArray_STRIDES(pvals)[0]/gpuarray_get_elsize(%(in_typecode)s), PyGpuArray_STRIDES(pvals)[1]/gpuarray_get_elsize(%(in_typecode)s), unis->ga.data, unis->ga.offset, PyGpuArray_STRIDES(unis)[0]/gpuarray_get_elsize(%(in_typecode)s), out->ga.data, out->ga.offset, PyGpuArray_STRIDES(out)[0]/gpuarray_get_elsize(%(out_typecode)s), PyGpuArray_STRIDES(out)[1]/gpuarray_get_elsize(%(out_typecode)s));
+        int err = k_multi_warp_multinomial_call(
+          1, &nb_blocks, &nb_threads, 0,
+          PyGpuArray_DIMS(out)[1], PyGpuArray_DIMS(out)[0], pvals->ga.data, pvals->ga.offset,
+          PyGpuArray_STRIDES(pvals)[0]/gpuarray_get_elsize(%(pvals_typecode)s),
+          PyGpuArray_STRIDES(pvals)[1]/gpuarray_get_elsize(%(pvals_typecode)s),
+          unis->ga.data, unis->ga.offset,
+          PyGpuArray_STRIDES(unis)[0]/gpuarray_get_elsize(%(unis_typecode)s), out->ga.data,
+          out->ga.offset, PyGpuArray_STRIDES(out)[0]/gpuarray_get_elsize(%(out_typecode)s),
+          PyGpuArray_STRIDES(out)[1]/gpuarray_get_elsize(%(out_typecode)s));
        if (err != GA_NO_ERROR) {
           PyErr_Format(
@@ -218,7 +229,7 @@ KERNEL void k_multi_warp_multinomial(
        return s
    def c_code_cache_version(self):
-        return (6,)
+        return (7,)
 class GPUAChoiceFromUniform(GpuKernelBase, Op):

--- a/theano/gpuarray/rng_mrg.py
+++ b/theano/gpuarray/rng_mrg.py
@@ -271,7 +271,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        if (n_streams > n_elements)
          n_streams = n_elements;
-        {
+        if (n_elements > 0){
          size_t ls = 0, gs = 0;
          int err = GpuKernel_sched(&%(kname)s, n_streams, &ls, &gs);
          if (err != GA_NO_ERROR) {
@@ -303,7 +303,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
                   """ % dict(fail=sub['fail']))
    def c_code_cache_version(self):
-        return (16,)
+        return (17,)
 @register_opt2([mrg_uniform], 'fast_compile')

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -19,7 +19,6 @@ from theano.gradient import undefined_grad
 from theano import tensor
 from theano.tensor import (TensorType, as_tensor_variable, get_vector_length,
                           cast, opt, scal)
-from theano.tensor import sqrt, log, sin, cos, join, prod
 from theano.compile import optdb
 from theano.gof import local_optimizer, ParamsType
 from theano.scalar import bool as bool_t, int32 as int_t
@@ -1029,93 +1028,183 @@ class MRG_RandomStreams(object):
        return self.choice(size=n, a=None, replace=False, p=pvals,
                           dtype=dtype, nstreams=nstreams, ndim=ndim, **kwargs)
-    def normal(self, size, avg=0.0, std=1.0, ndim=None,
+    def normal(self, size, avg=0.0, std=1.0, ndim=None, dtype=None,
-               dtype=None, nstreams=None):
+               nstreams=None, truncate=False, **kwargs):
-        # TODO : need description for method
        """
+        Sample a tensor of values from a normal distribution.
        Parameters
        ----------
-        size
+        size : int_vector_like
-            Can be a list of integers or Theano variables (ex: the shape
+            Array dimensions for the output tensor.
-            of another Theano Variable).
+        avg : float_like, optional
-        dtype
+            The mean value for the truncated normal to sample from (defaults to 0.0).
-            The output data type. If dtype is not specified, it will be
+        std : float_like, optional
-            inferred from the dtype of low and high, but will be at
+            The standard deviation for the truncated normal to sample from (defaults to 1.0).
-            least as precise as floatX.
+        truncate : bool, optional
-        nstreams
+            Truncates the normal distribution at 2 standard deviations if True (defaults to False).
-            Number of streams.
+            When this flag is set, the standard deviation of the result will be less than the one specified.
+        ndim : int, optional
+            The number of dimensions for the output tensor (defaults to None).
+            This argument is necessary if the size argument is ambiguous on the number of dimensions.
+        dtype : str, optional
+            The data-type for the output tensor. If not specified,
+            the dtype is inferred from avg and std, but it is at least as precise as floatX.
+        kwargs
+            Other keyword arguments for random number generation (see uniform).
+        Returns
+        -------
+        samples : TensorVariable
+            A Theano tensor of samples randomly drawn from a normal distribution.
        """
-        # We need an even number of ]0,1[ samples. Then we split them
+        size = _check_size(size)
-        # in two halves. First half becomes our U1's for Box-Muller,
+        avg = undefined_grad(as_tensor_variable(avg))
-        # second half our U2's. See Wikipedia page:
+        std = undefined_grad(as_tensor_variable(std))
-        # http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
-        avg = as_tensor_variable(avg)
-        avg = undefined_grad(avg)
-        std = as_tensor_variable(std)
-        std = undefined_grad(std)
        if dtype is None:
            dtype = scal.upcast(config.floatX, avg.dtype, std.dtype)
-        avg = cast(avg, dtype)
+        avg = tensor.cast(avg, dtype=dtype)
-        std = cast(std, dtype)
+        std = tensor.cast(std, dtype=dtype)
-        evened = False
+        # generate even number of uniform samples
-        constant = False
+        # Do manual constant folding to lower optiimizer work.
-        if (isinstance(size, tuple) and
+        if isinstance(size, theano.Constant):
-                all([isinstance(i, (np.integer, integer_types)) for i in size])):
+            n_odd_samples = size.prod(dtype='int64')
-            constant = True
-            # Force dtype because it defaults to float when size is empty
-            n_samples = np.prod(size, dtype='int64')
-            if n_samples % 2 == 1:
-                n_samples += 1
-                evened = True
        else:
-            # if even, don't change, if odd, +1
+            n_odd_samples = tensor.prod(size, dtype='int64')
-            n_samples = prod(size) + (prod(size) % 2)
+        n_even_samples = n_odd_samples + n_odd_samples % 2
-        flattened = self.uniform(size=(n_samples,), dtype=dtype,
+        uniform = self.uniform((n_even_samples, ), low=0., high=1.,
-                                 nstreams=nstreams)
+                               ndim=1, dtype=dtype, nstreams=nstreams, **kwargs)
-        if constant:
+        # box-muller transform
-            U1 = flattened[:n_samples // 2]
+        u1 = uniform[:n_even_samples // 2]
-            U2 = flattened[n_samples // 2:]
+        u2 = uniform[n_even_samples // 2:]
+        r = tensor.sqrt(-2.0 * tensor.log(u1))
+        theta = np.array(2.0 * np.pi, dtype=dtype) * u2
+        cos_theta, sin_theta = tensor.cos(theta), tensor.sin(theta)
+        z0 = r * cos_theta
+        z1 = r * sin_theta
+        if truncate:
+            # use valid samples
+            to_fix0 = (z0 < -2.) | (z0 > 2.)
+            to_fix1 = (z1 < -2.) | (z1 > 2.)
+            z0_valid = z0[tensor.nonzero(~to_fix0)]
+            z1_valid = z1[tensor.nonzero(~to_fix1)]
+            # re-sample invalid samples
+            to_fix0 = tensor.nonzero(to_fix0)[0]
+            to_fix1 = tensor.nonzero(to_fix1)[0]
+            n_fix_samples = to_fix0.size + to_fix1.size
+            lower = tensor.constant(1. / np.e**2, dtype=dtype)
+            u_fix = self.uniform((n_fix_samples, ), low=lower, high=1.,
+                                 ndim=1, dtype=dtype, nstreams=nstreams, **kwargs)
+            r_fix = tensor.sqrt(-2. * tensor.log(u_fix))
+            z0_fixed = r_fix[:to_fix0.size] * cos_theta[to_fix0]
+            z1_fixed = r_fix[to_fix0.size:] * sin_theta[to_fix1]
+            # pack everything together to a useful result
+            norm_samples = tensor.join(0, z0_valid, z0_fixed, z1_valid, z1_fixed)
        else:
-            U1 = flattened[:prod(flattened.shape) // 2]
+            norm_samples = tensor.join(0, z0, z1)
-            U2 = flattened[prod(flattened.shape) // 2:]
+        if isinstance(n_odd_samples, theano.Variable):
+            samples = norm_samples[:n_odd_samples]
-        # normal_samples = zeros_like(flattened)
+        elif n_odd_samples % 2 == 1:
-        sqrt_ln_U1 = sqrt(-2.0 * log(U1))
+            samples = norm_samples[:-1]
-        # TypeError: 'TensorVariable' object does not support item assignment
-        # so this doesn't work...
-        # normal_samples[:n_samples/2] = sqrt_ln_U1 * cos(2.0*np.pi*U2)
-        # normal_samples[n_samples/2:] = sqrt_ln_U1 * sin(2.0*np.pi*U2)
-        # so trying this instead
-        first_half = sqrt_ln_U1 * cos(
-            np.array(2.0 * np.pi, dtype=dtype) * U2)
-        second_half = sqrt_ln_U1 * sin(
-            np.array(2.0 * np.pi, dtype=dtype) * U2)
-        normal_samples = join(0, first_half, second_half)
-        final_samples = None
-        if evened:
-            final_samples = normal_samples[:-1]
-        elif constant:
-            final_samples = normal_samples
        else:
-            final_samples = normal_samples[:prod(size)]
+            samples = norm_samples
+        samples = tensor.reshape(samples, newshape=size, ndim=ndim)
+        samples *= std
+        samples += avg
+        return samples
+    def truncated_normal(self, size, avg=0.0, std=1.0,
+                         ndim=None, dtype=None, nstreams=None, **kwargs):
+        """
+        Sample a tensor of values from a symmetrically truncated normal distribution.
+        Parameters
+        ----------
+        size : int_vector_like
+            Array dimensions for the output tensor.
+        avg : float_like, optional
+            The mean value for the truncated normal to sample from (defaults to 0.0).
+        std : float_like, optional
+            The standard deviation for the truncated normal to sample from (defaults to 1.0).
+        ndim : int, optional
+            The number of dimensions for the output tensor (defaults to None).
+            This argument is necessary if the size argument is ambiguous on the number of dimensions.
+        dtype : str, optional
+            The data-type for the output tensor. If not specified,
+            the dtype is inferred from avg and std, but it is at least as precise as floatX.
+        kwargs
+            Other keyword arguments for random number generation (see uniform).
+        Returns
+        -------
+        samples : TensorVariable
+            A Theano tensor of samples randomly drawn from a truncated normal distribution.
+        See Also
+        --------
+        normal
+        """
+        # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+        std = std / tensor.constant(.87962566103423978)
+        return self.normal(size=size, avg=avg, std=std, truncate=True,
+                           ndim=ndim, dtype=dtype, nstreams=nstreams, **kwargs)
-        if not size:
-            # Force the dtype to be int64, otherwise reshape complains
-            size = tensor.constant(size, dtype='int64')
-        final_samples = final_samples.reshape(size)
-        final_samples = avg + std * final_samples
+def _check_size(size):
+    """
+    Canonicalise inputs to get valid output sizes for Theano tensors.
+    Parameters
+    ----------
+    size : int_vector_like
+        Some variable that could serve as the shape for a Theano tensor.
+        This can be an int, a tuple of ints, a list of ints
+        or a Theano Variable with similar properties.
+    Returns
+    -------
+    size_var : int_vector
+        A one-dimensional Theano variable encapsulating the given size.
+    Raises
+    ------
+    ValueError
+        If this method can not build a valid size from the input.
+    """
+    # non-tuple checks and scalar-to-tuple transform
+    if isinstance(size, theano.Variable):
+        if size.ndim == 1:
+            return size
+        elif size.ndim == 0:
+            return tensor.stack([size], ndim=1)
+        else:
+            raise ValueError("Theano variable must have 1 dimension to be a valid size.", size)
+    elif isinstance(size, (np.integer, integer_types)):
+        return tensor.constant([size], ndim=1)
+    elif not isinstance(size, (tuple, list)):
+        raise ValueError("Size must be a int, tuple, list or Theano variable.", size)
+    # check entries of list or tuple
+    for i in size:
+        if isinstance(i, theano.Variable):
+            if i.ndim != 0:
+                raise ValueError("Non-scalar Theano variable in size", size, i)
+        elif isinstance(i, (np.integer, integer_types)):
+            if i <= 0:
+                raise ValueError("Non-positive dimensions not allowed in size.", size, i)
+        else:
+            raise ValueError("Only Theano variables and integers are allowed in a size-tuple.", size, i)
-        assert final_samples.dtype == dtype
+    return tensor.as_tensor_variable(size, ndim=1)
-        return final_samples
 @local_optimizer((mrg_uniform_base,))

--- a/theano/sandbox/tests/test_rng_mrg.py
+++ b/theano/sandbox/tests/test_rng_mrg.py
@@ -298,7 +298,8 @@ def test_broadcastable():
    pvals_2 = R.uniform(size=size2)
    pvals_2 = pvals_2 / tensor.sum(pvals_2)
-    for distribution in [R.uniform, R.binomial, R.multinomial, R.multinomial_wo_replacement, R.normal]:
+    for distribution in [R.uniform, R.normal, R.truncated_normal,
+                         R.binomial, R.multinomial, R.multinomial_wo_replacement]:
        # multinomial or multinomial_wo_replacement does not support "size" argument,
        # the sizes of them are implicitly defined with "pvals" argument.
        if distribution in [R.multinomial, R.multinomial_wo_replacement]:
@@ -378,7 +379,6 @@ def t_binomial(mean, size, const_size, var_input, input, steps, rtol):
 @attr('slow')
 def test_normal0():
    steps = 50
    std = 2.
    if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or
@@ -391,7 +391,7 @@ def test_normal0():
    sample_size_odd = (sample_size[0], sample_size[1] - 1)
    x = tensor.matrix()
-    for size, const_size, var_input, input, avg, rtol, std_tol in [
+    test_cases = [
        (sample_size, sample_size, [], [], -5., default_rtol, default_rtol),
        (x.shape, sample_size, [x],
         [np.zeros(sample_size, dtype=config.floatX)],
@@ -409,8 +409,9 @@ def test_normal0():
        # test with few samples at the same time
        ((1,), (1,), [], [], -5., default_rtol, 0.02),
        ((3,), (3,), [], [], -5., default_rtol, 0.02),
-            ]:
+    ]
+    for size, const_size, var_input, input, avg, rtol, std_tol in test_cases:
        R = MRG_RandomStreams(234)
        # Note: we specify `nstreams` to avoid a warning.
        n = R.normal(size=size, avg=avg, std=std,
@@ -438,6 +439,126 @@ def test_normal0():
                  prefix='numpy ', allow_01=True, inputs=input, mean_rtol=rtol)
+@attr('slow')
+def test_normal_truncation():
+    # just a copy of test_normal0 with extra bound check
+    steps = 50
+    std = 2.
+    # standard deviation is slightly less than for a regular Gaussian
+    # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+    target_std = .87962566103423978 * std
+    if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or
+            config.mode == 'Mode' and config.linker in ['py']):
+        sample_size = (25, 30)
+        default_rtol = .02
+    else:
+        sample_size = (999, 50)
+        default_rtol = .01
+    sample_size_odd = (sample_size[0], sample_size[1] - 1)
+    x = tensor.matrix()
+    test_cases = [
+        (sample_size, sample_size, [], [], -5., default_rtol, default_rtol),
+        (x.shape, sample_size, [x],
+         [np.zeros(sample_size, dtype=config.floatX)],
+         -5., default_rtol, default_rtol),
+        # test odd value
+        (x.shape, sample_size_odd, [x],
+         [np.zeros(sample_size_odd, dtype=config.floatX)],
+         -5., default_rtol, default_rtol),
+        (sample_size, sample_size, [], [],
+         np.arange(np.prod(sample_size),
+                   dtype='float32').reshape(sample_size),
+         10. * std / np.sqrt(steps), default_rtol),
+        # test empty size (scalar)
+        ((), (), [], [], -5., default_rtol, 0.02),
+        # test with few samples at the same time
+        ((1,), (1,), [], [], -5., default_rtol, 0.02),
+        ((3,), (3,), [], [], -5., default_rtol, 0.02),
+    ]
+    for size, const_size, var_input, input, avg, rtol, std_tol in test_cases:
+        R = MRG_RandomStreams(234)
+        # Note: we specify `nstreams` to avoid a warning.
+        n = R.normal(size=size, avg=avg, std=std, truncate=True,
+                     nstreams=rng_mrg.guess_n_streams(size, warn=False))
+        f = theano.function(var_input, n)
+        # check if truncated at 2*std
+        samples = f(*input)
+        assert np.all(avg + 2 * std - samples >= 0), \
+            ("bad upper bound? %s %s" % (samples, avg + 2 * std))
+        assert np.all(samples - (avg - 2 * std) >= 0), \
+            ("bad lower bound? %s %s" % (samples, avg - 2 * std))
+        # Increase the number of steps if size implies only a few samples
+        if np.prod(const_size) < 10:
+            steps_ = steps * 50
+        else:
+            steps_ = steps
+        basictest(f, steps_, const_size, target_avg=avg, target_std=target_std,
+                  prefix='mrg ', allow_01=True, inputs=input,
+                  mean_rtol=rtol, std_tol=std_tol)
+        sys.stdout.flush()
+@attr('slow')
+def test_truncated_normal():
+    # just a copy of test_normal0 for truncated normal
+    steps = 50
+    std = 2.
+    if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or
+            config.mode == 'Mode' and config.linker in ['py']):
+        sample_size = (25, 30)
+        default_rtol = .02
+    else:
+        sample_size = (999, 50)
+        default_rtol = .01
+    sample_size_odd = (sample_size[0], sample_size[1] - 1)
+    x = tensor.matrix()
+    test_cases = [
+        (sample_size, sample_size, [], [], -5., default_rtol, default_rtol),
+        (x.shape, sample_size, [x],
+         [np.zeros(sample_size, dtype=config.floatX)],
+         -5., default_rtol, default_rtol),
+        # test odd value
+        (x.shape, sample_size_odd, [x],
+         [np.zeros(sample_size_odd, dtype=config.floatX)],
+         -5., default_rtol, default_rtol),
+        (sample_size, sample_size, [], [],
+         np.arange(np.prod(sample_size),
+                   dtype='float32').reshape(sample_size),
+         10. * std / np.sqrt(steps), default_rtol),
+        # test empty size (scalar)
+        ((), (), [], [], -5., default_rtol, 0.02),
+        # test with few samples at the same time
+        ((1,), (1,), [], [], -5., default_rtol, 0.02),
+        ((3,), (3,), [], [], -5., default_rtol, 0.02),
+    ]
+    for size, const_size, var_input, input, avg, rtol, std_tol in test_cases:
+        R = MRG_RandomStreams(234)
+        # Note: we specify `nstreams` to avoid a warning.
+        n = R.truncated_normal(size=size, avg=avg, std=std,
+                               nstreams=rng_mrg.guess_n_streams(size, warn=False))
+        f = theano.function(var_input, n)
+        # Increase the number of steps if size implies only a few samples
+        if np.prod(const_size) < 10:
+            steps_ = steps * 60
+        else:
+            steps_ = steps
+        basictest(f, steps_, const_size, target_avg=avg, target_std=std,
+                  prefix='mrg ', allow_01=True, inputs=input,
+                  mean_rtol=rtol, std_tol=std_tol)
+        sys.stdout.flush()
 def basic_multinomialtest(f, steps, sample_size, target_pvals, n_samples,
                          prefix="", mean_rtol=0.04):
@@ -519,6 +640,7 @@ class T_MRG(unittest.TestCase):
            self.assertRaises(ValueError, R.binomial, size)
            self.assertRaises(ValueError, R.multinomial, size, 1, [])
            self.assertRaises(ValueError, R.normal, size)
+            self.assertRaises(ValueError, R.truncated_normal, size)
 def test_multiple_rng_aliasing():
@@ -734,6 +856,19 @@ def test_undefined_grad():
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out,
                  (avg, std))
+    # checking truncated normal distribution
+    avg = tensor.scalar()
+    out = srng.truncated_normal((), avg=avg)
+    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, avg)
+    std = tensor.scalar()
+    out = srng.truncated_normal((), avg=0, std=std)
+    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, std)
+    out = srng.truncated_normal((), avg=avg, std=std)
+    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out,
+                  (avg, std))
 def test_f16_nonzero(mode=None, op_to_check=rng_mrg.mrg_uniform):
    srng = MRG_RandomStreams(seed=utt.fetch_seed())
@@ -755,6 +890,8 @@ def test_target_parameter():
        assert isinstance(f(), np.ndarray)
    basic_target_parameter_test(srng.uniform((3, 2), target='cpu'))
+    basic_target_parameter_test(srng.normal((3, 2), target='cpu'))
+    basic_target_parameter_test(srng.truncated_normal((3, 2), target='cpu'))
    basic_target_parameter_test(srng.binomial((3, 2), target='cpu'))
    basic_target_parameter_test(srng.multinomial(pvals=pvals.astype('float32'), target='cpu'))
    basic_target_parameter_test(srng.choice(p=pvals.astype('float32'), replace=False, target='cpu'))