Merge pull request #4064 from yaoli/rng_mrg_overflow

fixed rng_mrg int32 overflow, just throw out the error when it overflows

Merge pull request #4064 from yaoli/rng_mrg_overflow
e6e88ce2 · Xavier Bouthillier · e8d2e8c9 · 9f46fb07 · e6e88ce2 · e6e88ce2
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -338,7 +338,7 @@ class mrg_uniform(mrg_uniform_base):
        if ndim is None:
            ndim = get_vector_length(v_size)
        op = cls(TensorType(dtype, (False,) * ndim))
-        return op(rstate, cast(v_size, 'int32'))
+        return op(rstate, v_size)

    def perform(self, node, inp, out):
        rstate, size = inp
@@ -396,9 +396,8 @@ class mrg_uniform(mrg_uniform_base):
            NORM = '4.656612873077392578125e-10'
        return """
        //////// <code generated by mrg_uniform>
-
-        npy_intp odims[%(ndim)s];
-        int n_elements = 1;
+        npy_int64 odims[%(ndim)s];
+        npy_int64 n_elements = 1;
        int n_streams = 0;
        int must_alloc_sample = ((NULL == %(o_sample)s)
                                 || (PyArray_NDIM(%(o_sample)s) != %(ndim)s)
@@ -432,18 +431,24 @@ class mrg_uniform(mrg_uniform_base):
                %(ndim)s, int(PyArray_DIMS(%(size)s)[0]));
            %(fail)s
        }
-        if (PyArray_DESCR(%(size)s)->type_num != NPY_INT32)
-        {
-            PyErr_SetString(PyExc_ValueError, "size must be int32");
-            %(fail)s
-        }
+
        for (int i = 0; i < %(ndim)s; ++i)
        {
-            odims[i] = ((npy_int32*)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
+            odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
            n_elements *= odims[i];
            must_alloc_sample = must_alloc_sample || (PyArray_DIMS(%(o_sample)s)[i] != odims[i]);
            //fprintf(stderr, "size %%i %%i\\n", i, (int)odims[i]);
+            //printf("%%li", n_elements);
+        }
+        //fprintf(stderr, "n_elements %%lld\\n", (long long)n_elements);
+        if (n_elements > M1)
+        {
+            PyErr_SetString(
+                PyExc_ValueError,
+                "rng_mrg cpu-implementation does not support more than (2**31 -1) samples");
+            %(fail)s
        }
+
        if (must_alloc_sample)
        {
            Py_XDECREF(%(o_sample)s);
@@ -537,7 +542,7 @@ class mrg_uniform(mrg_uniform_base):
        """ % locals()

    def c_code_cache_version(self):
-        return (3,)
+        return (7, )


 class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
@@ -549,7 +554,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
        if ndim is None:
            ndim = get_vector_length(v_size)
        op = cls(CudaNdarrayType((False,) * ndim))
-        return op(rstate, cast(v_size, 'int32'))
+        return op(rstate, v_size)

    def c_support_code_apply(self, node, nodename):
        if self.output_type.dtype == 'float32':
@@ -661,9 +666,9 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
        SYNC = "CNDA_THREAD_SYNC"
        return """
        //////// <code generated by mrg_uniform>
-
-        int odims[%(ndim)s];
-        int n_elements = 1;
+        npy_int64 M1 = 2147483647;      //2^31 - 1
+        npy_int64 odims[%(ndim)s];
+        npy_int64 n_elements = 1;
        int n_streams, n_streams_used_in_this_call;
        int must_alloc_sample = ((NULL == %(o_sample)s)
                || !CudaNdarray_Check((PyObject*)%(o_sample)s)
@@ -681,18 +686,23 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
                %(ndim)s, PyArray_DIMS(%(size)s)[0]);
            %(fail)s
        }
-        if (PyArray_DESCR(%(size)s)->type_num != NPY_INT32)
-        {
-            PyErr_SetString(PyExc_ValueError, "size must be int32");
-            %(fail)s
-        }
+
        for (int i = 0; i < %(ndim)s; ++i)
        {
-            odims[i] = ((npy_int32*)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
+            odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
            n_elements *= odims[i];
            must_alloc_sample = (must_alloc_sample
                    || CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]);
        }
+
+        if (n_elements > M1)
+        {
+            PyErr_SetString(
+                PyExc_ValueError,
+                "rng_mrg gpu implementation does not support more than (2**31 -1) samples");
+            %(fail)s
+        }
+
        if (must_alloc_sample)
        {
            Py_XDECREF(%(o_sample)s);
@@ -735,7 +745,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
            %(fail)s;
        }
        n_streams = CudaNdarray_HOST_DIMS(%(o_rstate)s)[0]/6;
-        n_streams_used_in_this_call = std::min(n_streams, n_elements);
+        n_streams_used_in_this_call = std::min(n_streams, (int)n_elements);

        {
            unsigned int threads_per_block = std::min((unsigned int)n_streams_used_in_this_call, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
@@ -775,7 +785,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
        """ % locals()

    def c_code_cache_version(self):
-        return (10,)
+        return (12,)


 class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
@@ -791,7 +801,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        if ndim is None:
            ndim = get_vector_length(v_size)
        op = cls(GpuArrayType(dtype, (False,) * ndim))
-        return op(rstate, cast(v_size, 'int32'))
+        return op(rstate, v_size)

    def c_headers(self):
        return super(GPUA_mrg_uniform, self).c_headers() + ['numpy_compat.h']
@@ -920,8 +930,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        otypecode = str(self.output_type.typecode)

        return """
+        npy_int64 M1 = 2147483647;      //2^31 - 1
        size_t odims[%(ndim)s];
-        unsigned int n_elements = 1;
+        size_t n_elements = 1;
        unsigned int n_streams;
        int must_alloc_sample = ((NULL == %(o_sample)s)
                || !pygpu_GpuArray_Check((PyObject*)%(o_sample)s)
@@ -939,18 +950,22 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
                %(ndim)s, PyArray_DIMS(%(size)s)[0]);
            %(fail)s
        }
-        if (PyArray_DESCR(%(size)s)->type_num != NPY_INT32)
-        {
-            PyErr_SetString(PyExc_ValueError, "size must be int32");
-            %(fail)s
-        }
+
        for (int i = 0; i < %(ndim)s; ++i)
        {
-            odims[i] = ((npy_int32 *)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
+            odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
            n_elements *= odims[i];
            must_alloc_sample = (must_alloc_sample
                    || PyGpuArray_DIMS(%(o_sample)s)[i] != odims[i]);
        }
+
+        if (n_elements > M1)
+        {
+            PyErr_SetString(
+                PyExc_ValueError,
+                "rng_mrg gpu implementation does not support more than (2**31 -1) samples");
+            %(fail)s
+        }
        if (must_alloc_sample)
        {
            Py_XDECREF(%(o_sample)s);
@@ -1026,7 +1041,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        """ % locals()

    def c_code_cache_version(self):
-        return (8,)
+        return (11,)


 def guess_n_streams(size, warn=False):

--- a/theano/sandbox/tests/test_rng_mrg.py
+++ b/theano/sandbox/tests/test_rng_mrg.py
@@ -4,6 +4,7 @@ import os
 import sys
 import time
 import unittest
+import functools

 from nose.plugins.skip import SkipTest
 from nose.tools import assert_raises
@@ -1042,6 +1043,80 @@ def test_seed_fn():
            assert numpy.allclose(fn3_val1, fn3_val3) == same


+def rng_mrg_overflow(sizes, fct, mode, should_raise_error):
+    for size in sizes:
+        y = fct(size=size)
+        f = theano.function([], y, mode=mode)
+        theano.printing.debugprint(f)
+        if should_raise_error:
+            assert_raises(ValueError, f)
+        else:
+            f()
+
+
+def test_overflow_cpu():
+    # run with THEANO_FLAGS=mode=FAST_RUN,device=cpu,floatX=float32
+    rng = MRG_RandomStreams(numpy.random.randint(1234))
+    fct = rng.uniform
+    # should raise error as the size overflows
+    sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
+    rng_mrg_overflow(sizes, fct, config.mode, should_raise_error=True)
+    # should not raise error
+    sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
+    rng_mrg_overflow(sizes, fct, config.mode, should_raise_error=False)
+    # should support int32 sizes
+    sizes = [(numpy.int32(2**10), ),
+             (numpy.int32(2), numpy.int32(2**10), numpy.int32(2**10))]
+    rng_mrg_overflow(sizes, fct, config.mode, should_raise_error=False)
+
+
+def test_overflow_gpu_old_backend():
+    # run with THEANO_FLAGS=mode=FAST_RUN,init_gpu_device=gpu1,device=cpu
+    if not cuda_available:
+        raise SkipTest('Optional package cuda not available')
+    mode = mode_with_gpu
+    seed = 12345
+    rng = MRG_RandomStreams(seed=seed, use_cuda=True)
+    fct = rng.uniform
+    # should raise error as the size overflows
+    sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
+    rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
+    # should not raise error
+    sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
+    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
+    # should support int32 sizes
+    sizes = [(numpy.int32(2**10), ),
+             (numpy.int32(2), numpy.int32(2**10), numpy.int32(2**10))]
+    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
+
+
+def test_overflow_gpu_new_backend():
+    # run with THEANO_FLAGS=mode=FAST_RUN,init_gpu_device=cuda1,device=cpu
+    from theano.sandbox.gpuarray.tests.test_basic_ops import \
+        mode_with_gpu as mode
+    from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
+    seed = 12345
+    n_substreams = 7
+    curr_rstate = numpy.array([seed] * 6, dtype='int32')
+    rstate = [curr_rstate.copy()]
+    for j in range(1, n_substreams):
+        rstate.append(rng_mrg.ff_2p72(rstate[-1]))
+    rstate = numpy.asarray(rstate)
+    rstate = gpuarray_shared_constructor(rstate)
+    fct = functools.partial(rng_mrg.GPUA_mrg_uniform.new, rstate,
+                            ndim=None, dtype='float32')
+    # should raise error as the size overflows
+    sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
+    rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
+    # should not raise error
+    sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
+    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
+    # should support int32 sizes
+    sizes = [(numpy.int32(2**10), ),
+             (numpy.int32(2), numpy.int32(2**10), numpy.int32(2**10))]
+    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
+
+
 if __name__ == "__main__":
    rng = MRG_RandomStreams(numpy.random.randint(2147462579))
    print(theano.__file__)