fixed rng_mrg int32 overflow, just throw out the error when it overflows

comments in test pep8 bug fix, int32 to int64, for failure in test_multinomial.py:test_n_samples_compatibility fixed some bugs casting problem in c code fixed fixed more bugs and added into both old and new backend fixed more bugs and added into both old and new backend bug fix pep8 minor fix pep8fix pep8 fix too much memroy?, try something smaller refactored tests refactored tests and pep8 fail on --select=E121

fixed rng_mrg int32 overflow, just throw out the error when it overflows
bfe0057f · Li Yao · 35254935 · bfe0057f · bfe0057f
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -337,8 +337,8 @@ class mrg_uniform(mrg_uniform_base):
        v_size = as_tensor_variable(size)
        if ndim is None:
            ndim = get_vector_length(v_size)
-        op = cls(TensorType(dtype, (False,) * ndim))
+        op = cls(TensorType(dtype, (False,)*ndim))
-        return op(rstate, cast(v_size, 'int32'))
+        return op(rstate, v_size)
    def perform(self, node, inp, out):
        rstate, size = inp
@@ -396,9 +396,8 @@ class mrg_uniform(mrg_uniform_base):
            NORM = '4.656612873077392578125e-10'
        return """
        //////// <code generated by mrg_uniform>
+        npy_int64 odims[%(ndim)s];
-        npy_intp odims[%(ndim)s];
+        npy_int64 n_elements = 1;
-        int n_elements = 1;
        int n_streams = 0;
        int must_alloc_sample = ((NULL == %(o_sample)s)
                                 || (PyArray_NDIM(%(o_sample)s) != %(ndim)s)
@@ -432,18 +431,24 @@ class mrg_uniform(mrg_uniform_base):
                %(ndim)s, int(PyArray_DIMS(%(size)s)[0]));
            %(fail)s
        }
-        if (PyArray_DESCR(%(size)s)->type_num != NPY_INT32)
-        {
-            PyErr_SetString(PyExc_ValueError, "size must be int32");
-            %(fail)s
-        }
        for (int i = 0; i < %(ndim)s; ++i)
        {
-            odims[i] = ((npy_int32*)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
+            odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
            n_elements *= odims[i];
            must_alloc_sample = must_alloc_sample || (PyArray_DIMS(%(o_sample)s)[i] != odims[i]);
            //fprintf(stderr, "size %%i %%i\\n", i, (int)odims[i]);
+            //printf("%%li", n_elements);
+        }
+        //fprintf(stderr, "n_elements %%lld\\n", (long long)n_elements);
+        if (n_elements > M1)
+        {
+            PyErr_SetString(
+                PyExc_ValueError, 
+                "rng_mrg cpu-implementation does not support more than (2**31 -1) samples");
+            %(fail)s 
        }
        if (must_alloc_sample)
        {
            Py_XDECREF(%(o_sample)s);
@@ -537,7 +542,7 @@ class mrg_uniform(mrg_uniform_base):
        """ % locals()
    def c_code_cache_version(self):
-        return (3,)
+        return (7, )
 class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
@@ -548,8 +553,8 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
        v_size = as_tensor_variable(size)
        if ndim is None:
            ndim = get_vector_length(v_size)
-        op = cls(CudaNdarrayType((False,) * ndim))
+        op = cls(CudaNdarrayType((False,)*ndim))
-        return op(rstate, cast(v_size, 'int32'))
+        return op(rstate, v_size)
    def c_support_code_apply(self, node, nodename):
        if self.output_type.dtype == 'float32':
@@ -661,9 +666,9 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
        SYNC = "CNDA_THREAD_SYNC"
        return """
        //////// <code generated by mrg_uniform>
+        npy_int64 M1 = 2147483647;      //2^31 - 1
-        int odims[%(ndim)s];
+        npy_int64 odims[%(ndim)s];
-        int n_elements = 1;
+        npy_int64 n_elements = 1;
        int n_streams, n_streams_used_in_this_call;
        int must_alloc_sample = ((NULL == %(o_sample)s)
                || !CudaNdarray_Check((PyObject*)%(o_sample)s)
@@ -681,18 +686,23 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
                %(ndim)s, PyArray_DIMS(%(size)s)[0]);
            %(fail)s
        }
-        if (PyArray_DESCR(%(size)s)->type_num != NPY_INT32)
-        {
-            PyErr_SetString(PyExc_ValueError, "size must be int32");
-            %(fail)s
-        }
        for (int i = 0; i < %(ndim)s; ++i)
        {
-            odims[i] = ((npy_int32*)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
+            odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
            n_elements *= odims[i];
            must_alloc_sample = (must_alloc_sample
                    || CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]);
        }
+        if (n_elements > M1)
+        {
+            PyErr_SetString(
+                PyExc_ValueError, 
+                "rng_mrg gpu implementation does not support more than (2**31 -1) samples");
+            %(fail)s 
+        }
        if (must_alloc_sample)
        {
            Py_XDECREF(%(o_sample)s);
@@ -735,7 +745,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
            %(fail)s;
        }
        n_streams = CudaNdarray_HOST_DIMS(%(o_rstate)s)[0]/6;
-        n_streams_used_in_this_call = std::min(n_streams, n_elements);
+        n_streams_used_in_this_call = std::min(n_streams, (int)n_elements);
        {
            unsigned int threads_per_block = std::min((unsigned int)n_streams_used_in_this_call, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
@@ -775,7 +785,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
        """ % locals()
    def c_code_cache_version(self):
-        return (10,)
+        return (12,)
 class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
@@ -790,8 +800,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        v_size = as_tensor_variable(size)
        if ndim is None:
            ndim = get_vector_length(v_size)
-        op = cls(GpuArrayType(dtype, (False,) * ndim))
+        op = cls(GpuArrayType(dtype, (False,)*ndim))
-        return op(rstate, cast(v_size, 'int32'))
+        return op(rstate, v_size)
    def c_headers(self):
        return super(GPUA_mrg_uniform, self).c_headers() + ['numpy_compat.h']
@@ -920,8 +930,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        otypecode = str(self.output_type.typecode)
        return """
+        npy_int64 M1 = 2147483647;      //2^31 - 1
        size_t odims[%(ndim)s];
-        unsigned int n_elements = 1;
+        size_t n_elements = 1;
        unsigned int n_streams;
        int must_alloc_sample = ((NULL == %(o_sample)s)
                || !pygpu_GpuArray_Check((PyObject*)%(o_sample)s)
@@ -939,18 +950,22 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
                %(ndim)s, PyArray_DIMS(%(size)s)[0]);
            %(fail)s
        }
-        if (PyArray_DESCR(%(size)s)->type_num != NPY_INT32)
-        {
-            PyErr_SetString(PyExc_ValueError, "size must be int32");
-            %(fail)s
-        }
        for (int i = 0; i < %(ndim)s; ++i)
        {
-            odims[i] = ((npy_int32 *)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
+            odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
            n_elements *= odims[i];
            must_alloc_sample = (must_alloc_sample
                    || PyGpuArray_DIMS(%(o_sample)s)[i] != odims[i]);
        }
+        if (n_elements > M1)
+        {
+            PyErr_SetString(
+                PyExc_ValueError, 
+                "rng_mrg gpu implementation does not support more than (2**31 -1) samples");
+            %(fail)s 
+        }
        if (must_alloc_sample)
        {
            Py_XDECREF(%(o_sample)s);
@@ -1026,7 +1041,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        """ % locals()
    def c_code_cache_version(self):
-        return (8,)
+        return (11,)
 def guess_n_streams(size, warn=False):

--- a/theano/sandbox/tests/test_rng_mrg.py
+++ b/theano/sandbox/tests/test_rng_mrg.py
@@ -4,6 +4,7 @@ import os
 import sys
 import time
 import unittest
+import functools
 from nose.plugins.skip import SkipTest
 from nose.tools import assert_raises
@@ -1042,6 +1043,80 @@ def test_seed_fn():
            assert numpy.allclose(fn3_val1, fn3_val3) == same
+def rng_mrg_overflow(sizes, fct, mode, should_raise_error):
+    for size in sizes:
+        y = fct(size=size)
+        f = theano.function([], y, mode=mode)
+        theano.printing.debugprint(f)
+        if should_raise_error:
+            assert_raises(ValueError, f)
+        else:
+            f()
+def test_overflow_cpu():
+    # run with THEANO_FLAGS=mode=FAST_RUN,device=cpu,floatX=float32
+    rng = MRG_RandomStreams(numpy.random.randint(1234))
+    fct = rng.uniform
+    # should raise error as the size overflows
+    sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
+    rng_mrg_overflow(sizes, fct, config.mode, should_raise_error=True)
+    # should not raise error
+    sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
+    rng_mrg_overflow(sizes, fct, config.mode, should_raise_error=False)
+    # should support int32 sizes
+    sizes = [(numpy.int32(2**10), ),
+             (numpy.int32(2), numpy.int32(2**10), numpy.int32(2**10))]
+    rng_mrg_overflow(sizes, fct, config.mode, should_raise_error=False)
+def test_overflow_gpu_old_backend():
+    # run with THEANO_FLAGS=mode=FAST_RUN,init_gpu_device=gpu1,device=cpu
+    if not cuda_available:
+        raise SkipTest('Optional package cuda not available')
+    mode = mode_with_gpu
+    seed = 12345
+    rng = MRG_RandomStreams(seed=seed, use_cuda=True)
+    fct = rng.uniform
+    # should raise error as the size overflows
+    sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
+    rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
+    # should not raise error
+    sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
+    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
+    # should support int32 sizes
+    sizes = [(numpy.int32(2**10), ),
+             (numpy.int32(2), numpy.int32(2**10), numpy.int32(2**10))]
+    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
+def test_overflow_gpu_new_backend():
+    # run with THEANO_FLAGS=mode=FAST_RUN,init_gpu_device=cuda1,device=cpu
+    from theano.sandbox.gpuarray.tests.test_basic_ops import \
+        mode_with_gpu as mode
+    from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
+    seed = 12345
+    n_substreams = 7
+    curr_rstate = numpy.array([seed] * 6, dtype='int32')
+    rstate = [curr_rstate.copy()]
+    for j in range(1, n_substreams):
+        rstate.append(rng_mrg.ff_2p72(rstate[-1]))
+    rstate = numpy.asarray(rstate)
+    rstate = gpuarray_shared_constructor(rstate)
+    fct = functools.partial(rng_mrg.GPUA_mrg_uniform.new, rstate,
+                            ndim=None, dtype='float32')
+    # should raise error as the size overflows
+    sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
+    rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
+    # should not raise error
+    sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
+    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
+    # should support int32 sizes
+    sizes = [(numpy.int32(2**10), ),
+             (numpy.int32(2), numpy.int32(2**10), numpy.int32(2**10))]
+    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
 if __name__ == "__main__":
    rng = MRG_RandomStreams(numpy.random.randint(2147462579))
    print(theano.__file__)