提交 e6e88ce2 authored 作者: Xavier Bouthillier's avatar Xavier Bouthillier

Merge pull request #4064 from yaoli/rng_mrg_overflow

fixed rng_mrg int32 overflow, just throw out the error when it overflows
......@@ -338,7 +338,7 @@ class mrg_uniform(mrg_uniform_base):
if ndim is None:
ndim = get_vector_length(v_size)
op = cls(TensorType(dtype, (False,) * ndim))
return op(rstate, cast(v_size, 'int32'))
return op(rstate, v_size)
def perform(self, node, inp, out):
rstate, size = inp
......@@ -396,9 +396,8 @@ class mrg_uniform(mrg_uniform_base):
NORM = '4.656612873077392578125e-10'
return """
//////// <code generated by mrg_uniform>
npy_intp odims[%(ndim)s];
int n_elements = 1;
npy_int64 odims[%(ndim)s];
npy_int64 n_elements = 1;
int n_streams = 0;
int must_alloc_sample = ((NULL == %(o_sample)s)
|| (PyArray_NDIM(%(o_sample)s) != %(ndim)s)
......@@ -432,18 +431,24 @@ class mrg_uniform(mrg_uniform_base):
%(ndim)s, int(PyArray_DIMS(%(size)s)[0]));
%(fail)s
}
if (PyArray_DESCR(%(size)s)->type_num != NPY_INT32)
{
PyErr_SetString(PyExc_ValueError, "size must be int32");
%(fail)s
}
for (int i = 0; i < %(ndim)s; ++i)
{
odims[i] = ((npy_int32*)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
n_elements *= odims[i];
must_alloc_sample = must_alloc_sample || (PyArray_DIMS(%(o_sample)s)[i] != odims[i]);
//fprintf(stderr, "size %%i %%i\\n", i, (int)odims[i]);
//printf("%%li", n_elements);
}
//fprintf(stderr, "n_elements %%lld\\n", (long long)n_elements);
if (n_elements > M1)
{
PyErr_SetString(
PyExc_ValueError,
"rng_mrg cpu-implementation does not support more than (2**31 -1) samples");
%(fail)s
}
if (must_alloc_sample)
{
Py_XDECREF(%(o_sample)s);
......@@ -537,7 +542,7 @@ class mrg_uniform(mrg_uniform_base):
""" % locals()
def c_code_cache_version(self):
return (3,)
return (7, )
class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
......@@ -549,7 +554,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
if ndim is None:
ndim = get_vector_length(v_size)
op = cls(CudaNdarrayType((False,) * ndim))
return op(rstate, cast(v_size, 'int32'))
return op(rstate, v_size)
def c_support_code_apply(self, node, nodename):
if self.output_type.dtype == 'float32':
......@@ -661,9 +666,9 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
SYNC = "CNDA_THREAD_SYNC"
return """
//////// <code generated by mrg_uniform>
int odims[%(ndim)s];
int n_elements = 1;
npy_int64 M1 = 2147483647; //2^31 - 1
npy_int64 odims[%(ndim)s];
npy_int64 n_elements = 1;
int n_streams, n_streams_used_in_this_call;
int must_alloc_sample = ((NULL == %(o_sample)s)
|| !CudaNdarray_Check((PyObject*)%(o_sample)s)
......@@ -681,18 +686,23 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
%(ndim)s, PyArray_DIMS(%(size)s)[0]);
%(fail)s
}
if (PyArray_DESCR(%(size)s)->type_num != NPY_INT32)
{
PyErr_SetString(PyExc_ValueError, "size must be int32");
%(fail)s
}
for (int i = 0; i < %(ndim)s; ++i)
{
odims[i] = ((npy_int32*)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
n_elements *= odims[i];
must_alloc_sample = (must_alloc_sample
|| CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]);
}
if (n_elements > M1)
{
PyErr_SetString(
PyExc_ValueError,
"rng_mrg gpu implementation does not support more than (2**31 -1) samples");
%(fail)s
}
if (must_alloc_sample)
{
Py_XDECREF(%(o_sample)s);
......@@ -735,7 +745,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
%(fail)s;
}
n_streams = CudaNdarray_HOST_DIMS(%(o_rstate)s)[0]/6;
n_streams_used_in_this_call = std::min(n_streams, n_elements);
n_streams_used_in_this_call = std::min(n_streams, (int)n_elements);
{
unsigned int threads_per_block = std::min((unsigned int)n_streams_used_in_this_call, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
......@@ -775,7 +785,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
""" % locals()
def c_code_cache_version(self):
return (10,)
return (12,)
class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
......@@ -791,7 +801,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
if ndim is None:
ndim = get_vector_length(v_size)
op = cls(GpuArrayType(dtype, (False,) * ndim))
return op(rstate, cast(v_size, 'int32'))
return op(rstate, v_size)
def c_headers(self):
return super(GPUA_mrg_uniform, self).c_headers() + ['numpy_compat.h']
......@@ -920,8 +930,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
otypecode = str(self.output_type.typecode)
return """
npy_int64 M1 = 2147483647; //2^31 - 1
size_t odims[%(ndim)s];
unsigned int n_elements = 1;
size_t n_elements = 1;
unsigned int n_streams;
int must_alloc_sample = ((NULL == %(o_sample)s)
|| !pygpu_GpuArray_Check((PyObject*)%(o_sample)s)
......@@ -939,18 +950,22 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
%(ndim)s, PyArray_DIMS(%(size)s)[0]);
%(fail)s
}
if (PyArray_DESCR(%(size)s)->type_num != NPY_INT32)
{
PyErr_SetString(PyExc_ValueError, "size must be int32");
%(fail)s
}
for (int i = 0; i < %(ndim)s; ++i)
{
odims[i] = ((npy_int32 *)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
n_elements *= odims[i];
must_alloc_sample = (must_alloc_sample
|| PyGpuArray_DIMS(%(o_sample)s)[i] != odims[i]);
}
if (n_elements > M1)
{
PyErr_SetString(
PyExc_ValueError,
"rng_mrg gpu implementation does not support more than (2**31 -1) samples");
%(fail)s
}
if (must_alloc_sample)
{
Py_XDECREF(%(o_sample)s);
......@@ -1026,7 +1041,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
""" % locals()
def c_code_cache_version(self):
return (8,)
return (11,)
def guess_n_streams(size, warn=False):
......
......@@ -4,6 +4,7 @@ import os
import sys
import time
import unittest
import functools
from nose.plugins.skip import SkipTest
from nose.tools import assert_raises
......@@ -1042,6 +1043,80 @@ def test_seed_fn():
assert numpy.allclose(fn3_val1, fn3_val3) == same
def rng_mrg_overflow(sizes, fct, mode, should_raise_error):
for size in sizes:
y = fct(size=size)
f = theano.function([], y, mode=mode)
theano.printing.debugprint(f)
if should_raise_error:
assert_raises(ValueError, f)
else:
f()
def test_overflow_cpu():
# run with THEANO_FLAGS=mode=FAST_RUN,device=cpu,floatX=float32
rng = MRG_RandomStreams(numpy.random.randint(1234))
fct = rng.uniform
# should raise error as the size overflows
sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
rng_mrg_overflow(sizes, fct, config.mode, should_raise_error=True)
# should not raise error
sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
rng_mrg_overflow(sizes, fct, config.mode, should_raise_error=False)
# should support int32 sizes
sizes = [(numpy.int32(2**10), ),
(numpy.int32(2), numpy.int32(2**10), numpy.int32(2**10))]
rng_mrg_overflow(sizes, fct, config.mode, should_raise_error=False)
def test_overflow_gpu_old_backend():
# run with THEANO_FLAGS=mode=FAST_RUN,init_gpu_device=gpu1,device=cpu
if not cuda_available:
raise SkipTest('Optional package cuda not available')
mode = mode_with_gpu
seed = 12345
rng = MRG_RandomStreams(seed=seed, use_cuda=True)
fct = rng.uniform
# should raise error as the size overflows
sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
# should not raise error
sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
# should support int32 sizes
sizes = [(numpy.int32(2**10), ),
(numpy.int32(2), numpy.int32(2**10), numpy.int32(2**10))]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
def test_overflow_gpu_new_backend():
# run with THEANO_FLAGS=mode=FAST_RUN,init_gpu_device=cuda1,device=cpu
from theano.sandbox.gpuarray.tests.test_basic_ops import \
mode_with_gpu as mode
from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
seed = 12345
n_substreams = 7
curr_rstate = numpy.array([seed] * 6, dtype='int32')
rstate = [curr_rstate.copy()]
for j in range(1, n_substreams):
rstate.append(rng_mrg.ff_2p72(rstate[-1]))
rstate = numpy.asarray(rstate)
rstate = gpuarray_shared_constructor(rstate)
fct = functools.partial(rng_mrg.GPUA_mrg_uniform.new, rstate,
ndim=None, dtype='float32')
# should raise error as the size overflows
sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
# should not raise error
sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
# should support int32 sizes
sizes = [(numpy.int32(2**10), ),
(numpy.int32(2), numpy.int32(2**10), numpy.int32(2**10))]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
if __name__ == "__main__":
rng = MRG_RandomStreams(numpy.random.randint(2147462579))
print(theano.__file__)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论