提交 bfe0057f authored 作者: Li Yao's avatar Li Yao

fixed rng_mrg int32 overflow, just throw out the error when it overflows

comments in test pep8 bug fix, int32 to int64, for failure in test_multinomial.py:test_n_samples_compatibility fixed some bugs casting problem in c code fixed fixed more bugs and added into both old and new backend fixed more bugs and added into both old and new backend bug fix pep8 minor fix pep8fix pep8 fix too much memroy?, try something smaller refactored tests refactored tests and pep8 fail on --select=E121
上级 35254935
...@@ -337,8 +337,8 @@ class mrg_uniform(mrg_uniform_base): ...@@ -337,8 +337,8 @@ class mrg_uniform(mrg_uniform_base):
v_size = as_tensor_variable(size) v_size = as_tensor_variable(size)
if ndim is None: if ndim is None:
ndim = get_vector_length(v_size) ndim = get_vector_length(v_size)
op = cls(TensorType(dtype, (False,) * ndim)) op = cls(TensorType(dtype, (False,)*ndim))
return op(rstate, cast(v_size, 'int32')) return op(rstate, v_size)
def perform(self, node, inp, out): def perform(self, node, inp, out):
rstate, size = inp rstate, size = inp
...@@ -396,9 +396,8 @@ class mrg_uniform(mrg_uniform_base): ...@@ -396,9 +396,8 @@ class mrg_uniform(mrg_uniform_base):
NORM = '4.656612873077392578125e-10' NORM = '4.656612873077392578125e-10'
return """ return """
//////// <code generated by mrg_uniform> //////// <code generated by mrg_uniform>
npy_int64 odims[%(ndim)s];
npy_intp odims[%(ndim)s]; npy_int64 n_elements = 1;
int n_elements = 1;
int n_streams = 0; int n_streams = 0;
int must_alloc_sample = ((NULL == %(o_sample)s) int must_alloc_sample = ((NULL == %(o_sample)s)
|| (PyArray_NDIM(%(o_sample)s) != %(ndim)s) || (PyArray_NDIM(%(o_sample)s) != %(ndim)s)
...@@ -432,18 +431,24 @@ class mrg_uniform(mrg_uniform_base): ...@@ -432,18 +431,24 @@ class mrg_uniform(mrg_uniform_base):
%(ndim)s, int(PyArray_DIMS(%(size)s)[0])); %(ndim)s, int(PyArray_DIMS(%(size)s)[0]));
%(fail)s %(fail)s
} }
if (PyArray_DESCR(%(size)s)->type_num != NPY_INT32)
{
PyErr_SetString(PyExc_ValueError, "size must be int32");
%(fail)s
}
for (int i = 0; i < %(ndim)s; ++i) for (int i = 0; i < %(ndim)s; ++i)
{ {
odims[i] = ((npy_int32*)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0]; odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
n_elements *= odims[i]; n_elements *= odims[i];
must_alloc_sample = must_alloc_sample || (PyArray_DIMS(%(o_sample)s)[i] != odims[i]); must_alloc_sample = must_alloc_sample || (PyArray_DIMS(%(o_sample)s)[i] != odims[i]);
//fprintf(stderr, "size %%i %%i\\n", i, (int)odims[i]); //fprintf(stderr, "size %%i %%i\\n", i, (int)odims[i]);
//printf("%%li", n_elements);
}
//fprintf(stderr, "n_elements %%lld\\n", (long long)n_elements);
if (n_elements > M1)
{
PyErr_SetString(
PyExc_ValueError,
"rng_mrg cpu-implementation does not support more than (2**31 -1) samples");
%(fail)s
} }
if (must_alloc_sample) if (must_alloc_sample)
{ {
Py_XDECREF(%(o_sample)s); Py_XDECREF(%(o_sample)s);
...@@ -537,7 +542,7 @@ class mrg_uniform(mrg_uniform_base): ...@@ -537,7 +542,7 @@ class mrg_uniform(mrg_uniform_base):
""" % locals() """ % locals()
def c_code_cache_version(self): def c_code_cache_version(self):
return (3,) return (7, )
class GPU_mrg_uniform(mrg_uniform_base, GpuOp): class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
...@@ -548,8 +553,8 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp): ...@@ -548,8 +553,8 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
v_size = as_tensor_variable(size) v_size = as_tensor_variable(size)
if ndim is None: if ndim is None:
ndim = get_vector_length(v_size) ndim = get_vector_length(v_size)
op = cls(CudaNdarrayType((False,) * ndim)) op = cls(CudaNdarrayType((False,)*ndim))
return op(rstate, cast(v_size, 'int32')) return op(rstate, v_size)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
if self.output_type.dtype == 'float32': if self.output_type.dtype == 'float32':
...@@ -661,9 +666,9 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp): ...@@ -661,9 +666,9 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
SYNC = "CNDA_THREAD_SYNC" SYNC = "CNDA_THREAD_SYNC"
return """ return """
//////// <code generated by mrg_uniform> //////// <code generated by mrg_uniform>
npy_int64 M1 = 2147483647; //2^31 - 1
int odims[%(ndim)s]; npy_int64 odims[%(ndim)s];
int n_elements = 1; npy_int64 n_elements = 1;
int n_streams, n_streams_used_in_this_call; int n_streams, n_streams_used_in_this_call;
int must_alloc_sample = ((NULL == %(o_sample)s) int must_alloc_sample = ((NULL == %(o_sample)s)
|| !CudaNdarray_Check((PyObject*)%(o_sample)s) || !CudaNdarray_Check((PyObject*)%(o_sample)s)
...@@ -681,18 +686,23 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp): ...@@ -681,18 +686,23 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
%(ndim)s, PyArray_DIMS(%(size)s)[0]); %(ndim)s, PyArray_DIMS(%(size)s)[0]);
%(fail)s %(fail)s
} }
if (PyArray_DESCR(%(size)s)->type_num != NPY_INT32)
{
PyErr_SetString(PyExc_ValueError, "size must be int32");
%(fail)s
}
for (int i = 0; i < %(ndim)s; ++i) for (int i = 0; i < %(ndim)s; ++i)
{ {
odims[i] = ((npy_int32*)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0]; odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
n_elements *= odims[i]; n_elements *= odims[i];
must_alloc_sample = (must_alloc_sample must_alloc_sample = (must_alloc_sample
|| CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]); || CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]);
} }
if (n_elements > M1)
{
PyErr_SetString(
PyExc_ValueError,
"rng_mrg gpu implementation does not support more than (2**31 -1) samples");
%(fail)s
}
if (must_alloc_sample) if (must_alloc_sample)
{ {
Py_XDECREF(%(o_sample)s); Py_XDECREF(%(o_sample)s);
...@@ -735,7 +745,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp): ...@@ -735,7 +745,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
%(fail)s; %(fail)s;
} }
n_streams = CudaNdarray_HOST_DIMS(%(o_rstate)s)[0]/6; n_streams = CudaNdarray_HOST_DIMS(%(o_rstate)s)[0]/6;
n_streams_used_in_this_call = std::min(n_streams, n_elements); n_streams_used_in_this_call = std::min(n_streams, (int)n_elements);
{ {
unsigned int threads_per_block = std::min((unsigned int)n_streams_used_in_this_call, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); unsigned int threads_per_block = std::min((unsigned int)n_streams_used_in_this_call, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
...@@ -775,7 +785,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp): ...@@ -775,7 +785,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
""" % locals() """ % locals()
def c_code_cache_version(self): def c_code_cache_version(self):
return (10,) return (12,)
class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
...@@ -790,8 +800,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -790,8 +800,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
v_size = as_tensor_variable(size) v_size = as_tensor_variable(size)
if ndim is None: if ndim is None:
ndim = get_vector_length(v_size) ndim = get_vector_length(v_size)
op = cls(GpuArrayType(dtype, (False,) * ndim)) op = cls(GpuArrayType(dtype, (False,)*ndim))
return op(rstate, cast(v_size, 'int32')) return op(rstate, v_size)
def c_headers(self): def c_headers(self):
return super(GPUA_mrg_uniform, self).c_headers() + ['numpy_compat.h'] return super(GPUA_mrg_uniform, self).c_headers() + ['numpy_compat.h']
...@@ -920,8 +930,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -920,8 +930,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
otypecode = str(self.output_type.typecode) otypecode = str(self.output_type.typecode)
return """ return """
npy_int64 M1 = 2147483647; //2^31 - 1
size_t odims[%(ndim)s]; size_t odims[%(ndim)s];
unsigned int n_elements = 1; size_t n_elements = 1;
unsigned int n_streams; unsigned int n_streams;
int must_alloc_sample = ((NULL == %(o_sample)s) int must_alloc_sample = ((NULL == %(o_sample)s)
|| !pygpu_GpuArray_Check((PyObject*)%(o_sample)s) || !pygpu_GpuArray_Check((PyObject*)%(o_sample)s)
...@@ -939,18 +950,22 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -939,18 +950,22 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
%(ndim)s, PyArray_DIMS(%(size)s)[0]); %(ndim)s, PyArray_DIMS(%(size)s)[0]);
%(fail)s %(fail)s
} }
if (PyArray_DESCR(%(size)s)->type_num != NPY_INT32)
{
PyErr_SetString(PyExc_ValueError, "size must be int32");
%(fail)s
}
for (int i = 0; i < %(ndim)s; ++i) for (int i = 0; i < %(ndim)s; ++i)
{ {
odims[i] = ((npy_int32 *)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0]; odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
n_elements *= odims[i]; n_elements *= odims[i];
must_alloc_sample = (must_alloc_sample must_alloc_sample = (must_alloc_sample
|| PyGpuArray_DIMS(%(o_sample)s)[i] != odims[i]); || PyGpuArray_DIMS(%(o_sample)s)[i] != odims[i]);
} }
if (n_elements > M1)
{
PyErr_SetString(
PyExc_ValueError,
"rng_mrg gpu implementation does not support more than (2**31 -1) samples");
%(fail)s
}
if (must_alloc_sample) if (must_alloc_sample)
{ {
Py_XDECREF(%(o_sample)s); Py_XDECREF(%(o_sample)s);
...@@ -1026,7 +1041,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -1026,7 +1041,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
""" % locals() """ % locals()
def c_code_cache_version(self): def c_code_cache_version(self):
return (8,) return (11,)
def guess_n_streams(size, warn=False): def guess_n_streams(size, warn=False):
......
...@@ -4,6 +4,7 @@ import os ...@@ -4,6 +4,7 @@ import os
import sys import sys
import time import time
import unittest import unittest
import functools
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
from nose.tools import assert_raises from nose.tools import assert_raises
...@@ -1042,6 +1043,80 @@ def test_seed_fn(): ...@@ -1042,6 +1043,80 @@ def test_seed_fn():
assert numpy.allclose(fn3_val1, fn3_val3) == same assert numpy.allclose(fn3_val1, fn3_val3) == same
def rng_mrg_overflow(sizes, fct, mode, should_raise_error):
for size in sizes:
y = fct(size=size)
f = theano.function([], y, mode=mode)
theano.printing.debugprint(f)
if should_raise_error:
assert_raises(ValueError, f)
else:
f()
def test_overflow_cpu():
# run with THEANO_FLAGS=mode=FAST_RUN,device=cpu,floatX=float32
rng = MRG_RandomStreams(numpy.random.randint(1234))
fct = rng.uniform
# should raise error as the size overflows
sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
rng_mrg_overflow(sizes, fct, config.mode, should_raise_error=True)
# should not raise error
sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
rng_mrg_overflow(sizes, fct, config.mode, should_raise_error=False)
# should support int32 sizes
sizes = [(numpy.int32(2**10), ),
(numpy.int32(2), numpy.int32(2**10), numpy.int32(2**10))]
rng_mrg_overflow(sizes, fct, config.mode, should_raise_error=False)
def test_overflow_gpu_old_backend():
# run with THEANO_FLAGS=mode=FAST_RUN,init_gpu_device=gpu1,device=cpu
if not cuda_available:
raise SkipTest('Optional package cuda not available')
mode = mode_with_gpu
seed = 12345
rng = MRG_RandomStreams(seed=seed, use_cuda=True)
fct = rng.uniform
# should raise error as the size overflows
sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
# should not raise error
sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
# should support int32 sizes
sizes = [(numpy.int32(2**10), ),
(numpy.int32(2), numpy.int32(2**10), numpy.int32(2**10))]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
def test_overflow_gpu_new_backend():
# run with THEANO_FLAGS=mode=FAST_RUN,init_gpu_device=cuda1,device=cpu
from theano.sandbox.gpuarray.tests.test_basic_ops import \
mode_with_gpu as mode
from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
seed = 12345
n_substreams = 7
curr_rstate = numpy.array([seed] * 6, dtype='int32')
rstate = [curr_rstate.copy()]
for j in range(1, n_substreams):
rstate.append(rng_mrg.ff_2p72(rstate[-1]))
rstate = numpy.asarray(rstate)
rstate = gpuarray_shared_constructor(rstate)
fct = functools.partial(rng_mrg.GPUA_mrg_uniform.new, rstate,
ndim=None, dtype='float32')
# should raise error as the size overflows
sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
# should not raise error
sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
# should support int32 sizes
sizes = [(numpy.int32(2**10), ),
(numpy.int32(2), numpy.int32(2**10), numpy.int32(2**10))]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
if __name__ == "__main__": if __name__ == "__main__":
rng = MRG_RandomStreams(numpy.random.randint(2147462579)) rng = MRG_RandomStreams(numpy.random.randint(2147462579))
print(theano.__file__) print(theano.__file__)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论