提交 a388d94d authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Remove tentacles in sandbox

上级 9dcf3f4c
......@@ -10,12 +10,6 @@ from theano.tensor import NotScalarConstantError, get_scalar_constant_value
from theano.scalar import as_scalar
import copy
from theano.sandbox.cuda import cuda_available, GpuOp, register_opt
if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType
from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
class MultinomialFromUniform(Op):
# TODO : need description for parameter 'odtype'
"""
......@@ -403,232 +397,6 @@ class ChoiceFromUniform(MultinomialFromUniform):
break
class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
"""
The output is transposed compared to MultinomialFromUniform.
We must insert a Transpose op after it.
The optimization that moves it to the gpu does it.
"""
def make_node(self, pvals, unis):
assert pvals.dtype == 'float32'
assert unis.dtype == 'float32'
if not isinstance(pvals.type, CudaNdarrayType):
raise TypeError('pvals must be cudandarray', pvals)
if not isinstance(unis.type, CudaNdarrayType):
raise TypeError('unis must be cudandarray', unis)
if self.odtype == 'auto':
odtype = pvals.dtype
else:
odtype = self.odtype
if odtype != pvals.dtype:
raise NotImplementedError(
'GpuMultinomialFromUniform works only if '
'self.odtype == pvals.dtype', odtype, pvals.dtype)
br = (pvals.broadcastable[1], pvals.broadcastable[0])
out = CudaNdarrayType(broadcastable=br)()
return Apply(self, [pvals, unis], [out])
def perform(self, node, ins, outs):
# The perform from parent don't work with CudaNdarray. We
# don't need it as DebugMode will test again it as an
# optimization insert the GPU op.
return Op.perform(self, node, ins, outs)
def c_code_cache_version(self):
return (9,)
def c_support_code_apply(self, node, nodename):
return """
static __global__ void k_multi_warp_%(nodename)s(
const int nb_multi,
const int nb_outcomes,
float * global_pvals,
const int pvals_row_stride,
const int pvals_col_stride,
float * global_unis,
const int unis_stride,
float * global_outs,
const int outs_row_stride,
const int outs_col_stride
)
{
// each thread takes care of one multinomial draw
int n = blockDim.x*blockIdx.x + threadIdx.x;
if (n < nb_multi)
{
float cummul = 0.;
bool done = false;
const float unis_n = global_unis[n*unis_stride];
for (int m = 0; m < nb_outcomes; ++m)
{
float current_out = 0.;
if (!done)
{
cummul += global_pvals[m * pvals_col_stride + n * pvals_row_stride];
if (unis_n < cummul)
{
current_out = 1.;
done = true;
}
}
//write out transposed for speed.
global_outs[n * outs_col_stride + m * outs_row_stride] = current_out;
}
}
}
""" % locals()
def c_code(self, node, name, ins, outs, sub):
(pvals, unis) = ins
(z,) = outs
fail = sub['fail']
return """
if (CudaNdarray_NDIM(%(pvals)s) != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
%(fail)s;
}
if (CudaNdarray_NDIM(%(unis)s) != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
%(fail)s;
}
//N.B. that the output is TRANSPOSED compared with pvals
if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[0]))
{
Py_XDECREF(%(z)s);
npy_intp dims[2];
dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]);
dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]);
%(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
if (!%(z)s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
%(fail)s;
}
}
{ // NESTED SCOPE
int nb_multi = CudaNdarray_HOST_DIMS(%(pvals)s)[0];
int nb_outcomes = CudaNdarray_HOST_DIMS(%(pvals)s)[1];
//TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1;
int nb_blocks = max_nb_blocks + 1;
int nb_threads=16; // so it really starts at 32, because of the *2
do
{
nb_threads*=2;
if (nb_multi %% nb_threads == 0)
nb_blocks = nb_multi/nb_threads;
else
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks);
//printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
// TODO : next line is a bit hardcoded...
if (nb_threads > 512)
{
PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for so many rows in the matrix (%%i)", nb_multi);
%(fail)s;
}
dim3 n_blocks(nb_blocks,1,1);
dim3 n_threads(nb_threads,1,1);
int n_shared = 0;
assert(nb_blocks*nb_threads >= nb_multi);
k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(%(z)s)[1],
CudaNdarray_HOST_DIMS(%(z)s)[0],
CudaNdarray_DEV_DATA(%(pvals)s),
CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
CudaNdarray_DEV_DATA(%(unis)s),
CudaNdarray_HOST_STRIDES(%(unis)s)[0],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n",
"k_multi_warp_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z,
n_shared);
%(fail)s;
}
} // END NESTED SCOPE
""" % locals()
@register_opt()
@local_optimizer([MultinomialFromUniform])
def local_gpu_multinomial(node):
# TODO : need description for function
if type(node.op) is MultinomialFromUniform:
if len(node.inputs) == 2:
p, u = node.inputs
n_samples = 1
else:
p, u, n_samples = node.inputs
try:
if get_scalar_constant_value(n_samples) != 1:
return None
except NotScalarConstantError:
return None
m, = node.outputs
if (p.dtype == u.dtype == m.dtype == 'float32' and
any([i.owner and isinstance(i.owner.op,
theano.sandbox.cuda.HostFromGpu)
for i in node.inputs])):
gpu_op = GpuMultinomialFromUniform(node.op.odtype)
return [host_from_gpu(gpu_op(*[gpu_from_host(i)
for i in [p, u]])).T]
if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and
node.inputs[0].owner and
type(node.inputs[0].owner.op) is MultinomialFromUniform):
multi = node.inputs[0].owner
if len(node.inputs) == 2:
p, u = node.inputs
n_samples = 1
else:
p, u, n_samples = node.inputs
try:
if get_scalar_constant_value(n_samples) != 1:
return None
except NotScalarConstantError:
return None
m, = multi.outputs
if (p.dtype == u.dtype == m.dtype == 'float32'):
gpu_op = GpuMultinomialFromUniform(multi.op.odtype)
ret = gpu_op(*[gpu_from_host(i) for i in [p, u]]).T
# The dimshuffle is on the cpu, but will be moved to the
# gpu by an opt.
return [gpu_from_host(ret)]
class MultinomialWOReplacementFromUniform(ChoiceFromUniform):
def __init__(self, *args, **kwargs):
warnings.warn("MultinomialWOReplacementFromUniform is deprecated, "
......
......@@ -12,6 +12,7 @@ import numpy as np
from six import integer_types
from six.moves import xrange
import theano
from theano import Op, Apply, shared, config, Variable
from theano import gradient, function
from theano import tensor
......@@ -22,17 +23,11 @@ from theano.compile import optdb
from theano.gof import local_optimizer
from . import multinomial
import theano.sandbox.cuda
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
from theano.gpuarray.basic_ops import GpuKernelBase, Kernel, infer_context_name, as_gpuarray_variable
from theano.gpuarray.type import GpuArrayType
from theano.gpuarray.fp16_help import write_w
from theano.gpuarray.opt import (register_opt as register_gpua,
register_opt2)
if theano.sandbox.cuda.cuda_available:
from theano.sandbox.cuda import (CudaNdarrayType,
float32_shared_constructor)
def matVecModM(A, s, m):
......@@ -562,264 +557,6 @@ class mrg_uniform(mrg_uniform_base):
return (8, )
class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
# GPU VERSION
def make_node(self, rstate, size):
# error checking slightly redundant here, since
# this op should not be called directly.
#
# call through MRG_RandomStreams instead.
broad = []
for i in range(self.output_type.ndim):
broad.append(tensor.extract_constant(size[i]) == 1)
output_type = self.output_type.clone(broadcastable=broad)()
rstate = as_cuda_ndarray_variable(rstate)
return Apply(self,
[rstate, size],
[rstate.type(), output_type])
@classmethod
def new(cls, rstate, ndim, dtype, size):
v_size = as_tensor_variable(size)
if ndim is None:
ndim = get_vector_length(v_size)
op = cls(CudaNdarrayType((False,) * ndim))
return op(rstate, v_size)
def c_support_code_apply(self, node, nodename):
if self.output_type.dtype == 'float32':
otype = 'float'
NORM = '4.6566126e-10f' # np.float32(1.0/(2**31+65))
# this was determined by finding the biggest number such that
# np.float32(number * M1) < 1.0
else:
otype = 'double'
NORM = '4.656612873077392578125e-10'
return """
// FB: I disable the printing of the warning, as we
//receive too much email about this and this don't help
//people. I'm not even sure if the "fix" to give the info about
//the shape statically give a speed up. So I consider this
//warning as useless until proved it can speed the user code.
static int %(nodename)s_printed_warning = 1;
static __global__ void %(nodename)s_mrg_uniform(
%(otype)s*sample_data,
npy_int32*state_data,
const int Nsamples,
const int Nstreams_used)
{
const npy_int32 i0 = 0;
const npy_int32 i7 = 7;
const npy_int32 i9 = 9;
const npy_int32 i15 = 15;
const npy_int32 i16 = 16;
const npy_int32 i22 = 22;
const npy_int32 i24 = 24;
const npy_int32 M1 = 2147483647; //2^31 - 1
const npy_int32 M2 = 2147462579; //2^31 - 21069
const npy_int32 MASK12 = 511; //2^9 - 1
const npy_int32 MASK13 = 16777215; //2^24 - 1
const npy_int32 MASK2 = 65535; //2^16 - 1
const npy_int32 MULT2 = 21069;
const unsigned int numThreads = blockDim.x * gridDim.x;
const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
npy_int32 y1, y2, x11, x12, x13, x21, x22, x23;
if (idx < Nstreams_used)
{
x11 = state_data[idx*6+0];
x12 = state_data[idx*6+1];
x13 = state_data[idx*6+2];
x21 = state_data[idx*6+3];
x22 = state_data[idx*6+4];
x23 = state_data[idx*6+5];
for (int i = idx; i < Nsamples; i += Nstreams_used)
{
y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
y1 += x13;
y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
x13 = x12;
x12 = x11;
x11 = y1;
y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16));
y1 -= (y1 < 0 || y1 >= M2) ? M2 : 0;
y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16));
y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
y2 += x23;
y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
y2 += y1;
y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
x23 = x22;
x22 = x21;
x21 = y2;
if (x11 <= x21) {
sample_data[i] = (x11 - x21 + M1) * %(NORM)s;
}
else
{
sample_data[i] = (x11 - x21) * %(NORM)s;
}
}
state_data[idx*6+0]= x11;
state_data[idx*6+1]= x12;
state_data[idx*6+2]= x13;
state_data[idx*6+3]= x21;
state_data[idx*6+4]= x22;
state_data[idx*6+5]= x23;
}
}
""" % locals()
def c_code(self, node, nodename, inp, out, sub):
rstate, size = inp
o_rstate, o_sample = out
inplace = int(self.inplace)
ndim = self.output_type.ndim
o_type_num = np.asarray(0, dtype=self.output_type.dtype).dtype.num
fail = sub['fail']
if self.output_type.dtype == 'float32':
otype = 'float'
else:
otype = 'double'
SYNC = "CNDA_THREAD_SYNC"
return """
//////// <code generated by mrg_uniform>
npy_int64 M1 = 2147483647; //2^31 - 1
// The +1 is to avoid odims[0] which fails on windows
npy_int64 odims[%(ndim)s+1];
npy_int64 n_elements = 1;
int n_streams, n_streams_used_in_this_call;
int must_alloc_sample = ((NULL == %(o_sample)s)
|| !CudaNdarray_Check((PyObject*)%(o_sample)s)
|| !CudaNdarray_is_c_contiguous(%(o_sample)s)
|| (CudaNdarray_NDIM(%(o_sample)s) != %(ndim)s));
if (PyArray_NDIM(%(size)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "size must be vector");
%(fail)s
}
if (PyArray_DIMS(%(size)s)[0] != %(ndim)s)
{
PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%i)",
%(ndim)s, PyArray_DIMS(%(size)s)[0]);
%(fail)s
}
for (int i = 0; i < %(ndim)s; ++i)
{
odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
n_elements *= odims[i];
must_alloc_sample = (must_alloc_sample
|| CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]);
}
if (n_elements > M1)
{
PyErr_SetString(
PyExc_ValueError,
"rng_mrg gpu implementation does not support more than (2**31 -1) samples");
%(fail)s
}
if (must_alloc_sample)
{
Py_XDECREF(%(o_sample)s);
%(o_sample)s = (CudaNdarray*)CudaNdarray_NewDims(%(ndim)s, odims);
if(!%(o_sample)s)
{
%(fail)s;
}
}
if (!CudaNdarray_Check((PyObject*)%(rstate)s))
{
PyErr_Format(PyExc_ValueError, "rstate must be cudandarray");
%(fail)s;
}
Py_XDECREF(%(o_rstate)s);
if (%(inplace)s)
{
Py_INCREF(%(rstate)s);
%(o_rstate)s = %(rstate)s;
}
else
{
%(o_rstate)s = (CudaNdarray*)CudaNdarray_Copy(%(rstate)s);
if (!%(o_rstate)s) {
PyErr_SetString(PyExc_RuntimeError, "GPU_mrg_uniform: "
"could not copy rstate");
%(fail)s
}
}
if (CudaNdarray_NDIM(%(o_rstate)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "rstate must be vector");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(%(o_rstate)s)[0] %% 6)
{
PyErr_Format(PyExc_ValueError, "rstate len must be multiple of 6");
%(fail)s;
}
n_streams = CudaNdarray_HOST_DIMS(%(o_rstate)s)[0]/6;
n_streams_used_in_this_call = std::min(n_streams, (int)n_elements);
{
unsigned int threads_per_block = std::min((unsigned int)n_streams_used_in_this_call, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
unsigned int n_blocks = std::min(ceil_intdiv((unsigned int)n_streams_used_in_this_call, threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
if (n_streams > (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK * (unsigned int)NUM_VECTOR_OP_BLOCKS)
{
PyErr_Format(PyExc_ValueError, "On GPU, n_streams should be at most %%u",
(unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK * (unsigned int)NUM_VECTOR_OP_BLOCKS);
%(fail)s;
}
if (threads_per_block * n_blocks < n_streams)
{
if (! %(nodename)s_printed_warning)
fprintf(stderr, "WARNING: unused streams above %%i (Tune GPU_mrg get_n_streams)\\n", threads_per_block * n_blocks );
%(nodename)s_printed_warning = 1;
}
%(nodename)s_mrg_uniform<<<n_blocks,threads_per_block>>>(
CudaNdarray_DEV_DATA(%(o_sample)s),
(npy_int32*)CudaNdarray_DEV_DATA(%(o_rstate)s),
n_elements, n_streams_used_in_this_call);
}
%(SYNC)s;
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "mrg_uniform", cudaGetErrorString(err));
%(fail)s;
}
}
//////// </ code generated by mrg_uniform>
""" % locals()
def c_code_cache_version(self):
return (12,)
class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
# GpuArray version
_f16_ok = True
......@@ -1131,7 +868,6 @@ def guess_n_streams(size, warn=False):
class MRG_RandomStreams(object):
# TODO : need description for parameter 'use_cuda'
"""
Module component with similar interface to numpy.random
(numpy.random.RandomState).
......@@ -1151,7 +887,7 @@ class MRG_RandomStreams(object):
# TODO : need description for method and return
return list(self.state_updates)
def __init__(self, seed=12345, use_cuda=None):
def __init__(self, seed=12345):
# A list of pairs of the form (input_r, output_r), representing the
# update rules of all the random states generated
# by this RandomStreams.
......@@ -1164,11 +900,6 @@ class MRG_RandomStreams(object):
self.set_rstate(seed)
if use_cuda is None:
self.use_cuda = theano.sandbox.cuda.cuda_enabled
else:
self.use_cuda = use_cuda
def set_rstate(self, seed):
# TODO : need description for method, parameter
if isinstance(seed, integer_types):
......@@ -1271,15 +1002,6 @@ class MRG_RandomStreams(object):
if inc_rstate:
self.inc_rstate()
if self.use_cuda and dtype == 'float32':
rval = rval.flatten()
# HACK - we use fact that int32 and float32 have same size to
# sneak ints into the CudaNdarray type.
# these *SHOULD NEVER BE USED AS FLOATS*
tmp_float_buf = np.frombuffer(rval.data, dtype='float32')
assert tmp_float_buf.shape == rval.shape
assert (tmp_float_buf.view('int32') == rval).all()
rval = tmp_float_buf
return rval
......@@ -1352,25 +1074,11 @@ class MRG_RandomStreams(object):
nstreams = self.n_streams(size)
rstates = self.get_substream_rstates(nstreams, dtype)
if self.use_cuda and dtype == 'float32':
node_rstate = float32_shared_constructor(rstates)
assert isinstance(node_rstate.type, CudaNdarrayType)
# we can't use the normal mrg_uniform constructor + later
# optimization
# because of the tmp_float_buf hack above. There is
# currently no Theano node that will do a frombuffer
# reinterpretation.
u = self.pretty_return(node_rstate,
*GPU_mrg_uniform.new(node_rstate,
ndim, dtype, size),
size=size, nstreams=orig_nstreams)
else:
node_rstate = shared(rstates)
u = self.pretty_return(node_rstate,
*mrg_uniform.new(node_rstate,
ndim, dtype, size),
size=size, nstreams=orig_nstreams)
node_rstate = shared(rstates)
u = self.pretty_return(node_rstate,
*mrg_uniform.new(node_rstate,
ndim, dtype, size),
size=size, nstreams=orig_nstreams)
# Add a reference to distinguish from other shared variables
node_rstate.tag.is_rng = True
r = u * (high - low) + low
......@@ -1387,10 +1095,7 @@ class MRG_RandomStreams(object):
nstreams=None):
# TODO : need description for method, parameter and return
if n == 1:
if dtype == 'float32' and self.use_cuda:
x = self.uniform(size=size, dtype=dtype, nstreams=nstreams)
else:
x = self.uniform(size=size, nstreams=nstreams)
x = self.uniform(size=size, nstreams=nstreams)
return cast(x < p, dtype)
else:
raise NotImplementedError("MRG_RandomStreams.binomial with n > 1")
......@@ -1630,7 +1335,7 @@ def local_gpua_mrg(node):
return local_gpua_mrg_graph(node.op, context_name, node.inputs, node.outputs)
MRG_RNGs = (mrg_uniform, GPU_mrg_uniform, GPUA_mrg_uniform)
MRG_RNGs = (mrg_uniform, GPUA_mrg_uniform)
@local_optimizer(MRG_RNGs)
......
......@@ -10,28 +10,11 @@ import theano
from theano import config, function, tensor
from theano.sandbox import multinomial
from theano.compile.mode import get_default_mode
import theano.sandbox.cuda as cuda
import theano.tests.unittest_tools as utt
from theano.compat import PY3
from theano.misc.pkl_utils import CompatUnpickler
def get_mode(gpu):
mode = get_default_mode()
if theano.config.mode == 'FAST_COMPILE':
mode = theano.compile.get_mode('FAST_RUN')
if gpu:
mode = mode.including('gpu', 'gpu_local_optimizations',
'local_cut_gpu_host_gpu',
'local_gpu_multinomial')
return mode
def run_with_c(f, gpu=False):
mode = get_mode(gpu)
f(mode, gpu)
def test_n_samples_1():
p = tensor.fmatrix()
u = tensor.fvector()
......@@ -117,69 +100,52 @@ def test_multinomial_0():
m = multinomial.MultinomialFromUniform('auto')(p, u)
def body(mode, gpu):
# the m*2 allows the multinomial to reuse output
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
# the m*2 allows the multinomial to reuse output
f = function([p, u], m * 2, allow_input_downcast=True)
if gpu:
assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
# test that both first and second samples can be drawn
utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
[[2, 0], [0, 2]])
# test that both first and second samples can be drawn
utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
[[2, 0], [0, 2]])
# test that both second labels can be drawn
r = f([[.2, .8], [.3, .7]], [.31, .31])
utt.assert_allclose(r, [[0, 2], [0, 2]])
# test that both second labels can be drawn
r = f([[.2, .8], [.3, .7]], [.31, .31])
utt.assert_allclose(r, [[0, 2], [0, 2]])
# test that both first labels can be drawn
r = f([[.2, .8], [.3, .7]], [.21, .21])
utt.assert_allclose(r, [[0, 2], [2, 0]])
# test that both first labels can be drawn
r = f([[.2, .8], [.3, .7]], [.21, .21])
utt.assert_allclose(r, [[0, 2], [2, 0]])
# change the size to make sure output gets reallocated ok
# and also make sure that the GPU version doesn't screw up the
# transposed-ness
r = f([[.2, .8]], [.25])
utt.assert_allclose(r, [[0, 2]])
run_with_c(body)
if cuda.cuda_available:
run_with_c(body, True)
# change the size to make sure output gets reallocated ok
# and also make sure that the GPU version doesn't screw up the
# transposed-ness
r = f([[.2, .8]], [.25])
utt.assert_allclose(r, [[0, 2]])
# TODO: check a bigger example (make sure blocking on GPU is handled correctly)
def test_multinomial_large():
# DEBUG_MODE will test this on GPU
def body(mode, gpu):
p = tensor.fmatrix()
u = tensor.fvector()
m = multinomial.MultinomialFromUniform('auto')(p, u)
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
if gpu:
assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
pval = np.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = np.ones_like(pval[:, 0]) * 0.5
mval = f(pval, uval)
assert mval.shape == pval.shape
if config.cast_policy == 'custom':
assert mval.dtype == pval.dtype
elif config.cast_policy == 'numpy+floatX':
assert mval.dtype == config.floatX
elif config.cast_policy == 'numpy':
assert mval.dtype == 'float64'
else:
raise NotImplementedError(config.cast_policy)
utt.assert_allclose(mval.sum(axis=1), 2)
asdf = np.asarray([0, 0, 2, 0]) + 0 * pval
utt.assert_allclose(mval, asdf) # broadcast over all rows
run_with_c(body)
if cuda.cuda_available:
run_with_c(body, True)
p = tensor.fmatrix()
u = tensor.fvector()
m = multinomial.MultinomialFromUniform('auto')(p, u)
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
pval = np.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = np.ones_like(pval[:, 0]) * 0.5
mval = f(pval, uval)
assert mval.shape == pval.shape
if config.cast_policy == 'custom':
assert mval.dtype == pval.dtype
elif config.cast_policy == 'numpy+floatX':
assert mval.dtype == config.floatX
elif config.cast_policy == 'numpy':
assert mval.dtype == 'float64'
else:
raise NotImplementedError(config.cast_policy)
utt.assert_allclose(mval.sum(axis=1), 2)
asdf = np.asarray([0, 0, 2, 0]) + 0 * pval
utt.assert_allclose(mval, asdf) # broadcast over all rows
def test_multinomial_dtypes():
......@@ -197,40 +163,3 @@ def test_multinomial_dtypes():
u = tensor.fvector()
m = multinomial.MultinomialFromUniform('float64')(p, u)
assert m.dtype == 'float64', m.dtype
def test_gpu_opt():
if not cuda.cuda_available:
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
raise SkipTest('Optional package cuda not available')
# We test the case where we put the op on the gpu when the output
# is moved to the gpu.
p = tensor.fmatrix()
u = tensor.fvector()
m = multinomial.MultinomialFromUniform('auto')(p, u)
assert m.dtype == 'float32', m.dtype
m_gpu = cuda.gpu_from_host(m)
f = function([p, u], m_gpu, allow_input_downcast=True, mode=get_mode(True))
assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
pval = np.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = np.ones_like(pval[:, 0]) * 0.5
f(pval, uval)
# Test with a row, it was failing in the past.
r = tensor.frow()
m = multinomial.MultinomialFromUniform('auto')(r, u)
assert m.dtype == 'float32', m.dtype
m_gpu = cuda.gpu_from_host(m)
f = function([r, u], m_gpu, allow_input_downcast=True, mode=get_mode(True))
assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
pval = np.arange(1 * 4, dtype='float32').reshape((1, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = np.ones_like(pval[:, 0]) * 0.5
f(pval, uval)
......@@ -15,28 +15,15 @@ import theano
from theano import tensor, config
from theano.sandbox import rng_mrg
from theano.sandbox.rng_mrg import MRG_RandomStreams
from theano.sandbox.cuda import cuda_available
from theano.tests import unittest_tools as utt
from theano.tests.unittest_tools import attr
import theano.gpuarray.tests.config
if cuda_available:
from theano.sandbox.cuda import float32_shared_constructor
# TODO: test gpu
# Done in test_consistency_GPU_{serial,parallel}
# TODO: test MRG_RandomStreams
# Partly done in test_consistency_randomstreams
# TODO: test optimizer mrg_random_make_inplace
# TODO: make tests work when no flags gived. Now need:
# THEANO_FLAGS=device=gpu0,floatX=float32
# Partly done, in test_consistency_GPU_{serial,parallel}
mode = config.mode
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
utt.seed_rng()
# Results generated by Java code using L'Ecuyer et al.'s code, with:
......@@ -53,61 +40,46 @@ def test_deterministic():
seed = utt.fetch_seed()
sample_size = (10, 20)
test_use_cuda = [False]
if cuda_available:
test_use_cuda.append(True)
for use_cuda in test_use_cuda:
# print 'use_cuda =', use_cuda
R = MRG_RandomStreams(seed=seed, use_cuda=use_cuda)
u = R.uniform(size=sample_size)
f = theano.function([], u)
R = MRG_RandomStreams(seed=seed)
u = R.uniform(size=sample_size)
f = theano.function([], u)
fsample1 = f()
fsample2 = f()
assert not np.allclose(fsample1, fsample2)
fsample1 = f()
fsample2 = f()
assert not np.allclose(fsample1, fsample2)
R2 = MRG_RandomStreams(seed=seed, use_cuda=use_cuda)
u2 = R2.uniform(size=sample_size)
g = theano.function([], u2)
gsample1 = g()
gsample2 = g()
assert np.allclose(fsample1, gsample1)
assert np.allclose(fsample2, gsample2)
R2 = MRG_RandomStreams(seed=seed)
u2 = R2.uniform(size=sample_size)
g = theano.function([], u2)
gsample1 = g()
gsample2 = g()
assert np.allclose(fsample1, gsample1)
assert np.allclose(fsample2, gsample2)
def test_consistency_randomstreams():
"""
Verify that the random numbers generated by MRG_RandomStreams
are the same as the reference (Java) implementation by L'Ecuyer et al.
"""
# Verify that the random numbers generated by MRG_RandomStreams
# are the same as the reference (Java) implementation by L'Ecuyer et al.
seed = 12345
n_samples = 5
n_streams = 12
n_substreams = 7
test_use_cuda = [False]
if cuda_available:
test_use_cuda.append(True)
for use_cuda in test_use_cuda:
# print 'use_cuda =', use_cuda
samples = []
rng = MRG_RandomStreams(seed=seed, use_cuda=use_cuda)
for i in range(n_streams):
stream_samples = []
u = rng.uniform(size=(n_substreams,), nstreams=n_substreams)
f = theano.function([], u)
for j in range(n_samples):
s = f()
stream_samples.append(s)
stream_samples = np.array(stream_samples)
stream_samples = stream_samples.T.flatten()
samples.append(stream_samples)
samples = []
rng = MRG_RandomStreams(seed=seed)
for i in range(n_streams):
stream_samples = []
u = rng.uniform(size=(n_substreams,), nstreams=n_substreams)
f = theano.function([], u)
for j in range(n_samples):
s = f()
stream_samples.append(s)
stream_samples = np.array(stream_samples)
stream_samples = stream_samples.T.flatten()
samples.append(stream_samples)
samples = np.array(samples).flatten()
assert(np.allclose(samples, java_samples))
samples = np.array(samples).flatten()
assert(np.allclose(samples, java_samples))
def test_get_substream_rstates():
......@@ -214,153 +186,6 @@ def test_consistency_cpu_parallel():
assert(np.allclose(samples, java_samples))
def test_consistency_GPU_serial():
"""
Verify that the random numbers generated by GPU_mrg_uniform, serially,
are the same as the reference (Java) implementation by L'Ecuyer et al.
"""
if not cuda_available:
raise SkipTest('Optional package cuda not available')
if config.mode == 'FAST_COMPILE':
mode = 'FAST_RUN'
else:
mode = config.mode
seed = 12345
n_samples = 5
n_streams = 12
n_substreams = 7
samples = []
curr_rstate = np.array([seed] * 6, dtype='int32')
for i in range(n_streams):
stream_rstate = curr_rstate.copy()
for j in range(n_substreams):
substream_rstate = np.array(stream_rstate.copy(), dtype='int32')
# HACK - we transfer these int32 to the GPU memory as float32
# (reinterpret_cast)
tmp_float_buf = np.frombuffer(substream_rstate.data,
dtype='float32')
# Transfer to device
rstate = float32_shared_constructor(tmp_float_buf)
new_rstate, sample = rng_mrg.GPU_mrg_uniform.new(rstate, ndim=None,
dtype='float32',
size=(1,))
rstate.default_update = new_rstate
# Not really necessary, just mimicking
# rng_mrg.MRG_RandomStreams' behavior
sample.rstate = rstate
sample.update = (rstate, new_rstate)
# We need the sample back in the main memory
cpu_sample = tensor.as_tensor_variable(sample)
f = theano.function([], cpu_sample, mode=mode)
for k in range(n_samples):
s = f()
samples.append(s)
# next substream
stream_rstate = rng_mrg.ff_2p72(stream_rstate)
# next stream
curr_rstate = rng_mrg.ff_2p134(curr_rstate)
samples = np.array(samples).flatten()
assert(np.allclose(samples, java_samples))
def test_consistency_GPU_parallel():
"""
Verify that the random numbers generated by GPU_mrg_uniform, in
parallel, are the same as the reference (Java) implementation by
L'Ecuyer et al.
"""
if not cuda_available:
raise SkipTest('Optional package cuda not available')
if config.mode == 'FAST_COMPILE':
mode = 'FAST_RUN'
else:
mode = config.mode
seed = 12345
n_samples = 5
n_streams = 12
n_substreams = 7 # 7 samples will be drawn in parallel
samples = []
curr_rstate = np.array([seed] * 6, dtype='int32')
for i in range(n_streams):
stream_samples = []
rstate = [curr_rstate.copy()]
for j in range(1, n_substreams):
rstate.append(rng_mrg.ff_2p72(rstate[-1]))
rstate = np.asarray(rstate).flatten()
# HACK - transfer these int32 to the GPU memory as float32
# (reinterpret_cast)
tmp_float_buf = np.frombuffer(rstate.data, dtype='float32')
# Transfer to device
rstate = float32_shared_constructor(tmp_float_buf)
new_rstate, sample = rng_mrg.GPU_mrg_uniform.new(rstate, ndim=None,
dtype='float32',
size=(n_substreams,))
rstate.default_update = new_rstate
# Not really necessary, just mimicking
# rng_mrg.MRG_RandomStreams' behavior
sample.rstate = rstate
sample.update = (rstate, new_rstate)
# We need the sample back in the main memory
cpu_sample = tensor.as_tensor_variable(sample)
f = theano.function([], cpu_sample, mode=mode)
for k in range(n_samples):
s = f()
stream_samples.append(s)
samples.append(np.array(stream_samples).T.flatten())
# next stream
curr_rstate = rng_mrg.ff_2p134(curr_rstate)
samples = np.array(samples).flatten()
assert(np.allclose(samples, java_samples))
def test_GPU_nstreams_limit():
"""
Verify that a ValueError is raised when n_streams
is greater than 2**20 on GPU. This is the value of
(NUM_VECTOR_OP_THREADS_PER_BLOCK * NUM_VECTOR_OP_BLOCKS).
"""
if not cuda_available:
raise SkipTest('Optional package cuda not available')
seed = 12345
R = MRG_RandomStreams(seed=seed, use_cuda=True)
def eval_uniform(size, nstreams):
if theano.config.mode == "FAST_COMPILE":
mode = "FAST_RUN"
else:
mode = copy.copy(theano.compile.get_default_mode())
mode.check_py_code = False
out = R.uniform(size=size, nstreams=nstreams, dtype='float32')
f = theano.function([], out, mode=mode)
return f()
eval_uniform((10,), 2**20)
assert_raises(ValueError, eval_uniform, (10,), 2**20 + 1)
def test_consistency_GPUA_serial():
# Verify that the random numbers generated by GPUA_mrg_uniform, serially,
# are the same as the reference (Java) implementation by L'Ecuyer et al.
......@@ -470,7 +295,7 @@ def test_GPUA_full_fill():
# This needs to be large to trigger the problem on GPU
size = (10, 1000)
R = MRG_RandomStreams(234, use_cuda=False)
R = MRG_RandomStreams(234)
uni = R.uniform(size, nstreams=60 * 256)
f_cpu = theano.function([], uni)
......@@ -568,7 +393,7 @@ def test_uniform():
# print ''
# print 'ON CPU with size=(%s):' % str(size)
x = tensor.matrix()
R = MRG_RandomStreams(234, use_cuda=False)
R = MRG_RandomStreams(234)
# Note: we specify `nstreams` to avoid a warning.
# TODO Look for all occurrences of `guess_n_streams` and `30 * 256`
# for such situations: it would be better to instead filter the
......@@ -592,31 +417,6 @@ def test_uniform():
steps_ = steps
basictest(f, steps_, const_size, prefix='mrg cpu', inputs=input)
if mode != 'FAST_COMPILE' and cuda_available:
# print ''
# print 'ON GPU with size=(%s):' % str(size)
R = MRG_RandomStreams(234, use_cuda=True)
u = R.uniform(size=size, dtype='float32',
nstreams=rng_mrg.guess_n_streams(size, warn=False))
# well, it's really that this test w GPU doesn't make sense otw
assert u.dtype == 'float32'
f = theano.function(var_input, theano.Out(
theano.sandbox.cuda.basic_ops.gpu_from_host(u),
borrow=True), mode=mode_with_gpu)
assert any([isinstance(node.op,
theano.sandbox.rng_mrg.GPU_mrg_uniform)
for node in f.maker.fgraph.toposort()])
# theano.printing.debugprint(f)
gpu_out = np.asarray(f(*input))
# print 'GPU: random?[:10], random?[-10:]'
# print gpu_out[0, 0:10]
# print gpu_out[-1, -10:]
basictest(f, steps_, const_size, prefix='mrg gpu', inputs=input)
np.testing.assert_array_almost_equal(cpu_out, gpu_out,
decimal=6)
# print ''
# print 'ON CPU w Numpy with size=(%s):' % str(size)
RR = theano.tensor.shared_randomstreams.RandomStreams(234)
......@@ -629,7 +429,7 @@ def test_uniform():
def test_broadcastable():
R = MRG_RandomStreams(234, use_cuda=False)
R = MRG_RandomStreams(234)
x = tensor.matrix()
size1 = (10, 1)
size2 = (x.shape[0], 1)
......@@ -695,7 +495,7 @@ def test_binomial():
def t_binomial(mean, size, const_size, var_input, input, steps, rtol):
R = MRG_RandomStreams(234, use_cuda=False)
R = MRG_RandomStreams(234)
u = R.binomial(size=size, p=mean)
f = theano.function(var_input, u, mode=mode)
out = f(*input)
......@@ -709,22 +509,6 @@ def t_binomial(mean, size, const_size, var_input, input, steps, rtol):
inputs=input, allow_01=True,
target_avg=mean, mean_rtol=rtol)
if mode != 'FAST_COMPILE' and cuda_available:
R = MRG_RandomStreams(234, use_cuda=True)
u = R.binomial(size=size, p=mean, dtype='float32')
# well, it's really that this test w GPU doesn't make sense otw
assert u.dtype == 'float32'
f = theano.function(var_input, theano.Out(
theano.sandbox.cuda.basic_ops.gpu_from_host(u),
borrow=True), mode=mode_with_gpu)
gpu_out = np.asarray(f(*input))
basictest(f, steps_, const_size, prefix='mrg gpu',
inputs=input, allow_01=True,
target_avg=mean, mean_rtol=rtol)
np.testing.assert_array_almost_equal(out, gpu_out,
decimal=6)
RR = theano.tensor.shared_randomstreams.RandomStreams(234)
uu = RR.binomial(size=size, p=mean)
......@@ -778,7 +562,7 @@ def test_normal0():
# print ''
# print 'ON CPU:'
R = MRG_RandomStreams(234, use_cuda=False)
R = MRG_RandomStreams(234)
# Note: we specify `nstreams` to avoid a warning.
n = R.normal(size=size, avg=avg, std=std,
nstreams=rng_mrg.guess_n_streams(size, warn=False))
......@@ -798,31 +582,6 @@ def test_normal0():
sys.stdout.flush()
if mode != 'FAST_COMPILE' and cuda_available:
# print ''
# print 'ON GPU:'
R = MRG_RandomStreams(234, use_cuda=True)
n = R.normal(size=size, avg=avg, std=std, dtype='float32',
nstreams=rng_mrg.guess_n_streams(size, warn=False))
# well, it's really that this test w GPU doesn't make sense otw
assert n.dtype == 'float32'
f = theano.function(var_input, theano.Out(
theano.sandbox.cuda.basic_ops.gpu_from_host(n),
borrow=True), mode=mode_with_gpu)
# theano.printing.debugprint(f)
sys.stdout.flush()
gpu_out = np.asarray(f(*input))
# print 'random?[:10]\n', gpu_out[0, 0:10]
# print '----'
sys.stdout.flush()
basictest(f, steps_, const_size, target_avg=avg, target_std=std,
prefix='gpu mrg ', allow_01=True, inputs=input,
mean_rtol=rtol, std_tol=std_tol)
# Need to allow some rounding error as their is float
# computation that are done on the gpu vs cpu
assert np.allclose(out, gpu_out, rtol=5e-6, atol=5e-6)
# print ''
# print 'ON CPU w NUMPY:'
RR = theano.tensor.shared_randomstreams.RandomStreams(234)
......@@ -877,7 +636,7 @@ def test_multinomial():
pvals = np.asarray(np.random.uniform(size=sample_size))
pvals = np.apply_along_axis(lambda row: row / np.sum(row), 1, pvals)
R = MRG_RandomStreams(234, use_cuda=False)
R = MRG_RandomStreams(234)
# Note: we specify `nstreams` to avoid a warning.
m = R.multinomial(pvals=pvals, dtype=config.floatX, nstreams=30 * 256)
f = theano.function([], m, mode=mode_)
......@@ -886,29 +645,6 @@ def test_multinomial():
basic_multinomialtest(f, steps, sample_size, pvals, n_samples=1,
prefix='mrg ')
sys.stdout.flush()
if mode != 'FAST_COMPILE' and cuda_available:
# print ''
# print 'ON GPU:'
R = MRG_RandomStreams(234, use_cuda=True)
pvals = np.asarray(pvals, dtype='float32')
# We give the number of streams to avoid a warning.
n = R.multinomial(pvals=pvals, dtype='float32', nstreams=30 * 256)
# well, it's really that this test w GPU doesn't make sense otw
assert n.dtype == 'float32'
f = theano.function(
[],
theano.sandbox.cuda.basic_ops.gpu_from_host(n),
mode=mode_.including('gpu'))
# theano.printing.debugprint(f)
gpu_out = f()
sys.stdout.flush()
basic_multinomialtest(f, steps, sample_size, pvals, n_samples=1,
prefix='gpu mrg ')
np.testing.assert_array_almost_equal(out, gpu_out, decimal=6)
def test_multinomial_n_samples():
mode_ = mode
......@@ -924,7 +660,7 @@ def test_multinomial_n_samples():
pvals = np.asarray(np.random.uniform(size=sample_size))
pvals = np.apply_along_axis(lambda row: row / np.sum(row), 1, pvals)
R = MRG_RandomStreams(234, use_cuda=False)
R = MRG_RandomStreams(234)
for n_samples, steps in zip([5, 10, 100, 1000], [20, 10, 1, 1]):
m = R.multinomial(pvals=pvals, n=n_samples,
......@@ -934,26 +670,11 @@ def test_multinomial_n_samples():
n_samples, prefix='mrg ')
sys.stdout.flush()
if mode != 'FAST_COMPILE' and cuda_available:
R = MRG_RandomStreams(234, use_cuda=True)
pvals = np.asarray(pvals, dtype='float32')
n = R.multinomial(pvals=pvals, n=n_samples,
dtype='float32', nstreams=30 * 256)
assert n.dtype == 'float32'
f = theano.function(
[],
theano.sandbox.cuda.basic_ops.gpu_from_host(n),
mode=mode_.including('gpu'))
sys.stdout.flush()
basic_multinomialtest(f, steps, sample_size, pvals,
n_samples, prefix='gpu mrg ')
class T_MRG(unittest.TestCase):
def test_bad_size(self):
R = MRG_RandomStreams(234, use_cuda=False)
R = MRG_RandomStreams(234)
for size in [
(0, 100),
......@@ -1055,54 +776,43 @@ def test_multMatVect():
def test_seed_fn():
test_use_cuda = [False]
if cuda_available:
test_use_cuda.append(True)
idx = tensor.ivector()
for use_cuda in test_use_cuda:
if config.mode == 'FAST_COMPILE' and use_cuda:
mode = 'FAST_RUN'
else:
mode = config.mode
for new_seed, same in [(234, True), (None, True), (23, False)]:
random = MRG_RandomStreams(234, use_cuda=use_cuda)
fn1 = theano.function([], random.uniform((2, 2), dtype='float32'),
mode=mode)
fn2 = theano.function([], random.uniform((3, 3), nstreams=2,
dtype='float32'),
mode=mode)
fn3 = theano.function([idx],
random.uniform(idx, nstreams=3, ndim=1,
dtype='float32'),
mode=mode)
fn1_val0 = fn1()
fn1_val1 = fn1()
assert not np.allclose(fn1_val0, fn1_val1)
fn2_val0 = fn2()
fn2_val1 = fn2()
assert not np.allclose(fn2_val0, fn2_val1)
fn3_val0 = fn3([4])
fn3_val1 = fn3([4])
assert not np.allclose(fn3_val0, fn3_val1)
assert fn1_val0.size == 4
assert fn2_val0.size == 9
random.seed(new_seed)
fn1_val2 = fn1()
fn1_val3 = fn1()
fn2_val2 = fn2()
fn2_val3 = fn2()
fn3_val2 = fn3([4])
fn3_val3 = fn3([4])
assert np.allclose(fn1_val0, fn1_val2) == same
assert np.allclose(fn1_val1, fn1_val3) == same
assert np.allclose(fn2_val0, fn2_val2) == same
assert np.allclose(fn2_val1, fn2_val3) == same
assert np.allclose(fn3_val0, fn3_val2) == same
assert np.allclose(fn3_val1, fn3_val3) == same
for new_seed, same in [(234, True), (None, True), (23, False)]:
random = MRG_RandomStreams(234)
fn1 = theano.function([], random.uniform((2, 2), dtype='float32'))
fn2 = theano.function([], random.uniform((3, 3), nstreams=2,
dtype='float32'))
fn3 = theano.function([idx],
random.uniform(idx, nstreams=3, ndim=1,
dtype='float32'))
fn1_val0 = fn1()
fn1_val1 = fn1()
assert not np.allclose(fn1_val0, fn1_val1)
fn2_val0 = fn2()
fn2_val1 = fn2()
assert not np.allclose(fn2_val0, fn2_val1)
fn3_val0 = fn3([4])
fn3_val1 = fn3([4])
assert not np.allclose(fn3_val0, fn3_val1)
assert fn1_val0.size == 4
assert fn2_val0.size == 9
random.seed(new_seed)
fn1_val2 = fn1()
fn1_val3 = fn1()
fn2_val2 = fn2()
fn2_val3 = fn2()
fn3_val2 = fn3([4])
fn3_val3 = fn3([4])
assert np.allclose(fn1_val0, fn1_val2) == same
assert np.allclose(fn1_val1, fn1_val3) == same
assert np.allclose(fn2_val0, fn2_val2) == same
assert np.allclose(fn2_val1, fn2_val3) == same
assert np.allclose(fn3_val0, fn3_val2) == same
assert np.allclose(fn3_val1, fn3_val3) == same
def rng_mrg_overflow(sizes, fct, mode, should_raise_error):
......@@ -1132,28 +842,7 @@ def test_overflow_cpu():
rng_mrg_overflow(sizes, fct, config.mode, should_raise_error=False)
def test_overflow_gpu_old_backend():
# run with THEANO_FLAGS=mode=FAST_RUN,init_gpu_device=gpu1,device=cpu
if not cuda_available:
raise SkipTest('Optional package cuda not available')
mode = mode_with_gpu
seed = 12345
rng = MRG_RandomStreams(seed=seed, use_cuda=True)
fct = rng.uniform
# should raise error as the size overflows
sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
# should not raise error
sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
# should support int32 sizes
sizes = [(np.int32(2**10), ),
(np.int32(2), np.int32(2**10), np.int32(2**10))]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
def test_overflow_gpu_new_backend():
# run with THEANO_FLAGS=mode=FAST_RUN,init_gpu_device=cuda1,device=cpu
from theano.gpuarray.tests.test_basic_ops import \
mode_with_gpu as mode
from theano.gpuarray.type import gpuarray_shared_constructor
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论