提交 a388d94d authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Remove tentacles in sandbox

上级 9dcf3f4c
...@@ -10,12 +10,6 @@ from theano.tensor import NotScalarConstantError, get_scalar_constant_value ...@@ -10,12 +10,6 @@ from theano.tensor import NotScalarConstantError, get_scalar_constant_value
from theano.scalar import as_scalar from theano.scalar import as_scalar
import copy import copy
from theano.sandbox.cuda import cuda_available, GpuOp, register_opt
if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType
from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
class MultinomialFromUniform(Op): class MultinomialFromUniform(Op):
# TODO : need description for parameter 'odtype' # TODO : need description for parameter 'odtype'
""" """
...@@ -403,232 +397,6 @@ class ChoiceFromUniform(MultinomialFromUniform): ...@@ -403,232 +397,6 @@ class ChoiceFromUniform(MultinomialFromUniform):
break break
class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
"""
The output is transposed compared to MultinomialFromUniform.
We must insert a Transpose op after it.
The optimization that moves it to the gpu does it.
"""
def make_node(self, pvals, unis):
assert pvals.dtype == 'float32'
assert unis.dtype == 'float32'
if not isinstance(pvals.type, CudaNdarrayType):
raise TypeError('pvals must be cudandarray', pvals)
if not isinstance(unis.type, CudaNdarrayType):
raise TypeError('unis must be cudandarray', unis)
if self.odtype == 'auto':
odtype = pvals.dtype
else:
odtype = self.odtype
if odtype != pvals.dtype:
raise NotImplementedError(
'GpuMultinomialFromUniform works only if '
'self.odtype == pvals.dtype', odtype, pvals.dtype)
br = (pvals.broadcastable[1], pvals.broadcastable[0])
out = CudaNdarrayType(broadcastable=br)()
return Apply(self, [pvals, unis], [out])
def perform(self, node, ins, outs):
# The perform from parent don't work with CudaNdarray. We
# don't need it as DebugMode will test again it as an
# optimization insert the GPU op.
return Op.perform(self, node, ins, outs)
def c_code_cache_version(self):
return (9,)
def c_support_code_apply(self, node, nodename):
return """
static __global__ void k_multi_warp_%(nodename)s(
const int nb_multi,
const int nb_outcomes,
float * global_pvals,
const int pvals_row_stride,
const int pvals_col_stride,
float * global_unis,
const int unis_stride,
float * global_outs,
const int outs_row_stride,
const int outs_col_stride
)
{
// each thread takes care of one multinomial draw
int n = blockDim.x*blockIdx.x + threadIdx.x;
if (n < nb_multi)
{
float cummul = 0.;
bool done = false;
const float unis_n = global_unis[n*unis_stride];
for (int m = 0; m < nb_outcomes; ++m)
{
float current_out = 0.;
if (!done)
{
cummul += global_pvals[m * pvals_col_stride + n * pvals_row_stride];
if (unis_n < cummul)
{
current_out = 1.;
done = true;
}
}
//write out transposed for speed.
global_outs[n * outs_col_stride + m * outs_row_stride] = current_out;
}
}
}
""" % locals()
def c_code(self, node, name, ins, outs, sub):
(pvals, unis) = ins
(z,) = outs
fail = sub['fail']
return """
if (CudaNdarray_NDIM(%(pvals)s) != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
%(fail)s;
}
if (CudaNdarray_NDIM(%(unis)s) != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
%(fail)s;
}
//N.B. that the output is TRANSPOSED compared with pvals
if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[0]))
{
Py_XDECREF(%(z)s);
npy_intp dims[2];
dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]);
dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]);
%(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
if (!%(z)s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
%(fail)s;
}
}
{ // NESTED SCOPE
int nb_multi = CudaNdarray_HOST_DIMS(%(pvals)s)[0];
int nb_outcomes = CudaNdarray_HOST_DIMS(%(pvals)s)[1];
//TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1;
int nb_blocks = max_nb_blocks + 1;
int nb_threads=16; // so it really starts at 32, because of the *2
do
{
nb_threads*=2;
if (nb_multi %% nb_threads == 0)
nb_blocks = nb_multi/nb_threads;
else
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks);
//printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
// TODO : next line is a bit hardcoded...
if (nb_threads > 512)
{
PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for so many rows in the matrix (%%i)", nb_multi);
%(fail)s;
}
dim3 n_blocks(nb_blocks,1,1);
dim3 n_threads(nb_threads,1,1);
int n_shared = 0;
assert(nb_blocks*nb_threads >= nb_multi);
k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(%(z)s)[1],
CudaNdarray_HOST_DIMS(%(z)s)[0],
CudaNdarray_DEV_DATA(%(pvals)s),
CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
CudaNdarray_DEV_DATA(%(unis)s),
CudaNdarray_HOST_STRIDES(%(unis)s)[0],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n",
"k_multi_warp_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z,
n_shared);
%(fail)s;
}
} // END NESTED SCOPE
""" % locals()
@register_opt()
@local_optimizer([MultinomialFromUniform])
def local_gpu_multinomial(node):
# TODO : need description for function
if type(node.op) is MultinomialFromUniform:
if len(node.inputs) == 2:
p, u = node.inputs
n_samples = 1
else:
p, u, n_samples = node.inputs
try:
if get_scalar_constant_value(n_samples) != 1:
return None
except NotScalarConstantError:
return None
m, = node.outputs
if (p.dtype == u.dtype == m.dtype == 'float32' and
any([i.owner and isinstance(i.owner.op,
theano.sandbox.cuda.HostFromGpu)
for i in node.inputs])):
gpu_op = GpuMultinomialFromUniform(node.op.odtype)
return [host_from_gpu(gpu_op(*[gpu_from_host(i)
for i in [p, u]])).T]
if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and
node.inputs[0].owner and
type(node.inputs[0].owner.op) is MultinomialFromUniform):
multi = node.inputs[0].owner
if len(node.inputs) == 2:
p, u = node.inputs
n_samples = 1
else:
p, u, n_samples = node.inputs
try:
if get_scalar_constant_value(n_samples) != 1:
return None
except NotScalarConstantError:
return None
m, = multi.outputs
if (p.dtype == u.dtype == m.dtype == 'float32'):
gpu_op = GpuMultinomialFromUniform(multi.op.odtype)
ret = gpu_op(*[gpu_from_host(i) for i in [p, u]]).T
# The dimshuffle is on the cpu, but will be moved to the
# gpu by an opt.
return [gpu_from_host(ret)]
class MultinomialWOReplacementFromUniform(ChoiceFromUniform): class MultinomialWOReplacementFromUniform(ChoiceFromUniform):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
warnings.warn("MultinomialWOReplacementFromUniform is deprecated, " warnings.warn("MultinomialWOReplacementFromUniform is deprecated, "
......
...@@ -12,6 +12,7 @@ import numpy as np ...@@ -12,6 +12,7 @@ import numpy as np
from six import integer_types from six import integer_types
from six.moves import xrange from six.moves import xrange
import theano
from theano import Op, Apply, shared, config, Variable from theano import Op, Apply, shared, config, Variable
from theano import gradient, function from theano import gradient, function
from theano import tensor from theano import tensor
...@@ -22,17 +23,11 @@ from theano.compile import optdb ...@@ -22,17 +23,11 @@ from theano.compile import optdb
from theano.gof import local_optimizer from theano.gof import local_optimizer
from . import multinomial from . import multinomial
import theano.sandbox.cuda
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
from theano.gpuarray.basic_ops import GpuKernelBase, Kernel, infer_context_name, as_gpuarray_variable from theano.gpuarray.basic_ops import GpuKernelBase, Kernel, infer_context_name, as_gpuarray_variable
from theano.gpuarray.type import GpuArrayType from theano.gpuarray.type import GpuArrayType
from theano.gpuarray.fp16_help import write_w from theano.gpuarray.fp16_help import write_w
from theano.gpuarray.opt import (register_opt as register_gpua, from theano.gpuarray.opt import (register_opt as register_gpua,
register_opt2) register_opt2)
if theano.sandbox.cuda.cuda_available:
from theano.sandbox.cuda import (CudaNdarrayType,
float32_shared_constructor)
def matVecModM(A, s, m): def matVecModM(A, s, m):
...@@ -562,264 +557,6 @@ class mrg_uniform(mrg_uniform_base): ...@@ -562,264 +557,6 @@ class mrg_uniform(mrg_uniform_base):
return (8, ) return (8, )
class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
# GPU VERSION
def make_node(self, rstate, size):
# error checking slightly redundant here, since
# this op should not be called directly.
#
# call through MRG_RandomStreams instead.
broad = []
for i in range(self.output_type.ndim):
broad.append(tensor.extract_constant(size[i]) == 1)
output_type = self.output_type.clone(broadcastable=broad)()
rstate = as_cuda_ndarray_variable(rstate)
return Apply(self,
[rstate, size],
[rstate.type(), output_type])
@classmethod
def new(cls, rstate, ndim, dtype, size):
v_size = as_tensor_variable(size)
if ndim is None:
ndim = get_vector_length(v_size)
op = cls(CudaNdarrayType((False,) * ndim))
return op(rstate, v_size)
def c_support_code_apply(self, node, nodename):
if self.output_type.dtype == 'float32':
otype = 'float'
NORM = '4.6566126e-10f' # np.float32(1.0/(2**31+65))
# this was determined by finding the biggest number such that
# np.float32(number * M1) < 1.0
else:
otype = 'double'
NORM = '4.656612873077392578125e-10'
return """
// FB: I disable the printing of the warning, as we
//receive too much email about this and this don't help
//people. I'm not even sure if the "fix" to give the info about
//the shape statically give a speed up. So I consider this
//warning as useless until proved it can speed the user code.
static int %(nodename)s_printed_warning = 1;
static __global__ void %(nodename)s_mrg_uniform(
%(otype)s*sample_data,
npy_int32*state_data,
const int Nsamples,
const int Nstreams_used)
{
const npy_int32 i0 = 0;
const npy_int32 i7 = 7;
const npy_int32 i9 = 9;
const npy_int32 i15 = 15;
const npy_int32 i16 = 16;
const npy_int32 i22 = 22;
const npy_int32 i24 = 24;
const npy_int32 M1 = 2147483647; //2^31 - 1
const npy_int32 M2 = 2147462579; //2^31 - 21069
const npy_int32 MASK12 = 511; //2^9 - 1
const npy_int32 MASK13 = 16777215; //2^24 - 1
const npy_int32 MASK2 = 65535; //2^16 - 1
const npy_int32 MULT2 = 21069;
const unsigned int numThreads = blockDim.x * gridDim.x;
const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
npy_int32 y1, y2, x11, x12, x13, x21, x22, x23;
if (idx < Nstreams_used)
{
x11 = state_data[idx*6+0];
x12 = state_data[idx*6+1];
x13 = state_data[idx*6+2];
x21 = state_data[idx*6+3];
x22 = state_data[idx*6+4];
x23 = state_data[idx*6+5];
for (int i = idx; i < Nsamples; i += Nstreams_used)
{
y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
y1 += x13;
y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
x13 = x12;
x12 = x11;
x11 = y1;
y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16));
y1 -= (y1 < 0 || y1 >= M2) ? M2 : 0;
y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16));
y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
y2 += x23;
y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
y2 += y1;
y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
x23 = x22;
x22 = x21;
x21 = y2;
if (x11 <= x21) {
sample_data[i] = (x11 - x21 + M1) * %(NORM)s;
}
else
{
sample_data[i] = (x11 - x21) * %(NORM)s;
}
}
state_data[idx*6+0]= x11;
state_data[idx*6+1]= x12;
state_data[idx*6+2]= x13;
state_data[idx*6+3]= x21;
state_data[idx*6+4]= x22;
state_data[idx*6+5]= x23;
}
}
""" % locals()
def c_code(self, node, nodename, inp, out, sub):
rstate, size = inp
o_rstate, o_sample = out
inplace = int(self.inplace)
ndim = self.output_type.ndim
o_type_num = np.asarray(0, dtype=self.output_type.dtype).dtype.num
fail = sub['fail']
if self.output_type.dtype == 'float32':
otype = 'float'
else:
otype = 'double'
SYNC = "CNDA_THREAD_SYNC"
return """
//////// <code generated by mrg_uniform>
npy_int64 M1 = 2147483647; //2^31 - 1
// The +1 is to avoid odims[0] which fails on windows
npy_int64 odims[%(ndim)s+1];
npy_int64 n_elements = 1;
int n_streams, n_streams_used_in_this_call;
int must_alloc_sample = ((NULL == %(o_sample)s)
|| !CudaNdarray_Check((PyObject*)%(o_sample)s)
|| !CudaNdarray_is_c_contiguous(%(o_sample)s)
|| (CudaNdarray_NDIM(%(o_sample)s) != %(ndim)s));
if (PyArray_NDIM(%(size)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "size must be vector");
%(fail)s
}
if (PyArray_DIMS(%(size)s)[0] != %(ndim)s)
{
PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%i)",
%(ndim)s, PyArray_DIMS(%(size)s)[0]);
%(fail)s
}
for (int i = 0; i < %(ndim)s; ++i)
{
odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
n_elements *= odims[i];
must_alloc_sample = (must_alloc_sample
|| CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]);
}
if (n_elements > M1)
{
PyErr_SetString(
PyExc_ValueError,
"rng_mrg gpu implementation does not support more than (2**31 -1) samples");
%(fail)s
}
if (must_alloc_sample)
{
Py_XDECREF(%(o_sample)s);
%(o_sample)s = (CudaNdarray*)CudaNdarray_NewDims(%(ndim)s, odims);
if(!%(o_sample)s)
{
%(fail)s;
}
}
if (!CudaNdarray_Check((PyObject*)%(rstate)s))
{
PyErr_Format(PyExc_ValueError, "rstate must be cudandarray");
%(fail)s;
}
Py_XDECREF(%(o_rstate)s);
if (%(inplace)s)
{
Py_INCREF(%(rstate)s);
%(o_rstate)s = %(rstate)s;
}
else
{
%(o_rstate)s = (CudaNdarray*)CudaNdarray_Copy(%(rstate)s);
if (!%(o_rstate)s) {
PyErr_SetString(PyExc_RuntimeError, "GPU_mrg_uniform: "
"could not copy rstate");
%(fail)s
}
}
if (CudaNdarray_NDIM(%(o_rstate)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "rstate must be vector");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(%(o_rstate)s)[0] %% 6)
{
PyErr_Format(PyExc_ValueError, "rstate len must be multiple of 6");
%(fail)s;
}
n_streams = CudaNdarray_HOST_DIMS(%(o_rstate)s)[0]/6;
n_streams_used_in_this_call = std::min(n_streams, (int)n_elements);
{
unsigned int threads_per_block = std::min((unsigned int)n_streams_used_in_this_call, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
unsigned int n_blocks = std::min(ceil_intdiv((unsigned int)n_streams_used_in_this_call, threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
if (n_streams > (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK * (unsigned int)NUM_VECTOR_OP_BLOCKS)
{
PyErr_Format(PyExc_ValueError, "On GPU, n_streams should be at most %%u",
(unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK * (unsigned int)NUM_VECTOR_OP_BLOCKS);
%(fail)s;
}
if (threads_per_block * n_blocks < n_streams)
{
if (! %(nodename)s_printed_warning)
fprintf(stderr, "WARNING: unused streams above %%i (Tune GPU_mrg get_n_streams)\\n", threads_per_block * n_blocks );
%(nodename)s_printed_warning = 1;
}
%(nodename)s_mrg_uniform<<<n_blocks,threads_per_block>>>(
CudaNdarray_DEV_DATA(%(o_sample)s),
(npy_int32*)CudaNdarray_DEV_DATA(%(o_rstate)s),
n_elements, n_streams_used_in_this_call);
}
%(SYNC)s;
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "mrg_uniform", cudaGetErrorString(err));
%(fail)s;
}
}
//////// </ code generated by mrg_uniform>
""" % locals()
def c_code_cache_version(self):
return (12,)
class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
# GpuArray version # GpuArray version
_f16_ok = True _f16_ok = True
...@@ -1131,7 +868,6 @@ def guess_n_streams(size, warn=False): ...@@ -1131,7 +868,6 @@ def guess_n_streams(size, warn=False):
class MRG_RandomStreams(object): class MRG_RandomStreams(object):
# TODO : need description for parameter 'use_cuda'
""" """
Module component with similar interface to numpy.random Module component with similar interface to numpy.random
(numpy.random.RandomState). (numpy.random.RandomState).
...@@ -1151,7 +887,7 @@ class MRG_RandomStreams(object): ...@@ -1151,7 +887,7 @@ class MRG_RandomStreams(object):
# TODO : need description for method and return # TODO : need description for method and return
return list(self.state_updates) return list(self.state_updates)
def __init__(self, seed=12345, use_cuda=None): def __init__(self, seed=12345):
# A list of pairs of the form (input_r, output_r), representing the # A list of pairs of the form (input_r, output_r), representing the
# update rules of all the random states generated # update rules of all the random states generated
# by this RandomStreams. # by this RandomStreams.
...@@ -1164,11 +900,6 @@ class MRG_RandomStreams(object): ...@@ -1164,11 +900,6 @@ class MRG_RandomStreams(object):
self.set_rstate(seed) self.set_rstate(seed)
if use_cuda is None:
self.use_cuda = theano.sandbox.cuda.cuda_enabled
else:
self.use_cuda = use_cuda
def set_rstate(self, seed): def set_rstate(self, seed):
# TODO : need description for method, parameter # TODO : need description for method, parameter
if isinstance(seed, integer_types): if isinstance(seed, integer_types):
...@@ -1271,15 +1002,6 @@ class MRG_RandomStreams(object): ...@@ -1271,15 +1002,6 @@ class MRG_RandomStreams(object):
if inc_rstate: if inc_rstate:
self.inc_rstate() self.inc_rstate()
if self.use_cuda and dtype == 'float32':
rval = rval.flatten()
# HACK - we use fact that int32 and float32 have same size to
# sneak ints into the CudaNdarray type.
# these *SHOULD NEVER BE USED AS FLOATS*
tmp_float_buf = np.frombuffer(rval.data, dtype='float32')
assert tmp_float_buf.shape == rval.shape
assert (tmp_float_buf.view('int32') == rval).all()
rval = tmp_float_buf
return rval return rval
...@@ -1352,25 +1074,11 @@ class MRG_RandomStreams(object): ...@@ -1352,25 +1074,11 @@ class MRG_RandomStreams(object):
nstreams = self.n_streams(size) nstreams = self.n_streams(size)
rstates = self.get_substream_rstates(nstreams, dtype) rstates = self.get_substream_rstates(nstreams, dtype)
if self.use_cuda and dtype == 'float32': node_rstate = shared(rstates)
node_rstate = float32_shared_constructor(rstates) u = self.pretty_return(node_rstate,
assert isinstance(node_rstate.type, CudaNdarrayType) *mrg_uniform.new(node_rstate,
ndim, dtype, size),
# we can't use the normal mrg_uniform constructor + later size=size, nstreams=orig_nstreams)
# optimization
# because of the tmp_float_buf hack above. There is
# currently no Theano node that will do a frombuffer
# reinterpretation.
u = self.pretty_return(node_rstate,
*GPU_mrg_uniform.new(node_rstate,
ndim, dtype, size),
size=size, nstreams=orig_nstreams)
else:
node_rstate = shared(rstates)
u = self.pretty_return(node_rstate,
*mrg_uniform.new(node_rstate,
ndim, dtype, size),
size=size, nstreams=orig_nstreams)
# Add a reference to distinguish from other shared variables # Add a reference to distinguish from other shared variables
node_rstate.tag.is_rng = True node_rstate.tag.is_rng = True
r = u * (high - low) + low r = u * (high - low) + low
...@@ -1387,10 +1095,7 @@ class MRG_RandomStreams(object): ...@@ -1387,10 +1095,7 @@ class MRG_RandomStreams(object):
nstreams=None): nstreams=None):
# TODO : need description for method, parameter and return # TODO : need description for method, parameter and return
if n == 1: if n == 1:
if dtype == 'float32' and self.use_cuda: x = self.uniform(size=size, nstreams=nstreams)
x = self.uniform(size=size, dtype=dtype, nstreams=nstreams)
else:
x = self.uniform(size=size, nstreams=nstreams)
return cast(x < p, dtype) return cast(x < p, dtype)
else: else:
raise NotImplementedError("MRG_RandomStreams.binomial with n > 1") raise NotImplementedError("MRG_RandomStreams.binomial with n > 1")
...@@ -1630,7 +1335,7 @@ def local_gpua_mrg(node): ...@@ -1630,7 +1335,7 @@ def local_gpua_mrg(node):
return local_gpua_mrg_graph(node.op, context_name, node.inputs, node.outputs) return local_gpua_mrg_graph(node.op, context_name, node.inputs, node.outputs)
MRG_RNGs = (mrg_uniform, GPU_mrg_uniform, GPUA_mrg_uniform) MRG_RNGs = (mrg_uniform, GPUA_mrg_uniform)
@local_optimizer(MRG_RNGs) @local_optimizer(MRG_RNGs)
......
...@@ -10,28 +10,11 @@ import theano ...@@ -10,28 +10,11 @@ import theano
from theano import config, function, tensor from theano import config, function, tensor
from theano.sandbox import multinomial from theano.sandbox import multinomial
from theano.compile.mode import get_default_mode from theano.compile.mode import get_default_mode
import theano.sandbox.cuda as cuda
import theano.tests.unittest_tools as utt import theano.tests.unittest_tools as utt
from theano.compat import PY3 from theano.compat import PY3
from theano.misc.pkl_utils import CompatUnpickler from theano.misc.pkl_utils import CompatUnpickler
def get_mode(gpu):
mode = get_default_mode()
if theano.config.mode == 'FAST_COMPILE':
mode = theano.compile.get_mode('FAST_RUN')
if gpu:
mode = mode.including('gpu', 'gpu_local_optimizations',
'local_cut_gpu_host_gpu',
'local_gpu_multinomial')
return mode
def run_with_c(f, gpu=False):
mode = get_mode(gpu)
f(mode, gpu)
def test_n_samples_1(): def test_n_samples_1():
p = tensor.fmatrix() p = tensor.fmatrix()
u = tensor.fvector() u = tensor.fvector()
...@@ -117,69 +100,52 @@ def test_multinomial_0(): ...@@ -117,69 +100,52 @@ def test_multinomial_0():
m = multinomial.MultinomialFromUniform('auto')(p, u) m = multinomial.MultinomialFromUniform('auto')(p, u)
def body(mode, gpu): # the m*2 allows the multinomial to reuse output
# the m*2 allows the multinomial to reuse output f = function([p, u], m * 2, allow_input_downcast=True)
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
if gpu: # test that both first and second samples can be drawn
assert any([type(node.op) is multinomial.GpuMultinomialFromUniform utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
for node in f.maker.fgraph.toposort()]) [[2, 0], [0, 2]])
# test that both first and second samples can be drawn # test that both second labels can be drawn
utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]), r = f([[.2, .8], [.3, .7]], [.31, .31])
[[2, 0], [0, 2]]) utt.assert_allclose(r, [[0, 2], [0, 2]])
# test that both second labels can be drawn # test that both first labels can be drawn
r = f([[.2, .8], [.3, .7]], [.31, .31]) r = f([[.2, .8], [.3, .7]], [.21, .21])
utt.assert_allclose(r, [[0, 2], [0, 2]]) utt.assert_allclose(r, [[0, 2], [2, 0]])
# test that both first labels can be drawn # change the size to make sure output gets reallocated ok
r = f([[.2, .8], [.3, .7]], [.21, .21]) # and also make sure that the GPU version doesn't screw up the
utt.assert_allclose(r, [[0, 2], [2, 0]]) # transposed-ness
r = f([[.2, .8]], [.25])
# change the size to make sure output gets reallocated ok utt.assert_allclose(r, [[0, 2]])
# and also make sure that the GPU version doesn't screw up the
# transposed-ness
r = f([[.2, .8]], [.25])
utt.assert_allclose(r, [[0, 2]])
run_with_c(body)
if cuda.cuda_available:
run_with_c(body, True)
# TODO: check a bigger example (make sure blocking on GPU is handled correctly) # TODO: check a bigger example (make sure blocking on GPU is handled correctly)
def test_multinomial_large(): def test_multinomial_large():
# DEBUG_MODE will test this on GPU p = tensor.fmatrix()
def body(mode, gpu): u = tensor.fvector()
p = tensor.fmatrix() m = multinomial.MultinomialFromUniform('auto')(p, u)
u = tensor.fvector() f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
m = multinomial.MultinomialFromUniform('auto')(p, u)
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode) pval = np.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
if gpu: pval = pval / pval.sum(axis=1)[:, None]
assert any([type(node.op) is multinomial.GpuMultinomialFromUniform uval = np.ones_like(pval[:, 0]) * 0.5
for node in f.maker.fgraph.toposort()]) mval = f(pval, uval)
pval = np.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1 assert mval.shape == pval.shape
pval = pval / pval.sum(axis=1)[:, None] if config.cast_policy == 'custom':
uval = np.ones_like(pval[:, 0]) * 0.5 assert mval.dtype == pval.dtype
mval = f(pval, uval) elif config.cast_policy == 'numpy+floatX':
assert mval.dtype == config.floatX
assert mval.shape == pval.shape elif config.cast_policy == 'numpy':
if config.cast_policy == 'custom': assert mval.dtype == 'float64'
assert mval.dtype == pval.dtype else:
elif config.cast_policy == 'numpy+floatX': raise NotImplementedError(config.cast_policy)
assert mval.dtype == config.floatX utt.assert_allclose(mval.sum(axis=1), 2)
elif config.cast_policy == 'numpy': asdf = np.asarray([0, 0, 2, 0]) + 0 * pval
assert mval.dtype == 'float64' utt.assert_allclose(mval, asdf) # broadcast over all rows
else:
raise NotImplementedError(config.cast_policy)
utt.assert_allclose(mval.sum(axis=1), 2)
asdf = np.asarray([0, 0, 2, 0]) + 0 * pval
utt.assert_allclose(mval, asdf) # broadcast over all rows
run_with_c(body)
if cuda.cuda_available:
run_with_c(body, True)
def test_multinomial_dtypes(): def test_multinomial_dtypes():
...@@ -197,40 +163,3 @@ def test_multinomial_dtypes(): ...@@ -197,40 +163,3 @@ def test_multinomial_dtypes():
u = tensor.fvector() u = tensor.fvector()
m = multinomial.MultinomialFromUniform('float64')(p, u) m = multinomial.MultinomialFromUniform('float64')(p, u)
assert m.dtype == 'float64', m.dtype assert m.dtype == 'float64', m.dtype
def test_gpu_opt():
if not cuda.cuda_available:
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
raise SkipTest('Optional package cuda not available')
# We test the case where we put the op on the gpu when the output
# is moved to the gpu.
p = tensor.fmatrix()
u = tensor.fvector()
m = multinomial.MultinomialFromUniform('auto')(p, u)
assert m.dtype == 'float32', m.dtype
m_gpu = cuda.gpu_from_host(m)
f = function([p, u], m_gpu, allow_input_downcast=True, mode=get_mode(True))
assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
pval = np.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = np.ones_like(pval[:, 0]) * 0.5
f(pval, uval)
# Test with a row, it was failing in the past.
r = tensor.frow()
m = multinomial.MultinomialFromUniform('auto')(r, u)
assert m.dtype == 'float32', m.dtype
m_gpu = cuda.gpu_from_host(m)
f = function([r, u], m_gpu, allow_input_downcast=True, mode=get_mode(True))
assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
pval = np.arange(1 * 4, dtype='float32').reshape((1, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = np.ones_like(pval[:, 0]) * 0.5
f(pval, uval)
...@@ -15,28 +15,15 @@ import theano ...@@ -15,28 +15,15 @@ import theano
from theano import tensor, config from theano import tensor, config
from theano.sandbox import rng_mrg from theano.sandbox import rng_mrg
from theano.sandbox.rng_mrg import MRG_RandomStreams from theano.sandbox.rng_mrg import MRG_RandomStreams
from theano.sandbox.cuda import cuda_available
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from theano.tests.unittest_tools import attr from theano.tests.unittest_tools import attr
import theano.gpuarray.tests.config import theano.gpuarray.tests.config
if cuda_available:
from theano.sandbox.cuda import float32_shared_constructor
# TODO: test gpu
# Done in test_consistency_GPU_{serial,parallel}
# TODO: test MRG_RandomStreams # TODO: test MRG_RandomStreams
# Partly done in test_consistency_randomstreams # Partly done in test_consistency_randomstreams
# TODO: test optimizer mrg_random_make_inplace # TODO: test optimizer mrg_random_make_inplace
# TODO: make tests work when no flags gived. Now need:
# THEANO_FLAGS=device=gpu0,floatX=float32
# Partly done, in test_consistency_GPU_{serial,parallel}
mode = config.mode
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
utt.seed_rng() utt.seed_rng()
# Results generated by Java code using L'Ecuyer et al.'s code, with: # Results generated by Java code using L'Ecuyer et al.'s code, with:
...@@ -53,61 +40,46 @@ def test_deterministic(): ...@@ -53,61 +40,46 @@ def test_deterministic():
seed = utt.fetch_seed() seed = utt.fetch_seed()
sample_size = (10, 20) sample_size = (10, 20)
test_use_cuda = [False] R = MRG_RandomStreams(seed=seed)
if cuda_available: u = R.uniform(size=sample_size)
test_use_cuda.append(True) f = theano.function([], u)
for use_cuda in test_use_cuda:
# print 'use_cuda =', use_cuda
R = MRG_RandomStreams(seed=seed, use_cuda=use_cuda)
u = R.uniform(size=sample_size)
f = theano.function([], u)
fsample1 = f() fsample1 = f()
fsample2 = f() fsample2 = f()
assert not np.allclose(fsample1, fsample2) assert not np.allclose(fsample1, fsample2)
R2 = MRG_RandomStreams(seed=seed, use_cuda=use_cuda) R2 = MRG_RandomStreams(seed=seed)
u2 = R2.uniform(size=sample_size) u2 = R2.uniform(size=sample_size)
g = theano.function([], u2) g = theano.function([], u2)
gsample1 = g() gsample1 = g()
gsample2 = g() gsample2 = g()
assert np.allclose(fsample1, gsample1) assert np.allclose(fsample1, gsample1)
assert np.allclose(fsample2, gsample2) assert np.allclose(fsample2, gsample2)
def test_consistency_randomstreams(): def test_consistency_randomstreams():
""" # Verify that the random numbers generated by MRG_RandomStreams
Verify that the random numbers generated by MRG_RandomStreams # are the same as the reference (Java) implementation by L'Ecuyer et al.
are the same as the reference (Java) implementation by L'Ecuyer et al.
"""
seed = 12345 seed = 12345
n_samples = 5 n_samples = 5
n_streams = 12 n_streams = 12
n_substreams = 7 n_substreams = 7
test_use_cuda = [False] samples = []
if cuda_available: rng = MRG_RandomStreams(seed=seed)
test_use_cuda.append(True) for i in range(n_streams):
stream_samples = []
for use_cuda in test_use_cuda: u = rng.uniform(size=(n_substreams,), nstreams=n_substreams)
# print 'use_cuda =', use_cuda f = theano.function([], u)
samples = [] for j in range(n_samples):
rng = MRG_RandomStreams(seed=seed, use_cuda=use_cuda) s = f()
for i in range(n_streams): stream_samples.append(s)
stream_samples = [] stream_samples = np.array(stream_samples)
u = rng.uniform(size=(n_substreams,), nstreams=n_substreams) stream_samples = stream_samples.T.flatten()
f = theano.function([], u) samples.append(stream_samples)
for j in range(n_samples):
s = f()
stream_samples.append(s)
stream_samples = np.array(stream_samples)
stream_samples = stream_samples.T.flatten()
samples.append(stream_samples)
samples = np.array(samples).flatten() samples = np.array(samples).flatten()
assert(np.allclose(samples, java_samples)) assert(np.allclose(samples, java_samples))
def test_get_substream_rstates(): def test_get_substream_rstates():
...@@ -214,153 +186,6 @@ def test_consistency_cpu_parallel(): ...@@ -214,153 +186,6 @@ def test_consistency_cpu_parallel():
assert(np.allclose(samples, java_samples)) assert(np.allclose(samples, java_samples))
def test_consistency_GPU_serial():
"""
Verify that the random numbers generated by GPU_mrg_uniform, serially,
are the same as the reference (Java) implementation by L'Ecuyer et al.
"""
if not cuda_available:
raise SkipTest('Optional package cuda not available')
if config.mode == 'FAST_COMPILE':
mode = 'FAST_RUN'
else:
mode = config.mode
seed = 12345
n_samples = 5
n_streams = 12
n_substreams = 7
samples = []
curr_rstate = np.array([seed] * 6, dtype='int32')
for i in range(n_streams):
stream_rstate = curr_rstate.copy()
for j in range(n_substreams):
substream_rstate = np.array(stream_rstate.copy(), dtype='int32')
# HACK - we transfer these int32 to the GPU memory as float32
# (reinterpret_cast)
tmp_float_buf = np.frombuffer(substream_rstate.data,
dtype='float32')
# Transfer to device
rstate = float32_shared_constructor(tmp_float_buf)
new_rstate, sample = rng_mrg.GPU_mrg_uniform.new(rstate, ndim=None,
dtype='float32',
size=(1,))
rstate.default_update = new_rstate
# Not really necessary, just mimicking
# rng_mrg.MRG_RandomStreams' behavior
sample.rstate = rstate
sample.update = (rstate, new_rstate)
# We need the sample back in the main memory
cpu_sample = tensor.as_tensor_variable(sample)
f = theano.function([], cpu_sample, mode=mode)
for k in range(n_samples):
s = f()
samples.append(s)
# next substream
stream_rstate = rng_mrg.ff_2p72(stream_rstate)
# next stream
curr_rstate = rng_mrg.ff_2p134(curr_rstate)
samples = np.array(samples).flatten()
assert(np.allclose(samples, java_samples))
def test_consistency_GPU_parallel():
"""
Verify that the random numbers generated by GPU_mrg_uniform, in
parallel, are the same as the reference (Java) implementation by
L'Ecuyer et al.
"""
if not cuda_available:
raise SkipTest('Optional package cuda not available')
if config.mode == 'FAST_COMPILE':
mode = 'FAST_RUN'
else:
mode = config.mode
seed = 12345
n_samples = 5
n_streams = 12
n_substreams = 7 # 7 samples will be drawn in parallel
samples = []
curr_rstate = np.array([seed] * 6, dtype='int32')
for i in range(n_streams):
stream_samples = []
rstate = [curr_rstate.copy()]
for j in range(1, n_substreams):
rstate.append(rng_mrg.ff_2p72(rstate[-1]))
rstate = np.asarray(rstate).flatten()
# HACK - transfer these int32 to the GPU memory as float32
# (reinterpret_cast)
tmp_float_buf = np.frombuffer(rstate.data, dtype='float32')
# Transfer to device
rstate = float32_shared_constructor(tmp_float_buf)
new_rstate, sample = rng_mrg.GPU_mrg_uniform.new(rstate, ndim=None,
dtype='float32',
size=(n_substreams,))
rstate.default_update = new_rstate
# Not really necessary, just mimicking
# rng_mrg.MRG_RandomStreams' behavior
sample.rstate = rstate
sample.update = (rstate, new_rstate)
# We need the sample back in the main memory
cpu_sample = tensor.as_tensor_variable(sample)
f = theano.function([], cpu_sample, mode=mode)
for k in range(n_samples):
s = f()
stream_samples.append(s)
samples.append(np.array(stream_samples).T.flatten())
# next stream
curr_rstate = rng_mrg.ff_2p134(curr_rstate)
samples = np.array(samples).flatten()
assert(np.allclose(samples, java_samples))
def test_GPU_nstreams_limit():
"""
Verify that a ValueError is raised when n_streams
is greater than 2**20 on GPU. This is the value of
(NUM_VECTOR_OP_THREADS_PER_BLOCK * NUM_VECTOR_OP_BLOCKS).
"""
if not cuda_available:
raise SkipTest('Optional package cuda not available')
seed = 12345
R = MRG_RandomStreams(seed=seed, use_cuda=True)
def eval_uniform(size, nstreams):
if theano.config.mode == "FAST_COMPILE":
mode = "FAST_RUN"
else:
mode = copy.copy(theano.compile.get_default_mode())
mode.check_py_code = False
out = R.uniform(size=size, nstreams=nstreams, dtype='float32')
f = theano.function([], out, mode=mode)
return f()
eval_uniform((10,), 2**20)
assert_raises(ValueError, eval_uniform, (10,), 2**20 + 1)
def test_consistency_GPUA_serial(): def test_consistency_GPUA_serial():
# Verify that the random numbers generated by GPUA_mrg_uniform, serially, # Verify that the random numbers generated by GPUA_mrg_uniform, serially,
# are the same as the reference (Java) implementation by L'Ecuyer et al. # are the same as the reference (Java) implementation by L'Ecuyer et al.
...@@ -470,7 +295,7 @@ def test_GPUA_full_fill(): ...@@ -470,7 +295,7 @@ def test_GPUA_full_fill():
# This needs to be large to trigger the problem on GPU # This needs to be large to trigger the problem on GPU
size = (10, 1000) size = (10, 1000)
R = MRG_RandomStreams(234, use_cuda=False) R = MRG_RandomStreams(234)
uni = R.uniform(size, nstreams=60 * 256) uni = R.uniform(size, nstreams=60 * 256)
f_cpu = theano.function([], uni) f_cpu = theano.function([], uni)
...@@ -568,7 +393,7 @@ def test_uniform(): ...@@ -568,7 +393,7 @@ def test_uniform():
# print '' # print ''
# print 'ON CPU with size=(%s):' % str(size) # print 'ON CPU with size=(%s):' % str(size)
x = tensor.matrix() x = tensor.matrix()
R = MRG_RandomStreams(234, use_cuda=False) R = MRG_RandomStreams(234)
# Note: we specify `nstreams` to avoid a warning. # Note: we specify `nstreams` to avoid a warning.
# TODO Look for all occurrences of `guess_n_streams` and `30 * 256` # TODO Look for all occurrences of `guess_n_streams` and `30 * 256`
# for such situations: it would be better to instead filter the # for such situations: it would be better to instead filter the
...@@ -592,31 +417,6 @@ def test_uniform(): ...@@ -592,31 +417,6 @@ def test_uniform():
steps_ = steps steps_ = steps
basictest(f, steps_, const_size, prefix='mrg cpu', inputs=input) basictest(f, steps_, const_size, prefix='mrg cpu', inputs=input)
if mode != 'FAST_COMPILE' and cuda_available:
# print ''
# print 'ON GPU with size=(%s):' % str(size)
R = MRG_RandomStreams(234, use_cuda=True)
u = R.uniform(size=size, dtype='float32',
nstreams=rng_mrg.guess_n_streams(size, warn=False))
# well, it's really that this test w GPU doesn't make sense otw
assert u.dtype == 'float32'
f = theano.function(var_input, theano.Out(
theano.sandbox.cuda.basic_ops.gpu_from_host(u),
borrow=True), mode=mode_with_gpu)
assert any([isinstance(node.op,
theano.sandbox.rng_mrg.GPU_mrg_uniform)
for node in f.maker.fgraph.toposort()])
# theano.printing.debugprint(f)
gpu_out = np.asarray(f(*input))
# print 'GPU: random?[:10], random?[-10:]'
# print gpu_out[0, 0:10]
# print gpu_out[-1, -10:]
basictest(f, steps_, const_size, prefix='mrg gpu', inputs=input)
np.testing.assert_array_almost_equal(cpu_out, gpu_out,
decimal=6)
# print '' # print ''
# print 'ON CPU w Numpy with size=(%s):' % str(size) # print 'ON CPU w Numpy with size=(%s):' % str(size)
RR = theano.tensor.shared_randomstreams.RandomStreams(234) RR = theano.tensor.shared_randomstreams.RandomStreams(234)
...@@ -629,7 +429,7 @@ def test_uniform(): ...@@ -629,7 +429,7 @@ def test_uniform():
def test_broadcastable(): def test_broadcastable():
R = MRG_RandomStreams(234, use_cuda=False) R = MRG_RandomStreams(234)
x = tensor.matrix() x = tensor.matrix()
size1 = (10, 1) size1 = (10, 1)
size2 = (x.shape[0], 1) size2 = (x.shape[0], 1)
...@@ -695,7 +495,7 @@ def test_binomial(): ...@@ -695,7 +495,7 @@ def test_binomial():
def t_binomial(mean, size, const_size, var_input, input, steps, rtol): def t_binomial(mean, size, const_size, var_input, input, steps, rtol):
R = MRG_RandomStreams(234, use_cuda=False) R = MRG_RandomStreams(234)
u = R.binomial(size=size, p=mean) u = R.binomial(size=size, p=mean)
f = theano.function(var_input, u, mode=mode) f = theano.function(var_input, u, mode=mode)
out = f(*input) out = f(*input)
...@@ -709,22 +509,6 @@ def t_binomial(mean, size, const_size, var_input, input, steps, rtol): ...@@ -709,22 +509,6 @@ def t_binomial(mean, size, const_size, var_input, input, steps, rtol):
inputs=input, allow_01=True, inputs=input, allow_01=True,
target_avg=mean, mean_rtol=rtol) target_avg=mean, mean_rtol=rtol)
if mode != 'FAST_COMPILE' and cuda_available:
R = MRG_RandomStreams(234, use_cuda=True)
u = R.binomial(size=size, p=mean, dtype='float32')
# well, it's really that this test w GPU doesn't make sense otw
assert u.dtype == 'float32'
f = theano.function(var_input, theano.Out(
theano.sandbox.cuda.basic_ops.gpu_from_host(u),
borrow=True), mode=mode_with_gpu)
gpu_out = np.asarray(f(*input))
basictest(f, steps_, const_size, prefix='mrg gpu',
inputs=input, allow_01=True,
target_avg=mean, mean_rtol=rtol)
np.testing.assert_array_almost_equal(out, gpu_out,
decimal=6)
RR = theano.tensor.shared_randomstreams.RandomStreams(234) RR = theano.tensor.shared_randomstreams.RandomStreams(234)
uu = RR.binomial(size=size, p=mean) uu = RR.binomial(size=size, p=mean)
...@@ -778,7 +562,7 @@ def test_normal0(): ...@@ -778,7 +562,7 @@ def test_normal0():
# print '' # print ''
# print 'ON CPU:' # print 'ON CPU:'
R = MRG_RandomStreams(234, use_cuda=False) R = MRG_RandomStreams(234)
# Note: we specify `nstreams` to avoid a warning. # Note: we specify `nstreams` to avoid a warning.
n = R.normal(size=size, avg=avg, std=std, n = R.normal(size=size, avg=avg, std=std,
nstreams=rng_mrg.guess_n_streams(size, warn=False)) nstreams=rng_mrg.guess_n_streams(size, warn=False))
...@@ -798,31 +582,6 @@ def test_normal0(): ...@@ -798,31 +582,6 @@ def test_normal0():
sys.stdout.flush() sys.stdout.flush()
if mode != 'FAST_COMPILE' and cuda_available:
# print ''
# print 'ON GPU:'
R = MRG_RandomStreams(234, use_cuda=True)
n = R.normal(size=size, avg=avg, std=std, dtype='float32',
nstreams=rng_mrg.guess_n_streams(size, warn=False))
# well, it's really that this test w GPU doesn't make sense otw
assert n.dtype == 'float32'
f = theano.function(var_input, theano.Out(
theano.sandbox.cuda.basic_ops.gpu_from_host(n),
borrow=True), mode=mode_with_gpu)
# theano.printing.debugprint(f)
sys.stdout.flush()
gpu_out = np.asarray(f(*input))
# print 'random?[:10]\n', gpu_out[0, 0:10]
# print '----'
sys.stdout.flush()
basictest(f, steps_, const_size, target_avg=avg, target_std=std,
prefix='gpu mrg ', allow_01=True, inputs=input,
mean_rtol=rtol, std_tol=std_tol)
# Need to allow some rounding error as their is float
# computation that are done on the gpu vs cpu
assert np.allclose(out, gpu_out, rtol=5e-6, atol=5e-6)
# print '' # print ''
# print 'ON CPU w NUMPY:' # print 'ON CPU w NUMPY:'
RR = theano.tensor.shared_randomstreams.RandomStreams(234) RR = theano.tensor.shared_randomstreams.RandomStreams(234)
...@@ -877,7 +636,7 @@ def test_multinomial(): ...@@ -877,7 +636,7 @@ def test_multinomial():
pvals = np.asarray(np.random.uniform(size=sample_size)) pvals = np.asarray(np.random.uniform(size=sample_size))
pvals = np.apply_along_axis(lambda row: row / np.sum(row), 1, pvals) pvals = np.apply_along_axis(lambda row: row / np.sum(row), 1, pvals)
R = MRG_RandomStreams(234, use_cuda=False) R = MRG_RandomStreams(234)
# Note: we specify `nstreams` to avoid a warning. # Note: we specify `nstreams` to avoid a warning.
m = R.multinomial(pvals=pvals, dtype=config.floatX, nstreams=30 * 256) m = R.multinomial(pvals=pvals, dtype=config.floatX, nstreams=30 * 256)
f = theano.function([], m, mode=mode_) f = theano.function([], m, mode=mode_)
...@@ -886,29 +645,6 @@ def test_multinomial(): ...@@ -886,29 +645,6 @@ def test_multinomial():
basic_multinomialtest(f, steps, sample_size, pvals, n_samples=1, basic_multinomialtest(f, steps, sample_size, pvals, n_samples=1,
prefix='mrg ') prefix='mrg ')
sys.stdout.flush()
if mode != 'FAST_COMPILE' and cuda_available:
# print ''
# print 'ON GPU:'
R = MRG_RandomStreams(234, use_cuda=True)
pvals = np.asarray(pvals, dtype='float32')
# We give the number of streams to avoid a warning.
n = R.multinomial(pvals=pvals, dtype='float32', nstreams=30 * 256)
# well, it's really that this test w GPU doesn't make sense otw
assert n.dtype == 'float32'
f = theano.function(
[],
theano.sandbox.cuda.basic_ops.gpu_from_host(n),
mode=mode_.including('gpu'))
# theano.printing.debugprint(f)
gpu_out = f()
sys.stdout.flush()
basic_multinomialtest(f, steps, sample_size, pvals, n_samples=1,
prefix='gpu mrg ')
np.testing.assert_array_almost_equal(out, gpu_out, decimal=6)
def test_multinomial_n_samples(): def test_multinomial_n_samples():
mode_ = mode mode_ = mode
...@@ -924,7 +660,7 @@ def test_multinomial_n_samples(): ...@@ -924,7 +660,7 @@ def test_multinomial_n_samples():
pvals = np.asarray(np.random.uniform(size=sample_size)) pvals = np.asarray(np.random.uniform(size=sample_size))
pvals = np.apply_along_axis(lambda row: row / np.sum(row), 1, pvals) pvals = np.apply_along_axis(lambda row: row / np.sum(row), 1, pvals)
R = MRG_RandomStreams(234, use_cuda=False) R = MRG_RandomStreams(234)
for n_samples, steps in zip([5, 10, 100, 1000], [20, 10, 1, 1]): for n_samples, steps in zip([5, 10, 100, 1000], [20, 10, 1, 1]):
m = R.multinomial(pvals=pvals, n=n_samples, m = R.multinomial(pvals=pvals, n=n_samples,
...@@ -934,26 +670,11 @@ def test_multinomial_n_samples(): ...@@ -934,26 +670,11 @@ def test_multinomial_n_samples():
n_samples, prefix='mrg ') n_samples, prefix='mrg ')
sys.stdout.flush() sys.stdout.flush()
if mode != 'FAST_COMPILE' and cuda_available:
R = MRG_RandomStreams(234, use_cuda=True)
pvals = np.asarray(pvals, dtype='float32')
n = R.multinomial(pvals=pvals, n=n_samples,
dtype='float32', nstreams=30 * 256)
assert n.dtype == 'float32'
f = theano.function(
[],
theano.sandbox.cuda.basic_ops.gpu_from_host(n),
mode=mode_.including('gpu'))
sys.stdout.flush()
basic_multinomialtest(f, steps, sample_size, pvals,
n_samples, prefix='gpu mrg ')
class T_MRG(unittest.TestCase): class T_MRG(unittest.TestCase):
def test_bad_size(self): def test_bad_size(self):
R = MRG_RandomStreams(234, use_cuda=False) R = MRG_RandomStreams(234)
for size in [ for size in [
(0, 100), (0, 100),
...@@ -1055,54 +776,43 @@ def test_multMatVect(): ...@@ -1055,54 +776,43 @@ def test_multMatVect():
def test_seed_fn(): def test_seed_fn():
test_use_cuda = [False]
if cuda_available:
test_use_cuda.append(True)
idx = tensor.ivector() idx = tensor.ivector()
for use_cuda in test_use_cuda:
if config.mode == 'FAST_COMPILE' and use_cuda: for new_seed, same in [(234, True), (None, True), (23, False)]:
mode = 'FAST_RUN' random = MRG_RandomStreams(234)
else: fn1 = theano.function([], random.uniform((2, 2), dtype='float32'))
mode = config.mode fn2 = theano.function([], random.uniform((3, 3), nstreams=2,
dtype='float32'))
for new_seed, same in [(234, True), (None, True), (23, False)]: fn3 = theano.function([idx],
random = MRG_RandomStreams(234, use_cuda=use_cuda) random.uniform(idx, nstreams=3, ndim=1,
fn1 = theano.function([], random.uniform((2, 2), dtype='float32'), dtype='float32'))
mode=mode)
fn2 = theano.function([], random.uniform((3, 3), nstreams=2, fn1_val0 = fn1()
dtype='float32'), fn1_val1 = fn1()
mode=mode) assert not np.allclose(fn1_val0, fn1_val1)
fn3 = theano.function([idx], fn2_val0 = fn2()
random.uniform(idx, nstreams=3, ndim=1, fn2_val1 = fn2()
dtype='float32'), assert not np.allclose(fn2_val0, fn2_val1)
mode=mode) fn3_val0 = fn3([4])
fn3_val1 = fn3([4])
fn1_val0 = fn1() assert not np.allclose(fn3_val0, fn3_val1)
fn1_val1 = fn1() assert fn1_val0.size == 4
assert not np.allclose(fn1_val0, fn1_val1) assert fn2_val0.size == 9
fn2_val0 = fn2()
fn2_val1 = fn2() random.seed(new_seed)
assert not np.allclose(fn2_val0, fn2_val1)
fn3_val0 = fn3([4]) fn1_val2 = fn1()
fn3_val1 = fn3([4]) fn1_val3 = fn1()
assert not np.allclose(fn3_val0, fn3_val1) fn2_val2 = fn2()
assert fn1_val0.size == 4 fn2_val3 = fn2()
assert fn2_val0.size == 9 fn3_val2 = fn3([4])
fn3_val3 = fn3([4])
random.seed(new_seed) assert np.allclose(fn1_val0, fn1_val2) == same
assert np.allclose(fn1_val1, fn1_val3) == same
fn1_val2 = fn1() assert np.allclose(fn2_val0, fn2_val2) == same
fn1_val3 = fn1() assert np.allclose(fn2_val1, fn2_val3) == same
fn2_val2 = fn2() assert np.allclose(fn3_val0, fn3_val2) == same
fn2_val3 = fn2() assert np.allclose(fn3_val1, fn3_val3) == same
fn3_val2 = fn3([4])
fn3_val3 = fn3([4])
assert np.allclose(fn1_val0, fn1_val2) == same
assert np.allclose(fn1_val1, fn1_val3) == same
assert np.allclose(fn2_val0, fn2_val2) == same
assert np.allclose(fn2_val1, fn2_val3) == same
assert np.allclose(fn3_val0, fn3_val2) == same
assert np.allclose(fn3_val1, fn3_val3) == same
def rng_mrg_overflow(sizes, fct, mode, should_raise_error): def rng_mrg_overflow(sizes, fct, mode, should_raise_error):
...@@ -1132,28 +842,7 @@ def test_overflow_cpu(): ...@@ -1132,28 +842,7 @@ def test_overflow_cpu():
rng_mrg_overflow(sizes, fct, config.mode, should_raise_error=False) rng_mrg_overflow(sizes, fct, config.mode, should_raise_error=False)
def test_overflow_gpu_old_backend():
# run with THEANO_FLAGS=mode=FAST_RUN,init_gpu_device=gpu1,device=cpu
if not cuda_available:
raise SkipTest('Optional package cuda not available')
mode = mode_with_gpu
seed = 12345
rng = MRG_RandomStreams(seed=seed, use_cuda=True)
fct = rng.uniform
# should raise error as the size overflows
sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
# should not raise error
sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
# should support int32 sizes
sizes = [(np.int32(2**10), ),
(np.int32(2), np.int32(2**10), np.int32(2**10))]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
def test_overflow_gpu_new_backend(): def test_overflow_gpu_new_backend():
# run with THEANO_FLAGS=mode=FAST_RUN,init_gpu_device=cuda1,device=cpu
from theano.gpuarray.tests.test_basic_ops import \ from theano.gpuarray.tests.test_basic_ops import \
mode_with_gpu as mode mode_with_gpu as mode
from theano.gpuarray.type import gpuarray_shared_constructor from theano.gpuarray.type import gpuarray_shared_constructor
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论