提交 45c5780f authored 作者: Simon Lemieux's avatar Simon Lemieux

added multinomial for MRG

上级 0016c4d6
import theano
from theano import Op, Apply
import theano.tensor as T
from theano.tensor.opt import register_specialize
from theano.gof import local_optimizer
from theano.sandbox.cuda import cuda_available
if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType
from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
class Multinomial(Op):
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def make_node(self, pvals, unis):
pvals = T.as_tensor_variable(pvals)
unis = T.as_tensor_variable(unis)
#assert pvals.dtype == 'float32'
#assert unis.dtype == 'float32'
return Apply(self, [pvals, unis], [pvals.type()])
def grad(self, (pvals, unis), (gz,)):
return [None, None]
def c_code_cache_version(self):
return ()
#return (2,)
def c_code(self, node, name, (pvals, unis), (z,), sub):
fail = sub['fail']
return """
if (%(pvals)s->nd != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
%(fail)s;
}
if (%(unis)s->nd != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s;
}
if (%(unis)s->dimensions[0] != %(pvals)s->dimensions[1])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
%(fail)s;
}
if ((NULL == %(z)s)
|| ((%(z)s->dimensions)[0] != (%(pvals)s->dimensions)[0])
|| ((%(z)s->dimensions)[1] != (%(pvals)s->dimensions)[1])
)
{
Py_XDECREF(%(z)s);
npy_intp dims[2];
dims[0] = (%(pvals)s->dimensions)[0];
dims[1] = (%(pvals)s->dimensions)[1];
%(z)s = (PyArrayObject*) PyArray_ZEROS(2,
dims,
type_num_%(pvals)s,
0);
if (!%(z)s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
%(fail)s;
}
}
{ // NESTED SCOPE
const int nb_outcomes = %(pvals)s->dimensions[0];
const int nb_multi = %(pvals)s->dimensions[1];
//
// For each multinomials, loop over each possible outcome
//
for (int n = 0; n < nb_multi; ++n)
{
dtype_%(pvals)s cummul = 0.;
const dtype_%(unis)s* unis_n = (dtype_%(unis)s*)PyArray_GETPTR1(%(unis)s, n);
for (int m = 0; m < nb_outcomes; ++m)
{
dtype_%(z)s* z_nm = (dtype_%(z)s*)PyArray_GETPTR2(%(z)s, m,n);
const dtype_%(pvals)s* pvals_nm = (dtype_%(pvals)s*)PyArray_GETPTR2(%(pvals)s, m,n);
cummul += *pvals_nm;
if (*unis_n < cummul)
{
*z_nm = 1.;
break;
}
}
}
} // END NESTED SCOPE
""" % locals()
multinomial = Multinomial()
class GpuMultinomial(Multinomial):
def make_node(self, pvals, unis):
assert pvals.dtype == 'float32'
assert unis.dtype == 'float32'
if not isinstance(pvals.type, CudaNdarrayType):
raise TypeError('pvals must be cudandarray', pvals)
if not isinstance(unis.type, CudaNdarrayType):
raise TypeError('unis must be cudandarray', unis)
return Apply(self, [pvals, unis], [pvals.type()])
def c_code_cache_version(self):
#return ()
return (1,)
def c_support_code_apply(self, node, nodename):
return """
static __global__ void k_multi_warp_%(nodename)s(
const int nb_multi,
const int nb_outcomes,
const int pvals_row_strides,
const int pvals_col_strides,
float * global_pvals,
float * global_unis,
float * global_outs
)
{
int n = 32*blockIdx.x + threadIdx.x;
if (n < nb_multi)
{
float cummul = 0.;
bool done = false;
for (int m = 0; m < nb_outcomes; ++m)
{
cummul += global_pvals[n * pvals_col_strides + m * pvals_row_strides];
float current_out = 0.;
if (!done && global_unis[n] < cummul)
{
current_out = 1.;
done = true;
}
global_outs[n + m * nb_multi] = current_out;
}
}
}
""" % locals()
def c_code(self, node, name, (pvals, unis), (z,), sub):
fail = sub['fail']
return """
if (%(pvals)s->nd != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
%(fail)s;
}
if (%(unis)s->nd != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
%(fail)s;
}
if (!CudaNdarray_is_c_contiguous(%(unis)s))
{
PyErr_Format(PyExc_NotImplementedError, "require unis to be contiguous");
%(fail)s;
}
// Would be more efficient if pvals were also contiguous but practically I think it is not often the cas,
// since we are working on pvals.T here
if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[1]))
{
Py_XDECREF(%(z)s);
npy_intp dims[2];
dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]);
dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]);
%(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
if (!%(z)s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
%(fail)s;
}
}
{ // NESTED SCOPE
int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0];
int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1];
int nb_block;
if (nb_multi %% 32 == 0)
nb_block = nb_multi/32;
else
nb_block = (int)((float)nb_multi/32. + 1.);
dim3 n_blocks(nb_block,1,1);
dim3 n_threads(32,1,1);
int n_shared = 0;
k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(%(z)s)[1],
CudaNdarray_HOST_DIMS(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
CudaNdarray_DEV_DATA(%(pvals)s),
CudaNdarray_DEV_DATA(%(unis)s),
CudaNdarray_DEV_DATA(%(z)s)
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n",
"k_multi_warp_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z,
n_shared);
%(fail)s;
}
} // END NESTED SCOPE
""" % locals()
gpu_multinomial = GpuMultinomial()
@local_optimizer()
def use_gpu_multinomial(node):
if node.op == multinomial:
return [host_from_gpu(gpu_multinomial(*[gpu_from_host(i) for i in node.inputs]))]
if theano.config.device.startswith('gpu'):
register_specialize(use_gpu_multinomial)
\ No newline at end of file
...@@ -14,6 +14,8 @@ from theano.tensor import zeros_like, sqrt, log, sin, cos, join ...@@ -14,6 +14,8 @@ from theano.tensor import zeros_like, sqrt, log, sin, cos, join
from theano.compile import optdb from theano.compile import optdb
from theano.gof import local_optimizer from theano.gof import local_optimizer
from multinomial import multinomial
from theano.sandbox.cuda import cuda_available, cuda_enabled from theano.sandbox.cuda import cuda_available, cuda_enabled
if cuda_available: if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType, float32_shared_constructor from theano.sandbox.cuda import CudaNdarrayType, float32_shared_constructor
...@@ -672,6 +674,27 @@ class MRG_RandomStreams(object): ...@@ -672,6 +674,27 @@ class MRG_RandomStreams(object):
return cast(self.uniform(size=size) < p, dtype) return cast(self.uniform(size=size) < p, dtype)
else: else:
raise NotImplementedError("MRG_RandomStreams.binomial with n > 1") raise NotImplementedError("MRG_RandomStreams.binomial with n > 1")
def multinomial(self, size=None, n=1, pvals=[[.5,.5]], ndim=None, dtype='int64'):
"""
Sample `n` (currently `n` needs to be 1) times from a multinomial distribution defined by
probabilities pvals.
Example : pvals = [[.98,.01, .01], [.01, .98 .01]] will probably result in [[1,0,0],[0,1,0]].
.. note::
`size` and `ndim` are only there keep the same signature as other uniform, binomial, normal, etc.
todo : adapt multinomial to take that into account
"""
pvals = as_tensor_variable(pvals)
if n == 1 and pvals.ndim == 2:
pvals = as_tensor_variable(pvals)
unis = self.uniform(size=pvals.shape[0:1], ndim=1)
return cast(multinomial(pvals.T, unis).T, dtype)
else:
raise NotImplementedError("MRG_RandomStreams.multinomial only implemented with n == 1 and pvals.ndim = 2")
def normal(self, size=None, avg=0.0, std=1.0, ndim=None, dtype=config.floatX): def normal(self, size=None, avg=0.0, std=1.0, ndim=None, dtype=config.floatX):
# We need an even number of ]0,1[ samples. Then we split them # We need an even number of ]0,1[ samples. Then we split them
......
...@@ -475,3 +475,59 @@ def test_normal0(): ...@@ -475,3 +475,59 @@ def test_normal0():
ff = theano.function([], nn) ff = theano.function([], nn)
basictest(ff, steps, sample_size, target_avg=-5.0, target_std=2.0, prefix='numpy ', allow_01=True) basictest(ff, steps, sample_size, target_avg=-5.0, target_std=2.0, prefix='numpy ', allow_01=True)
def basic_multinomialtest(f, steps, target_pvals, prefix="", mean_rtol=0.04):
dt = 0.0
avg_pvals = numpy.zeros(target_pvals.shape, dtype=config.floatX)
for i in xrange(steps):
t0 = time.time()
ival = f()
dt += time.time() - t0
#ival = numpy.asarray(ival)
avg_pvals += ival
avg_pvals/= steps
print 'random?[:10]\n', f()[:10]
print prefix, 'mean', avg_pvals
print numpy.mean(abs(avg_pvals - target_pvals))# < mean_rtol, 'bad mean? %s %s' % (str(avg_pvals), str(target_pvals))
print prefix, 'time', dt
print prefix, 'elements', steps*numpy.prod(target_pvals.shape)
print prefix, 'samples/sec', steps*numpy.prod(target_pvals.shape) / dt
def test_multinomial():
steps = 100
if mode in ['DEBUG_MODE','FAST_COMPILE']:
sample_size = (49,5)
else:
sample_size = (450,6)
print ''
print 'ON CPU:'
pvals = numpy.asarray(numpy.random.uniform(size=sample_size))
pvals = numpy.apply_along_axis(lambda row : row/numpy.sum(row), 1, pvals)
R = MRG_RandomStreams(234, use_cuda=False)
m = R.multinomial(pvals=pvals, dtype=config.floatX)
f = theano.function([], m, mode=mode)
theano.printing.debugprint(f)
basic_multinomialtest(f, steps, pvals, prefix='mrg ')
sys.stdout.flush()
if mode!='FAST_COMPILE' and cuda_enabled:
print ''
print 'ON GPU:'
R = MRG_RandomStreams(234, use_cuda=True)
n = R.multinomial(pvals=pvals, dtype='float32')
assert n.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
f = theano.function([], theano.Out(
theano.sandbox.cuda.basic_ops.gpu_from_host(n),
borrow=True), mode=mode_with_gpu)
theano.printing.debugprint(f)
sys.stdout.flush()
basic_multinomialtest(f, steps, pvals, prefix='gpu mrg ')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论