提交 e9e07abd authored 作者: Amjad Almahairi's avatar Amjad Almahairi 提交者: Arnaud Bergeron

first trial - unfinished

first trial - midwork start local memory debugging debugging initial test fixed bugs modified test moved test file debugging first trial - finished
上级 d5944c96
......@@ -11,11 +11,13 @@ import theano
import theano.sandbox.multinomial
from theano import Apply, config
from theano.gof import Op
from theano.tensor import NotScalarConstantError, get_scalar_constant_value
from theano import gpuarray
from theano.tensor import NotScalarConstantError, get_scalar_constant_value
from .basic_ops import as_gpuarray_variable, infer_context_name
from .opt import register_opt, op_lifter, register_opt2
from .type import GpuArrayType
from theano.scalar import as_scalar
class GPUAMultinomialFromUniform(gpuarray.basic_ops.GpuKernelBase, Op):
......@@ -227,6 +229,239 @@ KERNEL void k_multi_warp_multinomial(
return (1,)
class GPUAMultinomialWOReplacementFromUniform(gpuarray.basic_ops.GpuKernelBase, Op):
"""
The output is transposed compared to MultinomialWOReplacementFromUniform.
We must insert a Transpose op after it.
The optimization that moves it to the gpu does it.
"""
__props__ = ("odtype",)
def __init__(self, odtype):
Op.__init__(self)
self.odtype = odtype
def get_params(self, node):
return node.outputs[0].type.context
def c_headers(self):
return ['<numpy_compat.h>', 'gpuarray_helper.h']
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def make_node(self, pvals, unis, n):
assert pvals.dtype == 'float32'
assert unis.dtype == 'float32'
ctx_name = infer_context_name(pvals, unis)
pvals = as_gpuarray_variable(pvals, ctx_name)
unis = as_gpuarray_variable(unis, ctx_name)
if pvals.ndim != 2:
raise NotImplementedError('pvals ndim should be 2', pvals.ndim)
if unis.ndim != 1:
raise NotImplementedError('unis ndim should be 1', unis.ndim)
if self.odtype == 'auto':
odtype = 'int64'
else:
odtype = self.odtype
assert odtype == 'int64', odtype
br = (pvals.broadcastable[1], pvals.broadcastable[0])
out = GpuArrayType(broadcastable=br,
dtype=odtype,
context_name=ctx_name)()
return Apply(self, [pvals, unis, as_scalar(n)], [out])
def gpu_kernels(self, node, name):
code = """
KERNEL void k_multi_warp_multinomial_wor(
const ga_size nb_multi,
const ga_size nb_outcomes,
const ga_size n_samples,
GLOBAL_MEM float * global_pvals_copy,
const ga_ssize pvals_row_stride,
const ga_ssize pvals_col_stride,
GLOBAL_MEM float * global_unis,
const ga_ssize unis_stride,
GLOBAL_MEM ga_long * global_outs,
const ga_ssize outs_row_stride,
const ga_ssize outs_col_stride
)
{
// each thread takes care of one multinomial-wor n_samples-draw
int n = LDIM_0*GID_0 + LID_0;
if (n < nb_multi)
{
for (int c = 0; c < n_samples; ++c)
{
float cummul = 0.;
bool done = false;
const float unis_n = global_unis[(c * nb_multi + n)*unis_stride];
for (ga_size m = 0; m < nb_outcomes; ++m)
{
float pvals_nm = global_pvals_copy[m * pvals_col_stride + n * pvals_row_stride];
cummul += pvals_nm;
if (!done && unis_n < cummul)
{
//write out transposed for speed.
global_outs[n * outs_col_stride +
c * outs_row_stride] = m;
global_pvals_copy[m * pvals_col_stride + n * pvals_row_stride] = 0.0;
cummul -= pvals_nm;
done = true;
}
}
// renormalize the multinomial
for (ga_int k = 0; k < nb_outcomes; ++k)
{
global_pvals_copy[k * pvals_col_stride + n * pvals_row_stride] /= cummul;
}
}
}
}
"""
return [gpuarray.basic_ops.Kernel(
code=code, name="k_multi_warp_multinomial_wor",
params=[pygpu.gpuarray.SIZE,
pygpu.gpuarray.SIZE,
pygpu.gpuarray.SIZE,
pygpu.gpuarray.GpuArray,
pygpu.gpuarray.SSIZE,
pygpu.gpuarray.SSIZE,
pygpu.gpuarray.GpuArray,
pygpu.gpuarray.SSIZE,
pygpu.gpuarray.GpuArray,
pygpu.gpuarray.SSIZE,
pygpu.gpuarray.SSIZE
],
flags=gpuarray.basic_ops.Kernel.get_flags(node.outputs[0].dtype),
objvar='k_multi_warp_multinomial_wor_' + name)]
def c_code(self, node, name, inp, outputs, sub):
pvals, unis, n = inp
out, = outputs
fail = sub['fail']
ctx = sub['params']
sync = bool(config.gpuarray.sync)
kname = self.gpu_kernels(node, name)[0].objvar
s = """
PyGpuArrayObject * pvals = %(pvals)s;
PyGpuArrayObject * unis = %(unis)s;
const size_t n_samples = %(n)s;
PyGpuArrayObject * out = %(out)s;
// create a copy of pvals matrix
PyGpuArrayObject * pvals_copy = NULL;
size_t dims[2];
if (PyGpuArray_NDIM(pvals) != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
%(fail)s
}
if (PyGpuArray_NDIM(unis) != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s
}
if ( n_samples > (PyGpuArray_DIMS(pvals)[1]) )
{
PyErr_Format(PyExc_ValueError, "Cannot sample without replacement n samples bigger than the size of the distribution.");
%(fail)s;
}
if (PyGpuArray_DIMS(unis)[0] != PyGpuArray_DIMS(pvals)[0] * n_samples)
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0] * n");
%(fail)s
}
pvals_copy = pygpu_copy(pvals, GA_C_ORDER);
dims[0] = n_samples;
dims[1] = PyGpuArray_DIMS(pvals)[0];
if (theano_prep_output(&out, 2, dims, GA_LONG,
GA_C_ORDER, %(ctx)s) != 0){
%(fail)s
}
%(out)s = out;
{ // NESTED SCOPE
int nb_multi = PyGpuArray_DIMS(pvals)[0];
int nb_outcomes = PyGpuArray_DIMS(pvals)[1];
//TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1;
size_t nb_blocks = max_nb_blocks + 1;
size_t nb_threads=16; // so it really starts at 32, because of the *2
do
{
nb_threads*=2;
if (nb_multi %% nb_threads == 0)
nb_blocks = nb_multi/nb_threads;
else
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks);
// TODO : next line is a bit hardcoded...
if (nb_threads > 512)
{
PyErr_Format(
PyExc_ValueError,
"Multinomial is not implemented for so many rows in the matrix (%%i)",
nb_multi);
%(fail)s
}
assert(nb_blocks*nb_threads >= nb_multi);
void *args[11];
ssize_t strides[5] = {
PyGpuArray_STRIDES(pvals)[0]/sizeof(float),
PyGpuArray_STRIDES(pvals)[1]/sizeof(float),
PyGpuArray_STRIDES(unis)[0]/sizeof(float),
PyGpuArray_STRIDES(out)[0]/8,
PyGpuArray_STRIDES(out)[1]/8
};
int err;
args[0] = (void*)&PyGpuArray_DIMS(pvals)[0];
args[1] = (void*)&PyGpuArray_DIMS(pvals)[1];
args[2] = (void*)&n_samples;
args[3] = pvals_copy->ga.data; //PyGpuArray_DEV_DATA(pvals);
args[4] = (void*)&strides[0];
args[5] = (void*)&strides[1];
args[6] = unis->ga.data; //PyGpuArray_DEV_DATA(unis);
args[7] = (void*)&strides[2];
args[8] = out->ga.data; //PyGpuArray_DEV_DATA(out);
args[9] = (void*)&strides[3];
args[10] = (void*)&strides[4];
err = GpuKernel_call(&%(kname)s, 1, &nb_threads, &nb_blocks, 0, args);
if (err != GA_NO_ERROR) {
PyErr_Format(
PyExc_RuntimeError,
"gpuarray error: %%s: %%s.\\n",
"k_multi_warp_%(name)s",
GpuKernel_error(&%(kname)s, err));
%(fail)s;
}
if(%(sync)d)
GpuArray_sync(&(out->ga));
} // END NESTED SCOPE
""" % locals()
return s
def c_code_cache_version(self):
return (1,)
@register_opt('fast_compile')
@op_lifter([theano.sandbox.multinomial.MultinomialFromUniform])
@register_opt2([theano.sandbox.multinomial.MultinomialFromUniform], 'fast_compile')
......@@ -248,3 +483,20 @@ def local_gpua_multinomial(op, context_name, inputs, outputs):
gpu_op = GPUAMultinomialFromUniform(op.odtype)
return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])(
gpu_op(p, u))
@register_opt()
@op_lifter([theano.sandbox.multinomial.MultinomialWOReplacementFromUniform])
def local_gpua_multinomial_wor(node, context_name):
# TODO : need description for function
p, u, n = node.inputs
# try:
# if get_scalar_constant_value(n_samples) != 1:
# return None
# except NotScalarConstantError:
# return None
m, = node.outputs
if ((p.dtype == u.dtype == 'float32') and (m.dtype == 'int64')):
gpu_op = GPUAMultinomialWOReplacementFromUniform(node.op.odtype)
return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])(
gpu_op(p, u, n))
......@@ -2,23 +2,18 @@ from __future__ import absolute_import, print_function, division
import numpy
import unittest
import theano
from theano import config, function, tensor
from ..multinomial import GPUAMultinomialFromUniform
import theano.tests.unittest_tools as utt
from .config import mode_with_gpu, mode_without_gpu
def get_mode(gpu):
mode = mode_without_gpu
if gpu:
mode = mode_with_gpu
return mode
from theano.sandbox import multinomial
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import theano.tests.unittest_tools as utt
def run_with_c(f, gpu=False):
mode = get_mode(gpu)
f(mode, gpu)
from .config import mode_with_gpu
from ..multinomial import (GPUAMultinomialFromUniform,
GPUAMultinomialWOReplacementFromUniform)
def test_multinomial_0():
......@@ -30,68 +25,59 @@ def test_multinomial_0():
m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u)
def body(mode, gpu):
# the m*2 allows the multinomial to reuse output
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
# the m*2 allows the multinomial to reuse output
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu)
if gpu:
assert any([type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
# test that both first and second samples can be drawn
utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
[[2, 0], [0, 2]])
assert any([type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
# test that both second labels can be drawn
r = f([[.2, .8], [.3, .7]], [.31, .31])
utt.assert_allclose(r, [[0, 2], [0, 2]])
# test that both first and second samples can be drawn
utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
[[2, 0], [0, 2]])
# test that both first labels can be drawn
r = f([[.2, .8], [.3, .7]], [.21, .21])
utt.assert_allclose(r, [[0, 2], [2, 0]])
# test that both second labels can be drawn
r = f([[.2, .8], [.3, .7]], [.31, .31])
utt.assert_allclose(r, [[0, 2], [0, 2]])
# change the size to make sure output gets reallocated ok
# and also make sure that the GPU version doesn't screw up the
# transposed-ness
r = f([[.2, .8]], [.25])
utt.assert_allclose(r, [[0, 2]])
# test that both first labels can be drawn
r = f([[.2, .8], [.3, .7]], [.21, .21])
utt.assert_allclose(r, [[0, 2], [2, 0]])
run_with_c(body)
run_with_c(body, True)
# change the size to make sure output gets reallocated ok
# and also make sure that the GPU version doesn't screw up the
# transposed-ness
r = f([[.2, .8]], [.25])
utt.assert_allclose(r, [[0, 2]])
# TODO: check a bigger example (make sure blocking on GPU is handled correctly)
def test_multinomial_large():
# DEBUG_MODE will test this on GPU
def body(mode, gpu):
p = tensor.fmatrix()
u = tensor.fvector()
m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u)
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
if gpu:
assert any([type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
pval = numpy.arange(10000 * 4,
dtype='float32').reshape((10000, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = numpy.ones_like(pval[:, 0]) * 0.5
mval = f(pval, uval)
assert mval.shape == pval.shape
if config.cast_policy == 'custom':
assert mval.dtype == pval.dtype
elif config.cast_policy == 'numpy+floatX':
assert mval.dtype == config.floatX
elif config.cast_policy == 'numpy':
assert mval.dtype == 'float64'
else:
raise NotImplementedError(config.cast_policy)
utt.assert_allclose(mval.sum(axis=1), 2)
asdf = numpy.asarray([0, 0, 2, 0]) + 0 * pval
utt.assert_allclose(mval, asdf) # broadcast over all rows
run_with_c(body)
run_with_c(body, True)
p = tensor.fmatrix()
u = tensor.fvector()
m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u)
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu)
assert any([type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
pval = numpy.arange(10000 * 4,
dtype='float32').reshape((10000, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = numpy.ones_like(pval[:, 0]) * 0.5
mval = f(pval, uval)
assert mval.shape == pval.shape
if config.cast_policy == 'custom':
assert mval.dtype == pval.dtype
elif config.cast_policy == 'numpy+floatX':
assert mval.dtype == config.floatX
elif config.cast_policy == 'numpy':
assert mval.dtype == 'float64'
else:
raise NotImplementedError(config.cast_policy)
utt.assert_allclose(mval.sum(axis=1), 2)
asdf = numpy.asarray([0, 0, 2, 0]) + 0 * pval
utt.assert_allclose(mval, asdf) # broadcast over all rows
def test_gpu_opt():
......@@ -104,7 +90,7 @@ def test_gpu_opt():
m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u)
assert m.dtype == 'float32', m.dtype
f = function([p, u], m, allow_input_downcast=True, mode=get_mode(True))
f = function([p, u], m, allow_input_downcast=True, mode=mode_with_gpu)
assert any([type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
......@@ -117,10 +103,192 @@ def test_gpu_opt():
m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(r, u)
assert m.dtype == 'float32', m.dtype
f = function([r, u], m, allow_input_downcast=True, mode=get_mode(True))
f = function([r, u], m, allow_input_downcast=True, mode=mode_with_gpu)
assert any([type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
pval = numpy.arange(1 * 4, dtype='float32').reshape((1, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = numpy.ones_like(pval[:, 0]) * 0.5
f(pval, uval)
class test_OP_wor(unittest.TestCase):
def test_select_distinct(self):
"""
Tests that MultinomialWOReplacementFromUniform always selects distinct elements
"""
p = tensor.fmatrix()
u = tensor.fvector()
n = tensor.iscalar()
m = multinomial.MultinomialWOReplacementFromUniform('auto')(p, u, n)
f = function([p, u, n], m, allow_input_downcast=True)
n_elements = 1000
all_indices = range(n_elements)
numpy.random.seed(12345)
for i in [5, 10, 50, 100, 500, n_elements]:
uni = numpy.random.rand(i).astype(config.floatX)
pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
pvals /= pvals.sum(1)
res = f(pvals, uni, i)
res = numpy.squeeze(res)
assert len(res) == i, res
assert numpy.all(numpy.in1d(numpy.unique(res), all_indices)), res
def test_fail_select_alot(self):
"""
Tests that MultinomialWOReplacementFromUniform fails when asked to sample more
elements than the actual number of elements
"""
p = tensor.fmatrix()
u = tensor.fvector()
n = tensor.iscalar()
m = multinomial.MultinomialWOReplacementFromUniform('auto')(p, u, n)
f = function([p, u, n], m, allow_input_downcast=True)
n_elements = 100
n_selected = 200
numpy.random.seed(12345)
uni = numpy.random.rand(n_selected).astype(config.floatX)
pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
pvals /= pvals.sum(1)
self.assertRaises(ValueError, f, pvals, uni, n_selected)
def test_select_proportional_to_weight(self):
"""
Tests that MultinomialWOReplacementFromUniform selects elements, on average,
proportional to the their probabilities
"""
p = tensor.fmatrix()
u = tensor.fvector()
n = tensor.iscalar()
m = multinomial.MultinomialWOReplacementFromUniform('auto')(p, u, n)
f = function([p, u, n], m, allow_input_downcast=True)
n_elements = 100
n_selected = 10
mean_rtol = 0.0005
numpy.random.seed(12345)
pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
pvals /= pvals.sum(1)
avg_pvals = numpy.zeros((n_elements,), dtype=config.floatX)
for rep in range(10000):
uni = numpy.random.rand(n_selected).astype(config.floatX)
res = f(pvals, uni, n_selected)
res = numpy.squeeze(res)
avg_pvals[res] += 1
avg_pvals /= avg_pvals.sum()
avg_diff = numpy.mean(abs(avg_pvals - pvals))
assert avg_diff < mean_rtol, avg_diff
class test_function_wor(unittest.TestCase):
def test_select_distinct(self):
"""
Tests that multinomial_wo_replacement always selects distinct elements
"""
th_rng = RandomStreams(12345)
p = tensor.fmatrix()
n = tensor.iscalar()
m = th_rng.multinomial_wo_replacement(pvals=p, n=n)
f = function([p, n], m, allow_input_downcast=True)
n_elements = 1000
all_indices = range(n_elements)
numpy.random.seed(12345)
for i in [5, 10, 50, 100, 500, n_elements]:
pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
pvals /= pvals.sum(1)
res = f(pvals, i)
res = numpy.squeeze(res)
assert len(res) == i
assert numpy.all(numpy.in1d(numpy.unique(res), all_indices)), res
def test_fail_select_alot(self):
"""
Tests that multinomial_wo_replacement fails when asked to sample more
elements than the actual number of elements
"""
th_rng = RandomStreams(12345)
p = tensor.fmatrix()
n = tensor.iscalar()
m = th_rng.multinomial_wo_replacement(pvals=p, n=n)
f = function([p, n], m, allow_input_downcast=True)
n_elements = 100
n_selected = 200
numpy.random.seed(12345)
pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
pvals /= pvals.sum(1)
self.assertRaises(ValueError, f, pvals, n_selected)
def test_select_proportional_to_weight(self):
"""
Tests that multinomial_wo_replacement selects elements, on average,
proportional to the their probabilities
"""
th_rng = RandomStreams(12345)
p = tensor.fmatrix()
n = tensor.iscalar()
m = th_rng.multinomial_wo_replacement(pvals=p, n=n)
f = function([p, n], m, allow_input_downcast=True)
n_elements = 100
n_selected = 10
mean_rtol = 0.0005
numpy.random.seed(12345)
pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
pvals /= pvals.sum(1)
avg_pvals = numpy.zeros((n_elements,), dtype=config.floatX)
for rep in range(10000):
res = f(pvals, n_selected)
res = numpy.squeeze(res)
avg_pvals[res] += 1
avg_pvals /= avg_pvals.sum()
avg_diff = numpy.mean(abs(avg_pvals - pvals))
assert avg_diff < mean_rtol
def test_gpu_opt_wor():
# We test the case where we put the op on the gpu when the output
# is moved to the gpu.
p = tensor.fmatrix()
u = tensor.fvector()
n = tensor.iscalar()
m = multinomial.MultinomialWOReplacementFromUniform('auto')(p, u, n)
assert m.dtype == 'int64', m.dtype
f = function([p, u, n], m, allow_input_downcast=True, mode=mode_with_gpu)
assert any([type(node.op) is GPUAMultinomialWOReplacementFromUniform
for node in f.maker.fgraph.toposort()])
n_samples = 3
pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = numpy.ones(pval.shape[0] * n_samples) * 0.5
f(pval, uval, n_samples)
# Test with a row, it was failing in the past.
r = tensor.frow()
m = multinomial.MultinomialWOReplacementFromUniform('auto')(r, u, n)
assert m.dtype == 'int64', m.dtype
f = function([r, u, n], m, allow_input_downcast=True, mode=mode_with_gpu)
assert any([type(node.op) is GPUAMultinomialWOReplacementFromUniform
for node in f.maker.fgraph.toposort()])
pval = numpy.arange(1 * 4, dtype='float32').reshape((1, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = numpy.ones_like(pval[:, 0]) * 0.5
f(pval, uval, 1)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论