提交 fa78b21a authored 作者: abergeron's avatar abergeron 提交者: GitHub

Merge pull request #5451 from aalmah/multi_dtype_multinomial

multiple data type support for GPUMultinomialFromUniform output
......@@ -18,10 +18,12 @@ from .opt import register_opt, op_lifter, register_opt2
from .type import GpuArrayType
from .elemwise import GpuDimShuffle
from theano.scalar import as_scalar
from .fp16_help import write_w, load_w, work_dtype
class GPUAMultinomialFromUniform(GpuKernelBase, Op):
__props__ = ("odtype",)
_f16_ok = True
def __init__(self, odtype):
Op.__init__(self)
......@@ -37,10 +39,9 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
return [os.path.dirname(__file__)]
def make_node(self, pvals, unis):
assert pvals.dtype == 'float32'
assert unis.dtype == 'float32'
assert unis.dtype == pvals.dtype
assert pvals.dtype in ['float32', 'float16', 'float64']
ctx_name = infer_context_name(pvals, unis)
pvals = as_gpuarray_variable(pvals, ctx_name)
unis = as_gpuarray_variable(unis, ctx_name)
......@@ -60,14 +61,19 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
return Apply(self, [pvals, unis], [out])
def gpu_kernels(self, node, name):
out_ctype = pygpu.gpuarray.dtype_to_ctype(node.outputs[0].dtype)
in_ctype = pygpu.gpuarray.dtype_to_ctype(node.inputs[0].dtype)
work_ctype = pygpu.gpuarray.dtype_to_ctype(work_dtype(node.inputs[0].dtype))
write_out_ctype = write_w(node.outputs[0].dtype)
load_in_ctype = load_w(node.inputs[0].dtype)
code = """
KERNEL void k_multi_warp_multinomial(
const ga_size nb_multi,
const ga_size nb_outcomes,
GLOBAL_MEM float * global_pvals,
GLOBAL_MEM %(in_ctype)s * global_pvals,
const ga_ssize pvals_row_stride,
const ga_ssize pvals_col_stride,
GLOBAL_MEM float * global_unis,
GLOBAL_MEM %(in_ctype)s * global_unis,
const ga_ssize unis_stride,
GLOBAL_MEM %(out_ctype)s * global_outs,
const ga_ssize outs_row_stride,
......@@ -78,16 +84,15 @@ KERNEL void k_multi_warp_multinomial(
int n = LDIM_0*GID_0 + LID_0;
if (n < nb_multi)
{
float cummul = 0.;
%(work_ctype)s cummul = 0.;
bool done = false;
const float unis_n = global_unis[n*unis_stride];
const %(work_ctype)s unis_n = %(load_in_ctype)s(global_unis[n*unis_stride]);
for (ga_size m = 0; m < nb_outcomes; ++m)
{
%(out_ctype)s current_out = 0;
%(work_ctype)s current_out = 0;
if (!done)
{
cummul += global_pvals[m * pvals_col_stride +
n * pvals_row_stride];
cummul += %(load_in_ctype)s(global_pvals[m * pvals_col_stride + n * pvals_row_stride]);
if (unis_n < cummul)
{
current_out = 1;
......@@ -96,11 +101,12 @@ KERNEL void k_multi_warp_multinomial(
}
//write out transposed for speed.
global_outs[n * outs_col_stride +
m * outs_row_stride] = current_out;
m * outs_row_stride] = %(write_out_ctype)s(current_out);
}
}
}
""" % dict(out_ctype=pygpu.gpuarray.dtype_to_ctype(node.outputs[0].dtype))
""" % dict(out_ctype=out_ctype, write_out_ctype=write_out_ctype,
work_ctype=work_ctype, in_ctype=in_ctype, load_in_ctype=load_in_ctype)
return [Kernel(
code=code, name="k_multi_warp_multinomial",
params=[pygpu.gpuarray.SIZE,
......@@ -124,6 +130,7 @@ KERNEL void k_multi_warp_multinomial(
sync = bool(config.gpuarray.sync)
kname = self.gpu_kernels(node, name)[0].objvar
out_typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
in_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
s = """
PyGpuArrayObject * pvals = %(pvals)s;
PyGpuArrayObject * unis = %(unis)s;
......@@ -187,9 +194,9 @@ KERNEL void k_multi_warp_multinomial(
void *args[10];
ssize_t strides[5] = {
PyGpuArray_STRIDES(pvals)[0]/sizeof(float),
PyGpuArray_STRIDES(pvals)[1]/sizeof(float),
PyGpuArray_STRIDES(unis)[0]/sizeof(float),
PyGpuArray_STRIDES(pvals)[0]/gpuarray_get_elsize(%(in_typecode)s),
PyGpuArray_STRIDES(pvals)[1]/gpuarray_get_elsize(%(in_typecode)s),
PyGpuArray_STRIDES(unis)[0]/gpuarray_get_elsize(%(in_typecode)s),
PyGpuArray_STRIDES(out)[0]/gpuarray_get_elsize(%(out_typecode)s),
PyGpuArray_STRIDES(out)[1]/gpuarray_get_elsize(%(out_typecode)s)
};
......@@ -222,7 +229,7 @@ KERNEL void k_multi_warp_multinomial(
return s
def c_code_cache_version(self):
return (2,)
return (3,)
class GPUAMultinomialWOReplacementFromUniform(GpuKernelBase, Op):
......
......@@ -16,15 +16,14 @@ from ..multinomial import (GPUAMultinomialFromUniform,
GPUAMultinomialWOReplacementFromUniform)
def test_multinomial_0():
def test_multinomial_output_dtype():
# This tests the MultinomialFromUniform Op directly, not going through the
# multinomial() call in GPU random generation.
p = tensor.fmatrix()
u = tensor.fvector()
for dtype in ['int64', 'float32', 'auto']:
for dtype in ['int64', 'float32', 'float16', 'float64', 'int32', 'auto']:
m = theano.sandbox.multinomial.MultinomialFromUniform(dtype)(p, u)
# the m*2 allows the multinomial to reuse output
......@@ -52,6 +51,44 @@ def test_multinomial_0():
utt.assert_allclose(r, [[0, 2]])
def test_multinomial_input_dtype():
# This tests the MultinomialFromUniform Op directly, not going through the
# multinomial() call in GPU random generation.
for idtype in ['float32', 'float16', 'float64']:
for odtype in ['float32', 'float16', 'float64', 'int32']:
p = tensor.matrix('p', idtype)
u = tensor.vector('u', idtype)
# p = tensor.dmatrix('p')
# u = tensor.dvector('u')
m = theano.sandbox.multinomial.MultinomialFromUniform(odtype)(p, u)
# the m*2 allows the multinomial to reuse output
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu)
assert any([type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
# test that both first and second samples can be drawn
utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
[[2, 0], [0, 2]])
# test that both second labels can be drawn
r = f([[.2, .8], [.3, .7]], [.31, .31])
utt.assert_allclose(r, [[0, 2], [0, 2]])
# test that both first labels can be drawn
r = f([[.2, .8], [.3, .7]], [.21, .21])
utt.assert_allclose(r, [[0, 2], [2, 0]])
# change the size to make sure output gets reallocated ok
# and also make sure that the GPU version doesn't screw up the
# transposed-ness
r = f([[.2, .8]], [.25])
utt.assert_allclose(r, [[0, 2]])
# TODO: check a bigger example (make sure blocking on GPU is handled correctly)
def test_multinomial_large():
# DEBUG_MODE will test this on GPU
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论