提交 dab0b393 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5038 from aalmah/gpu_multinomial_dtypes

Gpu multinomial more dtypes
......@@ -52,11 +52,6 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
odtype = pvals.dtype
else:
odtype = self.odtype
assert odtype == 'float32', odtype
if odtype != pvals.dtype:
raise NotImplementedError(
'GpuMultinomialFromUniform works only if '
'self.odtype == pvals.dtype', odtype, pvals.dtype)
br = (pvals.broadcastable[1], pvals.broadcastable[0])
out = GpuArrayType(broadcastable=br,
dtype=odtype,
......@@ -74,7 +69,7 @@ KERNEL void k_multi_warp_multinomial(
const ga_ssize pvals_col_stride,
GLOBAL_MEM float * global_unis,
const ga_ssize unis_stride,
GLOBAL_MEM float * global_outs,
GLOBAL_MEM %(out_ctype)s * global_outs,
const ga_ssize outs_row_stride,
const ga_ssize outs_col_stride
)
......@@ -88,14 +83,14 @@ KERNEL void k_multi_warp_multinomial(
const float unis_n = global_unis[n*unis_stride];
for (ga_size m = 0; m < nb_outcomes; ++m)
{
float current_out = 0.;
%(out_ctype)s current_out = 0;
if (!done)
{
cummul += global_pvals[m * pvals_col_stride +
n * pvals_row_stride];
if (unis_n < cummul)
{
current_out = 1.;
current_out = 1;
done = true;
}
}
......@@ -105,7 +100,7 @@ KERNEL void k_multi_warp_multinomial(
}
}
}
"""
""" % dict(out_ctype=pygpu.gpuarray.dtype_to_ctype(node.outputs[0].dtype))
return [Kernel(
code=code, name="k_multi_warp_multinomial",
params=[pygpu.gpuarray.SIZE,
......@@ -128,6 +123,7 @@ KERNEL void k_multi_warp_multinomial(
ctx = sub['params']
sync = bool(config.gpuarray.sync)
kname = self.gpu_kernels(node, name)[0].objvar
out_typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
s = """
PyGpuArrayObject * pvals = %(pvals)s;
PyGpuArrayObject * unis = %(unis)s;
......@@ -152,7 +148,7 @@ KERNEL void k_multi_warp_multinomial(
dims[0] = PyGpuArray_DIMS(pvals)[1];
dims[1] = PyGpuArray_DIMS(pvals)[0];
if (theano_prep_output(&out, 2, dims, unis->ga.typecode,
if (theano_prep_output(&out, 2, dims, %(out_typecode)s,
GA_C_ORDER, %(ctx)s) != 0){
%(fail)s
}
......@@ -194,8 +190,8 @@ KERNEL void k_multi_warp_multinomial(
PyGpuArray_STRIDES(pvals)[0]/sizeof(float),
PyGpuArray_STRIDES(pvals)[1]/sizeof(float),
PyGpuArray_STRIDES(unis)[0]/sizeof(float),
PyGpuArray_STRIDES(out)[0]/sizeof(float),
PyGpuArray_STRIDES(out)[1]/sizeof(float)
PyGpuArray_STRIDES(out)[0]/gpuarray_get_elsize(%(out_typecode)s),
PyGpuArray_STRIDES(out)[1]/gpuarray_get_elsize(%(out_typecode)s)
};
int err;
args[0] = (void*)&PyGpuArray_DIMS(out)[1];
......@@ -226,7 +222,7 @@ KERNEL void k_multi_warp_multinomial(
return s
def c_code_cache_version(self):
return (1,)
return (2,)
class GPUAMultinomialWOReplacementFromUniform(GpuKernelBase, Op):
......@@ -479,10 +475,9 @@ def local_gpua_multinomial(op, context_name, inputs, outputs):
except NotScalarConstantError:
return None
m, = outputs
if (p.dtype == u.dtype == m.dtype == 'float32'):
gpu_op = GPUAMultinomialFromUniform(op.odtype)
return GpuDimShuffle([False, False], [1, 0])(
gpu_op(p, u))
gpu_op = GPUAMultinomialFromUniform(op.odtype)
return GpuDimShuffle([False, False], [1, 0])(
gpu_op(p, u))
@register_opt('fast_compile')
......
......@@ -23,31 +23,33 @@ def test_multinomial_0():
p = tensor.fmatrix()
u = tensor.fvector()
m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u)
for dtype in ['int64', 'float32', 'auto']:
# the m*2 allows the multinomial to reuse output
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu)
m = theano.sandbox.multinomial.MultinomialFromUniform(dtype)(p, u)
assert any([type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
# the m*2 allows the multinomial to reuse output
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu)
assert any([type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
# test that both first and second samples can be drawn
utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
[[2, 0], [0, 2]])
# test that both first and second samples can be drawn
utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
[[2, 0], [0, 2]])
# test that both second labels can be drawn
r = f([[.2, .8], [.3, .7]], [.31, .31])
utt.assert_allclose(r, [[0, 2], [0, 2]])
# test that both second labels can be drawn
r = f([[.2, .8], [.3, .7]], [.31, .31])
utt.assert_allclose(r, [[0, 2], [0, 2]])
# test that both first labels can be drawn
r = f([[.2, .8], [.3, .7]], [.21, .21])
utt.assert_allclose(r, [[0, 2], [2, 0]])
# test that both first labels can be drawn
r = f([[.2, .8], [.3, .7]], [.21, .21])
utt.assert_allclose(r, [[0, 2], [2, 0]])
# change the size to make sure output gets reallocated ok
# and also make sure that the GPU version doesn't screw up the
# transposed-ness
r = f([[.2, .8]], [.25])
utt.assert_allclose(r, [[0, 2]])
# change the size to make sure output gets reallocated ok
# and also make sure that the GPU version doesn't screw up the
# transposed-ness
r = f([[.2, .8]], [.25])
utt.assert_allclose(r, [[0, 2]])
# TODO: check a bigger example (make sure blocking on GPU is handled correctly)
......@@ -80,6 +82,23 @@ def test_multinomial_large():
utt.assert_allclose(mval, asdf) # broadcast over all rows
def test_gpu_opt_dtypes():
# Test if the returned samples are of the datatype specified
for dtype in ['uint32', 'float32', 'int64', 'float64']:
p = tensor.fmatrix()
u = tensor.fvector()
m = theano.sandbox.multinomial.MultinomialFromUniform(dtype)(p, u)
f = function([p, u], m, allow_input_downcast=True, mode=mode_with_gpu)
assert any([type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = numpy.ones_like(pval[:, 0]) * 0.5
samples = f(pval, uval)
assert samples.dtype == dtype, "%s != %s" % (samples.dtype, dtype)
def test_gpu_opt():
# Does have some overlap with test_multinomial_0
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论