提交 dab0b393 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5038 from aalmah/gpu_multinomial_dtypes

Gpu multinomial more dtypes
...@@ -52,11 +52,6 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op): ...@@ -52,11 +52,6 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
odtype = pvals.dtype odtype = pvals.dtype
else: else:
odtype = self.odtype odtype = self.odtype
assert odtype == 'float32', odtype
if odtype != pvals.dtype:
raise NotImplementedError(
'GpuMultinomialFromUniform works only if '
'self.odtype == pvals.dtype', odtype, pvals.dtype)
br = (pvals.broadcastable[1], pvals.broadcastable[0]) br = (pvals.broadcastable[1], pvals.broadcastable[0])
out = GpuArrayType(broadcastable=br, out = GpuArrayType(broadcastable=br,
dtype=odtype, dtype=odtype,
...@@ -74,7 +69,7 @@ KERNEL void k_multi_warp_multinomial( ...@@ -74,7 +69,7 @@ KERNEL void k_multi_warp_multinomial(
const ga_ssize pvals_col_stride, const ga_ssize pvals_col_stride,
GLOBAL_MEM float * global_unis, GLOBAL_MEM float * global_unis,
const ga_ssize unis_stride, const ga_ssize unis_stride,
GLOBAL_MEM float * global_outs, GLOBAL_MEM %(out_ctype)s * global_outs,
const ga_ssize outs_row_stride, const ga_ssize outs_row_stride,
const ga_ssize outs_col_stride const ga_ssize outs_col_stride
) )
...@@ -88,14 +83,14 @@ KERNEL void k_multi_warp_multinomial( ...@@ -88,14 +83,14 @@ KERNEL void k_multi_warp_multinomial(
const float unis_n = global_unis[n*unis_stride]; const float unis_n = global_unis[n*unis_stride];
for (ga_size m = 0; m < nb_outcomes; ++m) for (ga_size m = 0; m < nb_outcomes; ++m)
{ {
float current_out = 0.; %(out_ctype)s current_out = 0;
if (!done) if (!done)
{ {
cummul += global_pvals[m * pvals_col_stride + cummul += global_pvals[m * pvals_col_stride +
n * pvals_row_stride]; n * pvals_row_stride];
if (unis_n < cummul) if (unis_n < cummul)
{ {
current_out = 1.; current_out = 1;
done = true; done = true;
} }
} }
...@@ -105,7 +100,7 @@ KERNEL void k_multi_warp_multinomial( ...@@ -105,7 +100,7 @@ KERNEL void k_multi_warp_multinomial(
} }
} }
} }
""" """ % dict(out_ctype=pygpu.gpuarray.dtype_to_ctype(node.outputs[0].dtype))
return [Kernel( return [Kernel(
code=code, name="k_multi_warp_multinomial", code=code, name="k_multi_warp_multinomial",
params=[pygpu.gpuarray.SIZE, params=[pygpu.gpuarray.SIZE,
...@@ -128,6 +123,7 @@ KERNEL void k_multi_warp_multinomial( ...@@ -128,6 +123,7 @@ KERNEL void k_multi_warp_multinomial(
ctx = sub['params'] ctx = sub['params']
sync = bool(config.gpuarray.sync) sync = bool(config.gpuarray.sync)
kname = self.gpu_kernels(node, name)[0].objvar kname = self.gpu_kernels(node, name)[0].objvar
out_typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
s = """ s = """
PyGpuArrayObject * pvals = %(pvals)s; PyGpuArrayObject * pvals = %(pvals)s;
PyGpuArrayObject * unis = %(unis)s; PyGpuArrayObject * unis = %(unis)s;
...@@ -152,7 +148,7 @@ KERNEL void k_multi_warp_multinomial( ...@@ -152,7 +148,7 @@ KERNEL void k_multi_warp_multinomial(
dims[0] = PyGpuArray_DIMS(pvals)[1]; dims[0] = PyGpuArray_DIMS(pvals)[1];
dims[1] = PyGpuArray_DIMS(pvals)[0]; dims[1] = PyGpuArray_DIMS(pvals)[0];
if (theano_prep_output(&out, 2, dims, unis->ga.typecode, if (theano_prep_output(&out, 2, dims, %(out_typecode)s,
GA_C_ORDER, %(ctx)s) != 0){ GA_C_ORDER, %(ctx)s) != 0){
%(fail)s %(fail)s
} }
...@@ -194,8 +190,8 @@ KERNEL void k_multi_warp_multinomial( ...@@ -194,8 +190,8 @@ KERNEL void k_multi_warp_multinomial(
PyGpuArray_STRIDES(pvals)[0]/sizeof(float), PyGpuArray_STRIDES(pvals)[0]/sizeof(float),
PyGpuArray_STRIDES(pvals)[1]/sizeof(float), PyGpuArray_STRIDES(pvals)[1]/sizeof(float),
PyGpuArray_STRIDES(unis)[0]/sizeof(float), PyGpuArray_STRIDES(unis)[0]/sizeof(float),
PyGpuArray_STRIDES(out)[0]/sizeof(float), PyGpuArray_STRIDES(out)[0]/gpuarray_get_elsize(%(out_typecode)s),
PyGpuArray_STRIDES(out)[1]/sizeof(float) PyGpuArray_STRIDES(out)[1]/gpuarray_get_elsize(%(out_typecode)s)
}; };
int err; int err;
args[0] = (void*)&PyGpuArray_DIMS(out)[1]; args[0] = (void*)&PyGpuArray_DIMS(out)[1];
...@@ -226,7 +222,7 @@ KERNEL void k_multi_warp_multinomial( ...@@ -226,7 +222,7 @@ KERNEL void k_multi_warp_multinomial(
return s return s
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (2,)
class GPUAMultinomialWOReplacementFromUniform(GpuKernelBase, Op): class GPUAMultinomialWOReplacementFromUniform(GpuKernelBase, Op):
...@@ -479,10 +475,9 @@ def local_gpua_multinomial(op, context_name, inputs, outputs): ...@@ -479,10 +475,9 @@ def local_gpua_multinomial(op, context_name, inputs, outputs):
except NotScalarConstantError: except NotScalarConstantError:
return None return None
m, = outputs m, = outputs
if (p.dtype == u.dtype == m.dtype == 'float32'): gpu_op = GPUAMultinomialFromUniform(op.odtype)
gpu_op = GPUAMultinomialFromUniform(op.odtype) return GpuDimShuffle([False, False], [1, 0])(
return GpuDimShuffle([False, False], [1, 0])( gpu_op(p, u))
gpu_op(p, u))
@register_opt('fast_compile') @register_opt('fast_compile')
......
...@@ -23,31 +23,33 @@ def test_multinomial_0(): ...@@ -23,31 +23,33 @@ def test_multinomial_0():
p = tensor.fmatrix() p = tensor.fmatrix()
u = tensor.fvector() u = tensor.fvector()
m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u) for dtype in ['int64', 'float32', 'auto']:
# the m*2 allows the multinomial to reuse output m = theano.sandbox.multinomial.MultinomialFromUniform(dtype)(p, u)
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu)
assert any([type(node.op) is GPUAMultinomialFromUniform # the m*2 allows the multinomial to reuse output
for node in f.maker.fgraph.toposort()]) f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu)
assert any([type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
# test that both first and second samples can be drawn # test that both first and second samples can be drawn
utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]), utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
[[2, 0], [0, 2]]) [[2, 0], [0, 2]])
# test that both second labels can be drawn # test that both second labels can be drawn
r = f([[.2, .8], [.3, .7]], [.31, .31]) r = f([[.2, .8], [.3, .7]], [.31, .31])
utt.assert_allclose(r, [[0, 2], [0, 2]]) utt.assert_allclose(r, [[0, 2], [0, 2]])
# test that both first labels can be drawn # test that both first labels can be drawn
r = f([[.2, .8], [.3, .7]], [.21, .21]) r = f([[.2, .8], [.3, .7]], [.21, .21])
utt.assert_allclose(r, [[0, 2], [2, 0]]) utt.assert_allclose(r, [[0, 2], [2, 0]])
# change the size to make sure output gets reallocated ok # change the size to make sure output gets reallocated ok
# and also make sure that the GPU version doesn't screw up the # and also make sure that the GPU version doesn't screw up the
# transposed-ness # transposed-ness
r = f([[.2, .8]], [.25]) r = f([[.2, .8]], [.25])
utt.assert_allclose(r, [[0, 2]]) utt.assert_allclose(r, [[0, 2]])
# TODO: check a bigger example (make sure blocking on GPU is handled correctly) # TODO: check a bigger example (make sure blocking on GPU is handled correctly)
...@@ -80,6 +82,23 @@ def test_multinomial_large(): ...@@ -80,6 +82,23 @@ def test_multinomial_large():
utt.assert_allclose(mval, asdf) # broadcast over all rows utt.assert_allclose(mval, asdf) # broadcast over all rows
def test_gpu_opt_dtypes():
# Test if the returned samples are of the datatype specified
for dtype in ['uint32', 'float32', 'int64', 'float64']:
p = tensor.fmatrix()
u = tensor.fvector()
m = theano.sandbox.multinomial.MultinomialFromUniform(dtype)(p, u)
f = function([p, u], m, allow_input_downcast=True, mode=mode_with_gpu)
assert any([type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = numpy.ones_like(pval[:, 0]) * 0.5
samples = f(pval, uval)
assert samples.dtype == dtype, "%s != %s" % (samples.dtype, dtype)
def test_gpu_opt(): def test_gpu_opt():
# Does have some overlap with test_multinomial_0 # Does have some overlap with test_multinomial_0
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论