提交 c708e3a5 authored 作者: Frederic Bastien's avatar Frederic Bastien

Advance MultinomialFromUniform for the new back-end

上级 fa978938
#section support_code_apply
static __global__ void k_multi_warp_APPLYSPECIFIC(multinomial)(
static __global__ void k_multi_warp_multinomial(
const int nb_multi,
const int nb_outcomes,
float * global_pvals,
......@@ -66,6 +66,7 @@ int APPLY_SPECIFIC(multinomial)(PyGpuArrayObject *pvals,
if (theano_prep_output(out, 2, dims, unis->ga.typecode,
GA_C_ORDER, c) != 0)
return 1;
GpuArray_memset(&((*out)->ga), 0);
{ // NESTED SCOPE
int nb_multi = PyGpuArray_DIMS(pvals)[0];
......@@ -97,19 +98,23 @@ int APPLY_SPECIFIC(multinomial)(PyGpuArrayObject *pvals,
assert(nb_blocks*nb_threads >= nb_multi);
k_multi_warp_APPLYSPECIFIC(multinomial)<<<n_blocks, n_threads, n_shared>>>(
k_multi_warp_multinomial<<<n_blocks, n_threads, n_shared>>>(
PyGpuArray_DIMS(*out)[1],
PyGpuArray_DIMS(*out)[0],
PyGpuArray_DEV_DATA(%(pvals)s),
PyGpuArray_STRIDES(%(pvals)s)[0],
PyGpuArray_STRIDES(%(pvals)s)[1],
PyGpuArray_DEV_DATA(%(unis)s),
PyGpuArray_STRIDES(%(unis)s)[0],
PyGpuArray_DEV_DATA(*out),
(float*)PyGpuArray_DEV_DATA(pvals),
PyGpuArray_STRIDES(pvals)[0],
PyGpuArray_STRIDES(pvals)[1],
(float*)PyGpuArray_DEV_DATA(unis),
PyGpuArray_STRIDES(unis)[0],
(float*)PyGpuArray_DEV_DATA(*out),
PyGpuArray_STRIDES(*out)[0],
PyGpuArray_STRIDES(*out)[1]
);
CNDA_THREAD_SYNC;
//TODO
//if(false)//SYNC)
// GpuArray_sync((*out)->ga);
// SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
......
......@@ -2,19 +2,32 @@ import os
import pygpu
import theano
import theano.sandbox.multinomial
from theano import Apply
from theano.gof import COp
from theano.gof import COp, local_optimizer
from .basic_ops import as_gpuarray_variable, infer_context_name
from .type import gpu_context_type, GpuArrayType
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
from theano.sandbox import gpuarray
from theano.sandbox.gpuarray.opt import register_opt, op_lifter
from theano.tensor import NotScalarConstantError, get_scalar_constant_value
class GPUAMultinomialFromUniform(COp):
__props__ = ("odtype",)
params_type = gpu_context_type
def __init__(self, odtype):
COp.__init__(self, ['multinomial.c'], 'APPLY_SPECIFIC(multinomial)')
self.odtype = odtype
def get_params(self, node):
return node.outputs[0].type.context
def __init__(self):
COp.__init__(self, ['multinomial.c'], 'APPLY_SPECIFIC(multinomial)')
def c_compiler(self):
# TODO: get rid of this
return NVCC_compiler
def c_headers(self):
return ['<numpy_compat.h>', 'gpuarray_helper.h']
......@@ -30,10 +43,44 @@ class GPUAMultinomialFromUniform(COp):
pvals = as_gpuarray_variable(pvals, ctx_name)
unis = as_gpuarray_variable(unis, ctx_name)
if pvals.ndim != 2:
raise NotImplementedError('pvals ndim should be 2', pvals.ndim)
if unis.ndim != 1:
raise NotImplementedError('unis ndim should be 1', unis.ndim)
if self.odtype == 'auto':
odtype = pvals.dtype
else:
odtype = self.odtype
if odtype != pvals.dtype:
raise NotImplementedError(
'GpuMultinomialFromUniform works only if '
'self.odtype == pvals.dtype', odtype, pvals.dtype)
br = (pvals.broadcastable[1], pvals.broadcastable[0])
out = GpuArrayType(broadcastable=br, dtype="float32")()
out = GpuArrayType(broadcastable=br, dtype=odtype)()
return Apply(self, [pvals, unis], [out])
def c_code_cache_version(self):
return (8,)
@register_opt()
@op_lifter([theano.sandbox.multinomial.MultinomialFromUniform])
def local_gpua_multinomial(node, context_name):
# TODO : need description for function
if len(node.inputs) == 2:
p, u = node.inputs
n_samples = 1
else:
p, u, n_samples = node.inputs
try:
if get_scalar_constant_value(n_samples) != 1:
return None
except NotScalarConstantError:
return None
m, = node.outputs
if (p.dtype == u.dtype == m.dtype == 'float32'):
gpu_op = GPUAMultinomialFromUniform(node.op.odtype)
return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])(
gpu_op(p, u))
from __future__ import absolute_import, print_function, division
import copy
import numpy
import theano
from theano import tensor
from theano.sandbox.gpuarray.multinomial import GPUAMultinomialFromUniform
from theano import config, function, tensor
from ..multinomial import GPUAMultinomialFromUniform
from .config import mode_with_gpu
from theano.compile.mode import get_default_mode, predefined_linkers
import theano.tests.unittest_tools as utt
from .. import pygpu_activated
def get_mode(gpu):
mode = get_default_mode()
mode = copy.copy(mode)
if gpu:
mode = mode.including('gpuarray', 'gpu_local_optimizations',
'local_cut_gpu_host_gpu')
if isinstance(mode.linker, theano.gof.PerformLinker):
mode.linker = predefined_linkers['c|py']
if hasattr(mode.linker, 'c_thunks'):
mode.linker.c_thunks = True
return mode
def run_with_c(f, gpu=False):
mode = get_mode(gpu)
f(mode, gpu)
def test_multinomial0():
# This tests the MultinomialFromUniform Op directly, not going through the
......@@ -12,8 +38,51 @@ def test_multinomial0():
p = tensor.fmatrix()
u = tensor.fvector()
m = GPUAMultinomialFromUniform()(p, u)
m = GPUAMultinomialFromUniform('auto')(p, u)
f = theano.function([p, u], m, mode=mode_with_gpu)
theano.printing.debugprint(f)
ret = f(numpy.array([[0.1, 0.2, 0.3, 0.4],
[0.1, 0.2, 0.3, 0.4]], dtype='float32'),
numpy.array([0.05, 0.05], dtype='float32'))
print(numpy.asarray(ret))
def test_multinomial_0():
# This tests the MultinomialFromUniform Op directly, not going through the
# multinomial() call in GPU random generation.
p = tensor.fmatrix()
u = tensor.fvector()
m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u)
def body(mode, gpu):
# the m*2 allows the multinomial to reuse output
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
if gpu:
assert any([type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
# test that both first and second samples can be drawn
utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
[[2, 0], [0, 2]])
# test that both second labels can be drawn
r = f([[.2, .8], [.3, .7]], [.31, .31])
utt.assert_allclose(r, [[0, 2], [0, 2]])
# test that both first labels can be drawn
r = f([[.2, .8], [.3, .7]], [.21, .21])
utt.assert_allclose(r, [[0, 2], [2, 0]])
# change the size to make sure output gets reallocated ok
# and also make sure that the GPU version doesn't screw up the
# transposed-ness
r = f([[.2, .8]], [.25])
utt.assert_allclose(r, [[0, 2]])
assert f(numpy.array([[0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4]]), numpy.array([0.05, 0.05]))
\ No newline at end of file
run_with_c(body)
if pygpu_activated:
run_with_c(body, True)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论