提交 c708e3a5 authored 作者: Frederic Bastien's avatar Frederic Bastien

Advance MultinomialFromUniform for the new back-end

上级 fa978938
#section support_code_apply #section support_code_apply
static __global__ void k_multi_warp_APPLYSPECIFIC(multinomial)( static __global__ void k_multi_warp_multinomial(
const int nb_multi, const int nb_multi,
const int nb_outcomes, const int nb_outcomes,
float * global_pvals, float * global_pvals,
...@@ -66,6 +66,7 @@ int APPLY_SPECIFIC(multinomial)(PyGpuArrayObject *pvals, ...@@ -66,6 +66,7 @@ int APPLY_SPECIFIC(multinomial)(PyGpuArrayObject *pvals,
if (theano_prep_output(out, 2, dims, unis->ga.typecode, if (theano_prep_output(out, 2, dims, unis->ga.typecode,
GA_C_ORDER, c) != 0) GA_C_ORDER, c) != 0)
return 1; return 1;
GpuArray_memset(&((*out)->ga), 0);
{ // NESTED SCOPE { // NESTED SCOPE
int nb_multi = PyGpuArray_DIMS(pvals)[0]; int nb_multi = PyGpuArray_DIMS(pvals)[0];
...@@ -97,19 +98,23 @@ int APPLY_SPECIFIC(multinomial)(PyGpuArrayObject *pvals, ...@@ -97,19 +98,23 @@ int APPLY_SPECIFIC(multinomial)(PyGpuArrayObject *pvals,
assert(nb_blocks*nb_threads >= nb_multi); assert(nb_blocks*nb_threads >= nb_multi);
k_multi_warp_APPLYSPECIFIC(multinomial)<<<n_blocks, n_threads, n_shared>>>( k_multi_warp_multinomial<<<n_blocks, n_threads, n_shared>>>(
PyGpuArray_DIMS(*out)[1], PyGpuArray_DIMS(*out)[1],
PyGpuArray_DIMS(*out)[0], PyGpuArray_DIMS(*out)[0],
PyGpuArray_DEV_DATA(%(pvals)s), (float*)PyGpuArray_DEV_DATA(pvals),
PyGpuArray_STRIDES(%(pvals)s)[0], PyGpuArray_STRIDES(pvals)[0],
PyGpuArray_STRIDES(%(pvals)s)[1], PyGpuArray_STRIDES(pvals)[1],
PyGpuArray_DEV_DATA(%(unis)s), (float*)PyGpuArray_DEV_DATA(unis),
PyGpuArray_STRIDES(%(unis)s)[0], PyGpuArray_STRIDES(unis)[0],
PyGpuArray_DEV_DATA(*out), (float*)PyGpuArray_DEV_DATA(*out),
PyGpuArray_STRIDES(*out)[0], PyGpuArray_STRIDES(*out)[0],
PyGpuArray_STRIDES(*out)[1] PyGpuArray_STRIDES(*out)[1]
); );
CNDA_THREAD_SYNC;
//TODO
//if(false)//SYNC)
// GpuArray_sync((*out)->ga);
// SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts) if (cudaSuccess != sts)
{ {
......
...@@ -2,19 +2,32 @@ import os ...@@ -2,19 +2,32 @@ import os
import pygpu import pygpu
import theano
import theano.sandbox.multinomial
from theano import Apply from theano import Apply
from theano.gof import COp from theano.gof import COp, local_optimizer
from .basic_ops import as_gpuarray_variable, infer_context_name from .basic_ops import as_gpuarray_variable, infer_context_name
from .type import gpu_context_type, GpuArrayType from .type import gpu_context_type, GpuArrayType
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
from theano.sandbox import gpuarray
from theano.sandbox.gpuarray.opt import register_opt, op_lifter
from theano.tensor import NotScalarConstantError, get_scalar_constant_value
class GPUAMultinomialFromUniform(COp): class GPUAMultinomialFromUniform(COp):
__props__ = ("odtype",)
params_type = gpu_context_type params_type = gpu_context_type
def __init__(self, odtype):
COp.__init__(self, ['multinomial.c'], 'APPLY_SPECIFIC(multinomial)')
self.odtype = odtype
def get_params(self, node): def get_params(self, node):
return node.outputs[0].type.context return node.outputs[0].type.context
def __init__(self): def c_compiler(self):
COp.__init__(self, ['multinomial.c'], 'APPLY_SPECIFIC(multinomial)') # TODO: get rid of this
return NVCC_compiler
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', 'gpuarray_helper.h'] return ['<numpy_compat.h>', 'gpuarray_helper.h']
...@@ -30,10 +43,44 @@ class GPUAMultinomialFromUniform(COp): ...@@ -30,10 +43,44 @@ class GPUAMultinomialFromUniform(COp):
pvals = as_gpuarray_variable(pvals, ctx_name) pvals = as_gpuarray_variable(pvals, ctx_name)
unis = as_gpuarray_variable(unis, ctx_name) unis = as_gpuarray_variable(unis, ctx_name)
if pvals.ndim != 2:
raise NotImplementedError('pvals ndim should be 2', pvals.ndim)
if unis.ndim != 1:
raise NotImplementedError('unis ndim should be 1', unis.ndim)
if self.odtype == 'auto':
odtype = pvals.dtype
else:
odtype = self.odtype
if odtype != pvals.dtype:
raise NotImplementedError(
'GpuMultinomialFromUniform works only if '
'self.odtype == pvals.dtype', odtype, pvals.dtype)
br = (pvals.broadcastable[1], pvals.broadcastable[0]) br = (pvals.broadcastable[1], pvals.broadcastable[0])
out = GpuArrayType(broadcastable=br, dtype="float32")() out = GpuArrayType(broadcastable=br, dtype=odtype)()
return Apply(self, [pvals, unis], [out]) return Apply(self, [pvals, unis], [out])
def c_code_cache_version(self): def c_code_cache_version(self):
return (8,) return (8,)
@register_opt()
@op_lifter([theano.sandbox.multinomial.MultinomialFromUniform])
def local_gpua_multinomial(node, context_name):
# TODO : need description for function
if len(node.inputs) == 2:
p, u = node.inputs
n_samples = 1
else:
p, u, n_samples = node.inputs
try:
if get_scalar_constant_value(n_samples) != 1:
return None
except NotScalarConstantError:
return None
m, = node.outputs
if (p.dtype == u.dtype == m.dtype == 'float32'):
gpu_op = GPUAMultinomialFromUniform(node.op.odtype)
return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])(
gpu_op(p, u))
from __future__ import absolute_import, print_function, division
import copy
import numpy import numpy
import theano import theano
from theano import tensor from theano import config, function, tensor
from theano.sandbox.gpuarray.multinomial import GPUAMultinomialFromUniform from ..multinomial import GPUAMultinomialFromUniform
from .config import mode_with_gpu from .config import mode_with_gpu
from theano.compile.mode import get_default_mode, predefined_linkers
import theano.tests.unittest_tools as utt
from .. import pygpu_activated
def get_mode(gpu):
mode = get_default_mode()
mode = copy.copy(mode)
if gpu:
mode = mode.including('gpuarray', 'gpu_local_optimizations',
'local_cut_gpu_host_gpu')
if isinstance(mode.linker, theano.gof.PerformLinker):
mode.linker = predefined_linkers['c|py']
if hasattr(mode.linker, 'c_thunks'):
mode.linker.c_thunks = True
return mode
def run_with_c(f, gpu=False):
mode = get_mode(gpu)
f(mode, gpu)
def test_multinomial0(): def test_multinomial0():
# This tests the MultinomialFromUniform Op directly, not going through the # This tests the MultinomialFromUniform Op directly, not going through the
...@@ -12,8 +38,51 @@ def test_multinomial0(): ...@@ -12,8 +38,51 @@ def test_multinomial0():
p = tensor.fmatrix() p = tensor.fmatrix()
u = tensor.fvector() u = tensor.fvector()
m = GPUAMultinomialFromUniform()(p, u) m = GPUAMultinomialFromUniform('auto')(p, u)
f = theano.function([p, u], m, mode=mode_with_gpu) f = theano.function([p, u], m, mode=mode_with_gpu)
theano.printing.debugprint(f)
ret = f(numpy.array([[0.1, 0.2, 0.3, 0.4],
[0.1, 0.2, 0.3, 0.4]], dtype='float32'),
numpy.array([0.05, 0.05], dtype='float32'))
print(numpy.asarray(ret))
def test_multinomial_0():
# This tests the MultinomialFromUniform Op directly, not going through the
# multinomial() call in GPU random generation.
p = tensor.fmatrix()
u = tensor.fvector()
m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u)
def body(mode, gpu):
# the m*2 allows the multinomial to reuse output
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
if gpu:
assert any([type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
# test that both first and second samples can be drawn
utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
[[2, 0], [0, 2]])
# test that both second labels can be drawn
r = f([[.2, .8], [.3, .7]], [.31, .31])
utt.assert_allclose(r, [[0, 2], [0, 2]])
# test that both first labels can be drawn
r = f([[.2, .8], [.3, .7]], [.21, .21])
utt.assert_allclose(r, [[0, 2], [2, 0]])
# change the size to make sure output gets reallocated ok
# and also make sure that the GPU version doesn't screw up the
# transposed-ness
r = f([[.2, .8]], [.25])
utt.assert_allclose(r, [[0, 2]])
assert f(numpy.array([[0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4]]), numpy.array([0.05, 0.05])) run_with_c(body)
\ No newline at end of file if pygpu_activated:
run_with_c(body, True)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论