提交 aa483425 authored 作者: Frederic Bastien's avatar Frederic Bastien

Make GPUAMultinomialFromUniform use GpuKernelBase.

上级 d88c1a84
# TODO test dtype != float32
import os import os
import pygpu import pygpu
import theano import theano
import theano.sandbox.multinomial import theano.sandbox.multinomial
from theano import Apply from theano import Apply, config
from theano.gof import COp, local_optimizer from theano.gof import Op, local_optimizer
from theano.tensor import NotScalarConstantError, get_scalar_constant_value
from theano.sandbox import gpuarray
from .basic_ops import as_gpuarray_variable, infer_context_name from .basic_ops import as_gpuarray_variable, infer_context_name
from .fp16_help import write_w
from .opt import register_opt, op_lifter
from .type import gpu_context_type, GpuArrayType from .type import gpu_context_type, GpuArrayType
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
from theano.sandbox import gpuarray
from theano.sandbox.gpuarray.opt import register_opt, op_lifter
from theano.tensor import NotScalarConstantError, get_scalar_constant_value
class GPUAMultinomialFromUniform(COp): class GPUAMultinomialFromUniform(gpuarray.basic_ops.GpuKernelBase, Op):
__props__ = ("odtype",) __props__ = ("odtype",)
params_type = gpu_context_type params_type = gpu_context_type
def __init__(self, odtype): def __init__(self, odtype):
COp.__init__(self, ['multinomial.c'], 'APPLY_SPECIFIC(multinomial)') Op.__init__(self)
self.odtype = odtype self.odtype = odtype
def get_params(self, node): def get_params(self, node):
return node.outputs[0].type.context return node.outputs[0].type.context
def c_compiler(self):
# TODO: get rid of this
return NVCC_compiler
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', 'gpuarray_helper.h'] return ['<numpy_compat.h>', 'gpuarray_helper.h']
...@@ -60,9 +57,180 @@ class GPUAMultinomialFromUniform(COp): ...@@ -60,9 +57,180 @@ class GPUAMultinomialFromUniform(COp):
return Apply(self, [pvals, unis], [out]) return Apply(self, [pvals, unis], [out])
# def c_code_cache_version(self): def gpu_kernels(self, node, name):
# return dtype = node.outputs[0].dtype
# return (8,) code = """
KERNEL void k_multi_warp_multinomial(
const ga_size nb_multi,
const ga_size nb_outcomes,
GLOBAL_MEM float * global_pvals,
const ga_ssize pvals_row_stride,
const ga_ssize pvals_col_stride,
GLOBAL_MEM float * global_unis,
const ga_ssize unis_stride,
GLOBAL_MEM float * global_outs,
const ga_ssize outs_row_stride,
const ga_ssize outs_col_stride
)
{
// each thread takes care of one multinomial draw
int n = blockDim.x*blockIdx.x + threadIdx.x;
if (n < nb_multi)
{
float cummul = 0.;
bool done = false;
const float unis_n = global_unis[n*unis_stride];
for (ga_size m = 0; m < nb_outcomes; ++m)
{
float current_out = 0.;
if (!done)
{
cummul += global_pvals[m * pvals_col_stride +
n * pvals_row_stride];
if (unis_n < cummul)
{
current_out = 1.;
done = true;
}
}
//write out transposed for speed.
global_outs[n * outs_col_stride +
m * outs_row_stride] = current_out;
}
}
}
//KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
// ga_size nb = n < m ? n : m;
// for (ga_size i = LID_0; i < nb; i += LDIM_0) {
// a[i*m + i] = %(write_a)s(1);
// }
//}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(dtype),
name=name, write_a=write_w(dtype))
return [gpuarray.basic_ops.Kernel(
code=code, name="k_multi_warp_multinomial",
params=[pygpu.gpuarray.SIZE,
pygpu.gpuarray.SIZE,
pygpu.gpuarray.GpuArray,
pygpu.gpuarray.SSIZE,
pygpu.gpuarray.SSIZE,
pygpu.gpuarray.GpuArray,
pygpu.gpuarray.SSIZE,
pygpu.gpuarray.GpuArray,
pygpu.gpuarray.SSIZE,
pygpu.gpuarray.SSIZE
],
flags=gpuarray.basic_ops.Kernel.get_flags(node.outputs[0].dtype),
objvar='k_multi_warp_multinomial_' + name)]
def c_code(self, node, name, inp, outputs, sub):
pvals, unis = inp
out, = outputs
fail = sub['fail']
ctx = sub['params']
#typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
sync = bool(config.gpuarray.sync)
kname = self.gpu_kernels(node, name)[0].objvar
s = """
PyGpuArrayObject * pvals = %(pvals)s;
PyGpuArrayObject * unis = %(unis)s;
PyGpuArrayObject * out = %(out)s;
size_t dims[2];
if (PyGpuArray_NDIM(pvals) != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
%(fail)s
}
if (PyGpuArray_NDIM(unis) != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s
}
if (PyGpuArray_DIMS(unis)[0] != PyGpuArray_DIMS(pvals)[0])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
%(fail)s
}
dims[0] = PyGpuArray_DIMS(pvals)[1];
dims[1] = PyGpuArray_DIMS(pvals)[0];
if (theano_prep_output(&out, 2, dims, unis->ga.typecode,
GA_C_ORDER, %(ctx)s) != 0){
%(fail)s
}
%(out)s = out;
GpuArray_memset(&(out->ga), 0);
{ // NESTED SCOPE
int nb_multi = PyGpuArray_DIMS(pvals)[0];
int nb_outcomes = PyGpuArray_DIMS(pvals)[1];
//TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1;
size_t nb_blocks = max_nb_blocks + 1;
size_t nb_threads=16; // so it really starts at 32, because of the *2
do
{
nb_threads*=2;
if (nb_multi % %nb_threads == 0)
nb_blocks = nb_multi/nb_threads;
else
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks);
//printf("\\nN=%%i b=%%i t=%%i t*b=%%i",
// nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
// TODO : next line is a bit hardcoded...
if (nb_threads > 512)
{
PyErr_Format(
PyExc_ValueError,
"Multinomial is not implemented for so many rows in the matrix (%%i)",
nb_multi);
%(fail)s
}
assert(nb_blocks*nb_threads >= nb_multi);
void *args[10];
ssize_t strides[5] = {
PyGpuArray_STRIDES(pvals)[0]/sizeof(float),
PyGpuArray_STRIDES(pvals)[1]/sizeof(float),
PyGpuArray_STRIDES(unis)[0]/sizeof(float),
PyGpuArray_STRIDES(out)[0]/sizeof(float),
PyGpuArray_STRIDES(out)[1]/sizeof(float)
};
int err;
args[0] = (void*)&PyGpuArray_DIMS(out)[1];
args[1] = (void*)&PyGpuArray_DIMS(out)[0];
args[2] = pvals->ga.data; //PyGpuArray_DEV_DATA(pvals);
args[3] = (void*)&strides[0];
args[4] = (void*)&strides[1];
args[5] = unis->ga.data; //PyGpuArray_DEV_DATA(unis);
args[6] = (void*)&strides[2];
args[7] = out->ga.data; //PyGpuArray_DEV_DATA(out);
args[8] = (void*)&strides[3];
args[9] = (void*)&strides[4];
err = GpuKernel_call(&%(kname)s, 1, &nb_threads, &nb_blocks, 0, args);
if (err != GA_NO_ERROR) {
PyErr_Format(
PyExc_RuntimeError,
"gpuarray error: %%s: %%s.\\n",
"k_multi_warp_%(name)s",
GpuKernel_error(&%(kname)s, err));
%(fail)s;
}
if(%(sync)d)
GpuArray_sync(&(out->ga));
} // END NESTED SCOPE
""" % locals()
return s
def c_code_cache_version(self):
return (1,)
@register_opt() @register_opt()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论