提交 a4906222 authored 作者: Harm de Vries's avatar Harm de Vries 提交者: Frederic Bastien

created the new op, not working ye

上级 e3569d12
#section support_code_apply
static __global__ void k_multi_warp_APPLYSPECIFIC(multinomial)(
const int nb_multi,
const int nb_outcomes,
float * global_pvals,
const int pvals_row_stride,
const int pvals_col_stride,
float * global_unis,
const int unis_stride,
float * global_outs,
const int outs_row_stride,
const int outs_col_stride
)
{
// each thread takes care of one multinomial draw
int n = blockDim.x*blockIdx.x + threadIdx.x;
if (n < nb_multi)
{
float cummul = 0.;
bool done = false;
const float unis_n = global_unis[n*unis_stride];
for (int m = 0; m < nb_outcomes; ++m)
{
float current_out = 0.;
if (!done)
{
cummul += global_pvals[m * pvals_col_stride + n * pvals_row_stride];
if (unis_n < cummul)
{
current_out = 1.;
done = true;
}
}
//write out transposed for speed.
global_outs[n * outs_col_stride + m * outs_row_stride] = current_out;
}
}
}
#section support_code_struct
int APPLY_SPECIFIC(multinomial)(PyGpuArrayObject *pvals,
PyGpuArrayObject *unis,
PyGpuArrayObject **out,
PyGpuContextObject *c) {
if (PyGpuArray_NDIM(pvals) != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
FAIL;
}
if (PyGpuArray_NDIM(unis) != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
FAIL;
}
if (PyGpuArray_HOST_DIMS(unis)[0] != PyGpuArray_HOST_DIMS(pvals)[0])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
FAIL;
}
//N.B. that the output is TRANSPOSED compared with pvals
if ((NULL == *out)
|| (PyGpuArray_HOST_DIMS(*out[0] != PyGpuArray_HOST_DIMS(pvals)[1]
|| (PyGpuAarray_HOST_DIMS(*out[1] != PyGpuArray_HOST_DIMS(pvals)[0])
{
Py_XDECREF(*out);
npy_intp dims[2];
dims[0] = (PyGpuArray_HOST_DIMS(pvals)[1];
dims[1] = (PyGpuArray_HOST_DIMS(pvals)[0]);
*out = (PyGpuarray*)PyGpuArray_NewDims(2, dims);
if (!*out)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
FAIL;
}
}
{ // NESTED SCOPE
int nb_multi = PyGpuArray_HOST_DIMS(pvals)[0];
int nb_outcomes = PyGpuArray_HOST_DIMS(pvals)[1];
//TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1;
int nb_blocks = max_nb_blocks + 1;
int nb_threads=16; // so it really starts at 32, because of the *2
do
{
nb_threads*=2;
if (nb_multi %% nb_threads == 0)
nb_blocks = nb_multi/nb_threads;
else
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks);
//printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
// TODO : next line is a bit hardcoded...
if (nb_threads > 512)
{
PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for so many rows in the matrix (%%i)", nb_multi);
FAIL;
}
dim3 n_blocks(nb_blocks,1,1);
dim3 n_threads(nb_threads,1,1);
int n_shared = 0;
assert(nb_blocks*nb_threads >= nb_multi);
k_multi_warp_APPLYSPECIFIC(multinomial)<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(%(z)s)[1],
CudaNdarray_HOST_DIMS(%(z)s)[0],
CudaNdarray_DEV_DATA(%(pvals)s),
CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
CudaNdarray_DEV_DATA(%(unis)s),
CudaNdarray_HOST_STRIDES(%(unis)s)[0],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n",
"k_multi_warp_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z,
n_shared);
FAIL;
}
} // END NESTED SCOPE
}
\ No newline at end of file
from theano import Apply
from theano.gof import COp
from .basic_ops import as_gpuarray_variable, infer_context_name
from .type import GpuArrayType
class GPUAMultinomialFromUniform(COp):
def __init__(self):
COp.__init__(self, ['multinomial.c'], 'APPLY_SPECIFIC(multinomial)')
def make_node(self, pvals, unis):
assert pvals.dtype == 'float32'
assert unis.dtype == 'float32'
ctx_name = infer_context_name(pvals, unis)
pvals = as_gpuarray_variable(pvals, ctx_name)
unis = as_gpuarray_variable(unis, ctx_name)
br = (pvals.broadcastable[1], pvals.broadcastable[0])
out = GpuArrayType(broadcastable=br, dtype="float32")()
return Apply(self, [pvals, unis], [out])
def c_code_cache_version(self):
return (8,)
import numpy
import theano
from theano import tensor
from theano.sandbox.gpuarray.multinomial import GPUAMultinomialFromUniform
from .config import mode_with_gpu
def test_multinomial0():
# This tests the MultinomialFromUniform Op directly, not going through the
# multinomial() call in GPU random generation.
p = tensor.fmatrix()
u = tensor.fvector()
m = GPUAMultinomialFromUniform()(p, u)
f = theano.function([p, u], m, mode=mode_with_gpu)
assert f(numpy.array([[0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4]]), numpy.array([0.05, 0.05]))
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论