提交 f1a3d267 authored 作者: Frederic Bastien's avatar Frederic Bastien

Code clean up following code review

上级 d6697f9f
#section support_code_apply
static __global__ void k_multi_warp_multinomial(
const int nb_multi,
const int nb_outcomes,
float * global_pvals,
const int pvals_row_stride,
const int pvals_col_stride,
float * global_unis,
const int unis_stride,
float * global_outs,
const int outs_row_stride,
const int outs_col_stride
)
{
// each thread takes care of one multinomial draw
int n = blockDim.x*blockIdx.x + threadIdx.x;
if (n < nb_multi)
{
float cummul = 0.;
bool done = false;
const float unis_n = global_unis[n*unis_stride];
for (int m = 0; m < nb_outcomes; ++m)
{
float current_out = 0.;
if (!done)
{
cummul += global_pvals[m * pvals_col_stride + n * pvals_row_stride];
if (unis_n < cummul)
{
current_out = 1.;
done = true;
}
}
//write out transposed for speed.
global_outs[n * outs_col_stride + m * outs_row_stride] = current_out;
}
}
}
#section support_code_struct
int APPLY_SPECIFIC(multinomial)(PyGpuArrayObject *pvals,
PyGpuArrayObject *unis,
PyGpuArrayObject **out,
PyGpuContextObject *c) {
size_t dims[2];
if (PyGpuArray_NDIM(pvals) != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
return 1;
}
if (PyGpuArray_NDIM(unis) != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
return 1;
}
if (PyGpuArray_DIMS(unis)[0] != PyGpuArray_DIMS(pvals)[0])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
return 1;
}
dims[0] = PyGpuArray_DIMS(pvals)[1];
dims[1] = PyGpuArray_DIMS(pvals)[0];
if (theano_prep_output(out, 2, dims, unis->ga.typecode,
GA_C_ORDER, c) != 0)
return 1;
GpuArray_memset(&((*out)->ga), 0);
{ // NESTED SCOPE
int nb_multi = PyGpuArray_DIMS(pvals)[0];
int nb_outcomes = PyGpuArray_DIMS(pvals)[1];
//TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1;
int nb_blocks = max_nb_blocks + 1;
int nb_threads=16; // so it really starts at 32, because of the *2
do
{
nb_threads*=2;
if (nb_multi % nb_threads == 0)
nb_blocks = nb_multi/nb_threads;
else
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks);
//printf("\\nN=%i b=%i t=%i t*b=%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
// TODO : next line is a bit hardcoded...
if (nb_threads > 512)
{
PyErr_Format(PyExc_ValueError, "Multinomial is not implemented for so many rows in the matrix (%i)", nb_multi);
return 1;
}
dim3 n_blocks(nb_blocks,1,1);
dim3 n_threads(nb_threads,1,1);
int n_shared = 0;
assert(nb_blocks*nb_threads >= nb_multi);
k_multi_warp_multinomial<<<n_blocks, n_threads, n_shared>>>(
PyGpuArray_DIMS(*out)[1],
PyGpuArray_DIMS(*out)[0],
(float*)PyGpuArray_DEV_DATA(pvals),
PyGpuArray_STRIDES(pvals)[0]/sizeof(float),
PyGpuArray_STRIDES(pvals)[1]/sizeof(float),
(float*)PyGpuArray_DEV_DATA(unis),
PyGpuArray_STRIDES(unis)[0]/sizeof(float),
(float*)PyGpuArray_DEV_DATA(*out),
PyGpuArray_STRIDES(*out)[0]/sizeof(float),
PyGpuArray_STRIDES(*out)[1]/sizeof(float)
);
//TODO
//if(false)//SYNC)
// GpuArray_sync((*out)->ga);
// SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (grid: %i x %i; block: %i x %i x %i; shared: %i)\\n",
"k_multi_warp_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z,
n_shared);
return 1;
}
} // END NESTED SCOPE
return 0;
}
...@@ -17,7 +17,6 @@ from .type import gpu_context_type, GpuArrayType ...@@ -17,7 +17,6 @@ from .type import gpu_context_type, GpuArrayType
class GPUAMultinomialFromUniform(gpuarray.basic_ops.GpuKernelBase, Op): class GPUAMultinomialFromUniform(gpuarray.basic_ops.GpuKernelBase, Op):
__props__ = ("odtype",) __props__ = ("odtype",)
params_type = gpu_context_type
def __init__(self, odtype): def __init__(self, odtype):
Op.__init__(self) Op.__init__(self)
...@@ -30,7 +29,7 @@ class GPUAMultinomialFromUniform(gpuarray.basic_ops.GpuKernelBase, Op): ...@@ -30,7 +29,7 @@ class GPUAMultinomialFromUniform(gpuarray.basic_ops.GpuKernelBase, Op):
return ['<numpy_compat.h>', 'gpuarray_helper.h'] return ['<numpy_compat.h>', 'gpuarray_helper.h']
def c_header_dirs(self): def c_header_dirs(self):
return [os.path.dirname(__file__), pygpu.get_include()] return [os.path.dirname(__file__)]
def make_node(self, pvals, unis): def make_node(self, pvals, unis):
assert pvals.dtype == 'float32' assert pvals.dtype == 'float32'
...@@ -100,15 +99,7 @@ KERNEL void k_multi_warp_multinomial( ...@@ -100,15 +99,7 @@ KERNEL void k_multi_warp_multinomial(
} }
} }
} }
"""
//KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
// ga_size nb = n < m ? n : m;
// for (ga_size i = LID_0; i < nb; i += LDIM_0) {
// a[i*m + i] = %(write_a)s(1);
// }
//}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(dtype),
name=name, write_a=write_w(dtype))
return [gpuarray.basic_ops.Kernel( return [gpuarray.basic_ops.Kernel(
code=code, name="k_multi_warp_multinomial", code=code, name="k_multi_warp_multinomial",
params=[pygpu.gpuarray.SIZE, params=[pygpu.gpuarray.SIZE,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论