提交 fa978938 authored 作者: Harm de Vries's avatar Harm de Vries 提交者: Frederic Bastien

freds changes

上级 a4906222
...@@ -44,42 +44,32 @@ int APPLY_SPECIFIC(multinomial)(PyGpuArrayObject *pvals, ...@@ -44,42 +44,32 @@ int APPLY_SPECIFIC(multinomial)(PyGpuArrayObject *pvals,
PyGpuArrayObject *unis, PyGpuArrayObject *unis,
PyGpuArrayObject **out, PyGpuArrayObject **out,
PyGpuContextObject *c) { PyGpuContextObject *c) {
size_t dims[2];
if (PyGpuArray_NDIM(pvals) != 2) if (PyGpuArray_NDIM(pvals) != 2)
{ {
PyErr_Format(PyExc_TypeError, "pvals wrong rank"); PyErr_Format(PyExc_TypeError, "pvals wrong rank");
FAIL; return 1;
} }
if (PyGpuArray_NDIM(unis) != 1) if (PyGpuArray_NDIM(unis) != 1)
{ {
PyErr_Format(PyExc_TypeError, "unis wrong rank"); PyErr_Format(PyExc_TypeError, "unis wrong rank");
FAIL; return 1;
} }
if (PyGpuArray_HOST_DIMS(unis)[0] != PyGpuArray_HOST_DIMS(pvals)[0]) if (PyGpuArray_DIMS(unis)[0] != PyGpuArray_DIMS(pvals)[0])
{ {
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]"); PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
FAIL; return 1;
} }
//N.B. that the output is TRANSPOSED compared with pvals dims[0] = PyGpuArray_DIMS(pvals)[1];
if ((NULL == *out) dims[1] = PyGpuArray_DIMS(pvals)[0];
|| (PyGpuArray_HOST_DIMS(*out[0] != PyGpuArray_HOST_DIMS(pvals)[1] if (theano_prep_output(out, 2, dims, unis->ga.typecode,
|| (PyGpuAarray_HOST_DIMS(*out[1] != PyGpuArray_HOST_DIMS(pvals)[0]) GA_C_ORDER, c) != 0)
{ return 1;
Py_XDECREF(*out);
npy_intp dims[2];
dims[0] = (PyGpuArray_HOST_DIMS(pvals)[1];
dims[1] = (PyGpuArray_HOST_DIMS(pvals)[0]);
*out = (PyGpuarray*)PyGpuArray_NewDims(2, dims);
if (!*out)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
FAIL;
}
}
{ // NESTED SCOPE { // NESTED SCOPE
int nb_multi = PyGpuArray_HOST_DIMS(pvals)[0]; int nb_multi = PyGpuArray_DIMS(pvals)[0];
int nb_outcomes = PyGpuArray_HOST_DIMS(pvals)[1]; int nb_outcomes = PyGpuArray_DIMS(pvals)[1];
//TODO : change this for a beautiful constant //TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1; int max_nb_blocks = 2<<15 - 1;
int nb_blocks = max_nb_blocks + 1; int nb_blocks = max_nb_blocks + 1;
...@@ -87,19 +77,19 @@ int APPLY_SPECIFIC(multinomial)(PyGpuArrayObject *pvals, ...@@ -87,19 +77,19 @@ int APPLY_SPECIFIC(multinomial)(PyGpuArrayObject *pvals,
do do
{ {
nb_threads*=2; nb_threads*=2;
if (nb_multi %% nb_threads == 0) if (nb_multi % nb_threads == 0)
nb_blocks = nb_multi/nb_threads; nb_blocks = nb_multi/nb_threads;
else else
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.); nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks); } while (nb_blocks > max_nb_blocks);
//printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads); //printf("\\nN=%i b=%i t=%i t*b=%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
// TODO : next line is a bit hardcoded... // TODO : next line is a bit hardcoded...
if (nb_threads > 512) if (nb_threads > 512)
{ {
PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for so many rows in the matrix (%%i)", nb_multi); PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for so many rows in the matrix (%i)", nb_multi);
FAIL; return 1;
} }
dim3 n_blocks(nb_blocks,1,1); dim3 n_blocks(nb_blocks,1,1);
dim3 n_threads(nb_threads,1,1); dim3 n_threads(nb_threads,1,1);
...@@ -108,22 +98,22 @@ int APPLY_SPECIFIC(multinomial)(PyGpuArrayObject *pvals, ...@@ -108,22 +98,22 @@ int APPLY_SPECIFIC(multinomial)(PyGpuArrayObject *pvals,
assert(nb_blocks*nb_threads >= nb_multi); assert(nb_blocks*nb_threads >= nb_multi);
k_multi_warp_APPLYSPECIFIC(multinomial)<<<n_blocks, n_threads, n_shared>>>( k_multi_warp_APPLYSPECIFIC(multinomial)<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(%(z)s)[1], PyGpuArray_DIMS(*out)[1],
CudaNdarray_HOST_DIMS(%(z)s)[0], PyGpuArray_DIMS(*out)[0],
CudaNdarray_DEV_DATA(%(pvals)s), PyGpuArray_DEV_DATA(%(pvals)s),
CudaNdarray_HOST_STRIDES(%(pvals)s)[0], PyGpuArray_STRIDES(%(pvals)s)[0],
CudaNdarray_HOST_STRIDES(%(pvals)s)[1], PyGpuArray_STRIDES(%(pvals)s)[1],
CudaNdarray_DEV_DATA(%(unis)s), PyGpuArray_DEV_DATA(%(unis)s),
CudaNdarray_HOST_STRIDES(%(unis)s)[0], PyGpuArray_STRIDES(%(unis)s)[0],
CudaNdarray_DEV_DATA(%(z)s), PyGpuArray_DEV_DATA(*out),
CudaNdarray_HOST_STRIDES(%(z)s)[0], PyGpuArray_STRIDES(*out)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1] PyGpuArray_STRIDES(*out)[1]
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts) if (cudaSuccess != sts)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n", PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (grid: %i x %i; block: %i x %i x %i; shared: %i)\\n",
"k_multi_warp_%(name)s", "k_multi_warp_%(name)s",
cudaGetErrorString(sts), cudaGetErrorString(sts),
n_blocks.x, n_blocks.x,
...@@ -132,8 +122,9 @@ int APPLY_SPECIFIC(multinomial)(PyGpuArrayObject *pvals, ...@@ -132,8 +122,9 @@ int APPLY_SPECIFIC(multinomial)(PyGpuArrayObject *pvals,
n_threads.y, n_threads.y,
n_threads.z, n_threads.z,
n_shared); n_shared);
FAIL; return 1;
} }
} // END NESTED SCOPE } // END NESTED SCOPE
} return 0;
\ No newline at end of file }
import os
import pygpu
from theano import Apply from theano import Apply
from theano.gof import COp from theano.gof import COp
from .basic_ops import as_gpuarray_variable, infer_context_name from .basic_ops import as_gpuarray_variable, infer_context_name
from .type import GpuArrayType from .type import gpu_context_type, GpuArrayType
class GPUAMultinomialFromUniform(COp): class GPUAMultinomialFromUniform(COp):
params_type = gpu_context_type
def get_params(self, node):
return node.outputs[0].type.context
def __init__(self): def __init__(self):
COp.__init__(self, ['multinomial.c'], 'APPLY_SPECIFIC(multinomial)') COp.__init__(self, ['multinomial.c'], 'APPLY_SPECIFIC(multinomial)')
def c_headers(self):
return ['<numpy_compat.h>', 'gpuarray_helper.h']
def c_header_dirs(self):
return [os.path.dirname(__file__), pygpu.get_include()]
def make_node(self, pvals, unis): def make_node(self, pvals, unis):
assert pvals.dtype == 'float32' assert pvals.dtype == 'float32'
assert unis.dtype == 'float32' assert unis.dtype == 'float32'
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论