提交 7461eaab authored 作者: Simon Lemieux's avatar Simon Lemieux

merge

......@@ -4,7 +4,7 @@ import theano.tensor as T
from theano.tensor.opt import register_specialize
from theano.gof import local_optimizer
from theano.sandbox.cuda import cuda_available
from theano.sandbox.cuda import cuda_available, cuda_enabled
if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType
from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
......@@ -109,12 +109,11 @@ class GpuMultinomial(Multinomial):
raise TypeError('pvals must be cudandarray', pvals)
if not isinstance(unis.type, CudaNdarrayType):
raise TypeError('unis must be cudandarray', unis)
return Apply(self, [pvals, unis], [pvals.type()])
def c_code_cache_version(self):
#return ()
return (super(GpuMultinomial,self).c_code_cache_version(),1)
return ()
#return (super(GpuMultinomial,self).c_code_cache_version(),1)
def c_support_code_apply(self, node, nodename):
return """
......@@ -128,7 +127,7 @@ class GpuMultinomial(Multinomial):
float * global_outs
)
{
int n = 32*blockIdx.x + threadIdx.x;
int n = blockDim.x*blockIdx.x + threadIdx.x;
if (n < nb_multi)
{
......@@ -201,14 +200,31 @@ class GpuMultinomial(Multinomial):
int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0];
int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1];
int nb_block;
if (nb_multi %% 32 == 0)
nb_block = nb_multi/32;
else
nb_block = (int)((float)nb_multi/32. + 1.);
//TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1;
int nb_blocks = max_nb_blocks + 1;
int nb_threads=16; // so it really starts at 32, because of the *2
do
{
nb_threads*=2;
if (nb_multi %% nb_threads == 0)
nb_blocks = nb_multi/nb_threads;
else
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks);
//printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
// TODO : next line is a bit hardcoded...
if (nb_threads > 512)
{
PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for as many rows in the matrix (%%i)", nb_multi);
%(fail)s;
}
dim3 n_blocks(nb_block,1,1);
dim3 n_threads(32,1,1);
dim3 n_blocks(nb_blocks,1,1);
dim3 n_threads(nb_threads,1,1);
int n_shared = 0;
k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
......@@ -244,6 +260,6 @@ gpu_multinomial = GpuMultinomial()
def use_gpu_multinomial(node):
if node.op == multinomial:
return [host_from_gpu(gpu_multinomial(*[gpu_from_host(i) for i in node.inputs]))]
if theano.config.device.startswith('gpu'):
if cuda_enabled:#theano.config.device.startswith('gpu'):
register_specialize(use_gpu_multinomial)
......@@ -685,7 +685,7 @@ class MRG_RandomStreams(object):
else:
raise NotImplementedError("MRG_RandomStreams.binomial with n > 1")
def multinomial(self, size=None, n=1, pvals=[[.5,.5]], ndim=None, dtype='int64'):
def multinomial(self, size=None, n=1, pvals=None, ndim=None, dtype='int64'):
"""
Sample `n` (currently `n` needs to be 1) times from a multinomial distribution defined by
probabilities pvals.
......@@ -696,13 +696,12 @@ class MRG_RandomStreams(object):
`size` and `ndim` are only there keep the same signature as other uniform, binomial, normal, etc.
todo : adapt multinomial to take that into account
"""
if pvals is None:
raise TypeError("You have to specify pvals")
pvals = as_tensor_variable(pvals)
if n == 1 and pvals.ndim == 2:
pvals = as_tensor_variable(pvals)
unis = self.uniform(size=pvals.shape[0:1], ndim=1)
return cast(multinomial(pvals.T, unis).T, dtype)
else:
raise NotImplementedError("MRG_RandomStreams.multinomial only implemented with n == 1 and pvals.ndim = 2")
......
......@@ -528,6 +528,7 @@ def test_multinomial():
print ''
print 'ON GPU:'
R = MRG_RandomStreams(234, use_cuda=True)
pvals = numpy.asarray(pvals, dtype='float32')
n = R.multinomial(pvals=pvals, dtype='float32')
assert n.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
f = theano.function([], theano.Out(
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论