提交 a388d94d authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Remove tentacles in sandbox

上级 9dcf3f4c
...@@ -10,12 +10,6 @@ from theano.tensor import NotScalarConstantError, get_scalar_constant_value ...@@ -10,12 +10,6 @@ from theano.tensor import NotScalarConstantError, get_scalar_constant_value
from theano.scalar import as_scalar from theano.scalar import as_scalar
import copy import copy
from theano.sandbox.cuda import cuda_available, GpuOp, register_opt
if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType
from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
class MultinomialFromUniform(Op): class MultinomialFromUniform(Op):
# TODO : need description for parameter 'odtype' # TODO : need description for parameter 'odtype'
""" """
...@@ -403,232 +397,6 @@ class ChoiceFromUniform(MultinomialFromUniform): ...@@ -403,232 +397,6 @@ class ChoiceFromUniform(MultinomialFromUniform):
break break
class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
"""
The output is transposed compared to MultinomialFromUniform.
We must insert a Transpose op after it.
The optimization that moves it to the gpu does it.
"""
def make_node(self, pvals, unis):
assert pvals.dtype == 'float32'
assert unis.dtype == 'float32'
if not isinstance(pvals.type, CudaNdarrayType):
raise TypeError('pvals must be cudandarray', pvals)
if not isinstance(unis.type, CudaNdarrayType):
raise TypeError('unis must be cudandarray', unis)
if self.odtype == 'auto':
odtype = pvals.dtype
else:
odtype = self.odtype
if odtype != pvals.dtype:
raise NotImplementedError(
'GpuMultinomialFromUniform works only if '
'self.odtype == pvals.dtype', odtype, pvals.dtype)
br = (pvals.broadcastable[1], pvals.broadcastable[0])
out = CudaNdarrayType(broadcastable=br)()
return Apply(self, [pvals, unis], [out])
def perform(self, node, ins, outs):
# The perform from parent don't work with CudaNdarray. We
# don't need it as DebugMode will test again it as an
# optimization insert the GPU op.
return Op.perform(self, node, ins, outs)
def c_code_cache_version(self):
return (9,)
def c_support_code_apply(self, node, nodename):
return """
static __global__ void k_multi_warp_%(nodename)s(
const int nb_multi,
const int nb_outcomes,
float * global_pvals,
const int pvals_row_stride,
const int pvals_col_stride,
float * global_unis,
const int unis_stride,
float * global_outs,
const int outs_row_stride,
const int outs_col_stride
)
{
// each thread takes care of one multinomial draw
int n = blockDim.x*blockIdx.x + threadIdx.x;
if (n < nb_multi)
{
float cummul = 0.;
bool done = false;
const float unis_n = global_unis[n*unis_stride];
for (int m = 0; m < nb_outcomes; ++m)
{
float current_out = 0.;
if (!done)
{
cummul += global_pvals[m * pvals_col_stride + n * pvals_row_stride];
if (unis_n < cummul)
{
current_out = 1.;
done = true;
}
}
//write out transposed for speed.
global_outs[n * outs_col_stride + m * outs_row_stride] = current_out;
}
}
}
""" % locals()
def c_code(self, node, name, ins, outs, sub):
(pvals, unis) = ins
(z,) = outs
fail = sub['fail']
return """
if (CudaNdarray_NDIM(%(pvals)s) != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
%(fail)s;
}
if (CudaNdarray_NDIM(%(unis)s) != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
%(fail)s;
}
//N.B. that the output is TRANSPOSED compared with pvals
if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[0]))
{
Py_XDECREF(%(z)s);
npy_intp dims[2];
dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]);
dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]);
%(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
if (!%(z)s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
%(fail)s;
}
}
{ // NESTED SCOPE
int nb_multi = CudaNdarray_HOST_DIMS(%(pvals)s)[0];
int nb_outcomes = CudaNdarray_HOST_DIMS(%(pvals)s)[1];
//TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1;
int nb_blocks = max_nb_blocks + 1;
int nb_threads=16; // so it really starts at 32, because of the *2
do
{
nb_threads*=2;
if (nb_multi %% nb_threads == 0)
nb_blocks = nb_multi/nb_threads;
else
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks);
//printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
// TODO : next line is a bit hardcoded...
if (nb_threads > 512)
{
PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for so many rows in the matrix (%%i)", nb_multi);
%(fail)s;
}
dim3 n_blocks(nb_blocks,1,1);
dim3 n_threads(nb_threads,1,1);
int n_shared = 0;
assert(nb_blocks*nb_threads >= nb_multi);
k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(%(z)s)[1],
CudaNdarray_HOST_DIMS(%(z)s)[0],
CudaNdarray_DEV_DATA(%(pvals)s),
CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
CudaNdarray_DEV_DATA(%(unis)s),
CudaNdarray_HOST_STRIDES(%(unis)s)[0],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n",
"k_multi_warp_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z,
n_shared);
%(fail)s;
}
} // END NESTED SCOPE
""" % locals()
@register_opt()
@local_optimizer([MultinomialFromUniform])
def local_gpu_multinomial(node):
# TODO : need description for function
if type(node.op) is MultinomialFromUniform:
if len(node.inputs) == 2:
p, u = node.inputs
n_samples = 1
else:
p, u, n_samples = node.inputs
try:
if get_scalar_constant_value(n_samples) != 1:
return None
except NotScalarConstantError:
return None
m, = node.outputs
if (p.dtype == u.dtype == m.dtype == 'float32' and
any([i.owner and isinstance(i.owner.op,
theano.sandbox.cuda.HostFromGpu)
for i in node.inputs])):
gpu_op = GpuMultinomialFromUniform(node.op.odtype)
return [host_from_gpu(gpu_op(*[gpu_from_host(i)
for i in [p, u]])).T]
if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and
node.inputs[0].owner and
type(node.inputs[0].owner.op) is MultinomialFromUniform):
multi = node.inputs[0].owner
if len(node.inputs) == 2:
p, u = node.inputs
n_samples = 1
else:
p, u, n_samples = node.inputs
try:
if get_scalar_constant_value(n_samples) != 1:
return None
except NotScalarConstantError:
return None
m, = multi.outputs
if (p.dtype == u.dtype == m.dtype == 'float32'):
gpu_op = GpuMultinomialFromUniform(multi.op.odtype)
ret = gpu_op(*[gpu_from_host(i) for i in [p, u]]).T
# The dimshuffle is on the cpu, but will be moved to the
# gpu by an opt.
return [gpu_from_host(ret)]
class MultinomialWOReplacementFromUniform(ChoiceFromUniform): class MultinomialWOReplacementFromUniform(ChoiceFromUniform):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
warnings.warn("MultinomialWOReplacementFromUniform is deprecated, " warnings.warn("MultinomialWOReplacementFromUniform is deprecated, "
......
差异被折叠。
...@@ -10,28 +10,11 @@ import theano ...@@ -10,28 +10,11 @@ import theano
from theano import config, function, tensor from theano import config, function, tensor
from theano.sandbox import multinomial from theano.sandbox import multinomial
from theano.compile.mode import get_default_mode from theano.compile.mode import get_default_mode
import theano.sandbox.cuda as cuda
import theano.tests.unittest_tools as utt import theano.tests.unittest_tools as utt
from theano.compat import PY3 from theano.compat import PY3
from theano.misc.pkl_utils import CompatUnpickler from theano.misc.pkl_utils import CompatUnpickler
def get_mode(gpu):
mode = get_default_mode()
if theano.config.mode == 'FAST_COMPILE':
mode = theano.compile.get_mode('FAST_RUN')
if gpu:
mode = mode.including('gpu', 'gpu_local_optimizations',
'local_cut_gpu_host_gpu',
'local_gpu_multinomial')
return mode
def run_with_c(f, gpu=False):
mode = get_mode(gpu)
f(mode, gpu)
def test_n_samples_1(): def test_n_samples_1():
p = tensor.fmatrix() p = tensor.fmatrix()
u = tensor.fvector() u = tensor.fvector()
...@@ -117,69 +100,52 @@ def test_multinomial_0(): ...@@ -117,69 +100,52 @@ def test_multinomial_0():
m = multinomial.MultinomialFromUniform('auto')(p, u) m = multinomial.MultinomialFromUniform('auto')(p, u)
def body(mode, gpu): # the m*2 allows the multinomial to reuse output
# the m*2 allows the multinomial to reuse output f = function([p, u], m * 2, allow_input_downcast=True)
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
if gpu: # test that both first and second samples can be drawn
assert any([type(node.op) is multinomial.GpuMultinomialFromUniform utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
for node in f.maker.fgraph.toposort()]) [[2, 0], [0, 2]])
# test that both first and second samples can be drawn # test that both second labels can be drawn
utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]), r = f([[.2, .8], [.3, .7]], [.31, .31])
[[2, 0], [0, 2]]) utt.assert_allclose(r, [[0, 2], [0, 2]])
# test that both second labels can be drawn # test that both first labels can be drawn
r = f([[.2, .8], [.3, .7]], [.31, .31]) r = f([[.2, .8], [.3, .7]], [.21, .21])
utt.assert_allclose(r, [[0, 2], [0, 2]]) utt.assert_allclose(r, [[0, 2], [2, 0]])
# test that both first labels can be drawn # change the size to make sure output gets reallocated ok
r = f([[.2, .8], [.3, .7]], [.21, .21]) # and also make sure that the GPU version doesn't screw up the
utt.assert_allclose(r, [[0, 2], [2, 0]]) # transposed-ness
r = f([[.2, .8]], [.25])
# change the size to make sure output gets reallocated ok utt.assert_allclose(r, [[0, 2]])
# and also make sure that the GPU version doesn't screw up the
# transposed-ness
r = f([[.2, .8]], [.25])
utt.assert_allclose(r, [[0, 2]])
run_with_c(body)
if cuda.cuda_available:
run_with_c(body, True)
# TODO: check a bigger example (make sure blocking on GPU is handled correctly) # TODO: check a bigger example (make sure blocking on GPU is handled correctly)
def test_multinomial_large(): def test_multinomial_large():
# DEBUG_MODE will test this on GPU p = tensor.fmatrix()
def body(mode, gpu): u = tensor.fvector()
p = tensor.fmatrix() m = multinomial.MultinomialFromUniform('auto')(p, u)
u = tensor.fvector() f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
m = multinomial.MultinomialFromUniform('auto')(p, u)
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode) pval = np.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
if gpu: pval = pval / pval.sum(axis=1)[:, None]
assert any([type(node.op) is multinomial.GpuMultinomialFromUniform uval = np.ones_like(pval[:, 0]) * 0.5
for node in f.maker.fgraph.toposort()]) mval = f(pval, uval)
pval = np.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1 assert mval.shape == pval.shape
pval = pval / pval.sum(axis=1)[:, None] if config.cast_policy == 'custom':
uval = np.ones_like(pval[:, 0]) * 0.5 assert mval.dtype == pval.dtype
mval = f(pval, uval) elif config.cast_policy == 'numpy+floatX':
assert mval.dtype == config.floatX
assert mval.shape == pval.shape elif config.cast_policy == 'numpy':
if config.cast_policy == 'custom': assert mval.dtype == 'float64'
assert mval.dtype == pval.dtype else:
elif config.cast_policy == 'numpy+floatX': raise NotImplementedError(config.cast_policy)
assert mval.dtype == config.floatX utt.assert_allclose(mval.sum(axis=1), 2)
elif config.cast_policy == 'numpy': asdf = np.asarray([0, 0, 2, 0]) + 0 * pval
assert mval.dtype == 'float64' utt.assert_allclose(mval, asdf) # broadcast over all rows
else:
raise NotImplementedError(config.cast_policy)
utt.assert_allclose(mval.sum(axis=1), 2)
asdf = np.asarray([0, 0, 2, 0]) + 0 * pval
utt.assert_allclose(mval, asdf) # broadcast over all rows
run_with_c(body)
if cuda.cuda_available:
run_with_c(body, True)
def test_multinomial_dtypes(): def test_multinomial_dtypes():
...@@ -197,40 +163,3 @@ def test_multinomial_dtypes(): ...@@ -197,40 +163,3 @@ def test_multinomial_dtypes():
u = tensor.fvector() u = tensor.fvector()
m = multinomial.MultinomialFromUniform('float64')(p, u) m = multinomial.MultinomialFromUniform('float64')(p, u)
assert m.dtype == 'float64', m.dtype assert m.dtype == 'float64', m.dtype
def test_gpu_opt():
if not cuda.cuda_available:
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
raise SkipTest('Optional package cuda not available')
# We test the case where we put the op on the gpu when the output
# is moved to the gpu.
p = tensor.fmatrix()
u = tensor.fvector()
m = multinomial.MultinomialFromUniform('auto')(p, u)
assert m.dtype == 'float32', m.dtype
m_gpu = cuda.gpu_from_host(m)
f = function([p, u], m_gpu, allow_input_downcast=True, mode=get_mode(True))
assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
pval = np.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = np.ones_like(pval[:, 0]) * 0.5
f(pval, uval)
# Test with a row, it was failing in the past.
r = tensor.frow()
m = multinomial.MultinomialFromUniform('auto')(r, u)
assert m.dtype == 'float32', m.dtype
m_gpu = cuda.gpu_from_host(m)
f = function([r, u], m_gpu, allow_input_downcast=True, mode=get_mode(True))
assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
for node in f.maker.fgraph.toposort()])
pval = np.arange(1 * 4, dtype='float32').reshape((1, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = np.ones_like(pval[:, 0]) * 0.5
f(pval, uval)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论