提交 c84ef1d8 authored 作者: Olivier Breuleux's avatar Olivier Breuleux

merge

...@@ -381,6 +381,7 @@ class InvalidValueError(DebugModeError): ...@@ -381,6 +381,7 @@ class InvalidValueError(DebugModeError):
client_node = self.client_node client_node = self.client_node
hint = self.hint hint = self.hint
specific_hint = self.specific_hint specific_hint = self.specific_hint
context = debugprint(r, prefix=' ', depth=12, file=StringIO()).getvalue()
return """InvalidValueError return """InvalidValueError
type(variable) = %(type_r)s type(variable) = %(type_r)s
variable = %(r)s variable = %(r)s
...@@ -393,7 +394,8 @@ class InvalidValueError(DebugModeError): ...@@ -393,7 +394,8 @@ class InvalidValueError(DebugModeError):
isfinite = %(v_isfinite)s isfinite = %(v_isfinite)s
client_node = %(client_node)s client_node = %(client_node)s
hint = %(hint)s hint = %(hint)s
specific_hint = %(specific_hint)s specific_hint = %(specific_hint)s
context = ...\n%(context)s
""" % locals() """ % locals()
######################## ########################
......
...@@ -156,7 +156,11 @@ def use(device, force=False, default_to_move_computation_to_gpu = True, ...@@ -156,7 +156,11 @@ def use(device, force=False, default_to_move_computation_to_gpu = True,
raise EnvironmentError("You forced use of device %s, but CUDA initialization failed " raise EnvironmentError("You forced use of device %s, but CUDA initialization failed "
"with error:\n%s" % (device, cuda_initialization_error_message)) "with error:\n%s" % (device, cuda_initialization_error_message))
if not cuda_available: if not cuda_available:
warning('CUDA is installed, but device %s is not available' % device) if cuda_initialization_error_message:
error_addendum = " (error: %s)" % cuda_initialization_error_message
else:
error_addendum = ""
warning('CUDA is installed, but device %s is not available%s' % (device, error_addendum))
return return
if device == 'gpu': if device == 'gpu':
......
...@@ -2427,6 +2427,7 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo ...@@ -2427,6 +2427,7 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
if (CudaNdarray_is_c_contiguous(self) && CudaNdarray_is_c_contiguous(other) && size == size_source) if (CudaNdarray_is_c_contiguous(self) && CudaNdarray_is_c_contiguous(other) && size == size_source)
{ {
cublasScopy(size, CudaNdarray_DEV_DATA(other), 1, CudaNdarray_DEV_DATA(self), 1); cublasScopy(size, CudaNdarray_DEV_DATA(other), 1, CudaNdarray_DEV_DATA(self), 1);
CNDA_THREAD_SYNC;
if (CUBLAS_STATUS_SUCCESS != cublasGetError()) if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{ {
PyErr_SetString(PyExc_RuntimeError, "Error copying memory"); PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
...@@ -2442,14 +2443,6 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo ...@@ -2442,14 +2443,6 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
{ {
// THIS CASE SHOULD NEVER HAPPEN BECAUSE SCALARS ARE ALWAYS C CONTIGUOUS // THIS CASE SHOULD NEVER HAPPEN BECAUSE SCALARS ARE ALWAYS C CONTIGUOUS
assert(0); assert(0);
assert (size==1);
cublasScopy(1, CudaNdarray_DEV_DATA(other), 1, CudaNdarray_DEV_DATA(self), 1);
CNDA_THREAD_SYNC;
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{
PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
return -1;
}
}; break; }; break;
case 1: // vector case 1: // vector
{ {
......
...@@ -132,9 +132,9 @@ class CudaNdarraySharedVariable(SharedVariable, _operators): ...@@ -132,9 +132,9 @@ class CudaNdarraySharedVariable(SharedVariable, _operators):
return other._as_CudaNdarrayVariable() return other._as_CudaNdarrayVariable()
if not isinstance(other.type, tensor.TensorType): if not isinstance(other.type, tensor.TensorType):
raise TypeError('Incompatible type', other.type) raise TypeError('Incompatible type', (self, (self.type, other.type)))
if (other.type.dtype != self.dtype): if (other.type.dtype != self.dtype):
raise TypeError('Incompatible dtype', (self.dtype, other.type.dtype)) raise TypeError('Incompatible dtype', (self, (self.dtype, other.type.dtype)))
if (other.type.broadcastable != self.broadcastable): if (other.type.broadcastable != self.broadcastable):
raise TypeError('Incompatible broadcastable', (self, (self.broadcastable, raise TypeError('Incompatible broadcastable', (self, (self.broadcastable,
other.type.broadcastable))) other.type.broadcastable)))
......
...@@ -10,30 +10,44 @@ if cuda_available: ...@@ -10,30 +10,44 @@ if cuda_available:
from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
class Multinomial(Op): class Multinomial(Op):
def __init__(self, odtype):
self.odtype=odtype
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other) and self.odtype==other.odtype
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash((type(self), self.odtype))
def __str__(self): def __str__(self):
return self.__class__.__name__ return '%s{%s}'%(self.__class__.__name__, self.odtype)
def __setstate__(self, dct):
self.__dict__.update(dct)
try:
self.odtype
except:
self.odtype='auto'
def make_node(self, pvals, unis): def make_node(self, pvals, unis):
pvals = T.as_tensor_variable(pvals) pvals = T.as_tensor_variable(pvals)
unis = T.as_tensor_variable(unis) unis = T.as_tensor_variable(unis)
#assert pvals.dtype == 'float32' if pvals.ndim != 2:
#assert unis.dtype == 'float32' raise NotImplementedError('pvals ndim', pvals.ndim)
return Apply(self, [pvals, unis], [pvals.type()]) if unis.ndim != 1:
raise NotImplementedError('unis ndim', unis.ndim)
def grad(self, inp, grads): if self.odtype=='auto':
pvals, unis = inp odtype = pvals.dtype
gz, = grads else:
odtype = self.odtype
return Apply(self, [pvals, unis], [T.matrix(dtype=odtype)])
def grad(self, ins, outs):
pvals, unis = ins
(gz,) = outs
return [None, None] return [None, None]
def c_code_cache_version(self): def c_code_cache_version(self):
return (3,) return (5,)
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, ins, outs, sub):
pvals, unis = inp (pvals, unis) = ins
z, = out (z,) = outs
fail = sub['fail'] fail = sub['fail']
return """ return """
...@@ -48,9 +62,9 @@ class Multinomial(Op): ...@@ -48,9 +62,9 @@ class Multinomial(Op):
%(fail)s; %(fail)s;
} }
if (%(unis)s->dimensions[0] != %(pvals)s->dimensions[1]) if (%(unis)s->dimensions[0] != %(pvals)s->dimensions[0])
{ {
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]"); PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
%(fail)s; %(fail)s;
} }
...@@ -60,15 +74,10 @@ class Multinomial(Op): ...@@ -60,15 +74,10 @@ class Multinomial(Op):
) )
{ {
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
npy_intp dims[2];
dims[0] = (%(pvals)s->dimensions)[0];
dims[1] = (%(pvals)s->dimensions)[1];
%(z)s = (PyArrayObject*) PyArray_ZEROS(2, %(z)s = (PyArrayObject*) PyArray_ZEROS(2,
dims, %(pvals)s->dimensions,
type_num_%(pvals)s, type_num_%(z)s,
0); 0);
if (!%(z)s) if (!%(z)s)
{ {
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output"); PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
...@@ -78,32 +87,37 @@ class Multinomial(Op): ...@@ -78,32 +87,37 @@ class Multinomial(Op):
{ // NESTED SCOPE { // NESTED SCOPE
const int nb_outcomes = %(pvals)s->dimensions[0]; const int nb_multi = %(pvals)s->dimensions[0];
const int nb_multi = %(pvals)s->dimensions[1]; const int nb_outcomes = %(pvals)s->dimensions[1];
// //
// For each multinomials, loop over each possible outcome // For each multinomials, loop over each possible outcome
// //
for (int n = 0; n < nb_multi; ++n) for (int n = 0; n < nb_multi; ++n)
{ {
int waiting = 1;
dtype_%(pvals)s cummul = 0.; dtype_%(pvals)s cummul = 0.;
const dtype_%(unis)s* unis_n = (dtype_%(unis)s*)PyArray_GETPTR1(%(unis)s, n); const dtype_%(unis)s* unis_n = (dtype_%(unis)s*)PyArray_GETPTR1(%(unis)s, n);
for (int m = 0; m < nb_outcomes; ++m) for (int m = 0; m < nb_outcomes; ++m)
{ {
dtype_%(z)s* z_nm = (dtype_%(z)s*)PyArray_GETPTR2(%(z)s, m,n); dtype_%(z)s* z_nm = (dtype_%(z)s*)PyArray_GETPTR2(%(z)s, n,m);
const dtype_%(pvals)s* pvals_nm = (dtype_%(pvals)s*)PyArray_GETPTR2(%(pvals)s, m,n); const dtype_%(pvals)s* pvals_nm = (dtype_%(pvals)s*)PyArray_GETPTR2(%(pvals)s, n,m);
cummul += *pvals_nm; cummul += *pvals_nm;
if (*unis_n < cummul) if (waiting && (cummul > *unis_n))
{ {
*z_nm = 1.; *z_nm = 1.;
break; waiting = 0;
}
else
{
// if we re-used old z pointer, we have to clear it out.
*z_nm = 0.;
} }
} }
} }
} // END NESTED SCOPE } // END NESTED SCOPE
""" % locals() """ % locals()
multinomial = Multinomial() #multinomial = Multinomial()
class GpuMultinomial(Multinomial): class GpuMultinomial(Multinomial):
...@@ -115,11 +129,16 @@ class GpuMultinomial(Multinomial): ...@@ -115,11 +129,16 @@ class GpuMultinomial(Multinomial):
raise TypeError('pvals must be cudandarray', pvals) raise TypeError('pvals must be cudandarray', pvals)
if not isinstance(unis.type, CudaNdarrayType): if not isinstance(unis.type, CudaNdarrayType):
raise TypeError('unis must be cudandarray', unis) raise TypeError('unis must be cudandarray', unis)
if self.odtype=='auto':
odtype = pvals.dtype
else:
odtype = self.odtype
if odtype != pvals.dtype:
raise NotImplementedError()
return Apply(self, [pvals, unis], [pvals.type()]) return Apply(self, [pvals, unis], [pvals.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
#return () return (6,)
return (super(GpuMultinomial,self).c_code_cache_version(),2)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
return """ return """
...@@ -128,28 +147,32 @@ class GpuMultinomial(Multinomial): ...@@ -128,28 +147,32 @@ class GpuMultinomial(Multinomial):
const int nb_outcomes, const int nb_outcomes,
const int pvals_row_strides, const int pvals_row_strides,
const int pvals_col_strides, const int pvals_col_strides,
const int unis_stride,
float * global_pvals, float * global_pvals,
float * global_unis, float * global_unis,
float * global_outs float * global_outs
) )
{ {
// each thread takes care of one multinomial draw
int n = blockDim.x*blockIdx.x + threadIdx.x; int n = blockDim.x*blockIdx.x + threadIdx.x;
if (n < nb_multi) if (n < nb_multi)
{ {
float cummul = 0.; float cummul = 0.;
bool done = false; bool done = false;
const float unis_n = global_unis[n*unis_stride];
for (int m = 0; m < nb_outcomes; ++m) for (int m = 0; m < nb_outcomes; ++m)
{ {
cummul += global_pvals[n * pvals_col_strides + m * pvals_row_strides];
float current_out = 0.; float current_out = 0.;
if (!done)
if (!done && global_unis[n] < cummul)
{ {
current_out = 1.; cummul += global_pvals[m * pvals_col_strides + n * pvals_row_strides];
done = true; if (unis_n < cummul)
{
current_out = 1.;
done = true;
}
} }
//write out transposed for speed.
global_outs[n + m * nb_multi] = current_out; global_outs[n + m * nb_multi] = current_out;
} }
} }
...@@ -158,12 +181,12 @@ class GpuMultinomial(Multinomial): ...@@ -158,12 +181,12 @@ class GpuMultinomial(Multinomial):
""" % locals() """ % locals()
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, ins, outs, sub):
pvals, unis = inp (pvals, unis) = ins
z, = out (z,) = outs
fail = sub['fail'] fail = sub['fail']
return """ return """
if (%(pvals)s->nd != 2) if (%(pvals)s->nd != 2)
{ {
PyErr_Format(PyExc_TypeError, "pvals wrong rank"); PyErr_Format(PyExc_TypeError, "pvals wrong rank");
...@@ -174,28 +197,21 @@ class GpuMultinomial(Multinomial): ...@@ -174,28 +197,21 @@ class GpuMultinomial(Multinomial):
PyErr_Format(PyExc_TypeError, "unis wrong rank"); PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s; %(fail)s;
} }
if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0])
if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
%(fail)s;
}
if (!CudaNdarray_is_c_contiguous(%(unis)s))
{ {
PyErr_Format(PyExc_NotImplementedError, "require unis to be contiguous"); PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
%(fail)s; %(fail)s;
} }
// Would be more efficient if pvals were also contiguous but practically I think it is not often the cas,
// since we are working on pvals.T here
//N.B. that the output is TRANSPOSED compared with pvals
if ((NULL == %(z)s) if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0]) || (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])) || (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[0]))
{ {
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
npy_intp dims[2]; npy_intp dims[2];
dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]); dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]);
dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]); dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]);
%(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims); %(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
if (!%(z)s) if (!%(z)s)
{ {
...@@ -205,9 +221,8 @@ class GpuMultinomial(Multinomial): ...@@ -205,9 +221,8 @@ class GpuMultinomial(Multinomial):
} }
{ // NESTED SCOPE { // NESTED SCOPE
int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0]; int nb_multi = CudaNdarray_HOST_DIMS(%(pvals)s)[0];
int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1]; int nb_outcomes = CudaNdarray_HOST_DIMS(%(pvals)s)[1];
//TODO : change this for a beautiful constant //TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1; int max_nb_blocks = 2<<15 - 1;
int nb_blocks = max_nb_blocks + 1; int nb_blocks = max_nb_blocks + 1;
...@@ -226,20 +241,21 @@ class GpuMultinomial(Multinomial): ...@@ -226,20 +241,21 @@ class GpuMultinomial(Multinomial):
// TODO : next line is a bit hardcoded... // TODO : next line is a bit hardcoded...
if (nb_threads > 512) if (nb_threads > 512)
{ {
PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for as many rows in the matrix (%%i)", nb_multi); PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for so many rows in the matrix (%%i)", nb_multi);
%(fail)s; %(fail)s;
} }
dim3 n_blocks(nb_blocks,1,1); dim3 n_blocks(nb_blocks,1,1);
dim3 n_threads(nb_threads,1,1); dim3 n_threads(nb_threads,1,1);
int n_shared = 0; int n_shared = 0;
assert(nb_blocks*nb_threads >= nb_multi);
k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>( k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(%(z)s)[1], CudaNdarray_HOST_DIMS(%(z)s)[1],
CudaNdarray_HOST_DIMS(%(z)s)[0], CudaNdarray_HOST_DIMS(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(pvals)s)[0], CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
CudaNdarray_HOST_STRIDES(%(pvals)s)[1], CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
CudaNdarray_HOST_STRIDES(%(unis)s)[0],
CudaNdarray_DEV_DATA(%(pvals)s), CudaNdarray_DEV_DATA(%(pvals)s),
CudaNdarray_DEV_DATA(%(unis)s), CudaNdarray_DEV_DATA(%(unis)s),
CudaNdarray_DEV_DATA(%(z)s) CudaNdarray_DEV_DATA(%(z)s)
...@@ -262,12 +278,271 @@ class GpuMultinomial(Multinomial): ...@@ -262,12 +278,271 @@ class GpuMultinomial(Multinomial):
} // END NESTED SCOPE } // END NESTED SCOPE
""" % locals() """ % locals()
gpu_multinomial = GpuMultinomial()
@local_optimizer() @local_optimizer()
def use_gpu_multinomial(node): def use_gpu_multinomial(node):
if node.op == multinomial: if node.op == multinomial:
return [host_from_gpu(gpu_multinomial(*[gpu_from_host(i) for i in node.inputs]))] p, u = node.inputs
m, = node.outputs
if p.dtype == u.dtype == m.dtype == 'float32':
gpu_op = GpuMultinomial(op.odtype)
return [host_from_gpu(gpu_op(*[gpu_from_host(i) for i in node.inputs]))]
if cuda_enabled:#theano.config.device.startswith('gpu'): if cuda_enabled:#theano.config.device.startswith('gpu'):
register_specialize(use_gpu_multinomial) register_specialize(use_gpu_multinomial)
if 0: # I hate you hg rebase, I hate you so very, very much.
class Multinomial(Op):
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, pvals, unis):
pvals = T.as_tensor_variable(pvals)
unis = T.as_tensor_variable(unis)
#assert pvals.dtype == 'float32'
#assert unis.dtype == 'float32'
return Apply(self, [pvals, unis], [pvals.type()])
def grad(self, inp, grads):
pvals, unis = inp
gz, = grads
return [None, None]
def c_code_cache_version(self):
return (3,)
def c_code(self, node, name, inp, out, sub):
pvals, unis = inp
z, = out
fail = sub['fail']
return """
if (%(pvals)s->nd != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
%(fail)s;
}
if (%(unis)s->nd != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s;
}
if (%(unis)s->dimensions[0] != %(pvals)s->dimensions[1])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
%(fail)s;
}
if ((NULL == %(z)s)
|| ((%(z)s->dimensions)[0] != (%(pvals)s->dimensions)[0])
|| ((%(z)s->dimensions)[1] != (%(pvals)s->dimensions)[1])
)
{
Py_XDECREF(%(z)s);
npy_intp dims[2];
dims[0] = (%(pvals)s->dimensions)[0];
dims[1] = (%(pvals)s->dimensions)[1];
%(z)s = (PyArrayObject*) PyArray_ZEROS(2,
dims,
type_num_%(pvals)s,
0);
if (!%(z)s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
%(fail)s;
}
}
{ // NESTED SCOPE
const int nb_outcomes = %(pvals)s->dimensions[0];
const int nb_multi = %(pvals)s->dimensions[1];
//
// For each multinomials, loop over each possible outcome
//
for (int n = 0; n < nb_multi; ++n)
{
dtype_%(pvals)s cummul = 0.;
const dtype_%(unis)s* unis_n = (dtype_%(unis)s*)PyArray_GETPTR1(%(unis)s, n);
for (int m = 0; m < nb_outcomes; ++m)
{
dtype_%(z)s* z_nm = (dtype_%(z)s*)PyArray_GETPTR2(%(z)s, m,n);
const dtype_%(pvals)s* pvals_nm = (dtype_%(pvals)s*)PyArray_GETPTR2(%(pvals)s, m,n);
cummul += *pvals_nm;
if (*unis_n < cummul)
{
*z_nm = 1.;
break;
}
}
}
} // END NESTED SCOPE
""" % locals()
multinomial = Multinomial()
class GpuMultinomial(Multinomial):
def make_node(self, pvals, unis):
assert pvals.dtype == 'float32'
assert unis.dtype == 'float32'
if not isinstance(pvals.type, CudaNdarrayType):
raise TypeError('pvals must be cudandarray', pvals)
if not isinstance(unis.type, CudaNdarrayType):
raise TypeError('unis must be cudandarray', unis)
return Apply(self, [pvals, unis], [pvals.type()])
def c_code_cache_version(self):
#return ()
return (super(GpuMultinomial,self).c_code_cache_version(),2)
def c_support_code_apply(self, node, nodename):
return """
static __global__ void k_multi_warp_%(nodename)s(
const int nb_multi,
const int nb_outcomes,
const int pvals_row_strides,
const int pvals_col_strides,
float * global_pvals,
float * global_unis,
float * global_outs
)
{
int n = blockDim.x*blockIdx.x + threadIdx.x;
if (n < nb_multi)
{
float cummul = 0.;
bool done = false;
for (int m = 0; m < nb_outcomes; ++m)
{
cummul += global_pvals[n * pvals_col_strides + m * pvals_row_strides];
float current_out = 0.;
if (!done && global_unis[n] < cummul)
{
current_out = 1.;
done = true;
}
global_outs[n + m * nb_multi] = current_out;
}
}
}
""" % locals()
def c_code(self, node, name, inp, out, sub):
pvals, unis = inp
z, = out
fail = sub['fail']
return """
if (%(pvals)s->nd != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
%(fail)s;
}
if (%(unis)s->nd != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
%(fail)s;
}
if (!CudaNdarray_is_c_contiguous(%(unis)s))
{
PyErr_Format(PyExc_NotImplementedError, "require unis to be contiguous");
%(fail)s;
}
// Would be more efficient if pvals were also contiguous but practically I think it is not often the cas,
// since we are working on pvals.T here
if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[1]))
{
Py_XDECREF(%(z)s);
npy_intp dims[2];
dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]);
dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]);
%(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
if (!%(z)s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
%(fail)s;
}
}
{ // NESTED SCOPE
int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0];
int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1];
//TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1;
int nb_blocks = max_nb_blocks + 1;
int nb_threads=16; // so it really starts at 32, because of the *2
do
{
nb_threads*=2;
if (nb_multi %% nb_threads == 0)
nb_blocks = nb_multi/nb_threads;
else
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks);
//printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
// TODO : next line is a bit hardcoded...
if (nb_threads > 512)
{
PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for as many rows in the matrix (%%i)", nb_multi);
%(fail)s;
}
dim3 n_blocks(nb_blocks,1,1);
dim3 n_threads(nb_threads,1,1);
int n_shared = 0;
k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(%(z)s)[1],
CudaNdarray_HOST_DIMS(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
CudaNdarray_DEV_DATA(%(pvals)s),
CudaNdarray_DEV_DATA(%(unis)s),
CudaNdarray_DEV_DATA(%(z)s)
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n",
"k_multi_warp_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z,
n_shared);
%(fail)s;
}
} // END NESTED SCOPE
""" % locals()
gpu_multinomial = GpuMultinomial()
...@@ -9,13 +9,14 @@ import sys ...@@ -9,13 +9,14 @@ import sys
import numpy import numpy
from theano import Op, Apply, shared, config, Variable from theano import Op, Apply, shared, config, Variable
from theano.tensor import raw_random, TensorType, as_tensor_variable, get_vector_length, cast, opt from theano.tensor import (raw_random, TensorType, as_tensor_variable,
get_vector_length, cast, opt)
from theano.tensor import zeros_like, sqrt, log, sin, cos, join, prod from theano.tensor import zeros_like, sqrt, log, sin, cos, join, prod
from theano.compile import optdb from theano.compile import optdb
from theano.gof import local_optimizer from theano.gof import local_optimizer
from theano.gof.python25 import all from theano.gof.python25 import all
from multinomial import multinomial import multinomial
from theano.sandbox.cuda import cuda_available, cuda_enabled from theano.sandbox.cuda import cuda_available, cuda_enabled
if cuda_available: if cuda_available:
...@@ -83,10 +84,12 @@ def mrg_next_value(rstate, new_rstate): ...@@ -83,10 +84,12 @@ def mrg_next_value(rstate, new_rstate):
x11, x12, x13, x21, x22, x23 = rstate x11, x12, x13, x21, x22, x23 = rstate
assert type(x11) == numpy.int32 assert type(x11) == numpy.int32
i0, i7, i9, i15, i16, i22, i24 = [numpy.int32(i) for i in (0,7, 9, 15, 16, 22, 24)] i0, i7, i9, i15, i16, i22, i24 = [numpy.int32(i)
for i in (0,7, 9, 15, 16, 22, 24)]
#first component #first component
y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24); y1 = (((x12 & MASK12) << i22) + (x12 >> i9)
+ ((x13 & MASK13) << i7) + (x13 >> i24))
assert type(y1) == numpy.int32 assert type(y1) == numpy.int32
if (y1 < 0 or y1 >= M1): #must also check overflow if (y1 < 0 or y1 >= M1): #must also check overflow
...@@ -741,10 +744,15 @@ class MRG_RandomStreams(object): ...@@ -741,10 +744,15 @@ class MRG_RandomStreams(object):
raise TypeError("You have to specify pvals") raise TypeError("You have to specify pvals")
pvals = as_tensor_variable(pvals) pvals = as_tensor_variable(pvals)
if n == 1 and pvals.ndim == 2: if n == 1 and pvals.ndim == 2:
unis = self.uniform(size=pvals.shape[0:1], ndim=1) ndim, size, bcast = raw_random._infer_ndim_bcast(
return cast(multinomial(pvals.T, unis).T, dtype) ndim, size, n, pvals[:,0])
bcast = bcast+(pvals.type.broadcastable[-1],)
unis = self.uniform(size=size, ndim=1)
op = multinomial.Multinomial(dtype)
return op(pvals, unis)
else: else:
raise NotImplementedError("MRG_RandomStreams.multinomial only implemented with n == 1 and pvals.ndim = 2") raise NotImplementedError(("MRG_RandomStreams.multinomial only"
" implemented with n == 1 and pvals.ndim = 2"))
def normal(self, size=None, avg=0.0, std=1.0, ndim=None, dtype=config.floatX): def normal(self, size=None, avg=0.0, std=1.0, ndim=None, dtype=config.floatX):
""" """
......
import numpy
from theano import tensor, shared, function
import multinomial
def test_multimomial_0():
# This tests the multinomial Op directly, not going through the
# multinomial() call in GPU random generation.
p = tensor.matrix()
u = tensor.vector()
m = multinomial.Multinomial('auto')(p,u)
#the m*2 allows the multinomial to reuse output
f = function([p,u], m*2, allow_input_downcast=True)
# test that both first and second samples can be drawn
assert numpy.allclose(f([[1,0], [0,1]], [.1, .1]),
[[2,0], [0,2]])
# test that both second labels can be drawn
r = f([[.2,.8], [.3,.7]], [.31, .31])
assert numpy.allclose(r, [[0,2], [0,2]]), r
# test that both first labels can be drawn
r = f([[.2,.8], [.3,.7]], [.21, .21])
assert numpy.allclose(r, [[0,2], [2,0]]), r
#change the size to make sure output gets reallocated ok
# and also make sure that the GPU version doesn't screw up the
# transposed-ness
r = f([[.2,.8] ], [.25])
assert numpy.allclose(r, [[0,2]]), r
#TODO: check a bigger example (make sure blocking on GPU is handled correctly)
def test_multinomial_large():
# DEBUG_MODE will test this on GPU
p = tensor.fmatrix()
u = tensor.fvector()
m = multinomial.Multinomial('auto')(p,u)
f = function([p,u], m*2, allow_input_downcast=True)
pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4))+0.1
pval = pval / pval.sum(axis=1)[:,None]
uval = numpy.ones_like(pval[:,0]) * 0.5
mval = f(pval,uval)
assert mval.shape == pval.shape
assert mval.dtype == pval.dtype
assert numpy.allclose(mval.sum(axis=1), 2)
asdf = numpy.asarray([0, 0, 2, 0])+0*pval
assert numpy.allclose(mval, asdf) #broadcast over all rows
def test_multinomial_dtypes():
p = tensor.dmatrix()
u = tensor.dvector()
m = multinomial.Multinomial('auto')(p,u)
assert m.dtype == 'float64', m.dtype
p = tensor.fmatrix()
u = tensor.fvector()
m = multinomial.Multinomial('auto')(p,u)
assert m.dtype == 'float32', m.dtype
p = tensor.fmatrix()
u = tensor.fvector()
m = multinomial.Multinomial('float64')(p,u)
assert m.dtype == 'float64', m.dtype
...@@ -236,6 +236,10 @@ class RandomFunction(gof.Op): ...@@ -236,6 +236,10 @@ class RandomFunction(gof.Op):
def _infer_ndim(ndim, shape, *args): def _infer_ndim(ndim, shape, *args):
ndim, ivec, bcast = _infer_ndim_bcast(ndim, shape, *args)
return ndim, ivec
def _infer_ndim_bcast(ndim, shape, *args):
""" """
Infer the number of dimensions from the shape or the other arguments. Infer the number of dimensions from the shape or the other arguments.
...@@ -255,7 +259,11 @@ def _infer_ndim(ndim, shape, *args): ...@@ -255,7 +259,11 @@ def _infer_ndim(ndim, shape, *args):
else: else:
args_ndim = 0 args_ndim = 0
if isinstance(shape, (tuple, list)): # there is a convention that -1 means the corresponding shape of a
# potentially-broadcasted symbolic arg
if (isinstance(shape, (tuple, list))
and numpy.all(numpy.asarray(shape)>=0)):
bcast = [(s==1) for s in shape]
v_shape = tensor.TensorConstant(type=tensor.lvector, data=theano._asarray(shape, dtype='int64')) v_shape = tensor.TensorConstant(type=tensor.lvector, data=theano._asarray(shape, dtype='int64'))
shape_ndim = len(shape) shape_ndim = len(shape)
if ndim is None: if ndim is None:
...@@ -265,6 +273,53 @@ def _infer_ndim(ndim, shape, *args): ...@@ -265,6 +273,53 @@ def _infer_ndim(ndim, shape, *args):
raise ValueError('ndim should be equal to len(shape), but\n', raise ValueError('ndim should be equal to len(shape), but\n',
'ndim = %s, len(shape) = %s, shape = %s' 'ndim = %s, len(shape) = %s, shape = %s'
% (ndim, shape_ndim, shape)) % (ndim, shape_ndim, shape))
elif isinstance(shape, (tuple, list)):
# there is a convention that -1 means the corresponding shape of a
# potentially-broadcasted symbolic arg
#
# This case combines together symbolic and non-symbolic shape
# information
if ndim is None:
ndim=args_ndim
else:
ndim = max(args_ndim, ndim)
ndim = max(args_ndim, len(shape))
shape = [-1]*(ndim - len(shape))+list(shape)
bcast = []
pre_v_shape = []
for i,s in enumerate(shape):
if hasattr(s, 'type'): # s is symbolic
bcast.append(False) # todo - introspect further
pre_v_shape.append(s)
else:
if s >= 0:
pre_v_shape.append(tensor.as_tensor_variable(s))
bcast.append((s==1))
elif s == -1:
n_a_i = 0
for a in args:
# ndim: _ _ _ _ _ _
# ashp: s0 s1 s2 s3
# i
if i >= ndim - a.ndim:
n_a_i += 1
a_i = i + a.ndim -ndim
if not a.broadcastable[a_i]:
pre_v_shape.append(a.shape[a_i])
bcast.append(False)
break
else:
if n_a_i == 0:
raise ValueError(('Auto-shape of -1 must overlap'
'with the shape of one of the broadcastable'
'inputs'))
else:
pre_v_shape.append(tensor.as_tensor_variable(1))
bcast.append(True)
else:
ValueError('negative shape', s)
# post-condition: shape may still contain both symbolic and non-symbolic things
v_shape = tensor.stack(*pre_v_shape)
elif shape is None: elif shape is None:
# The number of drawn samples will be determined automatically, # The number of drawn samples will be determined automatically,
...@@ -272,20 +327,23 @@ def _infer_ndim(ndim, shape, *args): ...@@ -272,20 +327,23 @@ def _infer_ndim(ndim, shape, *args):
v_shape = tensor.constant([], dtype='int64') v_shape = tensor.constant([], dtype='int64')
if ndim is None: if ndim is None:
ndim = args_ndim ndim = args_ndim
bcast = [False]*ndim #TODO: retrieve broadcasting patterns of arguments
else: else:
v_shape = tensor.as_tensor_variable(shape) v_shape = tensor.as_tensor_variable(shape)
if ndim is None: if ndim is None:
ndim = tensor.get_vector_length(v_shape) ndim = tensor.get_vector_length(v_shape)
bcast = [False]*ndim
if not (v_shape.dtype.startswith('int') or v_shape.dtype.startswith('uint')): if not (v_shape.dtype.startswith('int') or v_shape.dtype.startswith('uint')):
raise TypeError('shape must be an integer vector or list') raise TypeError('shape must be an integer vector or list', v_shape.dtype)
if args_ndim > ndim: if args_ndim > ndim:
raise ValueError('ndim should be at least as big as required by args value', raise ValueError('ndim should be at least as big as required by args value',
(ndim, args_ndim), args) (ndim, args_ndim), args)
return ndim, v_shape assert ndim == len(bcast)
return ndim, tensor.cast(v_shape, 'int32'), tuple(bcast)
def _generate_broadcasting_indices(out_shape, *shapes): def _generate_broadcasting_indices(out_shape, *shapes):
''' '''
...@@ -549,29 +607,89 @@ def multinomial_helper(random_state, n, pvals, size): ...@@ -549,29 +607,89 @@ def multinomial_helper(random_state, n, pvals, size):
out = numpy.ndarray(out_size) out = numpy.ndarray(out_size)
broadcast_ind = _generate_broadcasting_indices(size, n.shape, pvals.shape[:-1]) broadcast_ind = _generate_broadcasting_indices(size, n.shape, pvals.shape[:-1])
# Iterate over these indices, drawing from one multinomial at a time from numpy # Iterate over these indices, drawing from one multinomial at a time from numpy
assert pvals.min() >= 0
for mi, ni, pi in zip(*broadcast_ind): for mi, ni, pi in zip(*broadcast_ind):
out[mi] = random_state.multinomial(n=n[ni], pvals=pvals[pi]) pvi = pvals[pi]
# This might someday be fixed upstream
# Currently numpy raises an exception in this method if the sum
# of probabilities meets or exceeds 1.0.
# In perfect arithmetic this would be correct, but in float32 or
# float64 it is too strict.
pisum = numpy.sum(pvi)
if 1.0 < pisum < 1.0+1e-5:#correct if we went a little over
# because mtrand.pyx has a ValueError that will trigger if
# sum(pvals[:-1]) > 1.0
pvi = pvi * (1.0 - 5e-5)
#pvi = pvi * .9
pisum = numpy.sum(pvi)
elif pvi[-1]<5e-5: #will this even work?
pvi = pvi * (1.0 - 5e-5)
pisum = numpy.sum(pvi)
assert pisum<=1.0, pisum
out[mi] = random_state.multinomial(n=n[ni],
pvals=pvi.astype('float64'))
return out return out
def multinomial(random_state, size=None, n=1, pvals=[0.5, 0.5], ndim=None, dtype='int64'): def multinomial(random_state, size=None, n=1, pvals=[0.5, 0.5],
ndim=None, dtype='int64'):
""" """
Sample n times from a multinomial distribution defined by Sample from one or more multinomial distributions defined by
probabilities pvals, as many times as required by size. For one-dimensional slices in pvals.
instance, if size=(p,q), p*q samples will be drawn, and the output
shape will be (p,q,len(pvals)).
Theano tries to infer the number of dimensions from the length of :param pvals: a tensor of shape "nmulti+(L,)" describing each multinomial
the size argument and the shapes of n and pvals, but you may always distribution. This tensor must have the property that
specify it with the `ndim` parameter. numpy.allclose(pvals.sum(axis=-1), 1) is true.
:param size: a vector of shape information for the output; this can also
specify the "nmulti" part of pvals' shape. A -1 in the k'th position
from the right means to borrow the k'th position from the
right in nmulti. (See examples below.)
Default ``None`` means size=nmulti.
:param n: the number of experiments to simulate for each multinomial. This
can be a scalar, or tensor, it will be broadcasted to have shape "nmulti".
:param dtype: the dtype of the return value (which will represent counts)
:returns: tensor of len(size)+1 dimensions, and shape[-1]==L, with the specified ``dtype``,
with the experiment counts. See examples to understand the shape of the
return value, which is derived from both size and pvals.shape.
In return value rval, "numpy.allclose(rval.sum(axis=-1), n)" will be true.
For example, to simulate n experiments from each multinomial in a batch of
size B:
size=None, pvals.shape=(B,L) --> rval.shape=[B,L]
rval[i,j] is the count of possibility j in the i'th distribution (row)
in pvals.
Using size:
size=(1,-1), pvals.shape=(A,B,L)
--> rval.shape=[1,B,L], and requires that A==1.
rval[k,i,j] is the count of possibility j in the distribution specified
by pvals[k,i].
Using size for broadcasting of pvals:
size=(10,1,-1), pvals.shape=(A,B,L)
--> rval.shape=[10,1,B,L], and requires that A==1.
rval[l,k,i,j] is the count of possibility j in the distribution specified
by pvals[k,i], in the l'th of 10 draws.
.. note::
Note that the output will then be of dimension ndim+1.
""" """
n = tensor.as_tensor_variable(n) n = tensor.as_tensor_variable(n)
pvals = tensor.as_tensor_variable(pvals) pvals = tensor.as_tensor_variable(pvals)
ndim, size = _infer_ndim(ndim, size, n, pvals[0]) # until ellipsis is implemented (argh)
tmp = pvals.T[0].T
ndim, size, bcast = _infer_ndim_bcast(ndim, size, n, tmp)
bcast = bcast+(pvals.type.broadcastable[-1],)
op = RandomFunction(multinomial_helper, op = RandomFunction(multinomial_helper,
tensor.TensorType(dtype = 'int64', broadcastable = (False,)*(ndim+1)), tensor.TensorType(dtype = dtype, broadcastable = bcast),
ndim_added=1) ndim_added=1)
return op(random_state, size, n, pvals) return op(random_state, size, n, pvals)
......
...@@ -732,6 +732,52 @@ class T_random_function(unittest.TestCase): ...@@ -732,6 +732,52 @@ class T_random_function(unittest.TestCase):
assert numpy.all(val2 == numpy_val2) assert numpy.all(val2 == numpy_val2)
self.assertRaises(ValueError, g, rng2, n_val[:-1], pvals_val[:-1]) self.assertRaises(ValueError, g, rng2, n_val[:-1], pvals_val[:-1])
def test_multinomial_tensor3_a(self):
# Test the examples given in the multinomial documentation regarding
# tensor3 objects
rng_R = random_state_type()
n = 9
pvals = tensor.dtensor3()
post_r, out = multinomial(rng_R, n=n, pvals=pvals, size=(1,-1))
assert out.ndim == 3
assert out.broadcastable==(True, False, False)
f = compile.function([rng_R, pvals], [post_r, out], accept_inplace=True)
rng = numpy.random.RandomState(utt.fetch_seed())
numpy_rng = numpy.random.RandomState(utt.fetch_seed())
pvals_val = numpy.asarray([[[.1, .9], [.2, .8], [.3, .7]]])
assert pvals_val.shape == (1, 3, 2)
new_rng, draw = f(rng, pvals_val)
assert draw.shape==(1,3,2)
assert numpy.allclose(draw.sum(axis=2), 9)
def test_multinomial_tensor3_b(self):
# Test the examples given in the multinomial documentation regarding
# tensor3 objects
rng_R = random_state_type()
n = 9
pvals = tensor.dtensor3()
post_r, out = multinomial(rng_R, n=n, pvals=pvals, size=(10, 1,-1))
assert out.ndim == 4
assert out.broadcastable==(False, True, False, False)
f = compile.function([rng_R, pvals], [post_r, out], accept_inplace=True)
rng = numpy.random.RandomState(utt.fetch_seed())
numpy_rng = numpy.random.RandomState(utt.fetch_seed())
pvals_val = numpy.asarray([[[.1, .9], [.2, .8], [.3, .7]]])
assert pvals_val.shape == (1, 3, 2)
out_rng, draw = f(rng, pvals_val)
assert draw.shape==(10,1,3,2)
assert numpy.allclose(draw.sum(axis=3), 9)
def test_dtype(self): def test_dtype(self):
rng_R = random_state_type() rng_R = random_state_type()
low = tensor.lscalar() low = tensor.lscalar()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论