提交 bb724b38 authored 作者: James Bergstra's avatar James Bergstra

I HATE U hg rebase Y U GO DELETE EVERYTHING

上级 c3c73138
...@@ -10,30 +10,44 @@ if cuda_available: ...@@ -10,30 +10,44 @@ if cuda_available:
from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
class Multinomial(Op): class Multinomial(Op):
def __init__(self, odtype):
self.odtype=odtype
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other) and self.odtype==other.odtype
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash((type(self), self.odtype))
def __str__(self): def __str__(self):
return self.__class__.__name__ return '%s{%s}'%(self.__class__.__name__, self.odtype)
def __setstate__(self, dct):
self.__dict__.update(dct)
try:
self.odtype
except:
self.odtype='auto'
def make_node(self, pvals, unis): def make_node(self, pvals, unis):
pvals = T.as_tensor_variable(pvals) pvals = T.as_tensor_variable(pvals)
unis = T.as_tensor_variable(unis) unis = T.as_tensor_variable(unis)
#assert pvals.dtype == 'float32' if pvals.ndim != 2:
#assert unis.dtype == 'float32' raise NotImplementedError('pvals ndim', pvals.ndim)
return Apply(self, [pvals, unis], [pvals.type()]) if unis.ndim != 1:
raise NotImplementedError('unis ndim', unis.ndim)
def grad(self, inp, grads): if self.odtype=='auto':
pvals, unis = inp odtype = pvals.dtype
gz, = grads else:
odtype = self.odtype
return Apply(self, [pvals, unis], [T.matrix(dtype=odtype)])
def grad(self, ins, outs):
pvals, unis = ins
(gz,) = outs
return [None, None] return [None, None]
def c_code_cache_version(self): def c_code_cache_version(self):
return (3,) return (5,)
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, ins, outs, sub):
pvals, unis = inp (pvals, unis) = ins
z, = out (z,) = outs
fail = sub['fail'] fail = sub['fail']
return """ return """
...@@ -48,9 +62,9 @@ class Multinomial(Op): ...@@ -48,9 +62,9 @@ class Multinomial(Op):
%(fail)s; %(fail)s;
} }
if (%(unis)s->dimensions[0] != %(pvals)s->dimensions[1]) if (%(unis)s->dimensions[0] != %(pvals)s->dimensions[0])
{ {
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]"); PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
%(fail)s; %(fail)s;
} }
...@@ -60,15 +74,10 @@ class Multinomial(Op): ...@@ -60,15 +74,10 @@ class Multinomial(Op):
) )
{ {
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
npy_intp dims[2];
dims[0] = (%(pvals)s->dimensions)[0];
dims[1] = (%(pvals)s->dimensions)[1];
%(z)s = (PyArrayObject*) PyArray_ZEROS(2, %(z)s = (PyArrayObject*) PyArray_ZEROS(2,
dims, %(pvals)s->dimensions,
type_num_%(pvals)s, type_num_%(z)s,
0); 0);
if (!%(z)s) if (!%(z)s)
{ {
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output"); PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
...@@ -78,32 +87,37 @@ class Multinomial(Op): ...@@ -78,32 +87,37 @@ class Multinomial(Op):
{ // NESTED SCOPE { // NESTED SCOPE
const int nb_outcomes = %(pvals)s->dimensions[0]; const int nb_multi = %(pvals)s->dimensions[0];
const int nb_multi = %(pvals)s->dimensions[1]; const int nb_outcomes = %(pvals)s->dimensions[1];
// //
// For each multinomials, loop over each possible outcome // For each multinomials, loop over each possible outcome
// //
for (int n = 0; n < nb_multi; ++n) for (int n = 0; n < nb_multi; ++n)
{ {
int waiting = 1;
dtype_%(pvals)s cummul = 0.; dtype_%(pvals)s cummul = 0.;
const dtype_%(unis)s* unis_n = (dtype_%(unis)s*)PyArray_GETPTR1(%(unis)s, n); const dtype_%(unis)s* unis_n = (dtype_%(unis)s*)PyArray_GETPTR1(%(unis)s, n);
for (int m = 0; m < nb_outcomes; ++m) for (int m = 0; m < nb_outcomes; ++m)
{ {
dtype_%(z)s* z_nm = (dtype_%(z)s*)PyArray_GETPTR2(%(z)s, m,n); dtype_%(z)s* z_nm = (dtype_%(z)s*)PyArray_GETPTR2(%(z)s, n,m);
const dtype_%(pvals)s* pvals_nm = (dtype_%(pvals)s*)PyArray_GETPTR2(%(pvals)s, m,n); const dtype_%(pvals)s* pvals_nm = (dtype_%(pvals)s*)PyArray_GETPTR2(%(pvals)s, n,m);
cummul += *pvals_nm; cummul += *pvals_nm;
if (*unis_n < cummul) if (waiting && (cummul > *unis_n))
{ {
*z_nm = 1.; *z_nm = 1.;
break; waiting = 0;
}
else
{
// if we re-used old z pointer, we have to clear it out.
*z_nm = 0.;
} }
} }
} }
} // END NESTED SCOPE } // END NESTED SCOPE
""" % locals() """ % locals()
multinomial = Multinomial() #multinomial = Multinomial()
class GpuMultinomial(Multinomial): class GpuMultinomial(Multinomial):
...@@ -115,11 +129,16 @@ class GpuMultinomial(Multinomial): ...@@ -115,11 +129,16 @@ class GpuMultinomial(Multinomial):
raise TypeError('pvals must be cudandarray', pvals) raise TypeError('pvals must be cudandarray', pvals)
if not isinstance(unis.type, CudaNdarrayType): if not isinstance(unis.type, CudaNdarrayType):
raise TypeError('unis must be cudandarray', unis) raise TypeError('unis must be cudandarray', unis)
if self.odtype=='auto':
odtype = pvals.dtype
else:
odtype = self.odtype
if odtype != pvals.dtype:
raise NotImplementedError()
return Apply(self, [pvals, unis], [pvals.type()]) return Apply(self, [pvals, unis], [pvals.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
#return () return (6,)
return (super(GpuMultinomial,self).c_code_cache_version(),2)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
return """ return """
...@@ -128,28 +147,32 @@ class GpuMultinomial(Multinomial): ...@@ -128,28 +147,32 @@ class GpuMultinomial(Multinomial):
const int nb_outcomes, const int nb_outcomes,
const int pvals_row_strides, const int pvals_row_strides,
const int pvals_col_strides, const int pvals_col_strides,
const int unis_stride,
float * global_pvals, float * global_pvals,
float * global_unis, float * global_unis,
float * global_outs float * global_outs
) )
{ {
// each thread takes care of one multinomial draw
int n = blockDim.x*blockIdx.x + threadIdx.x; int n = blockDim.x*blockIdx.x + threadIdx.x;
if (n < nb_multi) if (n < nb_multi)
{ {
float cummul = 0.; float cummul = 0.;
bool done = false; bool done = false;
const float unis_n = global_unis[n*unis_stride];
for (int m = 0; m < nb_outcomes; ++m) for (int m = 0; m < nb_outcomes; ++m)
{ {
cummul += global_pvals[n * pvals_col_strides + m * pvals_row_strides];
float current_out = 0.; float current_out = 0.;
if (!done)
if (!done && global_unis[n] < cummul)
{ {
current_out = 1.; cummul += global_pvals[m * pvals_col_strides + n * pvals_row_strides];
done = true; if (unis_n < cummul)
{
current_out = 1.;
done = true;
}
} }
//write out transposed for speed.
global_outs[n + m * nb_multi] = current_out; global_outs[n + m * nb_multi] = current_out;
} }
} }
...@@ -158,12 +181,12 @@ class GpuMultinomial(Multinomial): ...@@ -158,12 +181,12 @@ class GpuMultinomial(Multinomial):
""" % locals() """ % locals()
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, ins, outs, sub):
pvals, unis = inp (pvals, unis) = ins
z, = out (z,) = outs
fail = sub['fail'] fail = sub['fail']
return """ return """
if (%(pvals)s->nd != 2) if (%(pvals)s->nd != 2)
{ {
PyErr_Format(PyExc_TypeError, "pvals wrong rank"); PyErr_Format(PyExc_TypeError, "pvals wrong rank");
...@@ -174,28 +197,21 @@ class GpuMultinomial(Multinomial): ...@@ -174,28 +197,21 @@ class GpuMultinomial(Multinomial):
PyErr_Format(PyExc_TypeError, "unis wrong rank"); PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s; %(fail)s;
} }
if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0])
if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
%(fail)s;
}
if (!CudaNdarray_is_c_contiguous(%(unis)s))
{ {
PyErr_Format(PyExc_NotImplementedError, "require unis to be contiguous"); PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
%(fail)s; %(fail)s;
} }
// Would be more efficient if pvals were also contiguous but practically I think it is not often the cas,
// since we are working on pvals.T here
//N.B. that the output is TRANSPOSED compared with pvals
if ((NULL == %(z)s) if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0]) || (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])) || (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[0]))
{ {
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
npy_intp dims[2]; npy_intp dims[2];
dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]); dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]);
dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]); dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]);
%(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims); %(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
if (!%(z)s) if (!%(z)s)
{ {
...@@ -205,9 +221,8 @@ class GpuMultinomial(Multinomial): ...@@ -205,9 +221,8 @@ class GpuMultinomial(Multinomial):
} }
{ // NESTED SCOPE { // NESTED SCOPE
int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0]; int nb_multi = CudaNdarray_HOST_DIMS(%(pvals)s)[0];
int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1]; int nb_outcomes = CudaNdarray_HOST_DIMS(%(pvals)s)[1];
//TODO : change this for a beautiful constant //TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1; int max_nb_blocks = 2<<15 - 1;
int nb_blocks = max_nb_blocks + 1; int nb_blocks = max_nb_blocks + 1;
...@@ -226,20 +241,21 @@ class GpuMultinomial(Multinomial): ...@@ -226,20 +241,21 @@ class GpuMultinomial(Multinomial):
// TODO : next line is a bit hardcoded... // TODO : next line is a bit hardcoded...
if (nb_threads > 512) if (nb_threads > 512)
{ {
PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for as many rows in the matrix (%%i)", nb_multi); PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for so many rows in the matrix (%%i)", nb_multi);
%(fail)s; %(fail)s;
} }
dim3 n_blocks(nb_blocks,1,1); dim3 n_blocks(nb_blocks,1,1);
dim3 n_threads(nb_threads,1,1); dim3 n_threads(nb_threads,1,1);
int n_shared = 0; int n_shared = 0;
assert(nb_blocks*nb_threads >= nb_multi);
k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>( k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(%(z)s)[1], CudaNdarray_HOST_DIMS(%(z)s)[1],
CudaNdarray_HOST_DIMS(%(z)s)[0], CudaNdarray_HOST_DIMS(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(pvals)s)[0], CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
CudaNdarray_HOST_STRIDES(%(pvals)s)[1], CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
CudaNdarray_HOST_STRIDES(%(unis)s)[0],
CudaNdarray_DEV_DATA(%(pvals)s), CudaNdarray_DEV_DATA(%(pvals)s),
CudaNdarray_DEV_DATA(%(unis)s), CudaNdarray_DEV_DATA(%(unis)s),
CudaNdarray_DEV_DATA(%(z)s) CudaNdarray_DEV_DATA(%(z)s)
...@@ -262,12 +278,271 @@ class GpuMultinomial(Multinomial): ...@@ -262,12 +278,271 @@ class GpuMultinomial(Multinomial):
} // END NESTED SCOPE } // END NESTED SCOPE
""" % locals() """ % locals()
gpu_multinomial = GpuMultinomial()
@local_optimizer() @local_optimizer()
def use_gpu_multinomial(node): def use_gpu_multinomial(node):
if node.op == multinomial: if node.op == multinomial:
return [host_from_gpu(gpu_multinomial(*[gpu_from_host(i) for i in node.inputs]))] p, u = node.inputs
m, = node.outputs
if p.dtype == u.dtype == m.dtype == 'float32':
gpu_op = GpuMultinomial(op.odtype)
return [host_from_gpu(gpu_op(*[gpu_from_host(i) for i in node.inputs]))]
if cuda_enabled:#theano.config.device.startswith('gpu'): if cuda_enabled:#theano.config.device.startswith('gpu'):
register_specialize(use_gpu_multinomial) register_specialize(use_gpu_multinomial)
if 0: # I hate you hg rebase, I hate you so very, very much.
class Multinomial(Op):
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, pvals, unis):
pvals = T.as_tensor_variable(pvals)
unis = T.as_tensor_variable(unis)
#assert pvals.dtype == 'float32'
#assert unis.dtype == 'float32'
return Apply(self, [pvals, unis], [pvals.type()])
def grad(self, inp, grads):
pvals, unis = inp
gz, = grads
return [None, None]
def c_code_cache_version(self):
return (3,)
def c_code(self, node, name, inp, out, sub):
pvals, unis = inp
z, = out
fail = sub['fail']
return """
if (%(pvals)s->nd != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
%(fail)s;
}
if (%(unis)s->nd != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s;
}
if (%(unis)s->dimensions[0] != %(pvals)s->dimensions[1])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
%(fail)s;
}
if ((NULL == %(z)s)
|| ((%(z)s->dimensions)[0] != (%(pvals)s->dimensions)[0])
|| ((%(z)s->dimensions)[1] != (%(pvals)s->dimensions)[1])
)
{
Py_XDECREF(%(z)s);
npy_intp dims[2];
dims[0] = (%(pvals)s->dimensions)[0];
dims[1] = (%(pvals)s->dimensions)[1];
%(z)s = (PyArrayObject*) PyArray_ZEROS(2,
dims,
type_num_%(pvals)s,
0);
if (!%(z)s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
%(fail)s;
}
}
{ // NESTED SCOPE
const int nb_outcomes = %(pvals)s->dimensions[0];
const int nb_multi = %(pvals)s->dimensions[1];
//
// For each multinomials, loop over each possible outcome
//
for (int n = 0; n < nb_multi; ++n)
{
dtype_%(pvals)s cummul = 0.;
const dtype_%(unis)s* unis_n = (dtype_%(unis)s*)PyArray_GETPTR1(%(unis)s, n);
for (int m = 0; m < nb_outcomes; ++m)
{
dtype_%(z)s* z_nm = (dtype_%(z)s*)PyArray_GETPTR2(%(z)s, m,n);
const dtype_%(pvals)s* pvals_nm = (dtype_%(pvals)s*)PyArray_GETPTR2(%(pvals)s, m,n);
cummul += *pvals_nm;
if (*unis_n < cummul)
{
*z_nm = 1.;
break;
}
}
}
} // END NESTED SCOPE
""" % locals()
multinomial = Multinomial()
class GpuMultinomial(Multinomial):
def make_node(self, pvals, unis):
assert pvals.dtype == 'float32'
assert unis.dtype == 'float32'
if not isinstance(pvals.type, CudaNdarrayType):
raise TypeError('pvals must be cudandarray', pvals)
if not isinstance(unis.type, CudaNdarrayType):
raise TypeError('unis must be cudandarray', unis)
return Apply(self, [pvals, unis], [pvals.type()])
def c_code_cache_version(self):
#return ()
return (super(GpuMultinomial,self).c_code_cache_version(),2)
def c_support_code_apply(self, node, nodename):
return """
static __global__ void k_multi_warp_%(nodename)s(
const int nb_multi,
const int nb_outcomes,
const int pvals_row_strides,
const int pvals_col_strides,
float * global_pvals,
float * global_unis,
float * global_outs
)
{
int n = blockDim.x*blockIdx.x + threadIdx.x;
if (n < nb_multi)
{
float cummul = 0.;
bool done = false;
for (int m = 0; m < nb_outcomes; ++m)
{
cummul += global_pvals[n * pvals_col_strides + m * pvals_row_strides];
float current_out = 0.;
if (!done && global_unis[n] < cummul)
{
current_out = 1.;
done = true;
}
global_outs[n + m * nb_multi] = current_out;
}
}
}
""" % locals()
def c_code(self, node, name, inp, out, sub):
pvals, unis = inp
z, = out
fail = sub['fail']
return """
if (%(pvals)s->nd != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
%(fail)s;
}
if (%(unis)s->nd != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
%(fail)s;
}
if (!CudaNdarray_is_c_contiguous(%(unis)s))
{
PyErr_Format(PyExc_NotImplementedError, "require unis to be contiguous");
%(fail)s;
}
// Would be more efficient if pvals were also contiguous but practically I think it is not often the cas,
// since we are working on pvals.T here
if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[1]))
{
Py_XDECREF(%(z)s);
npy_intp dims[2];
dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]);
dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]);
%(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
if (!%(z)s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
%(fail)s;
}
}
{ // NESTED SCOPE
int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0];
int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1];
//TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1;
int nb_blocks = max_nb_blocks + 1;
int nb_threads=16; // so it really starts at 32, because of the *2
do
{
nb_threads*=2;
if (nb_multi %% nb_threads == 0)
nb_blocks = nb_multi/nb_threads;
else
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks);
//printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
// TODO : next line is a bit hardcoded...
if (nb_threads > 512)
{
PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for as many rows in the matrix (%%i)", nb_multi);
%(fail)s;
}
dim3 n_blocks(nb_blocks,1,1);
dim3 n_threads(nb_threads,1,1);
int n_shared = 0;
k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(%(z)s)[1],
CudaNdarray_HOST_DIMS(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
CudaNdarray_DEV_DATA(%(pvals)s),
CudaNdarray_DEV_DATA(%(unis)s),
CudaNdarray_DEV_DATA(%(z)s)
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n",
"k_multi_warp_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z,
n_shared);
%(fail)s;
}
} // END NESTED SCOPE
""" % locals()
gpu_multinomial = GpuMultinomial()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论