提交 b22677b0 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Remove previous, commented-out code.

上级 6150143e
...@@ -289,260 +289,3 @@ def use_gpu_multinomial(node): ...@@ -289,260 +289,3 @@ def use_gpu_multinomial(node):
return [host_from_gpu(gpu_op(*[gpu_from_host(i) for i in node.inputs]))] return [host_from_gpu(gpu_op(*[gpu_from_host(i) for i in node.inputs]))]
if cuda_enabled:#theano.config.device.startswith('gpu'): if cuda_enabled:#theano.config.device.startswith('gpu'):
register_specialize(use_gpu_multinomial) register_specialize(use_gpu_multinomial)
if 0: # I hate you hg rebase, I hate you so very, very much.
class Multinomial(Op):
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, pvals, unis):
pvals = T.as_tensor_variable(pvals)
unis = T.as_tensor_variable(unis)
#assert pvals.dtype == 'float32'
#assert unis.dtype == 'float32'
return Apply(self, [pvals, unis], [pvals.type()])
def grad(self, inp, grads):
pvals, unis = inp
gz, = grads
return [None, None]
def c_code_cache_version(self):
return (3,)
def c_code(self, node, name, inp, out, sub):
pvals, unis = inp
z, = out
fail = sub['fail']
return """
if (%(pvals)s->nd != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
%(fail)s;
}
if (%(unis)s->nd != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s;
}
if (%(unis)s->dimensions[0] != %(pvals)s->dimensions[1])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
%(fail)s;
}
if ((NULL == %(z)s)
|| ((%(z)s->dimensions)[0] != (%(pvals)s->dimensions)[0])
|| ((%(z)s->dimensions)[1] != (%(pvals)s->dimensions)[1])
)
{
Py_XDECREF(%(z)s);
npy_intp dims[2];
dims[0] = (%(pvals)s->dimensions)[0];
dims[1] = (%(pvals)s->dimensions)[1];
%(z)s = (PyArrayObject*) PyArray_ZEROS(2,
dims,
type_num_%(pvals)s,
0);
if (!%(z)s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
%(fail)s;
}
}
{ // NESTED SCOPE
const int nb_outcomes = %(pvals)s->dimensions[0];
const int nb_multi = %(pvals)s->dimensions[1];
//
// For each multinomials, loop over each possible outcome
//
for (int n = 0; n < nb_multi; ++n)
{
dtype_%(pvals)s cummul = 0.;
const dtype_%(unis)s* unis_n = (dtype_%(unis)s*)PyArray_GETPTR1(%(unis)s, n);
for (int m = 0; m < nb_outcomes; ++m)
{
dtype_%(z)s* z_nm = (dtype_%(z)s*)PyArray_GETPTR2(%(z)s, m,n);
const dtype_%(pvals)s* pvals_nm = (dtype_%(pvals)s*)PyArray_GETPTR2(%(pvals)s, m,n);
cummul += *pvals_nm;
if (*unis_n < cummul)
{
*z_nm = 1.;
break;
}
}
}
} // END NESTED SCOPE
""" % locals()
multinomial = Multinomial()
class GpuMultinomial(Multinomial):
def make_node(self, pvals, unis):
assert pvals.dtype == 'float32'
assert unis.dtype == 'float32'
if not isinstance(pvals.type, CudaNdarrayType):
raise TypeError('pvals must be cudandarray', pvals)
if not isinstance(unis.type, CudaNdarrayType):
raise TypeError('unis must be cudandarray', unis)
return Apply(self, [pvals, unis], [pvals.type()])
def c_code_cache_version(self):
#return ()
return (super(GpuMultinomial,self).c_code_cache_version(),2)
def c_support_code_apply(self, node, nodename):
return """
static __global__ void k_multi_warp_%(nodename)s(
const int nb_multi,
const int nb_outcomes,
const int pvals_row_strides,
const int pvals_col_strides,
float * global_pvals,
float * global_unis,
float * global_outs
)
{
int n = blockDim.x*blockIdx.x + threadIdx.x;
if (n < nb_multi)
{
float cummul = 0.;
bool done = false;
for (int m = 0; m < nb_outcomes; ++m)
{
cummul += global_pvals[n * pvals_col_strides + m * pvals_row_strides];
float current_out = 0.;
if (!done && global_unis[n] < cummul)
{
current_out = 1.;
done = true;
}
global_outs[n + m * nb_multi] = current_out;
}
}
}
""" % locals()
def c_code(self, node, name, inp, out, sub):
pvals, unis = inp
z, = out
fail = sub['fail']
return """
if (%(pvals)s->nd != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
%(fail)s;
}
if (%(unis)s->nd != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
%(fail)s;
}
if (!CudaNdarray_is_c_contiguous(%(unis)s))
{
PyErr_Format(PyExc_NotImplementedError, "require unis to be contiguous");
%(fail)s;
}
// Would be more efficient if pvals were also contiguous but practically I think it is not often the cas,
// since we are working on pvals.T here
if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[1]))
{
Py_XDECREF(%(z)s);
npy_intp dims[2];
dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]);
dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]);
%(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
if (!%(z)s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
%(fail)s;
}
}
{ // NESTED SCOPE
int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0];
int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1];
//TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1;
int nb_blocks = max_nb_blocks + 1;
int nb_threads=16; // so it really starts at 32, because of the *2
do
{
nb_threads*=2;
if (nb_multi %% nb_threads == 0)
nb_blocks = nb_multi/nb_threads;
else
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks);
//printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
// TODO : next line is a bit hardcoded...
if (nb_threads > 512)
{
PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for as many rows in the matrix (%%i)", nb_multi);
%(fail)s;
}
dim3 n_blocks(nb_blocks,1,1);
dim3 n_threads(nb_threads,1,1);
int n_shared = 0;
k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(%(z)s)[1],
CudaNdarray_HOST_DIMS(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
CudaNdarray_DEV_DATA(%(pvals)s),
CudaNdarray_DEV_DATA(%(unis)s),
CudaNdarray_DEV_DATA(%(z)s)
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n",
"k_multi_warp_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z,
n_shared);
%(fail)s;
}
} // END NESTED SCOPE
""" % locals()
gpu_multinomial = GpuMultinomial()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论