提交 ed9c389c authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Make the GpuCrossentropySoftmaxArgmax1HotWithBias c_code more readable.

上级 dfe973bc
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import os
import numpy import numpy
from theano import Op, Apply, config from theano import Op, Apply, config
...@@ -45,7 +46,10 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -45,7 +46,10 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
return node.inputs[0].type.context return node.inputs[0].type.context
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>', 'gpuarray_helper.h']
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def gpu_kernels(self, node, nodename): def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
...@@ -191,9 +195,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -191,9 +195,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != b'cuda': if node.inputs[0].type.context.kind != b'cuda':
raise NotImplementedError('cuda only') raise NotImplementedError('cuda only')
typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype)
itemsize_x = numpy.dtype(node.inputs[0].dtype).itemsize itemsize_x = numpy.dtype(node.inputs[0].dtype).itemsize
worksize_x = numpy.dtype(work_dtype(node.inputs[0].dtype)).itemsize worksize_x = numpy.dtype(work_dtype(node.inputs[0].dtype)).itemsize
itemsize_b = numpy.dtype(node.inputs[1].dtype).itemsize itemsize_b = numpy.dtype(node.inputs[1].dtype).itemsize
...@@ -203,13 +204,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -203,13 +204,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
itemsize_am = numpy.dtype(node.outputs[2].dtype).itemsize itemsize_am = numpy.dtype(node.outputs[2].dtype).itemsize
x, b, y_idx = inp x, b, y_idx = inp
nll, sm, am = out nll, sm, am = out
dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
dtype_nll = node.outputs[0].dtype
dtype_sm = node.outputs[1].dtype
dtype_am = node.outputs[2].dtype
classname = self.__class__.__name__
fail = sub['fail'] fail = sub['fail']
ctx = sub['params'] ctx = sub['params']
k_var = "k_xent_sm_1hot_bias_%(nodename)s" % locals() k_var = "k_xent_sm_1hot_bias_%(nodename)s" % locals()
...@@ -229,21 +223,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -229,21 +223,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
""" % locals() """ % locals()
sio = StringIO() sio = StringIO()
print(""" print("""
if (PyGpuArray_NDIM(%(y_idx)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
%(fail)s;
}
if (PyGpuArray_NDIM(%(x)s) != 2)
{
PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
%(fail)s;
}
if (PyGpuArray_NDIM(%(b)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
%(fail)s;
}
if (PyGpuArray_DIMS(%(x)s)[0] != if (PyGpuArray_DIMS(%(x)s)[0] !=
PyGpuArray_DIMS(%(y_idx)s)[0]) PyGpuArray_DIMS(%(y_idx)s)[0])
{ {
...@@ -257,82 +236,32 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -257,82 +236,32 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
"dimension mismatch in x,b arguments"); "dimension mismatch in x,b arguments");
%(fail)s; %(fail)s;
} }
if ((NULL == %(nll)s) //initial condition if (theano_prep_output(&%(nll)s, 1, PyGpuArray_DIMS(%(y_idx)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
|| (PyGpuArray_DIMS(%(nll)s)[0] != if (theano_prep_output(&%(sm)s, 2, PyGpuArray_DIMS(%(x)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
PyGpuArray_DIMS(%(y_idx)s)[0])) if (theano_prep_output(&%(am)s, 1, PyGpuArray_DIMS(%(y_idx)s), %(y_idx)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
{
Py_XDECREF(%(nll)s);
%(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
%(typecode_x)s, GA_C_ORDER, %(ctx)s,
Py_None);
if (!%(nll)s) {
%(fail)s
}
}
if ((NULL == %(sm)s)
|| (PyGpuArray_DIMS(%(sm)s)[0] !=
PyGpuArray_DIMS(%(x)s)[0])
|| (PyGpuArray_DIMS(%(sm)s)[1] !=
PyGpuArray_DIMS(%(x)s)[1]))
{
Py_XDECREF(%(sm)s);
%(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
%(typecode_b)s, GA_C_ORDER,
%(ctx)s, Py_None);
if(!%(sm)s)
{
PyErr_SetString(PyExc_MemoryError,
"failed to alloc sm output");
// no need to decref cnda_nll, the cleanup code should do it up
%(fail)s;
}
}
if ((NULL == %(am)s)
|| (PyGpuArray_DIMS(%(am)s)[0] !=
PyGpuArray_DIMS(%(y_idx)s)[0]))
{
Py_XDECREF(%(am)s);
%(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
%(typecode_y_idx)s, GA_C_ORDER,
%(ctx)s, Py_None);
if(!%(am)s)
{
PyErr_SetString(PyExc_MemoryError,
"failed to alloc am output");
// no need to decref nll and sm,
// the cleanup code should do it up
%(fail)s;
}
}
{ {
size_t n_blocks = std::min(PyGpuArray_DIM(%(x)s, 0), (size_t)4096); size_t n_blocks = std::min(PyGpuArray_DIM(%(x)s, 0), (size_t)4096);
size_t n_threads = std::min(PyGpuArray_DIM(%(x)s, 1), (size_t)256); size_t n_threads = std::min(PyGpuArray_DIM(%(x)s, 1), (size_t)256);
size_t n_shared = n_threads * %(worksize_x)s; size_t n_shared = n_threads * %(worksize_x)s;
ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s;
ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
ssize_t stride_NLL0 = PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s;
ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
ssize_t stride_AM0 = PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s;
//TODO: launch more threads per row and do parallel sum and max reductions //TODO: launch more threads per row and do parallel sum and max reductions
void *kernel_params[] = { int err = k_xent_sm_1hot_bias_call(
(void *)&PyGpuArray_DIMS(%(x)s)[0], 1, &n_blocks, &n_threads, n_shared,
(void *)&PyGpuArray_DIMS(%(x)s)[1], PyGpuArray_DIMS(%(x)s)[0],
(void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset, PyGpuArray_DIMS(%(x)s)[1],
(void *)&stride_X0, (void *)&stride_X1, %(x)s->ga.data, %(x)s->ga.offset,
(void *)%(b)s->ga.data, (void *)&%(b)s->ga.offset, PyGpuArray_STRIDE(%(x)s, 0) / %(itemsize_x)s,
(void *)&stride_B0, PyGpuArray_STRIDE(%(x)s, 1) / %(itemsize_x)s,
(void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset, %(b)s->ga.data, %(b)s->ga.offset,
(void *)&stride_YIDX0, PyGpuArray_STRIDE(%(b)s, 0) / %(itemsize_b)s,
(void *)%(nll)s->ga.data, (void *)&%(nll)s->ga.offset, %(y_idx)s->ga.data, %(y_idx)s->ga.offset,
(void *)&stride_NLL0, PyGpuArray_STRIDE(%(y_idx)s, 0) / %(itemsize_y_idx)s,
(void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset, %(nll)s->ga.data, %(nll)s->ga.offset,
(void *)&stride_SM0, (void *)&stride_SM1, PyGpuArray_STRIDE(%(nll)s, 0) / %(itemsize_nll)s,
(void *)%(am)s->ga.data, (void *)&%(am)s->ga.offset, %(sm)s->ga.data, %(sm)s->ga.offset,
(void *)&stride_AM0}; PyGpuArray_STRIDE(%(sm)s, 0) / %(itemsize_sm)s,
int err = GpuKernel_call(&%(k_var)s, 1, &n_threads, &n_blocks, n_shared, kernel_params); PyGpuArray_STRIDE(%(sm)s, 1) / %(itemsize_sm)s,
%(am)s->ga.data, %(am)s->ga.offset,
PyGpuArray_STRIDE(%(am)s, 0) / %(itemsize_am)s);
%(err_check)s %(err_check)s
%(sync)s %(sync)s
} }
...@@ -340,7 +269,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -340,7 +269,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
return sio.getvalue() return sio.getvalue()
def c_code_cache_version(self): def c_code_cache_version(self):
return (10,) return (12,)
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias() gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论