提交 9f365bf3 authored 作者: Frederic's avatar Frederic

Small optimization for GpuAlloc and memset_0

上级 4c03457c
...@@ -516,30 +516,42 @@ class GpuAlloc(HideC, Alloc): ...@@ -516,30 +516,42 @@ class GpuAlloc(HideC, Alloc):
for (i = 0; i < %(ndim)s; i++) for (i = 0; i < %(ndim)s; i++)
need_new_out |= %(zz)s->ga.dimensions[i] != %(name)s_shape[i]; need_new_out |= %(zz)s->ga.dimensions[i] != %(name)s_shape[i];
if (need_new_out) { if (need_new_out && (%(memset_0)s)) {
//pygpu_zeros can be faster then empty followed by memset.
Py_XDECREF(%(zz)s); Py_XDECREF(%(zz)s);
%(zz)s = pygpu_empty(%(ndim)s, %(name)s_shape, %(zz)s = pygpu_zeros(%(ndim)s, %(name)s_shape,
%(vv)s->ga.typecode, GA_C_ORDER, %(vv)s->ga.typecode, GA_C_ORDER,
pygpu_default_context(), Py_None); pygpu_default_context(), Py_None);
if (!%(zz)s) { if (!%(zz)s) {
%(fail)s %(fail)s
} }
} } else {
if (%(memset_0)s && GpuArray_ISONESEGMENT(&%(zz)s->ga)) if (need_new_out) {
{ Py_XDECREF(%(zz)s);
int err = GpuArray_memset(&%(zz)s->ga, 0); %(zz)s = pygpu_empty(%(ndim)s, %(name)s_shape,
if (err != GA_NO_ERROR) %(vv)s->ga.typecode, GA_C_ORDER,
pygpu_default_context(), Py_None);
if (!%(zz)s) {
%(fail)s
}
}
if (%(memset_0)s && GpuArray_ISONESEGMENT(&%(zz)s->ga))
{ {
PyErr_Format(PyExc_MemoryError, int err = GpuArray_memset(&%(zz)s->ga, 0);
"GpuAlloc: Error memsetting %%d" if (err != GA_NO_ERROR)
" element of device memory to 0.", {
PyGpuArray_SIZE(%(zz)s)); PyErr_Format(PyExc_MemoryError,
%(fail)s; "GpuAlloc: Error memsetting %%d"
" element of device memory to 0.",
PyGpuArray_SIZE(%(zz)s));
%(fail)s;
}
}
else if (GpuArray_setarray(&%(zz)s->ga, &%(vv)s->ga) !=
GA_NO_ERROR) {
PyErr_SetString(PyExc_ValueError, "setarray failed");
%(fail)s
} }
}
else if (GpuArray_setarray(&%(zz)s->ga, &%(vv)s->ga) != GA_NO_ERROR) {
PyErr_SetString(PyExc_ValueError, "setarray failed");
%(fail)s
} }
""" % dict(name=name, ndim=ndim, zz=zz, vv=vv, """ % dict(name=name, ndim=ndim, zz=zz, vv=vv,
fail=sub['fail'], memset_0=memset_0) fail=sub['fail'], memset_0=memset_0)
...@@ -550,7 +562,7 @@ class GpuAlloc(HideC, Alloc): ...@@ -550,7 +562,7 @@ class GpuAlloc(HideC, Alloc):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (2,)
gpu_alloc = GpuAlloc() gpu_alloc = GpuAlloc()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论