提交 137ce6a3 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Use provided output memory if not contiguous

It may be slower than allocated new memory and calling memset, but if that non-contiguous memory was provided, it is probable that the output will end up being copied into that non-contiguous memory, and that would be even slower.
上级 10aa713a
......@@ -2721,7 +2721,7 @@ class GpuAlloc(GpuOp):
str += "if(%(out)s==NULL\n" % locals()
for idx, sh in enumerate(shps):
str += "||CudaNdarray_HOST_DIMS(%(out)s)[%(idx)s]!=dims[%(idx)s]" % locals()
str += """||(%(memset_0)s && !CudaNdarray_is_c_contiguous(%(out)s))){
str += """){
Py_XDECREF(%(out)s);
%(out)s = (CudaNdarray*)CudaNdarray_New();
if (!%(out)s)
......@@ -2737,7 +2737,7 @@ class GpuAlloc(GpuOp):
%(fail)s;
}
}
if (%(memset_0)s)
if (%(memset_0)s && CudaNdarray_is_c_contiguous(%(out)s))
{
if (cudaSuccess != cudaMemset(%(out)s->devdata, 0,
CudaNdarray_SIZE(%(out)s) * 4))
......@@ -2769,7 +2769,7 @@ class GpuAlloc(GpuOp):
return [None for i in inputs]
def c_code_cache_version(self):
return (6,)
return (7,)
def do_constant_folding(self, node):
for client in node.outputs[0].clients:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论