提交 c807a893 authored 作者: James Bergstra's avatar James Bergstra

limit the loop fusion so that arguments can fit in paramter list on gpu

上级 4daeb07f
...@@ -1321,6 +1321,21 @@ def local_elemwise_fusion(node): ...@@ -1321,6 +1321,21 @@ def local_elemwise_fusion(node):
assert len(n.outputs)==1 assert len(n.outputs)==1
assert node.outputs[0].dtype==n.outputs[0].dtype assert node.outputs[0].dtype==n.outputs[0].dtype
# There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function.
# Here, we estimate how many bytes the new Op will need, and abort if it needs too much.
if True:
argument_limit = 200 # 256 didn't work, but a lower number did... so something funny
# is going on
int_size = 4
ptr_size = 4
argument_size = 4 #for numels
argument_size += int_size * inputs[0].type.ndim # for the shape
argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.inputs)
argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.outputs)
if argument_size >= argument_limit:
_logger.warning('loop fusion failed because Op would exceed kernel argument limit.')
return False
# print "local_elemwise_fusion: FUSED",nb_elemwise+1,"elemwise!" # print "local_elemwise_fusion: FUSED",nb_elemwise+1,"elemwise!"
return n.outputs return n.outputs
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论