提交 b2d0f53d authored 作者: Pascal Lamblin's avatar Pascal Lamblin

New version of the C-code generator for Elemwise, enabling loop reordering.

- Code reviewed by Olivier B., thanks! - Lots of comments added - Unused code removed - Looping in Elemwise is now done so the contiguous dimension of the output is looped over in the inner-most loop. It does not change anything in the non-inplace case (the output was C-contiguous), but can lead to really better performances (at least 7x) in the inplace case.
上级 cfd4bef8
...@@ -665,10 +665,18 @@ class Elemwise(Op): ...@@ -665,10 +665,18 @@ class Elemwise(Op):
defines = "" defines = ""
undefs = "" undefs = ""
dmap = dict([(node.outputs[o], [node.inputs[i]]) for o, i in self.inplace_pattern.items()])
# The destroy map is a map of output indices to input indices
# that overwrite them. We just convert them to the actual
# Variables.
dmap = dict([(node.outputs[o], [node.inputs[i]])
for o, i in self.inplace_pattern.iteritems()])
# dtypes of the inputs
idtypes = [input.type.dtype_specs()[1] for input in inputs] idtypes = [input.type.dtype_specs()[1] for input in inputs]
# These are the outputs that we will need to allocate
# (output, name, name of the c type), transposed
real = zip(*[(r, s, r.type.dtype_specs()[1]) real = zip(*[(r, s, r.type.dtype_specs()[1])
for r, s in zip(node.outputs, onames) if r not in dmap]) for r, s in zip(node.outputs, onames) if r not in dmap])
if real: if real:
...@@ -676,6 +684,9 @@ class Elemwise(Op): ...@@ -676,6 +684,9 @@ class Elemwise(Op):
else: else:
real_outputs, real_onames, real_odtypes = [], [], [] real_outputs, real_onames, real_odtypes = [], [], []
# Outputs that are aliased with an input (inplace)
# (output, name), transposed (c type name not needed since we don't
# need to allocate.
aliased = zip(*[(r, s) aliased = zip(*[(r, s)
for (r, s) in zip(node.outputs, onames) if r in dmap]) for (r, s) in zip(node.outputs, onames) if r in dmap])
if aliased: if aliased:
...@@ -683,25 +694,49 @@ class Elemwise(Op): ...@@ -683,25 +694,49 @@ class Elemwise(Op):
else: else:
aliased_outputs, aliased_onames = [], [] aliased_outputs, aliased_onames = [], []
orders = [[x and 'x' or i for i, x in enumerate(input.type.broadcastable)] for input in inputs] # for each input:
# same as range(ndim), but with 'x' at all broadcastable positions
orders = [[x and 'x' or i
for i, x in enumerate(input.type.broadcastable)]
for input in inputs]
# number of nested loops we will need (all inputs have same
# dimensionality)
nnested = len(orders[0]) nnested = len(orders[0])
sub = dict(sub) sub = dict(sub)
for i, (input, iname) in enumerate(zip(inputs, inames)): for i, (input, iname) in enumerate(zip(inputs, inames)):
# the c generators will substitute the input names for
# references to loop variables lv0, lv1, ...
sub['lv%i' % i] = iname sub['lv%i' % i] = iname
decl = cgen.make_declare(orders, idtypes, sub) decl = cgen.make_declare(orders, idtypes, sub)
checks = cgen.make_checks(orders, idtypes, sub) checks = cgen.make_checks(orders, idtypes, sub)
alloc = "" alloc = ""
# We loop over the "real" outputs, i.e., those that are not
# inplace (must be allocated) and we declare/allocate/check
# them
for output, oname, odtype in zip(real_outputs, real_onames, real_odtypes): for output, oname, odtype in zip(real_outputs, real_onames, real_odtypes):
i += 1 i += 1 # before this loop, i = number of inputs
sub['lv%i' % i] = oname sub['lv%i' % i] = oname
sub['olv'] = oname sub['olv'] = oname
alloc += cgen.make_declare([range(nnested)], [odtype], dict(sub, lv0 = oname)) alloc += cgen.make_declare([range(nnested)], [odtype],
dict(sub, lv0 = oname))
alloc += cgen.make_alloc(orders, odtype, sub) alloc += cgen.make_alloc(orders, odtype, sub)
alloc += cgen.make_checks([range(nnested)], [odtype], dict(sub, lv0 = oname)) alloc += cgen.make_checks([range(nnested)], [odtype],
dict(sub, lv0 = oname))
olv_index = i # index of the last output
# We loop over the "aliased" outputs, i.e., those that are
# inplace (overwrite the contents of one of the inputs) and
# make the output pointers point to theur corresponding input
# pointers.
for output, oname in zip(aliased_outputs, aliased_onames): for output, oname in zip(aliased_outputs, aliased_onames):
iname = inames[inputs.index(dmap[output][0])] olv_index = inputs.index(dmap[output][0])
iname = inames[olv_index]
# We make the output point to the corresponding input and
# decrease the reference of whatever the output contained
# prior to this
alloc += """ alloc += """
if (%(oname)s) { if (%(oname)s) {
Py_XDECREF(%(oname)s); Py_XDECREF(%(oname)s);
...@@ -709,17 +744,30 @@ class Elemwise(Op): ...@@ -709,17 +744,30 @@ class Elemwise(Op):
%(oname)s = %(iname)s; %(oname)s = %(iname)s;
Py_XINCREF(%(oname)s); Py_XINCREF(%(oname)s);
""" % locals() """ % locals()
# We alias the scalar variables
defines += "#define %(oname)s_i %(iname)s_i" % locals() defines += "#define %(oname)s_i %(iname)s_i" % locals()
undefs += "#undef %(oname)s_i" % locals() undefs += "#undef %(oname)s_i" % locals()
task_code = self.scalar_op.c_code(Apply(self.scalar_op, # Note: here, olv_index is either the index of the last output
# which is allocated, OR, if there are any aliased outputs,
# the index of the last of these aliased outputs.
# We declare the scalar variables used in the inner loop to do
# the element-wise computation. Aliased scalar variables need
# not be declared, as they are #defined in defines
task_decl = "".join(["%(dtype)s& %(name)s_i = *%(name)s_iter;\n" % locals()
for name, dtype in zip(inames + list(real_onames),
idtypes + list(real_odtypes))])
# We generate the C code of the inner loop using the scalar op
task_code = self.scalar_op.c_code(
Apply(self.scalar_op,
[Scalar(dtype = input.type.dtype)() for input in node.inputs], [Scalar(dtype = input.type.dtype)() for input in node.inputs],
[Scalar(dtype = output.type.dtype)() for output in node.outputs]), [Scalar(dtype = output.type.dtype)() for output in node.outputs]),
name + '_scalar_', name + '_scalar_',
["%s_i" % s for s in _inames], ["%s_i" % s for s in _inames],
["%s_i" % s for s in onames], ["%s_i" % s for s in onames],
sub) sub)
task_decl = "".join(["%(dtype)s& %(name)s_i = *%(name)s_iter;\n" % locals() for name, dtype in zip(inames + list(real_onames), idtypes + list(real_odtypes))])
code = """ code = """
{ {
%(defines)s %(defines)s
...@@ -728,22 +776,28 @@ class Elemwise(Op): ...@@ -728,22 +776,28 @@ class Elemwise(Op):
%(undefs)s %(undefs)s
} }
""" % locals() """ % locals()
if nnested:
all_code = [("", "")] * (nnested - 1) + [("", code)] + [""] loop = cgen.make_reordered_loop(
else: init_loop_orders = orders + [range(nnested)] * len(real_onames),
all_code = [code] olv_index = olv_index,
loop = cgen.make_loop(orders + [range(nnested)] * len(real_onames), idtypes + list(real_odtypes), all_code, sub) dtypes = idtypes + list(real_odtypes),
inner_task = code,
sub = sub)
return decl, checks, alloc, loop return decl, checks, alloc, loop
def c_code(self, node, name, inames, onames, sub): def c_code(self, node, name, inames, onames, sub):
code = "\n".join(self._c_all(node, name, inames, onames, sub)) code = "\n".join(self._c_all(node, name, inames, onames, sub))
return code return code
def c_headers(self):
return ['<vector>', '<algorithm>']
def c_support_code(self): def c_support_code(self):
return self.scalar_op.c_support_code() support_code = self.scalar_op.c_support_code()
return support_code
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [4] # the version corresponding to the c code in this Op version = [5] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op, scalar_node = Apply(self.scalar_op,
...@@ -989,8 +1043,12 @@ for(int i=0;i<%(iname)s->nd;i++){ ...@@ -989,8 +1043,12 @@ for(int i=0;i<%(iname)s->nd;i++){
code = "\n".join(self._c_all(node, name, inames, onames, sub)) code = "\n".join(self._c_all(node, name, inames, onames, sub))
return code return code
def c_headers(self):
# Sometimes, Elemwise's c_code is returned, so we need its headers
return ['<vector>', '<algorithm>']
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [3] # the version corresponding to the c code in this Op version = [4] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op, scalar_node = Apply(self.scalar_op,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论