提交 b2d0f53d authored 作者: Pascal Lamblin's avatar Pascal Lamblin

New version of the C-code generator for Elemwise, enabling loop reordering.

- Code reviewed by Olivier B., thanks! - Lots of comments added - Unused code removed - Looping in Elemwise is now done so the contiguous dimension of the output is looped over in the inner-most loop. It does not change anything in the non-inplace case (the output was C-contiguous), but can lead to really better performances (at least 7x) in the inplace case.
上级 cfd4bef8
...@@ -665,10 +665,18 @@ class Elemwise(Op): ...@@ -665,10 +665,18 @@ class Elemwise(Op):
defines = "" defines = ""
undefs = "" undefs = ""
dmap = dict([(node.outputs[o], [node.inputs[i]]) for o, i in self.inplace_pattern.items()])
# The destroy map is a map of output indices to input indices
# that overwrite them. We just convert them to the actual
# Variables.
dmap = dict([(node.outputs[o], [node.inputs[i]])
for o, i in self.inplace_pattern.iteritems()])
# dtypes of the inputs
idtypes = [input.type.dtype_specs()[1] for input in inputs] idtypes = [input.type.dtype_specs()[1] for input in inputs]
# These are the outputs that we will need to allocate
# (output, name, name of the c type), transposed
real = zip(*[(r, s, r.type.dtype_specs()[1]) real = zip(*[(r, s, r.type.dtype_specs()[1])
for r, s in zip(node.outputs, onames) if r not in dmap]) for r, s in zip(node.outputs, onames) if r not in dmap])
if real: if real:
...@@ -676,6 +684,9 @@ class Elemwise(Op): ...@@ -676,6 +684,9 @@ class Elemwise(Op):
else: else:
real_outputs, real_onames, real_odtypes = [], [], [] real_outputs, real_onames, real_odtypes = [], [], []
# Outputs that are aliased with an input (inplace)
# (output, name), transposed (c type name not needed since we don't
# need to allocate.
aliased = zip(*[(r, s) aliased = zip(*[(r, s)
for (r, s) in zip(node.outputs, onames) if r in dmap]) for (r, s) in zip(node.outputs, onames) if r in dmap])
if aliased: if aliased:
...@@ -683,25 +694,49 @@ class Elemwise(Op): ...@@ -683,25 +694,49 @@ class Elemwise(Op):
else: else:
aliased_outputs, aliased_onames = [], [] aliased_outputs, aliased_onames = [], []
orders = [[x and 'x' or i for i, x in enumerate(input.type.broadcastable)] for input in inputs] # for each input:
# same as range(ndim), but with 'x' at all broadcastable positions
orders = [[x and 'x' or i
for i, x in enumerate(input.type.broadcastable)]
for input in inputs]
# number of nested loops we will need (all inputs have same
# dimensionality)
nnested = len(orders[0]) nnested = len(orders[0])
sub = dict(sub) sub = dict(sub)
for i, (input, iname) in enumerate(zip(inputs, inames)): for i, (input, iname) in enumerate(zip(inputs, inames)):
# the c generators will substitute the input names for
# references to loop variables lv0, lv1, ...
sub['lv%i' % i] = iname sub['lv%i' % i] = iname
decl = cgen.make_declare(orders, idtypes, sub) decl = cgen.make_declare(orders, idtypes, sub)
checks = cgen.make_checks(orders, idtypes, sub) checks = cgen.make_checks(orders, idtypes, sub)
alloc = "" alloc = ""
# We loop over the "real" outputs, i.e., those that are not
# inplace (must be allocated) and we declare/allocate/check
# them
for output, oname, odtype in zip(real_outputs, real_onames, real_odtypes): for output, oname, odtype in zip(real_outputs, real_onames, real_odtypes):
i += 1 i += 1 # before this loop, i = number of inputs
sub['lv%i' % i] = oname sub['lv%i' % i] = oname
sub['olv'] = oname sub['olv'] = oname
alloc += cgen.make_declare([range(nnested)], [odtype], dict(sub, lv0 = oname)) alloc += cgen.make_declare([range(nnested)], [odtype],
dict(sub, lv0 = oname))
alloc += cgen.make_alloc(orders, odtype, sub) alloc += cgen.make_alloc(orders, odtype, sub)
alloc += cgen.make_checks([range(nnested)], [odtype], dict(sub, lv0 = oname)) alloc += cgen.make_checks([range(nnested)], [odtype],
dict(sub, lv0 = oname))
olv_index = i # index of the last output
# We loop over the "aliased" outputs, i.e., those that are
# inplace (overwrite the contents of one of the inputs) and
# make the output pointers point to theur corresponding input
# pointers.
for output, oname in zip(aliased_outputs, aliased_onames): for output, oname in zip(aliased_outputs, aliased_onames):
iname = inames[inputs.index(dmap[output][0])] olv_index = inputs.index(dmap[output][0])
iname = inames[olv_index]
# We make the output point to the corresponding input and
# decrease the reference of whatever the output contained
# prior to this
alloc += """ alloc += """
if (%(oname)s) { if (%(oname)s) {
Py_XDECREF(%(oname)s); Py_XDECREF(%(oname)s);
...@@ -709,17 +744,30 @@ class Elemwise(Op): ...@@ -709,17 +744,30 @@ class Elemwise(Op):
%(oname)s = %(iname)s; %(oname)s = %(iname)s;
Py_XINCREF(%(oname)s); Py_XINCREF(%(oname)s);
""" % locals() """ % locals()
# We alias the scalar variables
defines += "#define %(oname)s_i %(iname)s_i" % locals() defines += "#define %(oname)s_i %(iname)s_i" % locals()
undefs += "#undef %(oname)s_i" % locals() undefs += "#undef %(oname)s_i" % locals()
task_code = self.scalar_op.c_code(Apply(self.scalar_op, # Note: here, olv_index is either the index of the last output
# which is allocated, OR, if there are any aliased outputs,
# the index of the last of these aliased outputs.
# We declare the scalar variables used in the inner loop to do
# the element-wise computation. Aliased scalar variables need
# not be declared, as they are #defined in defines
task_decl = "".join(["%(dtype)s& %(name)s_i = *%(name)s_iter;\n" % locals()
for name, dtype in zip(inames + list(real_onames),
idtypes + list(real_odtypes))])
# We generate the C code of the inner loop using the scalar op
task_code = self.scalar_op.c_code(
Apply(self.scalar_op,
[Scalar(dtype = input.type.dtype)() for input in node.inputs], [Scalar(dtype = input.type.dtype)() for input in node.inputs],
[Scalar(dtype = output.type.dtype)() for output in node.outputs]), [Scalar(dtype = output.type.dtype)() for output in node.outputs]),
name + '_scalar_', name + '_scalar_',
["%s_i" % s for s in _inames], ["%s_i" % s for s in _inames],
["%s_i" % s for s in onames], ["%s_i" % s for s in onames],
sub) sub)
task_decl = "".join(["%(dtype)s& %(name)s_i = *%(name)s_iter;\n" % locals() for name, dtype in zip(inames + list(real_onames), idtypes + list(real_odtypes))])
code = """ code = """
{ {
%(defines)s %(defines)s
...@@ -728,22 +776,28 @@ class Elemwise(Op): ...@@ -728,22 +776,28 @@ class Elemwise(Op):
%(undefs)s %(undefs)s
} }
""" % locals() """ % locals()
if nnested:
all_code = [("", "")] * (nnested - 1) + [("", code)] + [""] loop = cgen.make_reordered_loop(
else: init_loop_orders = orders + [range(nnested)] * len(real_onames),
all_code = [code] olv_index = olv_index,
loop = cgen.make_loop(orders + [range(nnested)] * len(real_onames), idtypes + list(real_odtypes), all_code, sub) dtypes = idtypes + list(real_odtypes),
inner_task = code,
sub = sub)
return decl, checks, alloc, loop return decl, checks, alloc, loop
def c_code(self, node, name, inames, onames, sub): def c_code(self, node, name, inames, onames, sub):
code = "\n".join(self._c_all(node, name, inames, onames, sub)) code = "\n".join(self._c_all(node, name, inames, onames, sub))
return code return code
def c_headers(self):
return ['<vector>', '<algorithm>']
def c_support_code(self): def c_support_code(self):
return self.scalar_op.c_support_code() support_code = self.scalar_op.c_support_code()
return support_code
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [4] # the version corresponding to the c code in this Op version = [5] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op, scalar_node = Apply(self.scalar_op,
...@@ -989,8 +1043,12 @@ for(int i=0;i<%(iname)s->nd;i++){ ...@@ -989,8 +1043,12 @@ for(int i=0;i<%(iname)s->nd;i++){
code = "\n".join(self._c_all(node, name, inames, onames, sub)) code = "\n".join(self._c_all(node, name, inames, onames, sub))
return code return code
def c_headers(self):
# Sometimes, Elemwise's c_code is returned, so we need its headers
return ['<vector>', '<algorithm>']
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [3] # the version corresponding to the c code in this Op version = [4] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op, scalar_node = Apply(self.scalar_op,
......
def make_declare(loop_orders, dtypes, sub): def make_declare(loop_orders, dtypes, sub):
"""
Produce code to declare all necessary variables.
"""
decl = "" decl = ""
for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)): for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
var = sub['lv%i' % i] var = sub['lv%i' % i] # input name corresponding to ith loop variable
# we declare an iteration variable
# and an integer for the number of dimensions
decl += """ decl += """
%(dtype)s* %(var)s_iter; %(dtype)s* %(var)s_iter;
int %(var)s_nd; int %(var)s_nd;
""" % locals() """ % locals()
for j, value in enumerate(loop_order): for j, value in enumerate(loop_order):
if value != 'x': if value != 'x':
# If the dimension is not broadcasted, we declare
# the number of elements in that dimension,
# the stride in that dimension,
# and the jump from an iteration to the next
decl += """ decl += """
int %(var)s_n%(value)i; int %(var)s_n%(value)i;
int %(var)s_stride%(value)i; int %(var)s_stride%(value)i;
int %(var)s_jump%(value)i_%(j)i; int %(var)s_jump%(value)i_%(j)i;
""" % locals() """ % locals()
else: else:
# if the dimension is broadcasted, we only need
# the jump (arbitrary length and stride = 0)
decl += """ decl += """
int %(var)s_jump%(value)s_%(j)i; int %(var)s_jump%(value)s_%(j)i;
""" % locals() """ % locals()
...@@ -27,8 +39,12 @@ def make_checks(loop_orders, dtypes, sub): ...@@ -27,8 +39,12 @@ def make_checks(loop_orders, dtypes, sub):
init = "" init = ""
for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)): for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
var = "%%(lv%i)s" % i var = "%%(lv%i)s" % i
# List of dimensions of var that are not broadcasted
nonx = [x for x in loop_order if x != 'x'] nonx = [x for x in loop_order if x != 'x']
if nonx: if nonx:
# If there are dimensions that are not broadcasted
# this is a check that the number of dimensions of the
# tensor is as expected.
min_nd = max(nonx) + 1 min_nd = max(nonx) + 1
init += """ init += """
if (%(var)s->nd < %(min_nd)s) { if (%(var)s->nd < %(min_nd)s) {
...@@ -36,10 +52,20 @@ def make_checks(loop_orders, dtypes, sub): ...@@ -36,10 +52,20 @@ def make_checks(loop_orders, dtypes, sub):
%%(fail)s %%(fail)s
} }
""" % locals() """ % locals()
adjust = []
for j, index in reversed([aaa for aaa in enumerate(loop_order)]): # In loop j, adjust represents the difference of values of the
# data pointer between the beginning and the end of the
# execution of loop j+1 (the loop inside the current one). It
# is equal to the stride in loop j+1 times the length of loop
# j+1, or 0 for the inner-most loop.
adjust = "0"
# We go from the inner loop to the outer loop
for j, index in reversed(list(enumerate(loop_order))):
if index != 'x': if index != 'x':
jump = " - ".join(["%(var)s_stride%(index)s" % locals()] + adjust) # Initialize the variables associated to the jth loop
# jump = stride - adjust
jump = "(%s) - (%s)" % ("%(var)s_stride%(index)s" % locals(), adjust)
init += """ init += """
%(var)s_n%(index)s = %(var)s->dimensions[%(index)s]; %(var)s_n%(index)s = %(var)s->dimensions[%(index)s];
%(var)s_stride%(index)s = %(var)s->strides[%(index)s] / sizeof(%(dtype)s); %(var)s_stride%(index)s = %(var)s->strides[%(index)s] / sizeof(%(dtype)s);
...@@ -47,36 +73,20 @@ def make_checks(loop_orders, dtypes, sub): ...@@ -47,36 +73,20 @@ def make_checks(loop_orders, dtypes, sub):
//printf("%(var)s_jump%(index)s_%(j)s is:"); //printf("%(var)s_jump%(index)s_%(j)s is:");
//std::cout << %(var)s_jump%(index)s_%(j)s << std::endl; //std::cout << %(var)s_jump%(index)s_%(j)s << std::endl;
""" % locals() """ % locals()
adjust = ["%(var)s_n%(index)s*%(var)s_stride%(index)s" % locals()] adjust = "%(var)s_n%(index)s*%(var)s_stride%(index)s" % locals()
else: else:
jump = " - ".join(["0"] + adjust) jump = "-(%s)" % adjust
init += """ init += """
%(var)s_jump%(index)s_%(j)s = %(jump)s; %(var)s_jump%(index)s_%(j)s = %(jump)s;
//printf("%(var)s_jump%(index)s_%(j)s is:"); //printf("%(var)s_jump%(index)s_%(j)s is:");
//std::cout << %(var)s_jump%(index)s_%(j)s << std::endl; //std::cout << %(var)s_jump%(index)s_%(j)s << std::endl;
""" % locals() """ % locals()
adjust = [] adjust = "0"
check = "" check = ""
if 0:
# original dimension-checking loop builds a single if condition, and if it is true, it # This loop builds multiple if conditions to verify that the
# raises a generic error message # dimensions of the inputs match, and the first one that is true
for matches in zip(*loop_orders): # raises an informative error message
to_compare = [(j, x) for j, x in enumerate(matches) if x != "x"]
if len(to_compare) < 2:
continue
j, x = to_compare[0]
first = "%%(lv%(j)s)s_n%(x)s" % locals()
cond = " || ".join(["%(first)s != %%(lv%(j)s)s_n%(x)s" % locals() for j, x in to_compare[1:]])
if cond:
check += """
if (%(cond)s) {
PyErr_SetString(PyExc_ValueError, "Input dimensions do not match (Try re-running with py linker for more information).");
%%(fail)s
}
""" % locals()
else:
# revised dimension-checking loop build multiple if conditions, and the first one that
# is true raises a more informative error message
for matches in zip(*loop_orders): for matches in zip(*loop_orders):
to_compare = [(j, x) for j, x in enumerate(matches) if x != "x"] to_compare = [(j, x) for j, x in enumerate(matches) if x != "x"]
...@@ -99,12 +109,22 @@ def make_checks(loop_orders, dtypes, sub): ...@@ -99,12 +109,22 @@ def make_checks(loop_orders, dtypes, sub):
%%(fail)s %%(fail)s
} }
""" % locals() """ % locals()
return init % sub + check % sub return init % sub + check % sub
def make_alloc(loop_orders, dtype, sub): def make_alloc(loop_orders, dtype, sub):
"""
Generate C code to allocate outputs.
"""
nd = len(loop_orders[0]) nd = len(loop_orders[0])
init_dims = "" init_dims = ""
# For each dimension, the tensors are either all broadcasted, in
# which case the output will also be broadcastable (dimension =
# 1), or one or more are not broadcasted, in which case the number
# of elements of the output in that dimension will be equal to the
# number of elements of any of them.
for i, candidates in enumerate(zip(*loop_orders)): for i, candidates in enumerate(zip(*loop_orders)):
for j, candidate in enumerate(candidates): for j, candidate in enumerate(candidates):
if candidate != 'x': if candidate != 'x':
...@@ -114,6 +134,12 @@ def make_alloc(loop_orders, dtype, sub): ...@@ -114,6 +134,12 @@ def make_alloc(loop_orders, dtype, sub):
else: else:
init_dims += "dims[%(i)s] = 1;\n" % locals() init_dims += "dims[%(i)s] = 1;\n" % locals()
#raise Exception("For each looping dimension, at least one input must have a non-broadcastable dimension.") #raise Exception("For each looping dimension, at least one input must have a non-broadcastable dimension.")
# TODO: it would be interesting to allocate the output in such a
# way that its contiguous dimensions match one of the input's
# contiguous dimensions, or the dimension with the smallest
# stride. Right now, it is allocated to be C_CONTIGUOUS.
return """ return """
{ {
npy_intp dims[%(nd)s]; npy_intp dims[%(nd)s];
...@@ -155,9 +181,12 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub): ...@@ -155,9 +181,12 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
an alias of the entry of that rank. an alias of the entry of that rank.
@type loop_tasks: list of M+1 pieces of code. @type loop_tasks: list of M+1 pieces of code.
@param loop_tasks: The ith loop_task is code @param loop_tasks: The ith loop_task is a pair of strings, the first
to be executed just before going to the next element of the string is code to be executed before the ith loop starts, the second
ith dimension. The last is code to be executed at the very end. one contains code to be executed just before going to the next element
of the ith dimension.
The last element if loop_tasks is a single string, containing code
to be executed at the very end.
@type sub: a dictionary. @type sub: a dictionary.
@param sub: Maps 'lv#' to a suitable variable name. @param sub: Maps 'lv#' to a suitable variable name.
...@@ -203,6 +232,177 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub): ...@@ -203,6 +232,177 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
return "{%s}" % s return "{%s}" % s
def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
'''A bit like make_loop, but when only the inner-most loop executes code.
All the loops will be reordered so that the loops over the output tensor
are executed with memory access as contiguous as possible.
For instance, if the output tensor is c_contiguous, the inner-most loop
will be on its rows; if it's f_contiguous, it will be on its columns.
The output tensor's index among the loop variables is indicated by olv_index.
'''
# Number of variables
nvars = len(init_loop_orders)
# Number of loops (dimensionality of the variables)
nnested = len(init_loop_orders[0])
# This is the var from which we'll get the loop order
ovar = sub['lv%i' % olv_index]
# The loops are ordered by (decreasing) absolute values of ovar's strides.
# The first element of each pair is the absolute value of the stride
# The second element correspond to the index in the initial loop order
order_loops = """
std::vector< std::pair<int, int> > %(ovar)s_loops(%(nnested)i);
std::vector< std::pair<int, int> >::iterator %(ovar)s_loops_it = %(ovar)s_loops.begin();
""" % locals()
# Fill the loop vector with the appropriate <stride, index> pairs
for i, index in enumerate(init_loop_orders[olv_index]):
if index != 'x':
order_loops += """
%(ovar)s_loops_it->first = abs(%(ovar)s->strides[%(index)i]);
""" % locals()
else:
# Stride is 0 when dimension is broadcastable
order_loops += """
%(ovar)s_loops_it->first = 0;
""" % locals()
order_loops += """
%(ovar)s_loops_it->second = %(i)i;
++%(ovar)s_loops_it;
""" % locals()
# We sort in decreasing order so that the outermost loop (loop 0)
# has the largest stride, and the innermost loop (nnested - 1) has
# the smallest stride.
order_loops += """
// rbegin and rend are reversed iterators, so this sorts in decreasing order
std::sort(%(ovar)s_loops.rbegin(), %(ovar)s_loops.rend());
""" % locals()
## Get the (sorted) total number of iterations of each loop
# Get totals in the initial order
# For each dimension, the tensors are either all broadcasted, in
# which case there is only one iteration of the loop, or one or
# more are not broadcasted, in which case the number of elements
# of any of them will be equal to the number of iterations we have
# to do.
totals = []
for i, candidates in enumerate(zip(*init_loop_orders)):
for j, candidate in enumerate(candidates):
if candidate != 'x':
var = sub['lv%i' % j]
total = "%(var)s_n%(candidate)s" % locals()
break
else:
total = '1';
totals.append(total)
declare_totals = """
int init_totals[%(nnested)s] = {%(totals)s};
""" % dict(
nnested = nnested,
totals = ', '.join(totals)
)
# Sort totals to match the new order that was computed by sorting
# the loop vector. One integer variable per loop is declared.
declare_totals += """
%(ovar)s_loops_it = %(ovar)s_loops.begin();
""" % locals()
for i in range(nnested):
declare_totals += """
int TOTAL_%(i)i = init_totals[%(ovar)s_loops_it->second];
++%(ovar)s_loops_it;
""" % locals()
## Get sorted strides and jumps
# Get strides in the initial order
def get_loop_strides(loop_order, i):
"""
Returns a list containing a C expression representing the
stride for each dimension of the ith variable, in the
specified loop_order.
"""
var = sub["lv%i" % i]
r = []
for index in loop_order:
# Note: the stride variable is not declared for broadcasted variables
if index != 'x':
r.append("%(var)s_stride%(index)s" % locals())
else:
r.append('0')
return r
# We declare the initial strides as a 2D array, nvars x nnested
declare_strides_jumps = """
int init_strides[%(nvars)i][%(nnested)i] = {
%(strides)s
};""" % dict(
nvars = nvars,
nnested = nnested,
strides = ', \n'.join(
', '.join(get_loop_strides(lo, i))
for i, lo in enumerate(init_loop_orders)
if len(lo)>0))
# Declare (sorted) stride and jumps for each variable
# we iterate from innermost loop to outermost loop
declare_strides_jumps += """
std::vector< std::pair<int, int> >::reverse_iterator %(ovar)s_loops_rit;
""" % locals()
for i in range(nvars):
var = sub["lv%i" % i]
declare_strides_jumps += """
%(ovar)s_loops_rit = %(ovar)s_loops.rbegin();""" % locals()
adjust = "0"
for j in reversed(range(nnested)):
jump = "(%s) - (%s)" % ("%(var)s_stride_l%(j)i" % locals(), adjust)
declare_strides_jumps +="""
int %(var)s_stride_l%(j)i = init_strides[%(i)i][%(ovar)s_loops_rit->second];
int %(var)s_jump_l%(j)i = %(jump)s;
++%(ovar)s_loops_rit;
""" % locals()
adjust = "TOTAL_%(j)i * %(var)s_stride_l%(j)i" % locals()
declare_iter = ""
for i, dtype in enumerate(dtypes):
var = sub["lv%i" % i]
declare_iter += "%(var)s_iter = (%(dtype)s*)(%(var)s->data);\n" % locals()
loop = inner_task
for i in reversed(range(nnested)):
iterv = 'ITER_%i' % i
total = 'TOTAL_%i' % i
update = ''
for j in range(nvars):
var = sub["lv%i" % j]
update += "%(var)s_iter += %(var)s_jump_l%(i)i;\n" % locals()
loop = """
for (int %(iterv)s = %(total)s; %(iterv)s; %(iterv)s--)
{ // begin loop %(i)i
%(loop)s
%(update)s
} // end loop %(i)i
""" % locals()
return '\n'.join([
'{',
order_loops,
declare_totals,
declare_strides_jumps,
declare_iter,
loop,
'}\n',
])
# print make_declare(((0, 1, 2, 3), ('x', 1, 0, 3), ('x', 'x', 'x', 0)), # print make_declare(((0, 1, 2, 3), ('x', 1, 0, 3), ('x', 'x', 'x', 0)),
# ('double', 'int', 'float'), # ('double', 'int', 'float'),
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论