提交 375b82ef authored 作者: AlOa's avatar AlOa

Add openmp to elemwise non contiguous case

上级 0a92ffef
...@@ -1029,14 +1029,6 @@ class Elemwise(OpenMPOp): ...@@ -1029,14 +1029,6 @@ class Elemwise(OpenMPOp):
# which is allocated, OR, if there are any aliased outputs, # which is allocated, OR, if there are any aliased outputs,
# the index of the last of these aliased outputs. # the index of the last of these aliased outputs.
# We declare the scalar variables used in the inner loop to do
# the element-wise computation. Aliased scalar variables need
# not be declared, as they are #defined in defines
task_decl = "".join([
"%s& %s_i = *%s_iter;\n" % (dtype, name, name)
for name, dtype in izip(inames + list(real_onames),
idtypes + list(real_odtypes))])
# We generate the C code of the inner loop using the scalar op # We generate the C code of the inner loop using the scalar op
task_code = self.scalar_op.c_code( task_code = self.scalar_op.c_code(
Apply(self.scalar_op, Apply(self.scalar_op,
...@@ -1051,7 +1043,6 @@ class Elemwise(OpenMPOp): ...@@ -1051,7 +1043,6 @@ class Elemwise(OpenMPOp):
code = """ code = """
{ {
%(defines)s %(defines)s
%(task_decl)s
%(task_code)s %(task_code)s
%(undefs)s %(undefs)s
} }
...@@ -1069,14 +1060,14 @@ class Elemwise(OpenMPOp): ...@@ -1069,14 +1060,14 @@ class Elemwise(OpenMPOp):
loop_orders=orders + [range(nnested)] * len(real_onames), loop_orders=orders + [range(nnested)] * len(real_onames),
dtypes=(idtypes + list(real_odtypes)), dtypes=(idtypes + list(real_odtypes)),
loop_tasks=all_code, loop_tasks=all_code,
sub=sub) sub=sub, reduce=False, openmp=self.openmp)
else: else:
loop = cgen.make_reordered_loop( loop = cgen.make_reordered_loop(
init_loop_orders=orders + [range(nnested)] * len(real_onames), init_loop_orders=orders + [range(nnested)] * len(real_onames),
olv_index=olv_index, olv_index=olv_index,
dtypes=(idtypes + list(real_odtypes)), dtypes=(idtypes + list(real_odtypes)),
inner_task=code, inner_task=code,
sub=sub) sub=sub, openmp=self.openmp)
# If all inputs and outputs are contiguous # If all inputs and outputs are contiguous
# and the scalar op define optimized code for that case # and the scalar op define optimized code for that case
...@@ -1562,7 +1553,7 @@ for(int i=0;i<PyArray_NDIM(%(iname)s);i++){ ...@@ -1562,7 +1553,7 @@ for(int i=0;i<PyArray_NDIM(%(iname)s);i++){
all_code = [task0_decl + code1] all_code = [task0_decl + code1]
loop = cgen.make_loop( loop = cgen.make_loop(
[order, range(nnested) + ['x'] * len(axis)], [order, range(nnested) + ['x'] * len(axis)],
[idtype, adtype], all_code, sub) [idtype, adtype], all_code, sub, reduce=True)
end = "" end = ""
if adtype != odtype: if adtype != odtype:
......
import theano
def make_declare(loop_orders, dtypes, sub): def make_declare(loop_orders, dtypes, sub):
...@@ -171,7 +172,7 @@ def make_alloc(loop_orders, dtype, sub, fortran='0'): ...@@ -171,7 +172,7 @@ def make_alloc(loop_orders, dtype, sub, fortran='0'):
""" % dict(locals(), **sub) """ % dict(locals(), **sub)
def make_loop(loop_orders, dtypes, loop_tasks, sub): def make_loop(loop_orders, dtypes, loop_tasks, sub, reduce=False, openmp=None):
""" """
Make a nested loop over several arrays and associate specific code Make a nested loop over several arrays and associate specific code
to each level of nesting. to each level of nesting.
...@@ -195,9 +196,37 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub): ...@@ -195,9 +196,37 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
@type sub: a dictionary. @type sub: a dictionary.
@param sub: Maps 'lv#' to a suitable variable name. @param sub: Maps 'lv#' to a suitable variable name.
The 'lvi' variable corresponds to the ith element of loop_orders. The 'lvi' variable corresponds to the ith element of loop_orders.
@type reduce: boolean
@param reduce: true if this function is called from CAReduce
false if it is called from Elemwise,because in elemnwise to use
openmp the code must be rearranged
""" """
def loop_over_elemwise(preloop, code, indices, i):
iterv = 'ITER_%i' % i
update = ""
suitable_n = "1"
for j, index in enumerate(indices):
var = sub['lv%i' % j]
dtype = dtypes[j]
update += "%(dtype)s &%(var)s_i = * ( %(var)s_iter + %(iterv)s * %(var)s_jump%(index)s_%(i)s );\n" % locals()
if index != 'x':
suitable_n = "%(var)s_n%(index)s" % locals()
if openmp:
openmp_minsize = theano.config.openmp_minsize
forloop = """#pragma omp parallel for if( %(suitable_n)s >=%(openmp_minsize)s)\n""" % locals()
else:
forloop = ""
forloop += """for (int %(iterv)s = 0; %(iterv)s<%(suitable_n)s; %(iterv)s++)""" % locals()
return"""
%(preloop)s
%(forloop)s {
%(update)s
%(code)s
}
""" % locals()
def loop_over(preloop, code, indices, i): def loop_over_reduce(preloop, code, indices, i):
iterv = 'ITER_%i' % i iterv = 'ITER_%i' % i
update = "" update = ""
suitable_n = "1" suitable_n = "1"
...@@ -229,6 +258,11 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub): ...@@ -229,6 +258,11 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
s = preloops.get(0, "") s = preloops.get(0, "")
else: else:
s = "" s = ""
if reduce:
loop_over = loop_over_reduce
else:
loop_over = loop_over_elemwise
for i, (pre_task, task), indices in reversed(zip(xrange(len(loop_tasks) - 1), loop_tasks, zip(*loop_orders))): for i, (pre_task, task), indices in reversed(zip(xrange(len(loop_tasks) - 1), loop_tasks, zip(*loop_orders))):
s = loop_over(preloops.get(i, "") + pre_task, s + task, indices, i) s = loop_over(preloops.get(i, "") + pre_task, s + task, indices, i)
...@@ -236,7 +270,7 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub): ...@@ -236,7 +270,7 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
return "{%s}" % s return "{%s}" % s
def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub): def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub, openmp=None):
'''A bit like make_loop, but when only the inner-most loop executes code. '''A bit like make_loop, but when only the inner-most loop executes code.
All the loops will be reordered so that the loops over the output tensor All the loops will be reordered so that the loops over the output tensor
...@@ -325,7 +359,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub): ...@@ -325,7 +359,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
++%(ovar)s_loops_it; ++%(ovar)s_loops_it;
""" % locals() """ % locals()
## Get sorted strides and jumps ## Get sorted strides
# Get strides in the initial order # Get strides in the initial order
def get_loop_strides(loop_order, i): def get_loop_strides(loop_order, i):
""" """
...@@ -344,7 +378,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub): ...@@ -344,7 +378,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
return r return r
# We declare the initial strides as a 2D array, nvars x nnested # We declare the initial strides as a 2D array, nvars x nnested
declare_strides_jumps = """ declare_strides = """
int init_strides[%(nvars)i][%(nnested)i] = { int init_strides[%(nvars)i][%(nnested)i] = {
%(strides)s %(strides)s
};""" % dict( };""" % dict(
...@@ -355,46 +389,57 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub): ...@@ -355,46 +389,57 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
for i, lo in enumerate(init_loop_orders) for i, lo in enumerate(init_loop_orders)
if len(lo)>0)) if len(lo)>0))
# Declare (sorted) stride and jumps for each variable # Declare (sorted) stride and for each variable
# we iterate from innermost loop to outermost loop # we iterate from innermost loop to outermost loop
declare_strides_jumps += """ declare_strides += """
std::vector< std::pair<int, int> >::reverse_iterator %(ovar)s_loops_rit; std::vector< std::pair<int, int> >::reverse_iterator %(ovar)s_loops_rit;
""" % locals() """ % locals()
for i in xrange(nvars): for i in xrange(nvars):
var = sub["lv%i" % i] var = sub["lv%i" % i]
declare_strides_jumps += """ declare_strides += """
%(ovar)s_loops_rit = %(ovar)s_loops.rbegin();""" % locals() %(ovar)s_loops_rit = %(ovar)s_loops.rbegin();""" % locals()
adjust = "0"
for j in reversed(range(nnested)): for j in reversed(range(nnested)):
jump = "(%s) - (%s)" % ("%(var)s_stride_l%(j)i" % locals(), adjust) declare_strides += """
declare_strides_jumps +="""
int %(var)s_stride_l%(j)i = init_strides[%(i)i][%(ovar)s_loops_rit->second]; int %(var)s_stride_l%(j)i = init_strides[%(i)i][%(ovar)s_loops_rit->second];
int %(var)s_jump_l%(j)i = %(jump)s;
++%(ovar)s_loops_rit; ++%(ovar)s_loops_rit;
""" % locals() """ % locals()
adjust = "TOTAL_%(j)i * %(var)s_stride_l%(j)i" % locals()
declare_iter = "" declare_iter = ""
for i, dtype in enumerate(dtypes): for i, dtype in enumerate(dtypes):
var = sub["lv%i" % i] var = sub["lv%i" % i]
declare_iter += "%(var)s_iter = (%(dtype)s*)(PyArray_DATA(%(var)s));\n" % locals() declare_iter += "%(var)s_iter = (%(dtype)s*)(PyArray_DATA(%(var)s));\n" % locals()
pointer_update = ''
for j in xrange(nvars):
var = sub["lv%i" % j]
pointer_update += "%(dtype)s &%(var)s_i = * ( %(var)s_iter"%locals()
tot_jump = ''
for i in reversed(range(nnested)):
iterv = 'ITER_%i' % i
pointer_update += "+%(var)s_stride_l%(i)i*%(iterv)s" % locals()
pointer_update += ");\n"
loop = inner_task loop = inner_task
for i in reversed(range(nnested)): for i in reversed(range(nnested)):
iterv = 'ITER_%i' % i iterv = 'ITER_%i' % i
total = 'TOTAL_%i' % i total = 'TOTAL_%i' % i
update = '' update = ''
for j in xrange(nvars): forloop = ''
var = sub["lv%i" % j] # The pointers are defined only in the most inner loop
update += "%(var)s_iter += %(var)s_jump_l%(i)i;\n" % locals() if i == nnested-1:
update = pointer_update
if i == 0:
if openmp:
openmp_minsize = theano.config.openmp_minsize
forloop += """#pragma omp parallel for if( %(total)s >=%(openmp_minsize)s)\n""" % locals()
forloop += "for(int %(iterv)s = 0; %(iterv)s<%(total)s; %(iterv)s++)" % locals()
loop = """ loop = """
for (int %(iterv)s = %(total)s; %(iterv)s; %(iterv)s--) %(forloop)s
{ // begin loop %(i)i { // begin loop %(i)i
%(loop)s
%(update)s %(update)s
%(loop)s
} // end loop %(i)i } // end loop %(i)i
""" % locals() """ % locals()
...@@ -402,7 +447,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub): ...@@ -402,7 +447,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
'{', '{',
order_loops, order_loops,
declare_totals, declare_totals,
declare_strides_jumps, declare_strides,
declare_iter, declare_iter,
loop, loop,
'}\n', '}\n',
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论