提交 375b82ef authored 作者: AlOa's avatar AlOa

Add openmp to elemwise non contiguous case

上级 0a92ffef
......@@ -1029,14 +1029,6 @@ class Elemwise(OpenMPOp):
# which is allocated, OR, if there are any aliased outputs,
# the index of the last of these aliased outputs.
# We declare the scalar variables used in the inner loop to do
# the element-wise computation. Aliased scalar variables need
# not be declared, as they are #defined in defines
task_decl = "".join([
"%s& %s_i = *%s_iter;\n" % (dtype, name, name)
for name, dtype in izip(inames + list(real_onames),
idtypes + list(real_odtypes))])
# We generate the C code of the inner loop using the scalar op
task_code = self.scalar_op.c_code(
Apply(self.scalar_op,
......@@ -1051,7 +1043,6 @@ class Elemwise(OpenMPOp):
code = """
{
%(defines)s
%(task_decl)s
%(task_code)s
%(undefs)s
}
......@@ -1069,14 +1060,14 @@ class Elemwise(OpenMPOp):
loop_orders=orders + [range(nnested)] * len(real_onames),
dtypes=(idtypes + list(real_odtypes)),
loop_tasks=all_code,
sub=sub)
sub=sub, reduce=False, openmp=self.openmp)
else:
loop = cgen.make_reordered_loop(
init_loop_orders=orders + [range(nnested)] * len(real_onames),
olv_index=olv_index,
dtypes=(idtypes + list(real_odtypes)),
inner_task=code,
sub=sub)
sub=sub, openmp=self.openmp)
# If all inputs and outputs are contiguous
# and the scalar op define optimized code for that case
......@@ -1562,7 +1553,7 @@ for(int i=0;i<PyArray_NDIM(%(iname)s);i++){
all_code = [task0_decl + code1]
loop = cgen.make_loop(
[order, range(nnested) + ['x'] * len(axis)],
[idtype, adtype], all_code, sub)
[idtype, adtype], all_code, sub, reduce=True)
end = ""
if adtype != odtype:
......
import theano
def make_declare(loop_orders, dtypes, sub):
......@@ -171,7 +172,7 @@ def make_alloc(loop_orders, dtype, sub, fortran='0'):
""" % dict(locals(), **sub)
def make_loop(loop_orders, dtypes, loop_tasks, sub):
def make_loop(loop_orders, dtypes, loop_tasks, sub, reduce=False, openmp=None):
"""
Make a nested loop over several arrays and associate specific code
to each level of nesting.
......@@ -195,9 +196,37 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
@type sub: a dictionary.
@param sub: Maps 'lv#' to a suitable variable name.
The 'lvi' variable corresponds to the ith element of loop_orders.
@type reduce: boolean
@param reduce: true if this function is called from CAReduce
false if it is called from Elemwise,because in elemnwise to use
openmp the code must be rearranged
"""
def loop_over_elemwise(preloop, code, indices, i):
iterv = 'ITER_%i' % i
update = ""
suitable_n = "1"
for j, index in enumerate(indices):
var = sub['lv%i' % j]
dtype = dtypes[j]
update += "%(dtype)s &%(var)s_i = * ( %(var)s_iter + %(iterv)s * %(var)s_jump%(index)s_%(i)s );\n" % locals()
if index != 'x':
suitable_n = "%(var)s_n%(index)s" % locals()
if openmp:
openmp_minsize = theano.config.openmp_minsize
forloop = """#pragma omp parallel for if( %(suitable_n)s >=%(openmp_minsize)s)\n""" % locals()
else:
forloop = ""
forloop += """for (int %(iterv)s = 0; %(iterv)s<%(suitable_n)s; %(iterv)s++)""" % locals()
return"""
%(preloop)s
%(forloop)s {
%(update)s
%(code)s
}
""" % locals()
def loop_over(preloop, code, indices, i):
def loop_over_reduce(preloop, code, indices, i):
iterv = 'ITER_%i' % i
update = ""
suitable_n = "1"
......@@ -229,6 +258,11 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
s = preloops.get(0, "")
else:
s = ""
if reduce:
loop_over = loop_over_reduce
else:
loop_over = loop_over_elemwise
for i, (pre_task, task), indices in reversed(zip(xrange(len(loop_tasks) - 1), loop_tasks, zip(*loop_orders))):
s = loop_over(preloops.get(i, "") + pre_task, s + task, indices, i)
......@@ -236,7 +270,7 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
return "{%s}" % s
def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub, openmp=None):
'''A bit like make_loop, but when only the inner-most loop executes code.
All the loops will be reordered so that the loops over the output tensor
......@@ -325,7 +359,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
++%(ovar)s_loops_it;
""" % locals()
## Get sorted strides and jumps
## Get sorted strides
# Get strides in the initial order
def get_loop_strides(loop_order, i):
"""
......@@ -344,7 +378,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
return r
# We declare the initial strides as a 2D array, nvars x nnested
declare_strides_jumps = """
declare_strides = """
int init_strides[%(nvars)i][%(nnested)i] = {
%(strides)s
};""" % dict(
......@@ -355,46 +389,57 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
for i, lo in enumerate(init_loop_orders)
if len(lo)>0))
# Declare (sorted) stride and jumps for each variable
# Declare (sorted) stride and for each variable
# we iterate from innermost loop to outermost loop
declare_strides_jumps += """
declare_strides += """
std::vector< std::pair<int, int> >::reverse_iterator %(ovar)s_loops_rit;
""" % locals()
for i in xrange(nvars):
var = sub["lv%i" % i]
declare_strides_jumps += """
declare_strides += """
%(ovar)s_loops_rit = %(ovar)s_loops.rbegin();""" % locals()
adjust = "0"
for j in reversed(range(nnested)):
jump = "(%s) - (%s)" % ("%(var)s_stride_l%(j)i" % locals(), adjust)
declare_strides_jumps +="""
declare_strides += """
int %(var)s_stride_l%(j)i = init_strides[%(i)i][%(ovar)s_loops_rit->second];
int %(var)s_jump_l%(j)i = %(jump)s;
++%(ovar)s_loops_rit;
""" % locals()
adjust = "TOTAL_%(j)i * %(var)s_stride_l%(j)i" % locals()
declare_iter = ""
for i, dtype in enumerate(dtypes):
var = sub["lv%i" % i]
declare_iter += "%(var)s_iter = (%(dtype)s*)(PyArray_DATA(%(var)s));\n" % locals()
pointer_update = ''
for j in xrange(nvars):
var = sub["lv%i" % j]
pointer_update += "%(dtype)s &%(var)s_i = * ( %(var)s_iter"%locals()
tot_jump = ''
for i in reversed(range(nnested)):
iterv = 'ITER_%i' % i
pointer_update += "+%(var)s_stride_l%(i)i*%(iterv)s" % locals()
pointer_update += ");\n"
loop = inner_task
for i in reversed(range(nnested)):
iterv = 'ITER_%i' % i
total = 'TOTAL_%i' % i
update = ''
for j in xrange(nvars):
var = sub["lv%i" % j]
update += "%(var)s_iter += %(var)s_jump_l%(i)i;\n" % locals()
forloop = ''
# The pointers are defined only in the most inner loop
if i == nnested-1:
update = pointer_update
if i == 0:
if openmp:
openmp_minsize = theano.config.openmp_minsize
forloop += """#pragma omp parallel for if( %(total)s >=%(openmp_minsize)s)\n""" % locals()
forloop += "for(int %(iterv)s = 0; %(iterv)s<%(total)s; %(iterv)s++)" % locals()
loop = """
for (int %(iterv)s = %(total)s; %(iterv)s; %(iterv)s--)
%(forloop)s
{ // begin loop %(i)i
%(loop)s
%(update)s
%(loop)s
} // end loop %(i)i
""" % locals()
......@@ -402,7 +447,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
'{',
order_loops,
declare_totals,
declare_strides_jumps,
declare_strides,
declare_iter,
loop,
'}\n',
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论