提交 2f4e666c authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1690 from AlOa/elemwise_openmp

Elemwise openmp
...@@ -211,6 +211,13 @@ import theano and print the config variable, as in: ...@@ -211,6 +211,13 @@ import theano and print the config variable, as in:
The best is to define it via Theano configuration The best is to define it via Theano configuration
file or with the environment variable THEANO_FLAGS. file or with the environment variable THEANO_FLAGS.
.. attribute:: openmp_elemwise_minsize
Positive int value, default: 200000.
This specifies the vectors minimum size for which elemwise ops
use openmp, if openmp is enable.
.. attribute:: cast_policy .. attribute:: cast_policy
String value: either 'numpy+floatX' or 'custom' String value: either 'numpy+floatX' or 'custom'
......
...@@ -45,3 +45,4 @@ you out. ...@@ -45,3 +45,4 @@ you out.
extending_theano extending_theano
faq faq
python-memory-management python-memory-management
multi_cores
=============================
Multi cores support in Theano
=============================
Parallel element wise op with openmp
====================================
Beacuse element wise ops work on every tensor entry indipedently they can be
easly parallelized using openmp.
To use openmp you must set the openmp flag in Theano configuration.
Yuo can use the flag openmp_elemwise_minsize to set the minimum tensor size
for which the operation is parallelized because for short tensor using opemp
can slow down the operation.
If it is no specified the default value (200000) is used.
For simple(fast) operation you can obtain a speed up for very long tensor
while for more complex operation you ca obtain a good speed up also for not
too long tensor.
There is a script (elemwise_openmp_speedup.py in theano/misc/) which you can
use to choose that value for your machine.
The script run two elemwise operation (a fast and a slow one) for a vector of
size openmp_elemwise_minsize with and without openmp and show the time
difference between the two cases.
...@@ -475,3 +475,11 @@ AddConfigVar('openmp', ...@@ -475,3 +475,11 @@ AddConfigVar('openmp',
BoolParam(default_openmp), BoolParam(default_openmp),
in_c_key=False, in_c_key=False,
) )
AddConfigVar('openmp_elemwise_minsize',
"If OpenMP is enable, this is the minimum size of vector "
"for which the openmp parallel for is enable."
"Used in element wise ops",
IntParam(200000),
in_c_key=False,
)
import os
import subprocess
import sys
from optparse import OptionParser
import theano
parser = OptionParser(usage='%prog <options>\n Compute time for'
' fast and slow elemwise operations')
parser.add_option('-N', '--N', action='store', dest='N',
default=theano.config.openmp_elemwise_minsize, type="int",
help="Number of vector element")
def runScript(N):
script = 'elemwise_time_test.py'
dir = os.path.dirname(os.path.abspath(__file__))
proc = subprocess.Popen(['python', script, '--script', '-N', str(N)],
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
cwd=dir)
(out, err) = proc.communicate()
if err:
print err
sys.exit()
return map(float, out.split(" "))
if __name__ == '__main__':
options, arguments = parser.parse_args(sys.argv)
if hasattr(options, "help"):
print options.help
sys.exit(0)
orig_flags = os.environ.get('THEANO_FLAGS', '')
os.environ['THEANO_FLAGS'] = orig_flags + ',openmp=false'
(cheapTime, costlyTime) = runScript(N=options.N)
os.environ['THEANO_FLAGS'] = orig_flags + ',openmp=true'
(cheapTimeOpenmp, costlyTimeOpenmp) = runScript(N=options.N)
if cheapTime > cheapTimeOpenmp:
cheapSpeed = cheapTime / cheapTimeOpenmp
cheapSpeedstring = "speedup"
else:
cheapSpeed = cheapTimeOpenmp / cheapTime
cheapSpeedstring = "slowdown"
if costlyTime > costlyTimeOpenmp:
costlySpeed = costlyTime / costlyTimeOpenmp
costlySpeedstring = "speedup"
else:
costlySpeed = costlyTimeOpenmp / costlyTime
costlySpeedstring = "slowdown"
print "Fast op time without openmp %fs with openmp %fs %s %2.2f" % (cheapTime, cheapTimeOpenmp, cheapSpeedstring, cheapSpeed)
print "Slow op time without openmp %fs with openmp %fs %s %2.2f" % (costlyTime, costlyTimeOpenmp, costlySpeedstring, costlySpeed)
from optparse import OptionParser
import sys
import time
import numpy as np
import theano
import theano.tensor as T
parser = OptionParser(usage='%prog <options>\n Compute time for'
' fast and slow elemwise operations')
parser.add_option('-N', '--N', action='store', dest='N',
default=theano.config.openmp_elemwise_minsize, type="int",
help="Number of vector element")
parser.add_option('--script', action='store_true', dest='script',
default=False,
help="Run program as script and print results on stdoutput")
def evalTime(f, v, script=False, loops=1000):
min = 1e10
for i in xrange(0, loops):
t0 = time.time()
f(v)
dt = time.time() - t0
min = dt if dt < min else min
if not script:
print ' run time in %d loops was %2.9f sec' % (loops, min)
return min
def ElemwiseOpTime(N, script=False, loops=1000):
x = T.vector('x')
np.random.seed(1235)
v = np.random.random(N).astype(theano.config.floatX)
f = theano.function([x], 2*x + x*x)
f1 = theano.function([x], T.tanh(x))
if not script:
if theano.config.openmp:
print "With openmp:"
print "Fast op ",
ceapTime = evalTime(f, v, script=script, loops=loops)
if not script:
print "Slow op ",
costlyTime = evalTime(f1, v, script=script, loops=loops)
return (ceapTime, costlyTime)
if __name__ == '__main__':
options, arguments = parser.parse_args(sys.argv)
if hasattr(options, "help"):
print options.help
sys.exit(0)
(cheapTime, costlyTime) = ElemwiseOpTime(N=options.N,
script=options.script)
if options.script:
sys.stdout.write("%2.9f %2.9f\n" % (cheapTime, costlyTime))
sys.stdout.flush()
...@@ -6,7 +6,7 @@ import numpy ...@@ -6,7 +6,7 @@ import numpy
import theano import theano
from theano import gof from theano import gof
from theano.gof import Apply, Op from theano.gof import Apply, Op, OpenMPOp
from theano import scalar from theano import scalar
from theano.scalar import Scalar, get_scalar_type from theano.scalar import Scalar, get_scalar_type
from theano.printing import pprint from theano.printing import pprint
...@@ -419,7 +419,7 @@ pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, DimShuffle), ...@@ -419,7 +419,7 @@ pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, DimShuffle),
### Elemwise ### ### Elemwise ###
################ ################
class Elemwise(Op): class Elemwise(OpenMPOp):
""" """
Generalizes a scalar op to tensors. Generalizes a scalar op to tensors.
...@@ -449,7 +449,7 @@ class Elemwise(Op): ...@@ -449,7 +449,7 @@ class Elemwise(Op):
""" """
def __init__(self, scalar_op, inplace_pattern=None, name=None, def __init__(self, scalar_op, inplace_pattern=None, name=None,
nfunc_spec=None): nfunc_spec=None, openmp=None):
""" """
Usage: Elemwise(scalar_op, inplace_pattern = {}) Usage: Elemwise(scalar_op, inplace_pattern = {})
...@@ -487,6 +487,7 @@ class Elemwise(Op): ...@@ -487,6 +487,7 @@ class Elemwise(Op):
#precompute the hash of this node #precompute the hash of this node
self._rehash() self._rehash()
super(Elemwise,self).__init__(openmp=openmp)
def __getstate__(self): def __getstate__(self):
d = copy(self.__dict__) d = copy(self.__dict__)
...@@ -1028,14 +1029,6 @@ class Elemwise(Op): ...@@ -1028,14 +1029,6 @@ class Elemwise(Op):
# which is allocated, OR, if there are any aliased outputs, # which is allocated, OR, if there are any aliased outputs,
# the index of the last of these aliased outputs. # the index of the last of these aliased outputs.
# We declare the scalar variables used in the inner loop to do
# the element-wise computation. Aliased scalar variables need
# not be declared, as they are #defined in defines
task_decl = "".join([
"%s& %s_i = *%s_iter;\n" % (dtype, name, name)
for name, dtype in izip(inames + list(real_onames),
idtypes + list(real_odtypes))])
# We generate the C code of the inner loop using the scalar op # We generate the C code of the inner loop using the scalar op
task_code = self.scalar_op.c_code( task_code = self.scalar_op.c_code(
Apply(self.scalar_op, Apply(self.scalar_op,
...@@ -1050,11 +1043,13 @@ class Elemwise(Op): ...@@ -1050,11 +1043,13 @@ class Elemwise(Op):
code = """ code = """
{ {
%(defines)s %(defines)s
%(task_decl)s
%(task_code)s %(task_code)s
%(undefs)s %(undefs)s
} }
""" % locals() """ % locals()
loop_orders = orders + [range(nnested)] * len(real_onames)
dtypes = (idtypes + list(real_odtypes))
if all([o.ndim <= 1 for o in node.outputs] or if all([o.ndim <= 1 for o in node.outputs] or
# Use simpler code when output ndim == 0 or 1 # Use simpler code when output ndim == 0 or 1
# or for broadcated scalar. # or for broadcated scalar.
...@@ -1063,19 +1058,47 @@ class Elemwise(Op): ...@@ -1063,19 +1058,47 @@ class Elemwise(Op):
all_code = [("", "")] * (nnested - 1) + [("", code)] + [""] all_code = [("", "")] * (nnested - 1) + [("", code)] + [""]
else: else:
all_code = [code] all_code = [code]
if len(all_code) == 1:
#No loops
task_decl = "".join([
"%s& %s_i = *%s_iter;\n" % (dtype, name, name)
for name, dtype in izip(inames + list(real_onames),
idtypes + list(real_odtypes))])
preloops = {}
for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
for j, index in enumerate(loop_order):
if index != 'x':
preloops.setdefault(j, "")
preloops[j] += ("%%(lv%(i)s)s_iter = (%(dtype)s*)(PyArray_DATA(%%(lv%(i)s)s));\n" % locals()) % sub
break
else: # all broadcastable
preloops.setdefault(0, "")
preloops[0] += ("%%(lv%(i)s)s_iter = (%(dtype)s*)(PyArray_DATA(%%(lv%(i)s)s));\n" % locals()) % sub
init_array = preloops.get(0, " ")
loop = """
{
%(defines)s
%(init_array)s
%(task_decl)s
%(task_code)s
%(undefs)s
}
""" % locals()
else:
loop = cgen.make_loop( loop = cgen.make_loop(
loop_orders=orders + [range(nnested)] * len(real_onames), loop_orders=loop_orders,
dtypes=(idtypes + list(real_odtypes)), dtypes=dtypes,
loop_tasks=all_code, loop_tasks=all_code,
sub=sub) sub=sub, openmp=self.openmp)
else: else:
loop = cgen.make_reordered_loop( loop = cgen.make_reordered_loop(
init_loop_orders=orders + [range(nnested)] * len(real_onames), init_loop_orders=loop_orders,
olv_index=olv_index, olv_index=olv_index,
dtypes=(idtypes + list(real_odtypes)), dtypes=dtypes,
inner_task=code, inner_task=code,
sub=sub) sub=sub, openmp=self.openmp)
# If all inputs and outputs are contiguous # If all inputs and outputs are contiguous
# and the scalar op define optimized code for that case # and the scalar op define optimized code for that case
...@@ -1117,7 +1140,8 @@ class Elemwise(Op): ...@@ -1117,7 +1140,8 @@ class Elemwise(Op):
contig += """ contig += """
dtype_%(x)s& %(x)s_i = ((dtype_%(x)s*) PyArray_DATA(%(x)s))[0]; dtype_%(x)s& %(x)s_i = ((dtype_%(x)s*) PyArray_DATA(%(x)s))[0];
""" % locals() """ % locals()
if self.openmp:
contig += """#pragma omp parallel for if(n>=%d)""" % (config.openmp_elemwise_minsize)
contig += """ contig += """
for(int i=0; i<n; i++){ for(int i=0; i<n; i++){
%(index)s %(index)s
...@@ -1166,6 +1190,7 @@ class Elemwise(Op): ...@@ -1166,6 +1190,7 @@ class Elemwise(Op):
version.append(self.scalar_op.c_code_cache_version_apply(scalar_node)) version.append(self.scalar_op.c_code_cache_version_apply(scalar_node))
for i in node.inputs + node.outputs: for i in node.inputs + node.outputs:
version.append(get_scalar_type(dtype=i.type.dtype).c_code_cache_version()) version.append(get_scalar_type(dtype=i.type.dtype).c_code_cache_version())
version.append(('openmp', self.openmp))
if all(version): if all(version):
return tuple(version) return tuple(version)
else: else:
...@@ -1557,7 +1582,7 @@ for(int i=0;i<PyArray_NDIM(%(iname)s);i++){ ...@@ -1557,7 +1582,7 @@ for(int i=0;i<PyArray_NDIM(%(iname)s);i++){
+ [("", code1), ""]) + [("", code1), ""])
else: else:
all_code = [task0_decl + code1] all_code = [task0_decl + code1]
loop = cgen.make_loop( loop = cgen.make_loop_careduce(
[order, range(nnested) + ['x'] * len(axis)], [order, range(nnested) + ['x'] * len(axis)],
[idtype, adtype], all_code, sub) [idtype, adtype], all_code, sub)
......
import theano
def make_declare(loop_orders, dtypes, sub): def make_declare(loop_orders, dtypes, sub):
...@@ -170,8 +171,7 @@ def make_alloc(loop_orders, dtype, sub, fortran='0'): ...@@ -170,8 +171,7 @@ def make_alloc(loop_orders, dtype, sub, fortran='0'):
} }
""" % dict(locals(), **sub) """ % dict(locals(), **sub)
def make_loop(loop_orders, dtypes, loop_tasks, sub, openmp=None):
def make_loop(loop_orders, dtypes, loop_tasks, sub):
""" """
Make a nested loop over several arrays and associate specific code Make a nested loop over several arrays and associate specific code
to each level of nesting. to each level of nesting.
...@@ -195,22 +195,29 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub): ...@@ -195,22 +195,29 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
@type sub: a dictionary. @type sub: a dictionary.
@param sub: Maps 'lv#' to a suitable variable name. @param sub: Maps 'lv#' to a suitable variable name.
The 'lvi' variable corresponds to the ith element of loop_orders. The 'lvi' variable corresponds to the ith element of loop_orders.
"""
"""
def loop_over(preloop, code, indices, i): def loop_over(preloop, code, indices, i):
iterv = 'ITER_%i' % i iterv = 'ITER_%i' % i
update = "" update = ""
suitable_n = "1" suitable_n = "1"
for j, index in enumerate(indices): for j, index in enumerate(indices):
var = sub['lv%i' % j] var = sub['lv%i' % j]
update += "%(var)s_iter += %(var)s_jump%(index)s_%(i)s;\n" % locals() dtype = dtypes[j]
update += "%(dtype)s &%(var)s_i = * ( %(var)s_iter + %(iterv)s * %(var)s_jump%(index)s_%(i)s );\n" % locals()
if index != 'x': if index != 'x':
suitable_n = "%(var)s_n%(index)s" % locals() suitable_n = "%(var)s_n%(index)s" % locals()
return """ if openmp:
openmp_elemwise_minsize = theano.config.openmp_elemwise_minsize
forloop = """#pragma omp parallel for if( %(suitable_n)s >=%(openmp_elemwise_minsize)s)\n""" % locals()
else:
forloop = ""
forloop += """for (int %(iterv)s = 0; %(iterv)s<%(suitable_n)s; %(iterv)s++)""" % locals()
return"""
%(preloop)s %(preloop)s
for (int %(iterv)s = %(suitable_n)s; %(iterv)s; %(iterv)s--) { %(forloop)s {
%(code)s
%(update)s %(update)s
%(code)s
} }
""" % locals() """ % locals()
...@@ -225,10 +232,8 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub): ...@@ -225,10 +232,8 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
preloops.setdefault(0, "") preloops.setdefault(0, "")
preloops[0] += ("%%(lv%(i)s)s_iter = (%(dtype)s*)(PyArray_DATA(%%(lv%(i)s)s));\n" % locals()) % sub preloops[0] += ("%%(lv%(i)s)s_iter = (%(dtype)s*)(PyArray_DATA(%%(lv%(i)s)s));\n" % locals()) % sub
if len(loop_tasks) == 1:
s = preloops.get(0, "")
else:
s = "" s = ""
for i, (pre_task, task), indices in reversed(zip(xrange(len(loop_tasks) - 1), loop_tasks, zip(*loop_orders))): for i, (pre_task, task), indices in reversed(zip(xrange(len(loop_tasks) - 1), loop_tasks, zip(*loop_orders))):
s = loop_over(preloops.get(i, "") + pre_task, s + task, indices, i) s = loop_over(preloops.get(i, "") + pre_task, s + task, indices, i)
...@@ -236,7 +241,7 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub): ...@@ -236,7 +241,7 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
return "{%s}" % s return "{%s}" % s
def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub): def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub, openmp=None):
'''A bit like make_loop, but when only the inner-most loop executes code. '''A bit like make_loop, but when only the inner-most loop executes code.
All the loops will be reordered so that the loops over the output tensor All the loops will be reordered so that the loops over the output tensor
...@@ -325,7 +330,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub): ...@@ -325,7 +330,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
++%(ovar)s_loops_it; ++%(ovar)s_loops_it;
""" % locals() """ % locals()
## Get sorted strides and jumps ## Get sorted strides
# Get strides in the initial order # Get strides in the initial order
def get_loop_strides(loop_order, i): def get_loop_strides(loop_order, i):
""" """
...@@ -344,7 +349,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub): ...@@ -344,7 +349,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
return r return r
# We declare the initial strides as a 2D array, nvars x nnested # We declare the initial strides as a 2D array, nvars x nnested
declare_strides_jumps = """ declare_strides = """
int init_strides[%(nvars)i][%(nnested)i] = { int init_strides[%(nvars)i][%(nnested)i] = {
%(strides)s %(strides)s
};""" % dict( };""" % dict(
...@@ -355,46 +360,57 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub): ...@@ -355,46 +360,57 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
for i, lo in enumerate(init_loop_orders) for i, lo in enumerate(init_loop_orders)
if len(lo)>0)) if len(lo)>0))
# Declare (sorted) stride and jumps for each variable # Declare (sorted) stride and for each variable
# we iterate from innermost loop to outermost loop # we iterate from innermost loop to outermost loop
declare_strides_jumps += """ declare_strides += """
std::vector< std::pair<int, int> >::reverse_iterator %(ovar)s_loops_rit; std::vector< std::pair<int, int> >::reverse_iterator %(ovar)s_loops_rit;
""" % locals() """ % locals()
for i in xrange(nvars): for i in xrange(nvars):
var = sub["lv%i" % i] var = sub["lv%i" % i]
declare_strides_jumps += """ declare_strides += """
%(ovar)s_loops_rit = %(ovar)s_loops.rbegin();""" % locals() %(ovar)s_loops_rit = %(ovar)s_loops.rbegin();""" % locals()
adjust = "0"
for j in reversed(range(nnested)): for j in reversed(range(nnested)):
jump = "(%s) - (%s)" % ("%(var)s_stride_l%(j)i" % locals(), adjust) declare_strides += """
declare_strides_jumps +="""
int %(var)s_stride_l%(j)i = init_strides[%(i)i][%(ovar)s_loops_rit->second]; int %(var)s_stride_l%(j)i = init_strides[%(i)i][%(ovar)s_loops_rit->second];
int %(var)s_jump_l%(j)i = %(jump)s;
++%(ovar)s_loops_rit; ++%(ovar)s_loops_rit;
""" % locals() """ % locals()
adjust = "TOTAL_%(j)i * %(var)s_stride_l%(j)i" % locals()
declare_iter = "" declare_iter = ""
for i, dtype in enumerate(dtypes): for i, dtype in enumerate(dtypes):
var = sub["lv%i" % i] var = sub["lv%i" % i]
declare_iter += "%(var)s_iter = (%(dtype)s*)(PyArray_DATA(%(var)s));\n" % locals() declare_iter += "%(var)s_iter = (%(dtype)s*)(PyArray_DATA(%(var)s));\n" % locals()
pointer_update = ''
for j , dtype in enumerate(dtypes):
var = sub["lv%i" % j]
pointer_update += "%(dtype)s &%(var)s_i = * ( %(var)s_iter"%locals()
tot_jump = ''
for i in reversed(range(nnested)):
iterv = 'ITER_%i' % i
pointer_update += "+%(var)s_stride_l%(i)i*%(iterv)s" % locals()
pointer_update += ");\n"
loop = inner_task loop = inner_task
for i in reversed(range(nnested)): for i in reversed(range(nnested)):
iterv = 'ITER_%i' % i iterv = 'ITER_%i' % i
total = 'TOTAL_%i' % i total = 'TOTAL_%i' % i
update = '' update = ''
for j in xrange(nvars): forloop = ''
var = sub["lv%i" % j] # The pointers are defined only in the most inner loop
update += "%(var)s_iter += %(var)s_jump_l%(i)i;\n" % locals() if i == nnested-1:
update = pointer_update
if i == 0:
if openmp:
openmp_elemwise_minsize= theano.config.openmp_elemwise_minsize
forloop += """#pragma omp parallel for if( %(total)s >=%(openmp_elemwise_minsize)s)\n""" % locals()
forloop += "for(int %(iterv)s = 0; %(iterv)s<%(total)s; %(iterv)s++)" % locals()
loop = """ loop = """
for (int %(iterv)s = %(total)s; %(iterv)s; %(iterv)s--) %(forloop)s
{ // begin loop %(i)i { // begin loop %(i)i
%(loop)s
%(update)s %(update)s
%(loop)s
} // end loop %(i)i } // end loop %(i)i
""" % locals() """ % locals()
...@@ -402,7 +418,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub): ...@@ -402,7 +418,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
'{', '{',
order_loops, order_loops,
declare_totals, declare_totals,
declare_strides_jumps, declare_strides,
declare_iter, declare_iter,
loop, loop,
'}\n', '}\n',
...@@ -435,21 +451,77 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub): ...@@ -435,21 +451,77 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
### DimShuffle ### ### DimShuffle ###
################## ##################
################# #################
### Broadcast ### ### Broadcast ###
################# #################
################ ################
### CAReduce ### ### CAReduce ###
################ ################
def make_loop_careduce(loop_orders, dtypes, loop_tasks, sub):
"""
Make a nested loop over several arrays and associate specific code
to each level of nesting.
@type loop_orders: list of N tuples of length M.
@param loop_orders: Each value of each
tuple can be either the index of a dimension to loop over or
the letter 'x' which means there is no looping to be done
over that variable at that point (in other words we broadcast
over that dimension). If an entry is an integer, it will become
an alias of the entry of that rank.
@type loop_tasks: list of M+1 pieces of code.
@param loop_tasks: The ith loop_task is a pair of strings, the first
string is code to be executed before the ith loop starts, the second
one contains code to be executed just before going to the next element
of the ith dimension.
The last element if loop_tasks is a single string, containing code
to be executed at the very end.
@type sub: a dictionary.
@param sub: Maps 'lv#' to a suitable variable name.
The 'lvi' variable corresponds to the ith element of loop_orders.
"""
def loop_over(preloop, code, indices, i):
iterv = 'ITER_%i' % i
update = ""
suitable_n = "1"
for j, index in enumerate(indices):
var = sub['lv%i' % j]
update += "%(var)s_iter += %(var)s_jump%(index)s_%(i)s;\n" % locals()
if index != 'x':
suitable_n = "%(var)s_n%(index)s" % locals()
return """
%(preloop)s
for (int %(iterv)s = %(suitable_n)s; %(iterv)s; %(iterv)s--) {
%(code)s
%(update)s
}
""" % locals()
preloops = {}
for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
for j, index in enumerate(loop_order):
if index != 'x':
preloops.setdefault(j, "")
preloops[j] += ("%%(lv%(i)s)s_iter = (%(dtype)s*)(PyArray_DATA(%%(lv%(i)s)s));\n" % locals()) % sub
break
else: # all broadcastable
preloops.setdefault(0, "")
preloops[0] += ("%%(lv%(i)s)s_iter = (%(dtype)s*)(PyArray_DATA(%%(lv%(i)s)s));\n" % locals()) % sub
if len(loop_tasks) == 1:
s = preloops.get(0, "")
else:
s = ""
for i, (pre_task, task), indices in reversed(zip(xrange(len(loop_tasks) - 1), loop_tasks, zip(*loop_orders))):
s = loop_over(preloops.get(i, "") + pre_task, s + task, indices, i)
s += loop_tasks[-1]
return "{%s}" % s
...@@ -16,7 +16,7 @@ from theano.compile.mode import get_default_mode ...@@ -16,7 +16,7 @@ from theano.compile.mode import get_default_mode
from theano.tensor.elemwise import (CAReduce, Elemwise, DimShuffle, from theano.tensor.elemwise import (CAReduce, Elemwise, DimShuffle,
Prod, ProdWithoutZeros) Prod, ProdWithoutZeros)
from theano.tests import unittest_tools from theano.tests import unittest_tools
import math
def FunctionGraph(i, o): def FunctionGraph(i, o):
e = gof.FunctionGraph(i, o) e = gof.FunctionGraph(i, o)
...@@ -145,6 +145,9 @@ class test_Broadcast(unittest.TestCase): ...@@ -145,6 +145,9 @@ class test_Broadcast(unittest.TestCase):
ctype = TensorType ctype = TensorType
cop = Elemwise cop = Elemwise
openmp_minsize = 2*config.openmp_elemwise_minsize
openmp_minsize_sqrt = math.ceil(math.sqrt(openmp_minsize))
def rand_val(self, shp): def rand_val(self, shp):
return numpy.asarray(numpy.random.rand(*shp)) return numpy.asarray(numpy.random.rand(*shp))
...@@ -160,6 +163,8 @@ class test_Broadcast(unittest.TestCase): ...@@ -160,6 +163,8 @@ class test_Broadcast(unittest.TestCase):
((3, 5), (3, 1)), ((3, 5), (3, 1)),
((1, 5), (5, 1)), ((1, 5), (5, 1)),
((1, 1), (1, 1)), ((1, 1), (1, 1)),
((self.openmp_minsize,), (self.openmp_minsize,)),
((self.openmp_minsize_sqrt, self.openmp_minsize_sqrt), (self.openmp_minsize_sqrt, self.openmp_minsize_sqrt)),
((2, 3, 4, 5), (2, 3, 4, 5)), ((2, 3, 4, 5), (2, 3, 4, 5)),
((2, 3, 4, 5), (1, 3, 1, 5)), ((2, 3, 4, 5), (1, 3, 1, 5)),
((2, 3, 4, 5), (1, 1, 1, 1)), ((2, 3, 4, 5), (1, 1, 1, 1)),
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论