Merge pull request #1690 from AlOa/elemwise_openmp

Elemwise openmp

Merge pull request #1690 from AlOa/elemwise_openmp
2f4e666c · Frédéric Bastien · 8e9ebc8f · 1de5723a · 2f4e666c · 2f4e666c
--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -211,6 +211,13 @@ import theano and print the config variable, as in:
    The best is to define it via Theano configuration
    file or with the environment variable THEANO_FLAGS.
+.. attribute:: openmp_elemwise_minsize
+   Positive int value, default: 200000.
+   This specifies the vectors minimum size for which elemwise ops
+   use openmp, if openmp is enable.
 .. attribute:: cast_policy
    String value: either 'numpy+floatX' or 'custom'

--- a/doc/tutorial/index.txt
+++ b/doc/tutorial/index.txt
@@ -45,3 +45,4 @@ you out.
    extending_theano
    faq
    python-memory-management
+    multi_cores
--- a/doc/tutorial/multi_cores.txt
+++ b/doc/tutorial/multi_cores.txt
+=============================
+Multi cores support in Theano
+=============================
+Parallel element wise op with openmp
+====================================
+Beacuse element wise ops work on every tensor entry indipedently they can be
+easly parallelized using openmp.
+To use openmp you must set the openmp flag in Theano configuration.
+Yuo can use the flag openmp_elemwise_minsize to set the minimum tensor size
+for which the operation is parallelized because for short tensor using opemp
+can slow down the operation.
+If it is no specified the default value (200000) is used.
+For simple(fast) operation you can obtain a speed up for very long tensor
+while for more complex operation you ca obtain a good speed up also for not
+too long tensor. 
+There is a script (elemwise_openmp_speedup.py in theano/misc/) which you can
+use to choose that value for your machine.
+The script run two elemwise operation (a fast and a slow one) for a vector of
+size openmp_elemwise_minsize with and without openmp and show the time
+difference between the two cases.
--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -475,3 +475,11 @@ AddConfigVar('openmp',
             BoolParam(default_openmp),
             in_c_key=False,
         )
+AddConfigVar('openmp_elemwise_minsize',
+             "If OpenMP is enable, this is the minimum size of vector "
+             "for which  the openmp parallel for is enable."
+             "Used in element wise ops",
+             IntParam(200000),
+             in_c_key=False,
+         )
--- a/theano/misc/elemwise_openmp_speedup.py
+++ b/theano/misc/elemwise_openmp_speedup.py
+import os
+import subprocess
+import sys
+from optparse import OptionParser
+import theano
+parser = OptionParser(usage='%prog <options>\n Compute time for'
+                      ' fast and slow elemwise operations')
+parser.add_option('-N', '--N', action='store', dest='N',
+                  default=theano.config.openmp_elemwise_minsize, type="int",
+                  help="Number of vector element")
+def runScript(N):
+    script = 'elemwise_time_test.py'
+    dir = os.path.dirname(os.path.abspath(__file__))
+    proc = subprocess.Popen(['python', script, '--script', '-N', str(N)],
+                            stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                            cwd=dir)
+    (out, err) = proc.communicate()
+    if err:
+        print err
+        sys.exit()
+    return map(float, out.split(" "))
+if __name__ == '__main__':
+    options, arguments = parser.parse_args(sys.argv)
+    if hasattr(options, "help"):
+        print options.help
+        sys.exit(0)
+    orig_flags = os.environ.get('THEANO_FLAGS', '')
+    os.environ['THEANO_FLAGS'] = orig_flags + ',openmp=false'
+    (cheapTime, costlyTime) = runScript(N=options.N)
+    os.environ['THEANO_FLAGS'] = orig_flags + ',openmp=true'
+    (cheapTimeOpenmp, costlyTimeOpenmp) = runScript(N=options.N)
+    if cheapTime > cheapTimeOpenmp:
+        cheapSpeed = cheapTime / cheapTimeOpenmp
+        cheapSpeedstring = "speedup"
+    else:
+        cheapSpeed = cheapTimeOpenmp / cheapTime
+        cheapSpeedstring = "slowdown"
+    if costlyTime > costlyTimeOpenmp:
+        costlySpeed = costlyTime / costlyTimeOpenmp
+        costlySpeedstring = "speedup"
+    else:
+        costlySpeed = costlyTimeOpenmp / costlyTime
+        costlySpeedstring = "slowdown"
+    print "Fast op time without openmp %fs with openmp %fs %s %2.2f" % (cheapTime, cheapTimeOpenmp, cheapSpeedstring, cheapSpeed)
+    print "Slow op time without openmp %fs with openmp %fs %s %2.2f" % (costlyTime, costlyTimeOpenmp, costlySpeedstring, costlySpeed)
--- a/theano/misc/elemwise_time_test.py
+++ b/theano/misc/elemwise_time_test.py
+from optparse import OptionParser
+import sys
+import time
+import numpy as np
+import theano
+import theano.tensor as T
+parser = OptionParser(usage='%prog <options>\n Compute time for'
+                      ' fast and slow elemwise operations')
+parser.add_option('-N', '--N', action='store', dest='N',
+                  default=theano.config.openmp_elemwise_minsize, type="int",
+                  help="Number of vector element")
+parser.add_option('--script', action='store_true', dest='script',
+                  default=False,
+                  help="Run program as script and print results on stdoutput")
+def evalTime(f, v, script=False, loops=1000):
+    min = 1e10
+    for i in xrange(0, loops):
+        t0 = time.time()
+        f(v)
+        dt = time.time() - t0
+        min = dt if dt < min else min
+    if not script:
+        print ' run time in %d loops was %2.9f sec' % (loops, min)
+    return min
+def ElemwiseOpTime(N, script=False, loops=1000):
+    x = T.vector('x')
+    np.random.seed(1235)
+    v = np.random.random(N).astype(theano.config.floatX)
+    f = theano.function([x], 2*x + x*x)
+    f1 = theano.function([x], T.tanh(x))
+    if not script:
+        if theano.config.openmp:
+            print "With openmp:"
+        print "Fast op ",
+    ceapTime = evalTime(f, v, script=script, loops=loops)
+    if not script:
+        print "Slow op ",
+    costlyTime = evalTime(f1, v, script=script, loops=loops)
+    return (ceapTime, costlyTime)
+if __name__ == '__main__':
+    options, arguments = parser.parse_args(sys.argv)
+    if hasattr(options, "help"):
+        print options.help
+        sys.exit(0)
+    (cheapTime, costlyTime) = ElemwiseOpTime(N=options.N,
+                                             script=options.script)
+    if options.script:
+        sys.stdout.write("%2.9f %2.9f\n" % (cheapTime, costlyTime))
+        sys.stdout.flush()
--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -6,7 +6,7 @@ import numpy
 import theano
 from theano import gof
-from theano.gof import Apply, Op
+from theano.gof import Apply, Op, OpenMPOp
 from theano import scalar
 from theano.scalar import Scalar, get_scalar_type
 from theano.printing import pprint
@@ -419,7 +419,7 @@ pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, DimShuffle),
 ### Elemwise ###
 ################
-class Elemwise(Op):
+class Elemwise(OpenMPOp):
    """
    Generalizes a scalar op to tensors.
@@ -449,7 +449,7 @@ class Elemwise(Op):
    """
    def __init__(self, scalar_op, inplace_pattern=None, name=None,
-            nfunc_spec=None):
+                 nfunc_spec=None, openmp=None):
        """
        Usage: Elemwise(scalar_op, inplace_pattern = {})
@@ -487,6 +487,7 @@ class Elemwise(Op):
        #precompute the hash of this node
        self._rehash()
+        super(Elemwise,self).__init__(openmp=openmp)
    def __getstate__(self):
        d = copy(self.__dict__)
@@ -1028,14 +1029,6 @@ class Elemwise(Op):
        # which is allocated, OR, if there are any aliased outputs,
        # the index of the last of these aliased outputs.
-        # We declare the scalar variables used in the inner loop to do
-        # the element-wise computation. Aliased scalar variables need
-        # not be declared, as they are #defined in defines
-        task_decl = "".join([
-            "%s& %s_i = *%s_iter;\n" % (dtype, name, name)
-                for name, dtype in izip(inames + list(real_onames),
-                                       idtypes + list(real_odtypes))])
        # We generate the C code of the inner loop using the scalar op
        task_code = self.scalar_op.c_code(
                Apply(self.scalar_op,
@@ -1050,11 +1043,13 @@ class Elemwise(Op):
        code = """
        {
            %(defines)s
-            %(task_decl)s
            %(task_code)s
            %(undefs)s
        }
        """ % locals()
+        loop_orders = orders + [range(nnested)] * len(real_onames)
+        dtypes = (idtypes + list(real_odtypes))
        if all([o.ndim <= 1 for o in node.outputs] or
               # Use simpler code when output ndim == 0 or 1
               # or for broadcated scalar.
@@ -1063,19 +1058,47 @@ class Elemwise(Op):
                all_code = [("", "")] * (nnested - 1) + [("", code)] + [""]
            else:
                all_code = [code]
+            if len(all_code) == 1:
+                #No loops
+                task_decl = "".join([
+                    "%s& %s_i = *%s_iter;\n" % (dtype, name, name)
+                    for name, dtype in izip(inames + list(real_onames),
+                                            idtypes + list(real_odtypes))])
+                preloops = {}
+                for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
+                    for j, index in enumerate(loop_order):
+                        if index != 'x':
+                            preloops.setdefault(j, "")
+                            preloops[j] += ("%%(lv%(i)s)s_iter = (%(dtype)s*)(PyArray_DATA(%%(lv%(i)s)s));\n" % locals()) % sub
+                            break
+                    else:  # all broadcastable
+                            preloops.setdefault(0, "")
+                            preloops[0] += ("%%(lv%(i)s)s_iter = (%(dtype)s*)(PyArray_DATA(%%(lv%(i)s)s));\n" % locals()) % sub
+                init_array = preloops.get(0, " ")
+                loop = """
+                {
+                  %(defines)s
+                  %(init_array)s
+                  %(task_decl)s
+                  %(task_code)s
+                  %(undefs)s
+                }
+                """ % locals()
+            else:
                loop = cgen.make_loop(
-                loop_orders=orders + [range(nnested)] * len(real_onames),
+                    loop_orders=loop_orders,
-                dtypes=(idtypes + list(real_odtypes)),
+                    dtypes=dtypes,
                    loop_tasks=all_code,
-                sub=sub)
+                    sub=sub, openmp=self.openmp)
        else:
            loop = cgen.make_reordered_loop(
-                init_loop_orders=orders + [range(nnested)] * len(real_onames),
+                init_loop_orders=loop_orders,
                olv_index=olv_index,
-                dtypes=(idtypes + list(real_odtypes)),
+                dtypes=dtypes,
                inner_task=code,
-                sub=sub)
+                sub=sub, openmp=self.openmp)
        # If all inputs and outputs are contiguous
        # and the scalar op define optimized code for that case
@@ -1117,7 +1140,8 @@ class Elemwise(Op):
                            contig += """
            dtype_%(x)s& %(x)s_i = ((dtype_%(x)s*) PyArray_DATA(%(x)s))[0];
                            """ % locals()
+                    if self.openmp:
+                        contig += """#pragma omp parallel for if(n>=%d)""" % (config.openmp_elemwise_minsize)
                    contig += """
                    for(int i=0; i<n; i++){
                        %(index)s
@@ -1166,6 +1190,7 @@ class Elemwise(Op):
        version.append(self.scalar_op.c_code_cache_version_apply(scalar_node))
        for i in node.inputs + node.outputs:
            version.append(get_scalar_type(dtype=i.type.dtype).c_code_cache_version())
+        version.append(('openmp', self.openmp))
        if all(version):
            return tuple(version)
        else:
@@ -1557,7 +1582,7 @@ for(int i=0;i<PyArray_NDIM(%(iname)s);i++){
                        + [("", code1), ""])
        else:
            all_code = [task0_decl + code1]
-        loop = cgen.make_loop(
+        loop = cgen.make_loop_careduce(
                [order, range(nnested) + ['x'] * len(axis)],
                [idtype, adtype], all_code, sub)

--- a/theano/tensor/elemwise_cgen.py
+++ b/theano/tensor/elemwise_cgen.py
+import theano
 def make_declare(loop_orders, dtypes, sub):
@@ -170,8 +171,7 @@ def make_alloc(loop_orders, dtype, sub, fortran='0'):
    }
    """ % dict(locals(), **sub)
+def make_loop(loop_orders, dtypes, loop_tasks, sub, openmp=None):
-def make_loop(loop_orders, dtypes, loop_tasks, sub):
    """
    Make a nested loop over several arrays and associate specific code
    to each level of nesting.
@@ -195,22 +195,29 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
    @type sub: a dictionary.
    @param sub: Maps 'lv#' to a suitable variable name.
      The 'lvi' variable corresponds to the ith element of loop_orders.
-    """
+    """
    def loop_over(preloop, code, indices, i):
        iterv = 'ITER_%i' % i
        update = ""
        suitable_n = "1"
        for j, index in enumerate(indices):
            var = sub['lv%i' % j]
-            update += "%(var)s_iter += %(var)s_jump%(index)s_%(i)s;\n" % locals()
+            dtype = dtypes[j]
+            update += "%(dtype)s &%(var)s_i = * ( %(var)s_iter + %(iterv)s * %(var)s_jump%(index)s_%(i)s );\n" % locals()
            if index != 'x':
                suitable_n = "%(var)s_n%(index)s" % locals()
-        return """
+        if openmp:
+            openmp_elemwise_minsize = theano.config.openmp_elemwise_minsize
+            forloop = """#pragma omp parallel for if( %(suitable_n)s >=%(openmp_elemwise_minsize)s)\n""" % locals()
+        else:
+            forloop = ""
+        forloop += """for (int %(iterv)s = 0; %(iterv)s<%(suitable_n)s; %(iterv)s++)""" % locals()
+        return"""
        %(preloop)s
-        for (int %(iterv)s = %(suitable_n)s; %(iterv)s; %(iterv)s--) {
+        %(forloop)s {
-            %(code)s
            %(update)s
+            %(code)s
        }
        """ % locals()
@@ -225,10 +232,8 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
            preloops.setdefault(0, "")
            preloops[0] += ("%%(lv%(i)s)s_iter = (%(dtype)s*)(PyArray_DATA(%%(lv%(i)s)s));\n" % locals()) % sub
-    if len(loop_tasks) == 1:
-        s = preloops.get(0, "")
-    else:
    s = ""
    for i, (pre_task, task), indices in reversed(zip(xrange(len(loop_tasks) - 1), loop_tasks, zip(*loop_orders))):
            s = loop_over(preloops.get(i, "") + pre_task, s + task, indices, i)
@@ -236,7 +241,7 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
    return "{%s}" % s
-def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
+def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub, openmp=None):
    '''A bit like make_loop, but when only the inner-most loop executes code.
    All the loops will be reordered so that the loops over the output tensor
@@ -325,7 +330,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
        ++%(ovar)s_loops_it;
        """ % locals()
-    ## Get sorted strides and jumps
+    ## Get sorted strides
    # Get strides in the initial order
    def get_loop_strides(loop_order, i):
        """
@@ -344,7 +349,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
        return r
    # We declare the initial strides as a 2D array, nvars x nnested
-    declare_strides_jumps = """
+    declare_strides = """
    int init_strides[%(nvars)i][%(nnested)i] = {
        %(strides)s
    };""" % dict(
@@ -355,46 +360,57 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
                for i, lo in enumerate(init_loop_orders)
                if len(lo)>0))
-    # Declare (sorted) stride and jumps for each variable
+    # Declare (sorted) stride and for each variable
    # we iterate from innermost loop to outermost loop
-    declare_strides_jumps += """
+    declare_strides += """
    std::vector< std::pair<int, int> >::reverse_iterator %(ovar)s_loops_rit;
    """ % locals()
    for i in xrange(nvars):
        var = sub["lv%i" % i]
-        declare_strides_jumps += """
+        declare_strides += """
        %(ovar)s_loops_rit = %(ovar)s_loops.rbegin();""" % locals()
-        adjust = "0"
        for j in reversed(range(nnested)):
-            jump = "(%s) - (%s)" % ("%(var)s_stride_l%(j)i" % locals(), adjust)
+            declare_strides += """
-            declare_strides_jumps +="""
            int %(var)s_stride_l%(j)i = init_strides[%(i)i][%(ovar)s_loops_rit->second];
-            int %(var)s_jump_l%(j)i = %(jump)s;
            ++%(ovar)s_loops_rit;
            """ % locals()
-            adjust = "TOTAL_%(j)i * %(var)s_stride_l%(j)i" % locals()
    declare_iter = ""
    for i, dtype in enumerate(dtypes):
        var = sub["lv%i" % i]
        declare_iter += "%(var)s_iter = (%(dtype)s*)(PyArray_DATA(%(var)s));\n" % locals()
+    pointer_update = ''
+    for j , dtype in enumerate(dtypes):
+        var = sub["lv%i" % j]
+        pointer_update += "%(dtype)s &%(var)s_i = * ( %(var)s_iter"%locals()
+        tot_jump = ''
+        for i in reversed(range(nnested)):
+            iterv = 'ITER_%i' % i
+            pointer_update += "+%(var)s_stride_l%(i)i*%(iterv)s" % locals()
+        pointer_update += ");\n"
    loop = inner_task
    for i in reversed(range(nnested)):
        iterv = 'ITER_%i' % i
        total = 'TOTAL_%i' % i
        update = ''
-        for j in xrange(nvars):
+        forloop = ''
-            var = sub["lv%i" % j]
+        # The pointers are defined only in the most inner loop
-            update += "%(var)s_iter += %(var)s_jump_l%(i)i;\n" % locals()
+        if i == nnested-1:
+            update = pointer_update
+        if i == 0:
+            if openmp:
+                openmp_elemwise_minsize= theano.config.openmp_elemwise_minsize
+                forloop += """#pragma omp parallel for if( %(total)s >=%(openmp_elemwise_minsize)s)\n""" % locals()
+        forloop += "for(int %(iterv)s = 0; %(iterv)s<%(total)s; %(iterv)s++)" % locals()
        loop = """
-        for (int %(iterv)s = %(total)s; %(iterv)s; %(iterv)s--)
+        %(forloop)s
        { // begin loop %(i)i
-            %(loop)s
            %(update)s
+            %(loop)s 
        } // end loop %(i)i
        """ % locals()
@@ -402,7 +418,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
            '{',
            order_loops,
            declare_totals,
-            declare_strides_jumps,
+            declare_strides,
            declare_iter,
            loop,
            '}\n',
@@ -435,21 +451,77 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
 ### DimShuffle ###
 ##################
 #################
 ### Broadcast ###
 #################
 ################
 ### CAReduce ###
 ################
+def make_loop_careduce(loop_orders, dtypes, loop_tasks, sub):
+    """
+    Make a nested loop over several arrays and associate specific code
+    to each level of nesting.
+    @type loop_orders: list of N tuples of length M.
+    @param loop_orders: Each value of each
+      tuple can be either the index of a dimension to loop over or
+      the letter 'x' which means there is no looping to be done
+      over that variable at that point (in other words we broadcast
+      over that dimension). If an entry is an integer, it will become
+      an alias of the entry of that rank.
+    @type loop_tasks: list of M+1 pieces of code.
+    @param loop_tasks: The ith loop_task is a pair of strings, the first
+      string is code to be executed before the ith loop starts, the second
+      one contains code to be executed just before going to the next element
+      of the ith dimension.
+      The last element if loop_tasks is a single string, containing code
+      to be executed at the very end.
+    @type sub: a dictionary.
+    @param sub: Maps 'lv#' to a suitable variable name.
+      The 'lvi' variable corresponds to the ith element of loop_orders.
+    """
+    def loop_over(preloop, code, indices, i):
+        iterv = 'ITER_%i' % i
+        update = ""
+        suitable_n = "1"
+        for j, index in enumerate(indices):
+            var = sub['lv%i' % j]
+            update += "%(var)s_iter += %(var)s_jump%(index)s_%(i)s;\n" % locals()
+            if index != 'x':
+                suitable_n = "%(var)s_n%(index)s" % locals()
+        return """
+        %(preloop)s
+        for (int %(iterv)s = %(suitable_n)s; %(iterv)s; %(iterv)s--) {
+            %(code)s
+            %(update)s
+        }
+        """ % locals()
+    preloops = {}
+    for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
+        for j, index in enumerate(loop_order):
+            if index != 'x':
+                preloops.setdefault(j, "")
+                preloops[j] += ("%%(lv%(i)s)s_iter = (%(dtype)s*)(PyArray_DATA(%%(lv%(i)s)s));\n" % locals()) % sub
+                break
+        else: # all broadcastable
+            preloops.setdefault(0, "")
+            preloops[0] += ("%%(lv%(i)s)s_iter = (%(dtype)s*)(PyArray_DATA(%%(lv%(i)s)s));\n" % locals()) % sub
+    if len(loop_tasks) == 1:
+        s = preloops.get(0, "")
+    else:
+        s = ""
+        for i, (pre_task, task), indices in reversed(zip(xrange(len(loop_tasks) - 1), loop_tasks, zip(*loop_orders))):
+            s = loop_over(preloops.get(i, "") + pre_task, s + task, indices, i)
+    s += loop_tasks[-1]
+    return "{%s}" % s
--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -16,7 +16,7 @@ from theano.compile.mode import get_default_mode
 from theano.tensor.elemwise import (CAReduce, Elemwise, DimShuffle,
                                    Prod, ProdWithoutZeros)
 from theano.tests import unittest_tools
+import math
 def FunctionGraph(i, o):
    e = gof.FunctionGraph(i, o)
@@ -145,6 +145,9 @@ class test_Broadcast(unittest.TestCase):
    ctype = TensorType
    cop = Elemwise
+    openmp_minsize = 2*config.openmp_elemwise_minsize
+    openmp_minsize_sqrt = math.ceil(math.sqrt(openmp_minsize))
    def rand_val(self, shp):
        return numpy.asarray(numpy.random.rand(*shp))
@@ -160,6 +163,8 @@ class test_Broadcast(unittest.TestCase):
                         ((3, 5), (3, 1)),
                         ((1, 5), (5, 1)),
                         ((1, 1), (1, 1)),
+                         ((self.openmp_minsize,), (self.openmp_minsize,)),
+                         ((self.openmp_minsize_sqrt, self.openmp_minsize_sqrt), (self.openmp_minsize_sqrt, self.openmp_minsize_sqrt)),
                         ((2, 3, 4, 5), (2, 3, 4, 5)),
                         ((2, 3, 4, 5), (1, 3, 1, 5)),
                         ((2, 3, 4, 5), (1, 1, 1, 1)),