New version of the C-code generator for Elemwise, enabling loop reordering.

- Code reviewed by Olivier B., thanks! - Lots of comments added - Unused code removed - Looping in Elemwise is now done so the contiguous dimension of the output is looped over in the inner-most loop. It does not change anything in the non-inplace case (the output was C-contiguous), but can lead to really better performances (at least 7x) in the inplace case.

New version of the C-code generator for Elemwise, enabling loop reordering.
b2d0f53d · Pascal Lamblin · cfd4bef8 · b2d0f53d · b2d0f53d
--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -665,10 +665,18 @@ class Elemwise(Op):
        defines = ""
        undefs = ""
-        dmap = dict([(node.outputs[o], [node.inputs[i]]) for o, i in self.inplace_pattern.items()])
+        # The destroy map is a map of output indices to input indices
+        # that overwrite them.  We just convert them to the actual
+        # Variables.
+        dmap = dict([(node.outputs[o], [node.inputs[i]])
+                     for o, i in self.inplace_pattern.iteritems()])
+        # dtypes of the inputs
        idtypes = [input.type.dtype_specs()[1] for input in inputs]
+        # These are the outputs that we will need to allocate
+        # (output, name, name of the c type), transposed
        real = zip(*[(r, s, r.type.dtype_specs()[1])
                     for r, s in zip(node.outputs, onames) if r not in dmap])
        if real:
@@ -676,6 +684,9 @@ class Elemwise(Op):
        else:
            real_outputs, real_onames, real_odtypes = [], [], []
+        # Outputs that are aliased with an input (inplace)
+        # (output, name), transposed (c type name not needed since we don't
+        # need to allocate.
        aliased = zip(*[(r, s)
                        for (r, s) in zip(node.outputs, onames) if r in dmap])
        if aliased:
@@ -683,25 +694,49 @@ class Elemwise(Op):
        else:
            aliased_outputs, aliased_onames = [], []
-        orders = [[x and 'x' or i for i, x in enumerate(input.type.broadcastable)] for input in inputs]
+        # for each input:
+        # same as range(ndim), but with 'x' at all broadcastable positions
+        orders = [[x and 'x' or i
+                   for i, x in enumerate(input.type.broadcastable)]
+                  for input in inputs]
+        # number of nested loops we will need (all inputs have same
+        # dimensionality)
        nnested = len(orders[0])
        sub = dict(sub)
        for i, (input, iname) in enumerate(zip(inputs, inames)):
+            # the c generators will substitute the input names for
+            # references to loop variables lv0, lv1, ...
            sub['lv%i' % i] = iname
        decl = cgen.make_declare(orders, idtypes, sub)
        checks = cgen.make_checks(orders, idtypes, sub)
        alloc = ""
+        # We loop over the "real" outputs, i.e., those that are not
+        # inplace (must be allocated) and we declare/allocate/check
+        # them
        for output, oname, odtype in zip(real_outputs, real_onames, real_odtypes):
-            i += 1
+            i += 1 # before this loop, i = number of inputs
            sub['lv%i' % i] = oname
            sub['olv'] = oname
-            alloc += cgen.make_declare([range(nnested)], [odtype], dict(sub, lv0 = oname))
+            alloc += cgen.make_declare([range(nnested)], [odtype],
+                                       dict(sub, lv0 = oname))
            alloc += cgen.make_alloc(orders, odtype, sub)
-            alloc += cgen.make_checks([range(nnested)], [odtype], dict(sub, lv0 = oname))
+            alloc += cgen.make_checks([range(nnested)], [odtype],
+                                      dict(sub, lv0 = oname))
+        olv_index = i # index of the last output
+        # We loop over the "aliased" outputs, i.e., those that are
+        # inplace (overwrite the contents of one of the inputs) and
+        # make the output pointers point to theur corresponding input
+        # pointers.
        for output, oname in zip(aliased_outputs, aliased_onames):
-            iname = inames[inputs.index(dmap[output][0])]
+            olv_index = inputs.index(dmap[output][0])
+            iname = inames[olv_index]
+            # We make the output point to the corresponding input and
+            # decrease the reference of whatever the output contained
+            # prior to this
            alloc += """
            if (%(oname)s) {
                Py_XDECREF(%(oname)s);
@@ -709,17 +744,30 @@ class Elemwise(Op):
            %(oname)s = %(iname)s;
            Py_XINCREF(%(oname)s);
            """ % locals()
+            # We alias the scalar variables
            defines += "#define %(oname)s_i %(iname)s_i" % locals()
            undefs += "#undef %(oname)s_i" % locals()
-        task_code = self.scalar_op.c_code(Apply(self.scalar_op,
+        # Note: here, olv_index is either the index of the last output
+        # which is allocated, OR, if there are any aliased outputs,
+        # the index of the last of these aliased outputs.
+        # We declare the scalar variables used in the inner loop to do
+        # the element-wise computation. Aliased scalar variables need
+        # not be declared, as they are #defined in defines
+        task_decl = "".join(["%(dtype)s& %(name)s_i = *%(name)s_iter;\n" % locals()
+                             for name, dtype in zip(inames + list(real_onames),
+                                                    idtypes + list(real_odtypes))])
+        # We generate the C code of the inner loop using the scalar op
+        task_code = self.scalar_op.c_code(
+                Apply(self.scalar_op,
                      [Scalar(dtype = input.type.dtype)() for input in node.inputs],
                      [Scalar(dtype = output.type.dtype)() for output in node.outputs]),
                name + '_scalar_',
                ["%s_i" % s for s in _inames],
                ["%s_i" % s for s in onames],
                sub)
-        task_decl = "".join(["%(dtype)s& %(name)s_i = *%(name)s_iter;\n" % locals() for name, dtype in zip(inames + list(real_onames), idtypes + list(real_odtypes))])
        code = """
        {
            %(defines)s
@@ -728,22 +776,28 @@ class Elemwise(Op):
            %(undefs)s
        }
        """ % locals()
-        if nnested:
-            all_code = [("", "")] * (nnested - 1) + [("", code)] + [""]
+        loop = cgen.make_reordered_loop(
-        else:
+                init_loop_orders = orders + [range(nnested)] * len(real_onames),
-            all_code = [code]
+                olv_index = olv_index,
-        loop = cgen.make_loop(orders + [range(nnested)] * len(real_onames), idtypes + list(real_odtypes), all_code, sub)
+                dtypes = idtypes + list(real_odtypes),
+                inner_task = code,
+                sub = sub)
        return decl, checks, alloc, loop
    def c_code(self, node, name, inames, onames, sub):
        code = "\n".join(self._c_all(node, name, inames, onames, sub))
        return code
+    def c_headers(self):
+        return ['<vector>', '<algorithm>']
    def c_support_code(self):
-        return self.scalar_op.c_support_code()
+        support_code = self.scalar_op.c_support_code()
+        return support_code
    def c_code_cache_version_apply(self, node):
-        version = [4] # the version corresponding to the c code in this Op
+        version = [5] # the version corresponding to the c code in this Op
        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(self.scalar_op,
@@ -989,8 +1043,12 @@ for(int i=0;i<%(iname)s->nd;i++){
        code = "\n".join(self._c_all(node, name, inames, onames, sub))
        return code
+    def c_headers(self):
+        # Sometimes, Elemwise's c_code is returned, so we need its headers
+        return ['<vector>', '<algorithm>']
    def c_code_cache_version_apply(self, node):
-        version = [3] # the version corresponding to the c code in this Op
+        version = [4] # the version corresponding to the c code in this Op
        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(self.scalar_op,

--- a/theano/tensor/elemwise_cgen.py
+++ b/theano/tensor/elemwise_cgen.py
 def make_declare(loop_orders, dtypes, sub):
+    """
+    Produce code to declare all necessary variables.
+    """
    decl = ""
    for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
-        var = sub['lv%i' % i]
+        var = sub['lv%i' % i] # input name corresponding to ith loop variable
+        # we declare an iteration variable
+        # and an integer for the number of dimensions
        decl += """
        %(dtype)s* %(var)s_iter;
        int %(var)s_nd;
        """ % locals()
        for j, value in enumerate(loop_order):
            if value != 'x':
+                # If the dimension is not broadcasted, we declare
+                # the number of elements in that dimension,
+                # the stride in that dimension,
+                # and the jump from an iteration to the next
                decl += """
                int %(var)s_n%(value)i;
                int %(var)s_stride%(value)i;
                int %(var)s_jump%(value)i_%(j)i;
                """ % locals()
            else:
+                # if the dimension is broadcasted, we only need
+                # the jump (arbitrary length and stride = 0)
                decl += """
                int %(var)s_jump%(value)s_%(j)i;
                """ % locals()
@@ -27,8 +39,12 @@ def make_checks(loop_orders, dtypes, sub):
    init = ""
    for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
        var = "%%(lv%i)s" % i
+        # List of dimensions of var that are not broadcasted
        nonx = [x for x in loop_order if x != 'x']
        if nonx:
+            # If there are dimensions that are not broadcasted
+            # this is a check that the number of dimensions of the
+            # tensor is as expected.
            min_nd = max(nonx) + 1
            init += """
            if (%(var)s->nd < %(min_nd)s) {
@@ -36,10 +52,20 @@ def make_checks(loop_orders, dtypes, sub):
                %%(fail)s
            }
            """ % locals()
-        adjust = []
-        for j, index in reversed([aaa for aaa in enumerate(loop_order)]):
+        # In loop j, adjust represents the difference of values of the
+        # data pointer between the beginning and the end of the
+        # execution of loop j+1 (the loop inside the current one). It
+        # is equal to the stride in loop j+1 times the length of loop
+        # j+1, or 0 for the inner-most loop.
+        adjust = "0"
+        # We go from the inner loop to the outer loop
+        for j, index in reversed(list(enumerate(loop_order))):
            if index != 'x':
-                jump = " - ".join(["%(var)s_stride%(index)s" % locals()] + adjust)
+                # Initialize the variables associated to the jth loop
+                # jump = stride - adjust
+                jump = "(%s) - (%s)" % ("%(var)s_stride%(index)s" % locals(), adjust)
                init += """
                %(var)s_n%(index)s = %(var)s->dimensions[%(index)s];
                %(var)s_stride%(index)s = %(var)s->strides[%(index)s] / sizeof(%(dtype)s);
@@ -47,36 +73,20 @@ def make_checks(loop_orders, dtypes, sub):
                //printf("%(var)s_jump%(index)s_%(j)s is:");
                //std::cout << %(var)s_jump%(index)s_%(j)s << std::endl;
                """ % locals()
-                adjust = ["%(var)s_n%(index)s*%(var)s_stride%(index)s" % locals()]
+                adjust = "%(var)s_n%(index)s*%(var)s_stride%(index)s" % locals()
            else:
-                jump = " - ".join(["0"] + adjust)
+                jump = "-(%s)" % adjust
                init += """
                %(var)s_jump%(index)s_%(j)s = %(jump)s;
                //printf("%(var)s_jump%(index)s_%(j)s is:");
                //std::cout << %(var)s_jump%(index)s_%(j)s << std::endl;
                """ % locals()
-                adjust = []
+                adjust = "0"
    check = ""
-    if 0: 
-        # original dimension-checking loop builds a single if condition, and if it is true, it
+    # This loop builds multiple if conditions to verify that the
-        # raises a generic error message
+    # dimensions of the inputs match, and the first one that is true
-        for matches in zip(*loop_orders):
+    # raises an informative error message
-            to_compare = [(j, x) for j, x in enumerate(matches) if x != "x"]
-            if len(to_compare) < 2:
-                continue
-            j, x = to_compare[0]
-            first = "%%(lv%(j)s)s_n%(x)s" % locals()
-            cond = " || ".join(["%(first)s != %%(lv%(j)s)s_n%(x)s" % locals() for j, x in to_compare[1:]])
-            if cond:
-                check += """
-                if (%(cond)s) {
-                    PyErr_SetString(PyExc_ValueError, "Input dimensions do not match (Try re-running with py linker for more information).");
-                    %%(fail)s
-                }
-                """ % locals()
-    else:
-        # revised dimension-checking loop build multiple if conditions, and the first one that
-        # is true raises a more informative error message
    for matches in zip(*loop_orders):
        to_compare = [(j, x) for j, x in enumerate(matches) if x != "x"]
@@ -99,12 +109,22 @@ def make_checks(loop_orders, dtypes, sub):
                %%(fail)s
            }
            """ % locals()
    return init % sub + check % sub
 def make_alloc(loop_orders, dtype, sub):
+    """
+    Generate C code to allocate outputs.
+    """
    nd = len(loop_orders[0])
    init_dims = ""
+    # For each dimension, the tensors are either all broadcasted, in
+    # which case the output will also be broadcastable (dimension =
+    # 1), or one or more are not broadcasted, in which case the number
+    # of elements of the output in that dimension will be equal to the
+    # number of elements of any of them.
    for i, candidates in enumerate(zip(*loop_orders)):
        for j, candidate in enumerate(candidates):
            if candidate != 'x':
@@ -114,6 +134,12 @@ def make_alloc(loop_orders, dtype, sub):
        else:
            init_dims += "dims[%(i)s] = 1;\n" % locals()
            #raise Exception("For each looping dimension, at least one input must have a non-broadcastable dimension.")
+    # TODO: it would be interesting to allocate the output in such a
+    # way that its contiguous dimensions match one of the input's
+    # contiguous dimensions, or the dimension with the smallest
+    # stride. Right now, it is allocated to be C_CONTIGUOUS.
    return """
    {
        npy_intp dims[%(nd)s];
@@ -155,9 +181,12 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
      an alias of the entry of that rank.
    @type loop_tasks: list of M+1 pieces of code.
-    @param loop_tasks: The ith loop_task is code
+    @param loop_tasks: The ith loop_task is a pair of strings, the first
-      to be executed just before going to the next element of the
+      string is code to be executed before the ith loop starts, the second
-      ith dimension. The last is code to be executed at the very end.
+      one contains code to be executed just before going to the next element
+      of the ith dimension.
+      The last element if loop_tasks is a single string, containing code
+      to be executed at the very end.
    @type sub: a dictionary.
    @param sub: Maps 'lv#' to a suitable variable name.
@@ -203,6 +232,177 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
    return "{%s}" % s
+def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
+    '''A bit like make_loop, but when only the inner-most loop executes code.
+    All the loops will be reordered so that the loops over the output tensor
+    are executed with memory access as contiguous as possible.
+    For instance, if the output tensor is c_contiguous, the inner-most loop
+    will be on its rows; if it's f_contiguous, it will be on its columns.
+    The output tensor's index among the loop variables is indicated by olv_index.
+    '''
+    # Number of variables
+    nvars = len(init_loop_orders)
+    # Number of loops (dimensionality of the variables)
+    nnested = len(init_loop_orders[0])
+    # This is the var from which we'll get the loop order
+    ovar = sub['lv%i' % olv_index]
+    # The loops are ordered by (decreasing) absolute values of ovar's strides.
+    # The first element of each pair is the absolute value of the stride
+    # The second element correspond to the index in the initial loop order
+    order_loops = """
+    std::vector< std::pair<int, int> > %(ovar)s_loops(%(nnested)i);
+    std::vector< std::pair<int, int> >::iterator %(ovar)s_loops_it = %(ovar)s_loops.begin();
+    """ % locals()
+    # Fill the loop vector with the appropriate <stride, index> pairs
+    for i, index in enumerate(init_loop_orders[olv_index]):
+        if index != 'x':
+            order_loops += """
+            %(ovar)s_loops_it->first = abs(%(ovar)s->strides[%(index)i]);
+            """  % locals()
+        else:
+            # Stride is 0 when dimension is broadcastable
+            order_loops += """
+            %(ovar)s_loops_it->first = 0;
+            """ % locals()
+        order_loops += """
+        %(ovar)s_loops_it->second = %(i)i;
+        ++%(ovar)s_loops_it;
+        """ % locals()
+    # We sort in decreasing order so that the outermost loop (loop 0)
+    # has the largest stride, and the innermost loop (nnested - 1) has
+    # the smallest stride.
+    order_loops += """
+    // rbegin and rend are reversed iterators, so this sorts in decreasing order
+    std::sort(%(ovar)s_loops.rbegin(), %(ovar)s_loops.rend());
+    """ % locals()
+    ## Get the (sorted) total number of iterations of each loop
+    # Get totals in the initial order
+    # For each dimension, the tensors are either all broadcasted, in
+    # which case there is only one iteration of the loop, or one or
+    # more are not broadcasted, in which case the number of elements
+    # of any of them will be equal to the number of iterations we have
+    # to do.
+    totals = []
+    for i, candidates in enumerate(zip(*init_loop_orders)):
+        for j, candidate in enumerate(candidates):
+            if candidate != 'x':
+                var = sub['lv%i' % j]
+                total = "%(var)s_n%(candidate)s" % locals()
+                break
+        else:
+            total = '1';
+        totals.append(total)
+    declare_totals = """
+    int init_totals[%(nnested)s] = {%(totals)s};
+    """ % dict(
+            nnested = nnested,
+            totals = ', '.join(totals)
+            )
+    # Sort totals to match the new order that was computed by sorting
+    # the loop vector. One integer variable per loop is declared.
+    declare_totals += """
+    %(ovar)s_loops_it = %(ovar)s_loops.begin();
+    """ % locals()
+    for i in range(nnested):
+        declare_totals += """
+        int TOTAL_%(i)i = init_totals[%(ovar)s_loops_it->second];
+        ++%(ovar)s_loops_it;
+        """ % locals()
+    ## Get sorted strides and jumps
+    # Get strides in the initial order
+    def get_loop_strides(loop_order, i):
+        """
+        Returns a list containing a C expression representing the
+        stride for each dimension of the ith variable, in the
+        specified loop_order.
+        """
+        var = sub["lv%i" % i]
+        r = []
+        for index in loop_order:
+            # Note: the stride variable is not declared for broadcasted variables
+            if index != 'x':
+                r.append("%(var)s_stride%(index)s" % locals())
+            else:
+                r.append('0')
+        return r
+    # We declare the initial strides as a 2D array, nvars x nnested
+    declare_strides_jumps = """
+    int init_strides[%(nvars)i][%(nnested)i] = {
+        %(strides)s
+    };""" % dict(
+            nvars = nvars,
+            nnested = nnested,
+            strides = ', \n'.join(
+                ', '.join(get_loop_strides(lo, i))
+                for i, lo in enumerate(init_loop_orders)
+                if len(lo)>0))
+    # Declare (sorted) stride and jumps for each variable
+    # we iterate from innermost loop to outermost loop
+    declare_strides_jumps += """
+    std::vector< std::pair<int, int> >::reverse_iterator %(ovar)s_loops_rit;
+    """ % locals()
+    for i in range(nvars):
+        var = sub["lv%i" % i]
+        declare_strides_jumps += """
+        %(ovar)s_loops_rit = %(ovar)s_loops.rbegin();""" % locals()
+        adjust = "0"
+        for j in reversed(range(nnested)):
+            jump = "(%s) - (%s)" % ("%(var)s_stride_l%(j)i" % locals(), adjust)
+            declare_strides_jumps +="""
+            int %(var)s_stride_l%(j)i = init_strides[%(i)i][%(ovar)s_loops_rit->second];
+            int %(var)s_jump_l%(j)i = %(jump)s;
+            ++%(ovar)s_loops_rit;
+            """ % locals()
+            adjust = "TOTAL_%(j)i * %(var)s_stride_l%(j)i" % locals()
+    declare_iter = ""
+    for i, dtype in enumerate(dtypes):
+        var = sub["lv%i" % i]
+        declare_iter += "%(var)s_iter = (%(dtype)s*)(%(var)s->data);\n" % locals()
+    loop = inner_task
+    for i in reversed(range(nnested)):
+        iterv = 'ITER_%i' % i
+        total = 'TOTAL_%i' % i
+        update = ''
+        for j in range(nvars):
+            var = sub["lv%i" % j]
+            update += "%(var)s_iter += %(var)s_jump_l%(i)i;\n" % locals()
+        loop = """
+        for (int %(iterv)s = %(total)s; %(iterv)s; %(iterv)s--)
+        { // begin loop %(i)i
+            %(loop)s
+            %(update)s
+        } // end loop %(i)i
+        """ % locals()
+    return '\n'.join([
+            '{',
+            order_loops,
+            declare_totals,
+            declare_strides_jumps,
+            declare_iter,
+            loop,
+            '}\n',
+            ])
 # print make_declare(((0, 1, 2, 3), ('x', 1, 0, 3), ('x', 'x', 'x', 0)),
 #                    ('double', 'int', 'float'),