Allow broadcasting in Elemwise c_code

This removes an inconsistency between Numpy and Aesara broadcasting rules, where a variable dimension with unknown shape was always assumed to be non-broadcastable (i.e., different than 1)

Allow broadcasting in Elemwise c_code
c7b416e1 · Ricardo · Ricardo Vieira · b60cf724 · c7b416e1 · c7b416e1
--- a/aesara/scalar/basic.py
+++ b/aesara/scalar/basic.py
@@ -2307,6 +2307,8 @@ class Pow(BinaryScalarOp):
        if (
            node.inputs[0].type == node.outputs[0].type
            and node.inputs[1].type == node.outputs[0].type
+            and None not in node.inputs[0].type.shape
+            and None not in node.inputs[1].type.shape
            and
            # amdlibm 3.0 do not have a float64 version of this SIMD function
            node.inputs[0].dtype == "float32"

--- a/aesara/tensor/elemwise.py
+++ b/aesara/tensor/elemwise.py
@@ -913,7 +913,7 @@ second dimension
        checks = cgen.make_checks(orders, idtypes, sub)
        # Check if all inputs (except broadcasted scalar) are fortran.
-        # In that case, create an fortran output ndarray.
+        # In that case, create a fortran output ndarray.
        z = list(zip(inames, inputs))
        alloc_fortran = " && ".join(
            [
@@ -1071,7 +1071,7 @@ second dimension
        # If all inputs and outputs are contiguous
        # and the scalar op define optimized code for that case
-        # use it! The scalar_op need to check the broadcast flag himself.
+        # use it! The scalar_op needs to check the type-level shapes itself.
        if (
            all(o.ndim >= 1 for o in node.outputs)
            and
@@ -1088,11 +1088,19 @@ second dimension
                # compiler to vectorize the code as their won't be as
                # many ptr and the stride will be hard coded.
                if all(
-                    [
+                    # io.type.shape == node.outputs[1].type.shape
-                        io.broadcastable == node.outputs[0].broadcastable
+                    # Elemwise does not specify non-broadcastable static/type-levelshape
-                        or all(io.broadcastable)
+                    # information for its outputs yet
-                        for io in node.inputs + node.outputs
+                    node.outputs[0].type.is_super(io.type)
-                    ]
+                    for io in node.inputs + node.outputs
+                ) and (
+                    len(node.inputs) <= 1
+                    # If either one of the inputs has a `None` shape, we cannot
+                    # assume they will have the same size
+                    or all(
+                        len(set(inp_shape)) == 1 and None not in inp_shape
+                        for inp_shape in zip(*(inp.type.shape for inp in node.inputs))
+                    )
                ):
                    z = onames[0]
                    contig = f"""
@@ -1188,7 +1196,7 @@ second dimension
        return support_code
    def c_code_cache_version_apply(self, node):
-        version = [13]  # the version corresponding to the c code in this Op
+        version = [14]  # the version corresponding to the c code in this Op
        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(

--- a/aesara/tensor/elemwise_cgen.py
+++ b/aesara/tensor/elemwise_cgen.py
@@ -66,10 +66,12 @@ def make_checks(loop_orders, dtypes, sub):
            if index != "x":
                # Initialize the variables associated to the jth loop
                # jump = stride - adjust
+                # If the variable has size 1 in that dim, we set the stride to zero to
+                # emulate broadcasting
                jump = f"({var}_stride{index}) - ({adjust})"
                init += f"""
                {var}_n{index} = PyArray_DIMS({var})[{index}];
-                {var}_stride{index} = PyArray_STRIDES({var})[{index}] / sizeof({dtype});
+                {var}_stride{index} = ({var}_n{index} == 1)? 0 : PyArray_STRIDES({var})[{index}] / sizeof({dtype});
                {var}_jump{index}_{j} = {jump};
                """
                adjust = f"{var}_n{index}*{var}_stride{index}"
@@ -90,26 +92,85 @@ def make_checks(loop_orders, dtypes, sub):
        # elements of to_compare are pairs ( input_variable_idx, input_variable_dim_idx )
        if len(to_compare) < 2:
            continue
-        j0, x0 = to_compare[0]
-        for (j, x) in to_compare[1:]:
+        # Find first dimension size that is != 1
-            check += f"""
+        jl, xl = to_compare[-1]
-            if (%(lv{j0})s_n{x0} != %(lv{j})s_n{x})
+        non1size_dim_check = f"""
+            npy_intp non1size_dim{xl};
+            non1size_dim{xl} = """
+        for (j, x) in to_compare[:-1]:
+            non1size_dim_check += f"(%(lv{j})s_n{x} != 1) ? %(lv{j})s_n{x} : "
+        non1size_dim_check += f"%(lv{jl})s_n{xl};"
+        check += non1size_dim_check
+        # Check the nonsize1 dims match
+        # TODO: This is a bit inefficient because we are comparing one dimension against itself
+        check += f"""
+            if (non1size_dim{xl} != 1)
            {{
-                PyErr_Format(PyExc_ValueError, "Input dimension mismatch. (input[%%i].shape[%%i] = %%lld, input[%%i].shape[%%i] = %%lld)",
+        """
-                   {j0},
+        for (j, x) in to_compare:
-                   {x0},
+            check += f"""
-                   (long long int) %(lv{j0})s_n{x0},
+                if ((%(lv{j})s_n{x} != non1size_dim{x}) && (%(lv{j})s_n{x} != 1))
-                   {j},
+                {{
-                   {x},
+                    PyErr_Format(PyExc_ValueError, "Input dimension mismatch. One other input has shape[%%i] = %%lld, but input[%%i].shape[%%i] = %%lld.",
-                   (long long int) %(lv{j})s_n{x}
+                       {x},
-                );
+                       (long long int) non1size_dim{x},
-                %(fail)s
+                       {j},
-            }}
+                       {x},
+                       (long long int) %(lv{j})s_n{x}
+                    );
+                    %(fail)s
+                }}
            """
+        check += """
+            }
+        """
    return init % sub + check % sub
+def compute_broadcast_dimensions(array_name: str, loop_orders, sub) -> str:
+    """Create c_code to compute broadcasted dimensions of multiple arrays, arising from
+    Elemwise operations.
+    The code returned by this function populates the array `array_name`, but does not
+    initialize it.
+    TODO: We can decide to either specialize C code even further given the input types
+    or make it general, regardless of whether static broadcastable information is given
+    """
+    dims_c_code = ""
+    for i, candidates in enumerate(zip(*loop_orders)):
+        # TODO: Are candidates always either "x" or "i"? If that's the case we can
+        # simplify some logic here (e.g., we don't need to track the `idx`).
+        nonx_candidates = tuple(
+            (idx, c) for idx, c in enumerate(candidates) if c != "x"
+        )
+        # All inputs are known to be broadcastable
+        if not nonx_candidates:
+            dims_c_code += f"{array_name}[{i}] = 1;\n"
+            continue
+        # There is only one informative source of size
+        if len(nonx_candidates) == 1:
+            idx, candidate = nonx_candidates[0]
+            var = sub[f"lv{int(idx)}"]
+            dims_c_code += f"{array_name}[{i}] = {var}_n{candidate};\n"
+            continue
+        # In this case any non-size 1 variable will define the right size
+        dims_c_code += f"{array_name}[{i}] = "
+        for (idx, candidate) in nonx_candidates[:-1]:
+            var = sub[f"lv{int(idx)}"]
+            dims_c_code += f"({var}_n{candidate} != 1)? {var}_n{candidate}: "
+        idx, candidate = nonx_candidates[-1]
+        var = sub[f"lv{idx}"]
+        dims_c_code += f"{var}_n{candidate};\n"
+    return dims_c_code
 def make_alloc(loop_orders, dtype, sub, fortran="0"):
    """Generate C code to allocate outputs.
@@ -125,20 +186,7 @@ def make_alloc(loop_orders, dtype, sub, fortran="0"):
    if type.startswith("AESARA_COMPLEX"):
        type = type.replace("AESARA_COMPLEX", "NPY_COMPLEX")
    nd = len(loop_orders[0])
-    init_dims = ""
+    init_dims = compute_broadcast_dimensions("dims", loop_orders, sub)
-    # For each dimension, the tensors are either all broadcasted, in
-    # which case the output will also be broadcastable (dimension =
-    # 1), or one or more are not broadcasted, in which case the number
-    # of elements of the output in that dimension will be equal to the
-    # number of elements of any of them.
-    for i, candidates in enumerate(zip(*loop_orders)):
-        for j, candidate in enumerate(candidates):
-            if candidate != "x":
-                var = sub[f"lv{int(j)}"]
-                init_dims += f"dims[{i}] = {var}_n{candidate};\n"
-                break
-        else:
-            init_dims += f"dims[{i}] = 1;\n"
    # TODO: it would be interesting to allocate the output in such a
    # way that its contiguous dimensions match one of the input's
@@ -310,26 +358,8 @@ def make_reordered_loop(
    """
    # Get the (sorted) total number of iterations of each loop
-    # Get totals in the initial order
+    declare_totals = f"int init_totals[{nnested}];\n"
-    # For each dimension, the tensors are either all broadcasted, in
+    declare_totals += compute_broadcast_dimensions("init_totals", init_loop_orders, sub)
-    # which case there is only one iteration of the loop, or one or
-    # more are not broadcasted, in which case the number of elements
-    # of any of them will be equal to the number of iterations we have
-    # to do.
-    totals = []
-    for i, candidates in enumerate(zip(*init_loop_orders)):
-        for j, candidate in enumerate(candidates):
-            if candidate != "x":
-                var = sub[f"lv{int(j)}"]
-                total = f"{var}_n{candidate}"
-                break
-        else:
-            total = "1"
-        totals.append(total)
-    declare_totals = f"""
-    int init_totals[{nnested}] = {{{", ".join(totals)}}};
-    """
    # Sort totals to match the new order that was computed by sorting
    # the loop vector. One integer variable per loop is declared.

--- a/tests/tensor/test_elemwise.py
+++ b/tests/tensor/test_elemwise.py
@@ -206,77 +206,117 @@ class TestBroadcast:
        return np.asarray(np.random.random(shp), dtype=aesara.config.floatX)
    def with_linker(self, linker, op, type, rand_val):
-        for xsh, ysh in [
+        for shape_info in ("complete", "only_broadcastable", "none"):
-            ((3, 5), (3, 5)),
+            for xsh, ysh in [
-            ((3, 5), (1, 5)),
+                ((3, 5), (3, 5)),
-            ((3, 5), (3, 1)),
+                ((3, 5), (1, 5)),
-            ((1, 5), (5, 1)),
+                ((3, 5), (3, 1)),
-            ((1, 1), (1, 1)),
+                ((1, 5), (5, 1)),
-            ((self.openmp_minsize,), (self.openmp_minsize,)),
+                ((1, 1), (1, 1)),
-            (
+                ((self.openmp_minsize,), (self.openmp_minsize,)),
-                (self.openmp_minsize_sqrt, self.openmp_minsize_sqrt),
+                (
-                (self.openmp_minsize_sqrt, self.openmp_minsize_sqrt),
+                    (self.openmp_minsize_sqrt, self.openmp_minsize_sqrt),
-            ),
+                    (self.openmp_minsize_sqrt, self.openmp_minsize_sqrt),
-            ((2, 3, 4, 5), (2, 3, 4, 5)),
+                ),
-            ((2, 3, 4, 5), (1, 3, 1, 5)),
+                ((2, 3, 4, 5), (2, 3, 4, 5)),
-            ((2, 3, 4, 5), (1, 1, 1, 1)),
+                ((2, 3, 4, 5), (1, 3, 1, 5)),
-            ((), ()),
+                ((2, 3, 4, 5), (1, 1, 1, 1)),
-        ]:
+                ((), ()),
-            x = type(aesara.config.floatX, [(entry == 1) for entry in xsh])("x")
+            ]:
-            y = type(aesara.config.floatX, [(entry == 1) for entry in ysh])("y")
+                if shape_info == "complete":
-            e = op(aes.add)(x, y)
+                    x_type = type(aesara.config.floatX, shape=xsh)
-            f = make_function(copy(linker).accept(FunctionGraph([x, y], [e])))
+                    y_type = type(aesara.config.floatX, shape=ysh)
-            xv = rand_val(xsh)
+                elif shape_info == "only_broadcastable":
-            yv = rand_val(ysh)
+                    # This condition is here for backwards compatibility, when the only
-            zv = xv + yv
+                    # type shape provided by Aesara was broadcastable/non-broadcastable
+                    x_type = type(
-            unittest_tools.assert_allclose(f(xv, yv), zv)
+                        aesara.config.floatX,
+                        broadcastable=[(entry == 1) for entry in xsh],
+                    )
+                    y_type = type(
+                        aesara.config.floatX,
+                        broadcastable=[(entry == 1) for entry in ysh],
+                    )
+                else:
+                    x_type = type(aesara.config.floatX, shape=[None for _ in xsh])
+                    y_type = type(aesara.config.floatX, shape=[None for _ in ysh])
-            # test Elemwise.infer_shape
+                x = x_type("x")
-            # the Shape op don't implement c_code!
+                y = y_type("y")
-            if isinstance(linker, PerformLinker):
-                x = type(aesara.config.floatX, [(entry == 1) for entry in xsh])("x")
-                y = type(aesara.config.floatX, [(entry == 1) for entry in ysh])("y")
                e = op(aes.add)(x, y)
-                f = make_function(copy(linker).accept(FunctionGraph([x, y], [e.shape])))
+                f = make_function(copy(linker).accept(FunctionGraph([x, y], [e])))
-                assert tuple(f(xv, yv)) == tuple(zv.shape)
+                xv = rand_val(xsh)
+                yv = rand_val(ysh)
+                zv = xv + yv
-    def with_linker_inplace(self, linker, op, type, rand_val):
+                unittest_tools.assert_allclose(f(xv, yv), zv)
-        for xsh, ysh in [
-            ((5, 5), (5, 5)),
-            ((5, 5), (1, 5)),
-            ((5, 5), (5, 1)),
-            ((1, 1), (1, 1)),
-            ((2, 3, 4, 5), (2, 3, 4, 5)),
-            ((2, 3, 4, 5), (1, 3, 1, 5)),
-            ((2, 3, 4, 5), (1, 1, 1, 1)),
-            ((), ()),
-        ]:
-            x = type(aesara.config.floatX, [(entry == 1) for entry in xsh])("x")
-            y = type(aesara.config.floatX, [(entry == 1) for entry in ysh])("y")
-            e = op(aes.Add(aes.transfer_type(0)), {0: 0})(x, y)
-            f = make_function(copy(linker).accept(FunctionGraph([x, y], [e])))
-            xv = rand_val(xsh)
-            yv = rand_val(ysh)
-            zv = xv + yv
-            f(xv, yv)
+                # test Elemwise.infer_shape
+                # the Shape op don't implement c_code!
+                if isinstance(linker, PerformLinker):
+                    x = x_type("x")
+                    y = y_type("y")
+                    e = op(aes.add)(x, y)
+                    f = make_function(
+                        copy(linker).accept(FunctionGraph([x, y], [e.shape]))
+                    )
+                    assert tuple(f(xv, yv)) == tuple(zv.shape)
-            assert (xv == zv).all()
+    def with_linker_inplace(self, linker, op, type, rand_val):
-            # test Elemwise.infer_shape
+        for shape_info in ("complete", "only_broadcastable", "none"):
-            # the Shape op don't implement c_code!
+            for xsh, ysh in [
-            if isinstance(linker, PerformLinker):
+                ((5, 5), (5, 5)),
-                x = type(aesara.config.floatX, [(entry == 1) for entry in xsh])("x")
+                ((5, 5), (1, 5)),
-                y = type(aesara.config.floatX, [(entry == 1) for entry in ysh])("y")
+                ((5, 5), (5, 1)),
+                ((1, 1), (1, 1)),
+                ((2, 3, 4, 5), (2, 3, 4, 5)),
+                ((2, 3, 4, 5), (1, 3, 1, 5)),
+                ((2, 3, 4, 5), (1, 1, 1, 1)),
+                ((), ()),
+            ]:
+                if shape_info == "complete":
+                    x_type = type(aesara.config.floatX, shape=xsh)
+                    y_type = type(aesara.config.floatX, shape=ysh)
+                elif shape_info == "only_broadcastable":
+                    # This condition is here for backwards compatibility, when the only
+                    # type shape provided by Aesara was broadcastable/non-broadcastable
+                    x_type = type(
+                        aesara.config.floatX,
+                        broadcastable=[(entry == 1) for entry in xsh],
+                    )
+                    y_type = type(
+                        aesara.config.floatX,
+                        broadcastable=[(entry == 1) for entry in ysh],
+                    )
+                else:
+                    x_type = type(aesara.config.floatX, shape=[None for _ in xsh])
+                    y_type = type(aesara.config.floatX, shape=[None for _ in ysh])
+                x = x_type("x")
+                y = y_type("y")
                e = op(aes.Add(aes.transfer_type(0)), {0: 0})(x, y)
-                f = make_function(copy(linker).accept(FunctionGraph([x, y], [e.shape])))
+                f = make_function(copy(linker).accept(FunctionGraph([x, y], [e])))
                xv = rand_val(xsh)
                yv = rand_val(ysh)
                zv = xv + yv
                f(xv, yv)
-                assert xv.shape == zv.shape
+                assert (xv == zv).all()
+                # test Elemwise.infer_shape
+                # the Shape op don't implement c_code!
+                if isinstance(linker, PerformLinker):
+                    x = x_type("x")
+                    y = y_type("y")
+                    e = op(aes.Add(aes.transfer_type(0)), {0: 0})(x, y)
+                    f = make_function(
+                        copy(linker).accept(FunctionGraph([x, y], [e.shape]))
+                    )
+                    xv = rand_val(xsh)
+                    yv = rand_val(ysh)
+                    zv = xv + yv
+                    assert xv.shape == zv.shape
+                    assert tuple(f(xv, yv)) == zv.shape
    def test_perform(self):
        self.with_linker(PerformLinker(), self.op, self.type, self.rand_val)
@@ -746,10 +786,6 @@ class TestElemwise(unittest_tools.InferShapeTester):
    def test_input_dimensions_match_python(self):
        self.check_input_dimensions_match(Mode(linker="py"))
-    @pytest.mark.xfail(
-        reason="Elemwise C implementation does not broadcast parameters",
-        exception=ValueError,
-    )
    @pytest.mark.skipif(
        not aesara.config.cxx, reason="G++ not available, so we need to skip this test."
    )