提交 c7b416e1 authored 作者: Ricardo's avatar Ricardo 提交者: Ricardo Vieira

Allow broadcasting in Elemwise c_code

This removes an inconsistency between Numpy and Aesara broadcasting rules, where a variable dimension with unknown shape was always assumed to be non-broadcastable (i.e., different than 1)
上级 b60cf724
...@@ -2307,6 +2307,8 @@ class Pow(BinaryScalarOp): ...@@ -2307,6 +2307,8 @@ class Pow(BinaryScalarOp):
if ( if (
node.inputs[0].type == node.outputs[0].type node.inputs[0].type == node.outputs[0].type
and node.inputs[1].type == node.outputs[0].type and node.inputs[1].type == node.outputs[0].type
and None not in node.inputs[0].type.shape
and None not in node.inputs[1].type.shape
and and
# amdlibm 3.0 do not have a float64 version of this SIMD function # amdlibm 3.0 do not have a float64 version of this SIMD function
node.inputs[0].dtype == "float32" node.inputs[0].dtype == "float32"
......
...@@ -913,7 +913,7 @@ second dimension ...@@ -913,7 +913,7 @@ second dimension
checks = cgen.make_checks(orders, idtypes, sub) checks = cgen.make_checks(orders, idtypes, sub)
# Check if all inputs (except broadcasted scalar) are fortran. # Check if all inputs (except broadcasted scalar) are fortran.
# In that case, create an fortran output ndarray. # In that case, create a fortran output ndarray.
z = list(zip(inames, inputs)) z = list(zip(inames, inputs))
alloc_fortran = " && ".join( alloc_fortran = " && ".join(
[ [
...@@ -1071,7 +1071,7 @@ second dimension ...@@ -1071,7 +1071,7 @@ second dimension
# If all inputs and outputs are contiguous # If all inputs and outputs are contiguous
# and the scalar op define optimized code for that case # and the scalar op define optimized code for that case
# use it! The scalar_op need to check the broadcast flag himself. # use it! The scalar_op needs to check the type-level shapes itself.
if ( if (
all(o.ndim >= 1 for o in node.outputs) all(o.ndim >= 1 for o in node.outputs)
and and
...@@ -1088,11 +1088,19 @@ second dimension ...@@ -1088,11 +1088,19 @@ second dimension
# compiler to vectorize the code as their won't be as # compiler to vectorize the code as their won't be as
# many ptr and the stride will be hard coded. # many ptr and the stride will be hard coded.
if all( if all(
[ # io.type.shape == node.outputs[1].type.shape
io.broadcastable == node.outputs[0].broadcastable # Elemwise does not specify non-broadcastable static/type-levelshape
or all(io.broadcastable) # information for its outputs yet
for io in node.inputs + node.outputs node.outputs[0].type.is_super(io.type)
] for io in node.inputs + node.outputs
) and (
len(node.inputs) <= 1
# If either one of the inputs has a `None` shape, we cannot
# assume they will have the same size
or all(
len(set(inp_shape)) == 1 and None not in inp_shape
for inp_shape in zip(*(inp.type.shape for inp in node.inputs))
)
): ):
z = onames[0] z = onames[0]
contig = f""" contig = f"""
...@@ -1188,7 +1196,7 @@ second dimension ...@@ -1188,7 +1196,7 @@ second dimension
return support_code return support_code
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [13] # the version corresponding to the c code in this Op version = [14] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply( scalar_node = Apply(
......
...@@ -66,10 +66,12 @@ def make_checks(loop_orders, dtypes, sub): ...@@ -66,10 +66,12 @@ def make_checks(loop_orders, dtypes, sub):
if index != "x": if index != "x":
# Initialize the variables associated to the jth loop # Initialize the variables associated to the jth loop
# jump = stride - adjust # jump = stride - adjust
# If the variable has size 1 in that dim, we set the stride to zero to
# emulate broadcasting
jump = f"({var}_stride{index}) - ({adjust})" jump = f"({var}_stride{index}) - ({adjust})"
init += f""" init += f"""
{var}_n{index} = PyArray_DIMS({var})[{index}]; {var}_n{index} = PyArray_DIMS({var})[{index}];
{var}_stride{index} = PyArray_STRIDES({var})[{index}] / sizeof({dtype}); {var}_stride{index} = ({var}_n{index} == 1)? 0 : PyArray_STRIDES({var})[{index}] / sizeof({dtype});
{var}_jump{index}_{j} = {jump}; {var}_jump{index}_{j} = {jump};
""" """
adjust = f"{var}_n{index}*{var}_stride{index}" adjust = f"{var}_n{index}*{var}_stride{index}"
...@@ -90,26 +92,85 @@ def make_checks(loop_orders, dtypes, sub): ...@@ -90,26 +92,85 @@ def make_checks(loop_orders, dtypes, sub):
# elements of to_compare are pairs ( input_variable_idx, input_variable_dim_idx ) # elements of to_compare are pairs ( input_variable_idx, input_variable_dim_idx )
if len(to_compare) < 2: if len(to_compare) < 2:
continue continue
j0, x0 = to_compare[0]
for (j, x) in to_compare[1:]: # Find first dimension size that is != 1
check += f""" jl, xl = to_compare[-1]
if (%(lv{j0})s_n{x0} != %(lv{j})s_n{x}) non1size_dim_check = f"""
npy_intp non1size_dim{xl};
non1size_dim{xl} = """
for (j, x) in to_compare[:-1]:
non1size_dim_check += f"(%(lv{j})s_n{x} != 1) ? %(lv{j})s_n{x} : "
non1size_dim_check += f"%(lv{jl})s_n{xl};"
check += non1size_dim_check
# Check the nonsize1 dims match
# TODO: This is a bit inefficient because we are comparing one dimension against itself
check += f"""
if (non1size_dim{xl} != 1)
{{ {{
PyErr_Format(PyExc_ValueError, "Input dimension mismatch. (input[%%i].shape[%%i] = %%lld, input[%%i].shape[%%i] = %%lld)", """
{j0}, for (j, x) in to_compare:
{x0}, check += f"""
(long long int) %(lv{j0})s_n{x0}, if ((%(lv{j})s_n{x} != non1size_dim{x}) && (%(lv{j})s_n{x} != 1))
{j}, {{
{x}, PyErr_Format(PyExc_ValueError, "Input dimension mismatch. One other input has shape[%%i] = %%lld, but input[%%i].shape[%%i] = %%lld.",
(long long int) %(lv{j})s_n{x} {x},
); (long long int) non1size_dim{x},
%(fail)s {j},
}} {x},
(long long int) %(lv{j})s_n{x}
);
%(fail)s
}}
""" """
check += """
}
"""
return init % sub + check % sub return init % sub + check % sub
def compute_broadcast_dimensions(array_name: str, loop_orders, sub) -> str:
"""Create c_code to compute broadcasted dimensions of multiple arrays, arising from
Elemwise operations.
The code returned by this function populates the array `array_name`, but does not
initialize it.
TODO: We can decide to either specialize C code even further given the input types
or make it general, regardless of whether static broadcastable information is given
"""
dims_c_code = ""
for i, candidates in enumerate(zip(*loop_orders)):
# TODO: Are candidates always either "x" or "i"? If that's the case we can
# simplify some logic here (e.g., we don't need to track the `idx`).
nonx_candidates = tuple(
(idx, c) for idx, c in enumerate(candidates) if c != "x"
)
# All inputs are known to be broadcastable
if not nonx_candidates:
dims_c_code += f"{array_name}[{i}] = 1;\n"
continue
# There is only one informative source of size
if len(nonx_candidates) == 1:
idx, candidate = nonx_candidates[0]
var = sub[f"lv{int(idx)}"]
dims_c_code += f"{array_name}[{i}] = {var}_n{candidate};\n"
continue
# In this case any non-size 1 variable will define the right size
dims_c_code += f"{array_name}[{i}] = "
for (idx, candidate) in nonx_candidates[:-1]:
var = sub[f"lv{int(idx)}"]
dims_c_code += f"({var}_n{candidate} != 1)? {var}_n{candidate}: "
idx, candidate = nonx_candidates[-1]
var = sub[f"lv{idx}"]
dims_c_code += f"{var}_n{candidate};\n"
return dims_c_code
def make_alloc(loop_orders, dtype, sub, fortran="0"): def make_alloc(loop_orders, dtype, sub, fortran="0"):
"""Generate C code to allocate outputs. """Generate C code to allocate outputs.
...@@ -125,20 +186,7 @@ def make_alloc(loop_orders, dtype, sub, fortran="0"): ...@@ -125,20 +186,7 @@ def make_alloc(loop_orders, dtype, sub, fortran="0"):
if type.startswith("AESARA_COMPLEX"): if type.startswith("AESARA_COMPLEX"):
type = type.replace("AESARA_COMPLEX", "NPY_COMPLEX") type = type.replace("AESARA_COMPLEX", "NPY_COMPLEX")
nd = len(loop_orders[0]) nd = len(loop_orders[0])
init_dims = "" init_dims = compute_broadcast_dimensions("dims", loop_orders, sub)
# For each dimension, the tensors are either all broadcasted, in
# which case the output will also be broadcastable (dimension =
# 1), or one or more are not broadcasted, in which case the number
# of elements of the output in that dimension will be equal to the
# number of elements of any of them.
for i, candidates in enumerate(zip(*loop_orders)):
for j, candidate in enumerate(candidates):
if candidate != "x":
var = sub[f"lv{int(j)}"]
init_dims += f"dims[{i}] = {var}_n{candidate};\n"
break
else:
init_dims += f"dims[{i}] = 1;\n"
# TODO: it would be interesting to allocate the output in such a # TODO: it would be interesting to allocate the output in such a
# way that its contiguous dimensions match one of the input's # way that its contiguous dimensions match one of the input's
...@@ -310,26 +358,8 @@ def make_reordered_loop( ...@@ -310,26 +358,8 @@ def make_reordered_loop(
""" """
# Get the (sorted) total number of iterations of each loop # Get the (sorted) total number of iterations of each loop
# Get totals in the initial order declare_totals = f"int init_totals[{nnested}];\n"
# For each dimension, the tensors are either all broadcasted, in declare_totals += compute_broadcast_dimensions("init_totals", init_loop_orders, sub)
# which case there is only one iteration of the loop, or one or
# more are not broadcasted, in which case the number of elements
# of any of them will be equal to the number of iterations we have
# to do.
totals = []
for i, candidates in enumerate(zip(*init_loop_orders)):
for j, candidate in enumerate(candidates):
if candidate != "x":
var = sub[f"lv{int(j)}"]
total = f"{var}_n{candidate}"
break
else:
total = "1"
totals.append(total)
declare_totals = f"""
int init_totals[{nnested}] = {{{", ".join(totals)}}};
"""
# Sort totals to match the new order that was computed by sorting # Sort totals to match the new order that was computed by sorting
# the loop vector. One integer variable per loop is declared. # the loop vector. One integer variable per loop is declared.
......
...@@ -206,77 +206,117 @@ class TestBroadcast: ...@@ -206,77 +206,117 @@ class TestBroadcast:
return np.asarray(np.random.random(shp), dtype=aesara.config.floatX) return np.asarray(np.random.random(shp), dtype=aesara.config.floatX)
def with_linker(self, linker, op, type, rand_val): def with_linker(self, linker, op, type, rand_val):
for xsh, ysh in [ for shape_info in ("complete", "only_broadcastable", "none"):
((3, 5), (3, 5)), for xsh, ysh in [
((3, 5), (1, 5)), ((3, 5), (3, 5)),
((3, 5), (3, 1)), ((3, 5), (1, 5)),
((1, 5), (5, 1)), ((3, 5), (3, 1)),
((1, 1), (1, 1)), ((1, 5), (5, 1)),
((self.openmp_minsize,), (self.openmp_minsize,)), ((1, 1), (1, 1)),
( ((self.openmp_minsize,), (self.openmp_minsize,)),
(self.openmp_minsize_sqrt, self.openmp_minsize_sqrt), (
(self.openmp_minsize_sqrt, self.openmp_minsize_sqrt), (self.openmp_minsize_sqrt, self.openmp_minsize_sqrt),
), (self.openmp_minsize_sqrt, self.openmp_minsize_sqrt),
((2, 3, 4, 5), (2, 3, 4, 5)), ),
((2, 3, 4, 5), (1, 3, 1, 5)), ((2, 3, 4, 5), (2, 3, 4, 5)),
((2, 3, 4, 5), (1, 1, 1, 1)), ((2, 3, 4, 5), (1, 3, 1, 5)),
((), ()), ((2, 3, 4, 5), (1, 1, 1, 1)),
]: ((), ()),
x = type(aesara.config.floatX, [(entry == 1) for entry in xsh])("x") ]:
y = type(aesara.config.floatX, [(entry == 1) for entry in ysh])("y") if shape_info == "complete":
e = op(aes.add)(x, y) x_type = type(aesara.config.floatX, shape=xsh)
f = make_function(copy(linker).accept(FunctionGraph([x, y], [e]))) y_type = type(aesara.config.floatX, shape=ysh)
xv = rand_val(xsh) elif shape_info == "only_broadcastable":
yv = rand_val(ysh) # This condition is here for backwards compatibility, when the only
zv = xv + yv # type shape provided by Aesara was broadcastable/non-broadcastable
x_type = type(
unittest_tools.assert_allclose(f(xv, yv), zv) aesara.config.floatX,
broadcastable=[(entry == 1) for entry in xsh],
)
y_type = type(
aesara.config.floatX,
broadcastable=[(entry == 1) for entry in ysh],
)
else:
x_type = type(aesara.config.floatX, shape=[None for _ in xsh])
y_type = type(aesara.config.floatX, shape=[None for _ in ysh])
# test Elemwise.infer_shape x = x_type("x")
# the Shape op don't implement c_code! y = y_type("y")
if isinstance(linker, PerformLinker):
x = type(aesara.config.floatX, [(entry == 1) for entry in xsh])("x")
y = type(aesara.config.floatX, [(entry == 1) for entry in ysh])("y")
e = op(aes.add)(x, y) e = op(aes.add)(x, y)
f = make_function(copy(linker).accept(FunctionGraph([x, y], [e.shape]))) f = make_function(copy(linker).accept(FunctionGraph([x, y], [e])))
assert tuple(f(xv, yv)) == tuple(zv.shape) xv = rand_val(xsh)
yv = rand_val(ysh)
zv = xv + yv
def with_linker_inplace(self, linker, op, type, rand_val): unittest_tools.assert_allclose(f(xv, yv), zv)
for xsh, ysh in [
((5, 5), (5, 5)),
((5, 5), (1, 5)),
((5, 5), (5, 1)),
((1, 1), (1, 1)),
((2, 3, 4, 5), (2, 3, 4, 5)),
((2, 3, 4, 5), (1, 3, 1, 5)),
((2, 3, 4, 5), (1, 1, 1, 1)),
((), ()),
]:
x = type(aesara.config.floatX, [(entry == 1) for entry in xsh])("x")
y = type(aesara.config.floatX, [(entry == 1) for entry in ysh])("y")
e = op(aes.Add(aes.transfer_type(0)), {0: 0})(x, y)
f = make_function(copy(linker).accept(FunctionGraph([x, y], [e])))
xv = rand_val(xsh)
yv = rand_val(ysh)
zv = xv + yv
f(xv, yv) # test Elemwise.infer_shape
# the Shape op don't implement c_code!
if isinstance(linker, PerformLinker):
x = x_type("x")
y = y_type("y")
e = op(aes.add)(x, y)
f = make_function(
copy(linker).accept(FunctionGraph([x, y], [e.shape]))
)
assert tuple(f(xv, yv)) == tuple(zv.shape)
assert (xv == zv).all() def with_linker_inplace(self, linker, op, type, rand_val):
# test Elemwise.infer_shape for shape_info in ("complete", "only_broadcastable", "none"):
# the Shape op don't implement c_code! for xsh, ysh in [
if isinstance(linker, PerformLinker): ((5, 5), (5, 5)),
x = type(aesara.config.floatX, [(entry == 1) for entry in xsh])("x") ((5, 5), (1, 5)),
y = type(aesara.config.floatX, [(entry == 1) for entry in ysh])("y") ((5, 5), (5, 1)),
((1, 1), (1, 1)),
((2, 3, 4, 5), (2, 3, 4, 5)),
((2, 3, 4, 5), (1, 3, 1, 5)),
((2, 3, 4, 5), (1, 1, 1, 1)),
((), ()),
]:
if shape_info == "complete":
x_type = type(aesara.config.floatX, shape=xsh)
y_type = type(aesara.config.floatX, shape=ysh)
elif shape_info == "only_broadcastable":
# This condition is here for backwards compatibility, when the only
# type shape provided by Aesara was broadcastable/non-broadcastable
x_type = type(
aesara.config.floatX,
broadcastable=[(entry == 1) for entry in xsh],
)
y_type = type(
aesara.config.floatX,
broadcastable=[(entry == 1) for entry in ysh],
)
else:
x_type = type(aesara.config.floatX, shape=[None for _ in xsh])
y_type = type(aesara.config.floatX, shape=[None for _ in ysh])
x = x_type("x")
y = y_type("y")
e = op(aes.Add(aes.transfer_type(0)), {0: 0})(x, y) e = op(aes.Add(aes.transfer_type(0)), {0: 0})(x, y)
f = make_function(copy(linker).accept(FunctionGraph([x, y], [e.shape]))) f = make_function(copy(linker).accept(FunctionGraph([x, y], [e])))
xv = rand_val(xsh) xv = rand_val(xsh)
yv = rand_val(ysh) yv = rand_val(ysh)
zv = xv + yv zv = xv + yv
f(xv, yv) f(xv, yv)
assert xv.shape == zv.shape assert (xv == zv).all()
# test Elemwise.infer_shape
# the Shape op don't implement c_code!
if isinstance(linker, PerformLinker):
x = x_type("x")
y = y_type("y")
e = op(aes.Add(aes.transfer_type(0)), {0: 0})(x, y)
f = make_function(
copy(linker).accept(FunctionGraph([x, y], [e.shape]))
)
xv = rand_val(xsh)
yv = rand_val(ysh)
zv = xv + yv
assert xv.shape == zv.shape
assert tuple(f(xv, yv)) == zv.shape
def test_perform(self): def test_perform(self):
self.with_linker(PerformLinker(), self.op, self.type, self.rand_val) self.with_linker(PerformLinker(), self.op, self.type, self.rand_val)
...@@ -746,10 +786,6 @@ class TestElemwise(unittest_tools.InferShapeTester): ...@@ -746,10 +786,6 @@ class TestElemwise(unittest_tools.InferShapeTester):
def test_input_dimensions_match_python(self): def test_input_dimensions_match_python(self):
self.check_input_dimensions_match(Mode(linker="py")) self.check_input_dimensions_match(Mode(linker="py"))
@pytest.mark.xfail(
reason="Elemwise C implementation does not broadcast parameters",
exception=ValueError,
)
@pytest.mark.skipif( @pytest.mark.skipif(
not aesara.config.cxx, reason="G++ not available, so we need to skip this test." not aesara.config.cxx, reason="G++ not available, so we need to skip this test."
) )
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论