提交 e752fc3d authored 作者: Ricardo Vieira's avatar Ricardo Vieira 提交者: Ricardo Vieira

CAReduce loop reordering C-impl

上级 00a8a883
from copy import copy from copy import copy
from textwrap import dedent
import numpy as np import numpy as np
from numpy.core.numeric import normalize_axis_tuple from numpy.core.numeric import normalize_axis_tuple
...@@ -1448,15 +1449,16 @@ class CAReduce(COp): ...@@ -1448,15 +1449,16 @@ class CAReduce(COp):
return ((),) return ((),)
return ([ishape[i] for i in range(node.inputs[0].type.ndim) if i not in axis],) return ([ishape[i] for i in range(node.inputs[0].type.ndim) if i not in axis],)
def _c_all(self, node, name, inames, onames, sub): def _c_all(self, node, name, input_names, output_names, sub):
input = node.inputs[0] [inp] = node.inputs
output = node.outputs[0] [out] = node.outputs
ndim = inp.type.ndim
iname = inames[0] [inp_name] = input_names
oname = onames[0] [out_name] = output_names
idtype = input.type.dtype_specs()[1] inp_dtype = inp.type.dtype_specs()[1]
odtype = output.type.dtype_specs()[1] out_dtype = out.type.dtype_specs()[1]
acc_dtype = getattr(self, "acc_dtype", None) acc_dtype = getattr(self, "acc_dtype", None)
...@@ -1464,100 +1466,97 @@ class CAReduce(COp): ...@@ -1464,100 +1466,97 @@ class CAReduce(COp):
if acc_dtype == "float16": if acc_dtype == "float16":
raise MethodNotDefined("no c_code for float16") raise MethodNotDefined("no c_code for float16")
acc_type = TensorType(shape=node.outputs[0].type.shape, dtype=acc_dtype) acc_type = TensorType(shape=node.outputs[0].type.shape, dtype=acc_dtype)
adtype = acc_type.dtype_specs()[1] acc_dtype = acc_type.dtype_specs()[1]
else: else:
adtype = odtype acc_dtype = out_dtype
axis = self.axis axis = self.axis
if axis is None: if axis is None:
axis = list(range(input.type.ndim)) axis = list(range(inp.type.ndim))
if len(axis) == 0: if len(axis) == 0:
# This is just an Elemwise cast operation
# The acc_dtype is never a downcast compared to the input dtype # The acc_dtype is never a downcast compared to the input dtype
# So we just need a cast to the output dtype. # So we just need a cast to the output dtype.
var = pytensor.tensor.basic.cast(input, node.outputs[0].dtype) var = pytensor.tensor.basic.cast(inp, node.outputs[0].dtype)
if var is input: if var is inp:
var = Elemwise(scalar_identity)(input) var = Elemwise(scalar_identity)(inp)
assert var.dtype == node.outputs[0].dtype assert var.dtype == node.outputs[0].dtype
return var.owner.op._c_all(var.owner, name, inames, onames, sub) return var.owner.op._c_all(var.owner, name, input_names, output_names, sub)
order1 = [i for i in range(input.type.ndim) if i not in axis]
order = order1 + list(axis)
nnested = len(order1) inp_dims = list(range(ndim))
non_reduced_dims = [i for i in inp_dims if i not in axis]
counter = iter(range(ndim))
acc_dims = ["x" if i in axis else next(counter) for i in range(ndim)]
sub = dict(sub) sub = sub.copy()
for i, (input, iname) in enumerate(zip(node.inputs, inames)): sub["lv0"] = inp_name
sub[f"lv{i}"] = iname sub["lv1"] = out_name
sub["olv"] = out_name
decl = "" if acc_dtype != out_dtype:
if adtype != odtype:
# Create an accumulator variable different from the output # Create an accumulator variable different from the output
aname = "acc" acc_name = "acc"
decl = acc_type.c_declare(aname, sub) setup = acc_type.c_declare(acc_name, sub) + acc_type.c_init(acc_name, sub)
decl += acc_type.c_init(aname, sub)
else: else:
# the output is the accumulator variable # the output is the accumulator variable
aname = oname acc_name = out_name
setup = ""
decl += cgen.make_declare([order], [idtype], sub)
checks = cgen.make_checks([order], [idtype], sub) # Define strides of input array
setup += cgen.make_declare(
alloc = "" [inp_dims], [inp_dtype], sub, compute_stride_jump=False
i += 1 ) + cgen.make_checks([inp_dims], [inp_dtype], sub, compute_stride_jump=False)
sub[f"lv{i}"] = oname
sub["olv"] = oname # Define strides of output array and allocate it
out_sub = sub | {"lv0": out_name}
# Allocate output buffer alloc = (
alloc += cgen.make_declare( cgen.make_declare(
[list(range(nnested)) + ["x"] * len(axis)], [odtype], dict(sub, lv0=oname) [acc_dims], [out_dtype], out_sub, compute_stride_jump=False
) )
alloc += cgen.make_alloc([order1], odtype, sub) + cgen.make_alloc([non_reduced_dims], out_dtype, sub)
alloc += cgen.make_checks( + cgen.make_checks(
[list(range(nnested)) + ["x"] * len(axis)], [odtype], dict(sub, lv0=oname) [acc_dims], [out_dtype], out_sub, compute_stride_jump=False
)
) )
if adtype != odtype: if acc_dtype != out_dtype:
# Allocate accumulation buffer # Define strides of accumulation buffer and allocate it
sub[f"lv{i}"] = aname sub["lv1"] = acc_name
sub["olv"] = aname sub["olv"] = acc_name
alloc += cgen.make_declare( acc_sub = sub | {"lv0": acc_name}
[list(range(nnested)) + ["x"] * len(axis)], alloc += (
[adtype], cgen.make_declare(
dict(sub, lv0=aname), [acc_dims], [acc_dtype], acc_sub, compute_stride_jump=False
) )
alloc += cgen.make_alloc([order1], adtype, sub) + cgen.make_alloc([non_reduced_dims], acc_dtype, sub)
alloc += cgen.make_checks( + cgen.make_checks(
[list(range(nnested)) + ["x"] * len(axis)], [acc_dims], [acc_dtype], acc_sub, compute_stride_jump=False
[adtype], )
dict(sub, lv0=aname),
) )
identity = self.scalar_op.identity identity = self.scalar_op.identity
if np.isposinf(identity): if np.isposinf(identity):
if input.type.dtype in ("float32", "float64"): if inp.type.dtype in ("float32", "float64"):
identity = "__builtin_inf()" identity = "__builtin_inf()"
elif input.type.dtype.startswith("uint") or input.type.dtype == "bool": elif inp.type.dtype.startswith("uint") or inp.type.dtype == "bool":
identity = "1" identity = "1"
else: else:
identity = "NPY_MAX_" + str(input.type.dtype).upper() identity = "NPY_MAX_" + str(inp.type.dtype).upper()
elif np.isneginf(identity): elif np.isneginf(identity):
if input.type.dtype in ("float32", "float64"): if inp.type.dtype in ("float32", "float64"):
identity = "-__builtin_inf()" identity = "-__builtin_inf()"
elif input.type.dtype.startswith("uint") or input.type.dtype == "bool": elif inp.type.dtype.startswith("uint") or inp.type.dtype == "bool":
identity = "0" identity = "0"
else: else:
identity = "NPY_MIN_" + str(input.type.dtype).upper() identity = "NPY_MIN_" + str(inp.type.dtype).upper()
elif identity is None: elif identity is None:
raise TypeError(f"The {self.scalar_op} does not define an identity.") raise TypeError(f"The {self.scalar_op} does not define an identity.")
task0_decl = f"{adtype}& {aname}_i = *{aname}_iter;\n{aname}_i = {identity};" initial_value = f"{acc_name}_i = {identity};"
task1_decl = f"{idtype}& {inames[0]}_i = *{inames[0]}_iter;\n"
task1_code = self.scalar_op.c_code( inner_task = self.scalar_op.c_code(
Apply( Apply(
self.scalar_op, self.scalar_op,
[ [
...@@ -1570,44 +1569,45 @@ class CAReduce(COp): ...@@ -1570,44 +1569,45 @@ class CAReduce(COp):
], ],
), ),
None, None,
[f"{aname}_i", f"{inames[0]}_i"], [f"{acc_name}_i", f"{inp_name}_i"],
[f"{aname}_i"], [f"{acc_name}_i"],
sub, sub,
) )
code1 = f"""
{{
{task1_decl}
{task1_code}
}}
"""
if node.inputs[0].type.ndim: if out.type.ndim == 0:
if len(axis) == 1: # Simple case where everything is reduced, no need for loop ordering
all_code = [("", "")] * nnested + [(task0_decl, code1), ""] loop = cgen.make_complete_loop_careduce(
else: inp_var=inp_name,
all_code = ( acc_var=acc_name,
[("", "")] * nnested inp_dtype=inp_dtype,
+ [(task0_decl, "")] acc_dtype=acc_dtype,
+ [("", "")] * (len(axis) - 2) initial_value=initial_value,
+ [("", code1), ""] inner_task=inner_task,
) fail_code=sub["fail"],
)
else: else:
all_code = [task0_decl + code1] loop = cgen.make_reordered_loop_careduce(
loop = cgen.make_loop_careduce( inp_var=inp_name,
[order, list(range(nnested)) + ["x"] * len(axis)], acc_var=acc_name,
[idtype, adtype], inp_dtype=inp_dtype,
all_code, acc_dtype=acc_dtype,
sub, inp_ndim=ndim,
) reduction_axes=axis,
initial_value=initial_value,
inner_task=inner_task,
)
end = "" if acc_dtype != out_dtype:
if adtype != odtype: cast = dedent(
end = f""" f"""
PyArray_CopyInto({oname}, {aname}); PyArray_CopyInto({out_name}, {acc_name});
""" {acc_type.c_cleanup(acc_name, sub)}
end += acc_type.c_cleanup(aname, sub) """
)
else:
cast = ""
return decl, checks, alloc, loop, end return setup, alloc, loop, cast
def c_code(self, node, name, inames, onames, sub): def c_code(self, node, name, inames, onames, sub):
code = "\n".join(self._c_all(node, name, inames, onames, sub)) code = "\n".join(self._c_all(node, name, inames, onames, sub))
...@@ -1619,7 +1619,7 @@ class CAReduce(COp): ...@@ -1619,7 +1619,7 @@ class CAReduce(COp):
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
# the version corresponding to the c code in this Op # the version corresponding to the c code in this Op
version = [9] version = [10]
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply( scalar_node = Apply(
......
from collections.abc import Sequence
from textwrap import dedent, indent from textwrap import dedent, indent
from pytensor.configdefaults import config from pytensor.configdefaults import config
def make_declare(loop_orders, dtypes, sub): def make_declare(loop_orders, dtypes, sub, compute_stride_jump=True):
""" """
Produce code to declare all necessary variables. Produce code to declare all necessary variables.
...@@ -20,13 +21,11 @@ def make_declare(loop_orders, dtypes, sub): ...@@ -20,13 +21,11 @@ def make_declare(loop_orders, dtypes, sub):
# the number of elements in that dimension, # the number of elements in that dimension,
# the stride in that dimension, # the stride in that dimension,
# and the jump from an iteration to the next # and the jump from an iteration to the next
decl += f""" decl += f"npy_intp {var}_n{value};\nssize_t {var}_stride{value};\n"
npy_intp {var}_n{value}; if compute_stride_jump:
ssize_t {var}_stride{value}; decl += f"int {var}_jump{value}_{j};\n"
int {var}_jump{value}_{j};
"""
else: elif compute_stride_jump:
# if the dimension is broadcasted, we only need # if the dimension is broadcasted, we only need
# the jump (arbitrary length and stride = 0) # the jump (arbitrary length and stride = 0)
decl += f"int {var}_jump{value}_{j};\n" decl += f"int {var}_jump{value}_{j};\n"
...@@ -34,7 +33,7 @@ def make_declare(loop_orders, dtypes, sub): ...@@ -34,7 +33,7 @@ def make_declare(loop_orders, dtypes, sub):
return decl return decl
def make_checks(loop_orders, dtypes, sub): def make_checks(loop_orders, dtypes, sub, compute_stride_jump=True):
init = "" init = ""
for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)): for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
var = sub[f"lv{i}"] var = sub[f"lv{i}"]
...@@ -67,13 +66,13 @@ def make_checks(loop_orders, dtypes, sub): ...@@ -67,13 +66,13 @@ def make_checks(loop_orders, dtypes, sub):
# Initialize the variables associated to the jth loop # Initialize the variables associated to the jth loop
# jump = stride - adjust # jump = stride - adjust
jump = f"({var}_stride{index}) - ({adjust})" jump = f"({var}_stride{index}) - ({adjust})"
init += f""" init += f"{var}_n{index} = PyArray_DIMS({var})[{index}];\n"
{var}_n{index} = PyArray_DIMS({var})[{index}]; init += f"{var}_stride{index} = PyArray_STRIDES({var})[{index}] / sizeof({dtype});\n"
{var}_stride{index} = PyArray_STRIDES({var})[{index}] / sizeof({dtype}); if compute_stride_jump:
{var}_jump{index}_{j} = {jump}; init += f"{var}_jump{index}_{j} = {jump};\n"
"""
adjust = f"{var}_n{index}*{var}_stride{index}" adjust = f"{var}_n{index}*{var}_stride{index}"
else:
elif compute_stride_jump:
jump = f"-({adjust})" jump = f"-({adjust})"
init += f"{var}_jump{index}_{j} = {jump};\n" init += f"{var}_jump{index}_{j} = {jump};\n"
adjust = "0" adjust = "0"
...@@ -460,72 +459,298 @@ def make_reordered_loop( ...@@ -460,72 +459,298 @@ def make_reordered_loop(
################ ################
def make_loop_careduce(loop_orders, dtypes, loop_tasks, sub): def make_complete_loop_careduce(
inp_var: str,
acc_var: str,
inp_dtype: str,
acc_dtype: str,
initial_value: str,
inner_task: str,
fail_code,
) -> str:
"""Generate C code for a complete reduction loop.
The generated code for a float64 input variable `inp` and accumulation variable `acc` looks like:
.. code-block:: C
{
NpyIter* iter;
NpyIter_IterNextFunc *iternext;
char** data_ptr;
npy_intp* stride_ptr,* innersize_ptr;
// Special case for empty inputs
if (PyArray_SIZE(inp) == 0) {
npy_float64 acc_i = *(npy_float64*)(PyArray_DATA(acc));
acc_i = 0;
}else{
iter = NpyIter_New(inp,
NPY_ITER_READONLY| NPY_ITER_EXTERNAL_LOOP| NPY_ITER_REFS_OK,
NPY_KEEPORDER,
NPY_NO_CASTING,
NULL);
iternext = NpyIter_GetIterNext(iter, NULL);
if (iternext == NULL) {
NpyIter_Deallocate(iter);
{ fail }
}
data_ptr = NpyIter_GetDataPtrArray(iter);
stride_ptr = NpyIter_GetInnerStrideArray(iter);
innersize_ptr = NpyIter_GetInnerLoopSizePtr(iter);
npy_float64 acc_i;
acc_i = 0;
do {
char* data = *data_ptr;
npy_intp stride = *stride_ptr;
npy_intp count = *innersize_ptr;
while(count--) {
npy_float64 inp_i = *((npy_float64*)data);
acc_i = acc_i + inp_i;
data += stride;
}
} while(iternext(iter));
NpyIter_Deallocate(iter);
*(npy_float64*)(PyArray_DATA(acc)) = acc_i;
}
}
""" """
Make a nested loop over several arrays and associate specific code return dedent(
to each level of nesting. f"""
{{
NpyIter* iter;
NpyIter_IterNextFunc *iternext;
char** data_ptr;
npy_intp* stride_ptr,* innersize_ptr;
// Special case for empty inputs
if (PyArray_SIZE({inp_var}) == 0) {{
{acc_dtype} &{acc_var}_i = *({acc_dtype}*)(PyArray_DATA({acc_var}));
{initial_value}
}}else{{
iter = NpyIter_New({inp_var},
NPY_ITER_READONLY| NPY_ITER_EXTERNAL_LOOP| NPY_ITER_REFS_OK,
NPY_KEEPORDER,
NPY_NO_CASTING,
NULL);
iternext = NpyIter_GetIterNext(iter, NULL);
if (iternext == NULL) {{
NpyIter_Deallocate(iter);
{fail_code}
}}
Parameters data_ptr = NpyIter_GetDataPtrArray(iter);
---------- stride_ptr = NpyIter_GetInnerStrideArray(iter);
loop_orders : list of N tuples of length M innersize_ptr = NpyIter_GetInnerLoopSizePtr(iter);
Each value of each tuple can be either the index of a dimension to
loop over or the letter 'x' which means there is no looping to be done
over that variable at that point (in other words we broadcast
over that dimension). If an entry is an integer, it will become
an alias of the entry of that rank.
loop_tasks : list of M+1 pieces of code
The ith loop_task is a pair of strings, the first
string is code to be executed before the ith loop starts, the second
one contains code to be executed just before going to the next element
of the ith dimension.
The last element if loop_tasks is a single string, containing code
to be executed at the very end.
sub: dictionary
Maps 'lv#' to a suitable variable name.
The 'lvi' variable corresponds to the ith element of loop_orders.
""" {acc_dtype} {acc_var}_i;
{initial_value}
def loop_over(preloop, code, indices, i): do {{
iterv = f"ITER_{int(i)}" char* data = *data_ptr;
update = "" npy_intp stride = *stride_ptr;
suitable_n = "1" npy_intp count = *innersize_ptr;
for j, index in enumerate(indices):
var = sub[f"lv{int(j)}"] while(count--) {{
update += f"{var}_iter += {var}_jump{index}_{i};\n" {inp_dtype} {inp_var}_i = *(({inp_dtype}*)data);
if index != "x": {inner_task}
suitable_n = f"{var}_n{index}" data += stride;
return f""" }}
{preloop} }} while(iternext(iter));
for (int {iterv} = {suitable_n}; {iterv}; {iterv}--) {{
{code} NpyIter_Deallocate(iter);
{update} *({acc_dtype}*)(PyArray_DATA({acc_var})) = {acc_var}_i;
}}
}} }}
""" """
)
preloops = {}
for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
for j, index in enumerate(loop_order):
if index != "x":
preloops.setdefault(j, "")
preloops[j] += (
f"%(lv{i})s_iter = ({dtype}*)(PyArray_DATA(%(lv{i})s));\n"
) % sub
break
else: # all broadcastable
preloops.setdefault(0, "")
preloops[0] += (
f"%(lv{i})s_iter = ({dtype}*)(PyArray_DATA(%(lv{i})s));\n"
) % sub
if len(loop_tasks) == 1: def make_reordered_loop_careduce(
s = preloops.get(0, "") inp_var: str,
else: acc_var: str,
s = "" inp_dtype: str,
for i, (pre_task, task), indices in reversed( acc_dtype: str,
list(zip(range(len(loop_tasks) - 1), loop_tasks, list(zip(*loop_orders)))) inp_ndim: int,
): reduction_axes: Sequence[int],
s = loop_over(preloops.get(i, "") + pre_task, s + task, indices, i) initial_value: str,
inner_task: str,
) -> str:
"""Generate C code for a partial reduction loop, reordering for optimal memory access of the input variable.
The generated code for a sum along the last axis of a 2D float64 input variable `inp`
in an accumulation variable `acc` looks like:
.. code-block:: C
{
// Special case for empty inputs
if (PyArray_SIZE(inp) == 0) {
acc_iter = (npy_float64*)(PyArray_DATA(acc));
int_n = PyArray_SIZE(acc);
for(int i = 0; i < n; i++)
{
npy_float64 &acc_i = acc_iter[i];
acc_i = 0;
}
} else {
std::vector< std::pair<int, int> > loops(2);
std::vector< std::pair<int, int> >::iterator loops_it = loops.begin();
loops_it->first = abs(PyArray_STRIDES(inp)[0]);
loops_it->second = 0;
++loops_it;
loops_it->first = abs(PyArray_STRIDES(inp)[1]);
loops_it->second = 1;
++loops_it;
std::sort(loops.rbegin(), loops.rend());
int dim_lengths[2] = {inp_n0, inp_n1};
int inp_strides[2] = {inp_stride0, inp_stride1};
int acc_strides[2] = {acc_stride0, 0};
bool reduction_axes[2] = {0, 1};
loops_it = loops.begin();
int dim_length_0 = dim_lengths[loops_it->second];
int is_reduction_axis_0 = reduction_axes[loops_it->second];
int inp_stride_0 = inp_strides[loops_it->second];
int acc_stride_0 = acc_strides[loops_it->second];
++loops_it;
int dim_length_1 = dim_lengths[loops_it->second];
int is_reduction_axis_1 = reduction_axes[loops_it->second];
int inp_stride_1 = inp_strides[loops_it->second];
int acc_stride_1 = acc_strides[loops_it->second];
++loops_it;
inp_iter = (npy_float64*)(PyArray_DATA(inp));
acc_iter = (npy_float64*)(PyArray_DATA(acc));
for(int iter_0 = 0; iter_0<dim_length_0; iter_0++){
for(int iter_1 = 0; iter_1<dim_length_1; iter_1++){
npy_float64 &inp_i = *(inp_iter + inp_stride_1*iter_1 + inp_stride_0*iter_0);
npy_float64 &acc_i = *(acc_iter + acc_stride_1*iter_1 + acc_stride_0*iter_0);
if((!is_reduction_axis_0 || iter_0 == 0) && (!is_reduction_axis_1 || iter_1 == 0))
{
acc_i = 0;
}
{acc_i = acc_i + inp_i;}
}
}
}
s += loop_tasks[-1] """
return f"{{{s}}}"
empty_case = dedent(
f"""
// Special case for empty inputs
if (PyArray_SIZE({inp_var}) == 0) {{
{acc_var}_iter = ({acc_dtype}*)(PyArray_DATA({acc_var}));
int n = PyArray_SIZE({acc_var});
for(int i = 0; i < n; i++)
{{
{acc_dtype} &{acc_var}_i = {acc_var}_iter[i];
{initial_value}
}}
}} else {{
"""
)
# The loops are ordered by (decreasing) absolute values of inp_var's strides.
# The first element of each pair is the absolute value of the stride
# The second element correspond to the index in the initial loop order
order_loops = dedent(
f"""
std::vector< std::pair<int, int> > loops({inp_ndim});
std::vector< std::pair<int, int> >::iterator loops_it = loops.begin();
"""
)
# Fill the loop vector with the appropriate <stride, index> pairs
for i in range(inp_ndim):
order_loops += dedent(
f"""
loops_it->first = abs(PyArray_STRIDES({inp_var})[{i}]);
loops_it->second = {i};
++loops_it;"""
)
# We sort in decreasing order so that the outermost loop (loop 0)
# has the largest stride, and the innermost loop has the smallest stride.
order_loops += "\nstd::sort(loops.rbegin(), loops.rend());\n"
# Sort shape and strides to match the new order that was computed by sorting the loop vector.
counter = iter(range(inp_ndim))
unsorted_vars = dedent(
f"""
int dim_lengths[{inp_ndim}] = {{{','.join(f'{inp_var}_n{i}' for i in range(inp_ndim))}}};
int inp_strides[{inp_ndim}] = {{{','.join(f'{inp_var}_stride{i}' for i in range(inp_ndim))}}};
int acc_strides[{inp_ndim}] = {{{','.join("0" if i in reduction_axes else f'{acc_var}_stride{next(counter)}'for i in range(inp_ndim))}}};
bool reduction_axes[{inp_ndim}] = {{{', '.join("1" if i in reduction_axes else "0" for i in range(inp_ndim))}}};\n
"""
)
sorted_vars = "loops_it = loops.begin();"
for i in range(inp_ndim):
sorted_vars += dedent(
f"""
int dim_length_{i} = dim_lengths[loops_it->second];
int is_reduction_axis_{i} = reduction_axes[loops_it->second];
int {inp_var}_stride_{i} = inp_strides[loops_it->second];
int {acc_var}_stride_{i} = acc_strides[loops_it->second];
++loops_it;
"""
)
declare_iter = dedent(
f"""
{inp_var}_iter = ({inp_dtype}*)(PyArray_DATA({inp_var}));
{acc_var}_iter = ({acc_dtype}*)(PyArray_DATA({acc_var}));
"""
)
pointer_update = ""
for var, dtype in ((inp_var, inp_dtype), (acc_var, acc_dtype)):
pointer_update += f"{dtype} &{var}_i = *({var}_iter"
for i in reversed(tuple(range(inp_ndim))):
iter_var = f"iter_{i}"
pointer_update += f" + {var}_stride_{i}*{iter_var}"
pointer_update += ");\n"
# Set initial value in first iteration of each output
# This happens on the first iteration of every reduction axis
initial_iteration = " && ".join(
f"(!is_reduction_axis_{i} || iter_{i} == 0)" for i in range(inp_ndim)
)
set_initial_value = dedent(
f"""
if({initial_iteration})
{{
{initial_value}
}}
"""
)
# We set do pointer_update, initial_value and inner task in inner loop
loop = "\n\n".join((pointer_update, set_initial_value, f"{{{inner_task}}}"))
# Create outer loops recursively
for i in reversed(range(inp_ndim)):
iter_var = f"iter_{i}"
dim_length = f"dim_length_{i}"
loop = dedent(
f"""
for(int {iter_var} = 0; {iter_var}<{dim_length}; {iter_var}++){{
{loop}
}}
"""
)
non_empty_case = "\n".join(
(order_loops, unsorted_vars, sorted_vars, declare_iter, loop)
)
code = "\n".join((empty_case, non_empty_case, "}"))
return f"{{\n{code}\n}}\n"
import builtins import builtins
import warnings import warnings
from collections.abc import Sequence from collections.abc import Sequence
from textwrap import dedent
from typing import TYPE_CHECKING, Optional from typing import TYPE_CHECKING, Optional
import numpy as np import numpy as np
...@@ -361,12 +362,14 @@ class FixedOpCAReduce(CAReduce): ...@@ -361,12 +362,14 @@ class FixedOpCAReduce(CAReduce):
class NonZeroDimsCAReduce(FixedOpCAReduce): class NonZeroDimsCAReduce(FixedOpCAReduce):
def _c_all(self, node, name, inames, onames, sub): def _c_all(self, node, name, input_names, output_names, sub):
decl, checks, alloc, loop, end = super()._c_all(node, name, inames, onames, sub) setup, alloc, loop, cast = super()._c_all(
node, name, input_names, output_names, sub
)
# We add an additional check for zero-sized dimensions (This seems like # We add an additional check for zero-sized dimensions (This seems like
# something that could enabled in `elemwise_cgen.make_checks`.) # something that could enabled in `elemwise_cgen.make_checks`.)
iname = inames[0] [iname] = input_names
axis = self.axis axis = self.axis
if axis is None: if axis is None:
...@@ -378,17 +381,19 @@ class NonZeroDimsCAReduce(FixedOpCAReduce): ...@@ -378,17 +381,19 @@ class NonZeroDimsCAReduce(FixedOpCAReduce):
pattern_ = str(pattern)[1:-1] pattern_ = str(pattern)[1:-1]
decl += f"""int tosum[]={{{pattern_}}};""" setup = f"int tosum[]={{{pattern_}}};" + setup
alloc += f""" alloc += dedent(
for(int i=0;i<PyArray_NDIM({iname});i++){{ f"""
if(PyArray_DIMS({iname})[i]==0 && tosum[i]){{ for(int i=0;i<PyArray_NDIM({iname});i++){{
PyErr_Format(PyExc_ValueError, if(PyArray_DIMS({iname})[i]==0 && tosum[i]){{
"Input of CAReduce{{{node.op.scalar_op}}} has zero-size on axis %%d",i); PyErr_Format(PyExc_ValueError,
{sub["fail"]}; "Input of CAReduce{{{node.op.scalar_op}}} has zero-size on axis %%d",i);
}} {sub["fail"]};
}} }}
""" }}
return decl, checks, alloc, loop, end """
)
return setup, alloc, loop, cast
class Max(NonZeroDimsCAReduce): class Max(NonZeroDimsCAReduce):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论