提交 e752fc3d authored 作者: Ricardo Vieira's avatar Ricardo Vieira 提交者: Ricardo Vieira

CAReduce loop reordering C-impl

上级 00a8a883
from copy import copy
from textwrap import dedent
import numpy as np
from numpy.core.numeric import normalize_axis_tuple
......@@ -1448,15 +1449,16 @@ class CAReduce(COp):
return ((),)
return ([ishape[i] for i in range(node.inputs[0].type.ndim) if i not in axis],)
def _c_all(self, node, name, inames, onames, sub):
input = node.inputs[0]
output = node.outputs[0]
def _c_all(self, node, name, input_names, output_names, sub):
[inp] = node.inputs
[out] = node.outputs
ndim = inp.type.ndim
iname = inames[0]
oname = onames[0]
[inp_name] = input_names
[out_name] = output_names
idtype = input.type.dtype_specs()[1]
odtype = output.type.dtype_specs()[1]
inp_dtype = inp.type.dtype_specs()[1]
out_dtype = out.type.dtype_specs()[1]
acc_dtype = getattr(self, "acc_dtype", None)
......@@ -1464,100 +1466,97 @@ class CAReduce(COp):
if acc_dtype == "float16":
raise MethodNotDefined("no c_code for float16")
acc_type = TensorType(shape=node.outputs[0].type.shape, dtype=acc_dtype)
adtype = acc_type.dtype_specs()[1]
acc_dtype = acc_type.dtype_specs()[1]
else:
adtype = odtype
acc_dtype = out_dtype
axis = self.axis
if axis is None:
axis = list(range(input.type.ndim))
axis = list(range(inp.type.ndim))
if len(axis) == 0:
# This is just an Elemwise cast operation
# The acc_dtype is never a downcast compared to the input dtype
# So we just need a cast to the output dtype.
var = pytensor.tensor.basic.cast(input, node.outputs[0].dtype)
if var is input:
var = Elemwise(scalar_identity)(input)
var = pytensor.tensor.basic.cast(inp, node.outputs[0].dtype)
if var is inp:
var = Elemwise(scalar_identity)(inp)
assert var.dtype == node.outputs[0].dtype
return var.owner.op._c_all(var.owner, name, inames, onames, sub)
order1 = [i for i in range(input.type.ndim) if i not in axis]
order = order1 + list(axis)
return var.owner.op._c_all(var.owner, name, input_names, output_names, sub)
nnested = len(order1)
inp_dims = list(range(ndim))
non_reduced_dims = [i for i in inp_dims if i not in axis]
counter = iter(range(ndim))
acc_dims = ["x" if i in axis else next(counter) for i in range(ndim)]
sub = dict(sub)
for i, (input, iname) in enumerate(zip(node.inputs, inames)):
sub[f"lv{i}"] = iname
sub = sub.copy()
sub["lv0"] = inp_name
sub["lv1"] = out_name
sub["olv"] = out_name
decl = ""
if adtype != odtype:
if acc_dtype != out_dtype:
# Create an accumulator variable different from the output
aname = "acc"
decl = acc_type.c_declare(aname, sub)
decl += acc_type.c_init(aname, sub)
acc_name = "acc"
setup = acc_type.c_declare(acc_name, sub) + acc_type.c_init(acc_name, sub)
else:
# the output is the accumulator variable
aname = oname
decl += cgen.make_declare([order], [idtype], sub)
checks = cgen.make_checks([order], [idtype], sub)
alloc = ""
i += 1
sub[f"lv{i}"] = oname
sub["olv"] = oname
# Allocate output buffer
alloc += cgen.make_declare(
[list(range(nnested)) + ["x"] * len(axis)], [odtype], dict(sub, lv0=oname)
)
alloc += cgen.make_alloc([order1], odtype, sub)
alloc += cgen.make_checks(
[list(range(nnested)) + ["x"] * len(axis)], [odtype], dict(sub, lv0=oname)
acc_name = out_name
setup = ""
# Define strides of input array
setup += cgen.make_declare(
[inp_dims], [inp_dtype], sub, compute_stride_jump=False
) + cgen.make_checks([inp_dims], [inp_dtype], sub, compute_stride_jump=False)
# Define strides of output array and allocate it
out_sub = sub | {"lv0": out_name}
alloc = (
cgen.make_declare(
[acc_dims], [out_dtype], out_sub, compute_stride_jump=False
)
+ cgen.make_alloc([non_reduced_dims], out_dtype, sub)
+ cgen.make_checks(
[acc_dims], [out_dtype], out_sub, compute_stride_jump=False
)
)
if adtype != odtype:
# Allocate accumulation buffer
sub[f"lv{i}"] = aname
sub["olv"] = aname
if acc_dtype != out_dtype:
# Define strides of accumulation buffer and allocate it
sub["lv1"] = acc_name
sub["olv"] = acc_name
alloc += cgen.make_declare(
[list(range(nnested)) + ["x"] * len(axis)],
[adtype],
dict(sub, lv0=aname),
)
alloc += cgen.make_alloc([order1], adtype, sub)
alloc += cgen.make_checks(
[list(range(nnested)) + ["x"] * len(axis)],
[adtype],
dict(sub, lv0=aname),
acc_sub = sub | {"lv0": acc_name}
alloc += (
cgen.make_declare(
[acc_dims], [acc_dtype], acc_sub, compute_stride_jump=False
)
+ cgen.make_alloc([non_reduced_dims], acc_dtype, sub)
+ cgen.make_checks(
[acc_dims], [acc_dtype], acc_sub, compute_stride_jump=False
)
)
identity = self.scalar_op.identity
if np.isposinf(identity):
if input.type.dtype in ("float32", "float64"):
if inp.type.dtype in ("float32", "float64"):
identity = "__builtin_inf()"
elif input.type.dtype.startswith("uint") or input.type.dtype == "bool":
elif inp.type.dtype.startswith("uint") or inp.type.dtype == "bool":
identity = "1"
else:
identity = "NPY_MAX_" + str(input.type.dtype).upper()
identity = "NPY_MAX_" + str(inp.type.dtype).upper()
elif np.isneginf(identity):
if input.type.dtype in ("float32", "float64"):
if inp.type.dtype in ("float32", "float64"):
identity = "-__builtin_inf()"
elif input.type.dtype.startswith("uint") or input.type.dtype == "bool":
elif inp.type.dtype.startswith("uint") or inp.type.dtype == "bool":
identity = "0"
else:
identity = "NPY_MIN_" + str(input.type.dtype).upper()
identity = "NPY_MIN_" + str(inp.type.dtype).upper()
elif identity is None:
raise TypeError(f"The {self.scalar_op} does not define an identity.")
task0_decl = f"{adtype}& {aname}_i = *{aname}_iter;\n{aname}_i = {identity};"
task1_decl = f"{idtype}& {inames[0]}_i = *{inames[0]}_iter;\n"
initial_value = f"{acc_name}_i = {identity};"
task1_code = self.scalar_op.c_code(
inner_task = self.scalar_op.c_code(
Apply(
self.scalar_op,
[
......@@ -1570,44 +1569,45 @@ class CAReduce(COp):
],
),
None,
[f"{aname}_i", f"{inames[0]}_i"],
[f"{aname}_i"],
[f"{acc_name}_i", f"{inp_name}_i"],
[f"{acc_name}_i"],
sub,
)
code1 = f"""
{{
{task1_decl}
{task1_code}
}}
"""
if node.inputs[0].type.ndim:
if len(axis) == 1:
all_code = [("", "")] * nnested + [(task0_decl, code1), ""]
else:
all_code = (
[("", "")] * nnested
+ [(task0_decl, "")]
+ [("", "")] * (len(axis) - 2)
+ [("", code1), ""]
)
if out.type.ndim == 0:
# Simple case where everything is reduced, no need for loop ordering
loop = cgen.make_complete_loop_careduce(
inp_var=inp_name,
acc_var=acc_name,
inp_dtype=inp_dtype,
acc_dtype=acc_dtype,
initial_value=initial_value,
inner_task=inner_task,
fail_code=sub["fail"],
)
else:
all_code = [task0_decl + code1]
loop = cgen.make_loop_careduce(
[order, list(range(nnested)) + ["x"] * len(axis)],
[idtype, adtype],
all_code,
sub,
)
loop = cgen.make_reordered_loop_careduce(
inp_var=inp_name,
acc_var=acc_name,
inp_dtype=inp_dtype,
acc_dtype=acc_dtype,
inp_ndim=ndim,
reduction_axes=axis,
initial_value=initial_value,
inner_task=inner_task,
)
end = ""
if adtype != odtype:
end = f"""
PyArray_CopyInto({oname}, {aname});
"""
end += acc_type.c_cleanup(aname, sub)
if acc_dtype != out_dtype:
cast = dedent(
f"""
PyArray_CopyInto({out_name}, {acc_name});
{acc_type.c_cleanup(acc_name, sub)}
"""
)
else:
cast = ""
return decl, checks, alloc, loop, end
return setup, alloc, loop, cast
def c_code(self, node, name, inames, onames, sub):
code = "\n".join(self._c_all(node, name, inames, onames, sub))
......@@ -1619,7 +1619,7 @@ class CAReduce(COp):
def c_code_cache_version_apply(self, node):
# the version corresponding to the c code in this Op
version = [9]
version = [10]
# now we insert versions for the ops on which we depend...
scalar_node = Apply(
......
from collections.abc import Sequence
from textwrap import dedent, indent
from pytensor.configdefaults import config
def make_declare(loop_orders, dtypes, sub):
def make_declare(loop_orders, dtypes, sub, compute_stride_jump=True):
"""
Produce code to declare all necessary variables.
......@@ -20,13 +21,11 @@ def make_declare(loop_orders, dtypes, sub):
# the number of elements in that dimension,
# the stride in that dimension,
# and the jump from an iteration to the next
decl += f"""
npy_intp {var}_n{value};
ssize_t {var}_stride{value};
int {var}_jump{value}_{j};
"""
decl += f"npy_intp {var}_n{value};\nssize_t {var}_stride{value};\n"
if compute_stride_jump:
decl += f"int {var}_jump{value}_{j};\n"
else:
elif compute_stride_jump:
# if the dimension is broadcasted, we only need
# the jump (arbitrary length and stride = 0)
decl += f"int {var}_jump{value}_{j};\n"
......@@ -34,7 +33,7 @@ def make_declare(loop_orders, dtypes, sub):
return decl
def make_checks(loop_orders, dtypes, sub):
def make_checks(loop_orders, dtypes, sub, compute_stride_jump=True):
init = ""
for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
var = sub[f"lv{i}"]
......@@ -67,13 +66,13 @@ def make_checks(loop_orders, dtypes, sub):
# Initialize the variables associated to the jth loop
# jump = stride - adjust
jump = f"({var}_stride{index}) - ({adjust})"
init += f"""
{var}_n{index} = PyArray_DIMS({var})[{index}];
{var}_stride{index} = PyArray_STRIDES({var})[{index}] / sizeof({dtype});
{var}_jump{index}_{j} = {jump};
"""
init += f"{var}_n{index} = PyArray_DIMS({var})[{index}];\n"
init += f"{var}_stride{index} = PyArray_STRIDES({var})[{index}] / sizeof({dtype});\n"
if compute_stride_jump:
init += f"{var}_jump{index}_{j} = {jump};\n"
adjust = f"{var}_n{index}*{var}_stride{index}"
else:
elif compute_stride_jump:
jump = f"-({adjust})"
init += f"{var}_jump{index}_{j} = {jump};\n"
adjust = "0"
......@@ -460,72 +459,298 @@ def make_reordered_loop(
################
def make_loop_careduce(loop_orders, dtypes, loop_tasks, sub):
def make_complete_loop_careduce(
inp_var: str,
acc_var: str,
inp_dtype: str,
acc_dtype: str,
initial_value: str,
inner_task: str,
fail_code,
) -> str:
"""Generate C code for a complete reduction loop.
The generated code for a float64 input variable `inp` and accumulation variable `acc` looks like:
.. code-block:: C
{
NpyIter* iter;
NpyIter_IterNextFunc *iternext;
char** data_ptr;
npy_intp* stride_ptr,* innersize_ptr;
// Special case for empty inputs
if (PyArray_SIZE(inp) == 0) {
npy_float64 acc_i = *(npy_float64*)(PyArray_DATA(acc));
acc_i = 0;
}else{
iter = NpyIter_New(inp,
NPY_ITER_READONLY| NPY_ITER_EXTERNAL_LOOP| NPY_ITER_REFS_OK,
NPY_KEEPORDER,
NPY_NO_CASTING,
NULL);
iternext = NpyIter_GetIterNext(iter, NULL);
if (iternext == NULL) {
NpyIter_Deallocate(iter);
{ fail }
}
data_ptr = NpyIter_GetDataPtrArray(iter);
stride_ptr = NpyIter_GetInnerStrideArray(iter);
innersize_ptr = NpyIter_GetInnerLoopSizePtr(iter);
npy_float64 acc_i;
acc_i = 0;
do {
char* data = *data_ptr;
npy_intp stride = *stride_ptr;
npy_intp count = *innersize_ptr;
while(count--) {
npy_float64 inp_i = *((npy_float64*)data);
acc_i = acc_i + inp_i;
data += stride;
}
} while(iternext(iter));
NpyIter_Deallocate(iter);
*(npy_float64*)(PyArray_DATA(acc)) = acc_i;
}
}
"""
Make a nested loop over several arrays and associate specific code
to each level of nesting.
return dedent(
f"""
{{
NpyIter* iter;
NpyIter_IterNextFunc *iternext;
char** data_ptr;
npy_intp* stride_ptr,* innersize_ptr;
// Special case for empty inputs
if (PyArray_SIZE({inp_var}) == 0) {{
{acc_dtype} &{acc_var}_i = *({acc_dtype}*)(PyArray_DATA({acc_var}));
{initial_value}
}}else{{
iter = NpyIter_New({inp_var},
NPY_ITER_READONLY| NPY_ITER_EXTERNAL_LOOP| NPY_ITER_REFS_OK,
NPY_KEEPORDER,
NPY_NO_CASTING,
NULL);
iternext = NpyIter_GetIterNext(iter, NULL);
if (iternext == NULL) {{
NpyIter_Deallocate(iter);
{fail_code}
}}
Parameters
----------
loop_orders : list of N tuples of length M
Each value of each tuple can be either the index of a dimension to
loop over or the letter 'x' which means there is no looping to be done
over that variable at that point (in other words we broadcast
over that dimension). If an entry is an integer, it will become
an alias of the entry of that rank.
loop_tasks : list of M+1 pieces of code
The ith loop_task is a pair of strings, the first
string is code to be executed before the ith loop starts, the second
one contains code to be executed just before going to the next element
of the ith dimension.
The last element if loop_tasks is a single string, containing code
to be executed at the very end.
sub: dictionary
Maps 'lv#' to a suitable variable name.
The 'lvi' variable corresponds to the ith element of loop_orders.
data_ptr = NpyIter_GetDataPtrArray(iter);
stride_ptr = NpyIter_GetInnerStrideArray(iter);
innersize_ptr = NpyIter_GetInnerLoopSizePtr(iter);
"""
{acc_dtype} {acc_var}_i;
{initial_value}
def loop_over(preloop, code, indices, i):
iterv = f"ITER_{int(i)}"
update = ""
suitable_n = "1"
for j, index in enumerate(indices):
var = sub[f"lv{int(j)}"]
update += f"{var}_iter += {var}_jump{index}_{i};\n"
if index != "x":
suitable_n = f"{var}_n{index}"
return f"""
{preloop}
for (int {iterv} = {suitable_n}; {iterv}; {iterv}--) {{
{code}
{update}
do {{
char* data = *data_ptr;
npy_intp stride = *stride_ptr;
npy_intp count = *innersize_ptr;
while(count--) {{
{inp_dtype} {inp_var}_i = *(({inp_dtype}*)data);
{inner_task}
data += stride;
}}
}} while(iternext(iter));
NpyIter_Deallocate(iter);
*({acc_dtype}*)(PyArray_DATA({acc_var})) = {acc_var}_i;
}}
}}
"""
)
preloops = {}
for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
for j, index in enumerate(loop_order):
if index != "x":
preloops.setdefault(j, "")
preloops[j] += (
f"%(lv{i})s_iter = ({dtype}*)(PyArray_DATA(%(lv{i})s));\n"
) % sub
break
else: # all broadcastable
preloops.setdefault(0, "")
preloops[0] += (
f"%(lv{i})s_iter = ({dtype}*)(PyArray_DATA(%(lv{i})s));\n"
) % sub
if len(loop_tasks) == 1:
s = preloops.get(0, "")
else:
s = ""
for i, (pre_task, task), indices in reversed(
list(zip(range(len(loop_tasks) - 1), loop_tasks, list(zip(*loop_orders))))
):
s = loop_over(preloops.get(i, "") + pre_task, s + task, indices, i)
def make_reordered_loop_careduce(
inp_var: str,
acc_var: str,
inp_dtype: str,
acc_dtype: str,
inp_ndim: int,
reduction_axes: Sequence[int],
initial_value: str,
inner_task: str,
) -> str:
"""Generate C code for a partial reduction loop, reordering for optimal memory access of the input variable.
The generated code for a sum along the last axis of a 2D float64 input variable `inp`
in an accumulation variable `acc` looks like:
.. code-block:: C
{
// Special case for empty inputs
if (PyArray_SIZE(inp) == 0) {
acc_iter = (npy_float64*)(PyArray_DATA(acc));
int_n = PyArray_SIZE(acc);
for(int i = 0; i < n; i++)
{
npy_float64 &acc_i = acc_iter[i];
acc_i = 0;
}
} else {
std::vector< std::pair<int, int> > loops(2);
std::vector< std::pair<int, int> >::iterator loops_it = loops.begin();
loops_it->first = abs(PyArray_STRIDES(inp)[0]);
loops_it->second = 0;
++loops_it;
loops_it->first = abs(PyArray_STRIDES(inp)[1]);
loops_it->second = 1;
++loops_it;
std::sort(loops.rbegin(), loops.rend());
int dim_lengths[2] = {inp_n0, inp_n1};
int inp_strides[2] = {inp_stride0, inp_stride1};
int acc_strides[2] = {acc_stride0, 0};
bool reduction_axes[2] = {0, 1};
loops_it = loops.begin();
int dim_length_0 = dim_lengths[loops_it->second];
int is_reduction_axis_0 = reduction_axes[loops_it->second];
int inp_stride_0 = inp_strides[loops_it->second];
int acc_stride_0 = acc_strides[loops_it->second];
++loops_it;
int dim_length_1 = dim_lengths[loops_it->second];
int is_reduction_axis_1 = reduction_axes[loops_it->second];
int inp_stride_1 = inp_strides[loops_it->second];
int acc_stride_1 = acc_strides[loops_it->second];
++loops_it;
inp_iter = (npy_float64*)(PyArray_DATA(inp));
acc_iter = (npy_float64*)(PyArray_DATA(acc));
for(int iter_0 = 0; iter_0<dim_length_0; iter_0++){
for(int iter_1 = 0; iter_1<dim_length_1; iter_1++){
npy_float64 &inp_i = *(inp_iter + inp_stride_1*iter_1 + inp_stride_0*iter_0);
npy_float64 &acc_i = *(acc_iter + acc_stride_1*iter_1 + acc_stride_0*iter_0);
if((!is_reduction_axis_0 || iter_0 == 0) && (!is_reduction_axis_1 || iter_1 == 0))
{
acc_i = 0;
}
{acc_i = acc_i + inp_i;}
}
}
}
s += loop_tasks[-1]
return f"{{{s}}}"
"""
empty_case = dedent(
f"""
// Special case for empty inputs
if (PyArray_SIZE({inp_var}) == 0) {{
{acc_var}_iter = ({acc_dtype}*)(PyArray_DATA({acc_var}));
int n = PyArray_SIZE({acc_var});
for(int i = 0; i < n; i++)
{{
{acc_dtype} &{acc_var}_i = {acc_var}_iter[i];
{initial_value}
}}
}} else {{
"""
)
# The loops are ordered by (decreasing) absolute values of inp_var's strides.
# The first element of each pair is the absolute value of the stride
# The second element correspond to the index in the initial loop order
order_loops = dedent(
f"""
std::vector< std::pair<int, int> > loops({inp_ndim});
std::vector< std::pair<int, int> >::iterator loops_it = loops.begin();
"""
)
# Fill the loop vector with the appropriate <stride, index> pairs
for i in range(inp_ndim):
order_loops += dedent(
f"""
loops_it->first = abs(PyArray_STRIDES({inp_var})[{i}]);
loops_it->second = {i};
++loops_it;"""
)
# We sort in decreasing order so that the outermost loop (loop 0)
# has the largest stride, and the innermost loop has the smallest stride.
order_loops += "\nstd::sort(loops.rbegin(), loops.rend());\n"
# Sort shape and strides to match the new order that was computed by sorting the loop vector.
counter = iter(range(inp_ndim))
unsorted_vars = dedent(
f"""
int dim_lengths[{inp_ndim}] = {{{','.join(f'{inp_var}_n{i}' for i in range(inp_ndim))}}};
int inp_strides[{inp_ndim}] = {{{','.join(f'{inp_var}_stride{i}' for i in range(inp_ndim))}}};
int acc_strides[{inp_ndim}] = {{{','.join("0" if i in reduction_axes else f'{acc_var}_stride{next(counter)}'for i in range(inp_ndim))}}};
bool reduction_axes[{inp_ndim}] = {{{', '.join("1" if i in reduction_axes else "0" for i in range(inp_ndim))}}};\n
"""
)
sorted_vars = "loops_it = loops.begin();"
for i in range(inp_ndim):
sorted_vars += dedent(
f"""
int dim_length_{i} = dim_lengths[loops_it->second];
int is_reduction_axis_{i} = reduction_axes[loops_it->second];
int {inp_var}_stride_{i} = inp_strides[loops_it->second];
int {acc_var}_stride_{i} = acc_strides[loops_it->second];
++loops_it;
"""
)
declare_iter = dedent(
f"""
{inp_var}_iter = ({inp_dtype}*)(PyArray_DATA({inp_var}));
{acc_var}_iter = ({acc_dtype}*)(PyArray_DATA({acc_var}));
"""
)
pointer_update = ""
for var, dtype in ((inp_var, inp_dtype), (acc_var, acc_dtype)):
pointer_update += f"{dtype} &{var}_i = *({var}_iter"
for i in reversed(tuple(range(inp_ndim))):
iter_var = f"iter_{i}"
pointer_update += f" + {var}_stride_{i}*{iter_var}"
pointer_update += ");\n"
# Set initial value in first iteration of each output
# This happens on the first iteration of every reduction axis
initial_iteration = " && ".join(
f"(!is_reduction_axis_{i} || iter_{i} == 0)" for i in range(inp_ndim)
)
set_initial_value = dedent(
f"""
if({initial_iteration})
{{
{initial_value}
}}
"""
)
# We set do pointer_update, initial_value and inner task in inner loop
loop = "\n\n".join((pointer_update, set_initial_value, f"{{{inner_task}}}"))
# Create outer loops recursively
for i in reversed(range(inp_ndim)):
iter_var = f"iter_{i}"
dim_length = f"dim_length_{i}"
loop = dedent(
f"""
for(int {iter_var} = 0; {iter_var}<{dim_length}; {iter_var}++){{
{loop}
}}
"""
)
non_empty_case = "\n".join(
(order_loops, unsorted_vars, sorted_vars, declare_iter, loop)
)
code = "\n".join((empty_case, non_empty_case, "}"))
return f"{{\n{code}\n}}\n"
import builtins
import warnings
from collections.abc import Sequence
from textwrap import dedent
from typing import TYPE_CHECKING, Optional
import numpy as np
......@@ -361,12 +362,14 @@ class FixedOpCAReduce(CAReduce):
class NonZeroDimsCAReduce(FixedOpCAReduce):
def _c_all(self, node, name, inames, onames, sub):
decl, checks, alloc, loop, end = super()._c_all(node, name, inames, onames, sub)
def _c_all(self, node, name, input_names, output_names, sub):
setup, alloc, loop, cast = super()._c_all(
node, name, input_names, output_names, sub
)
# We add an additional check for zero-sized dimensions (This seems like
# something that could enabled in `elemwise_cgen.make_checks`.)
iname = inames[0]
[iname] = input_names
axis = self.axis
if axis is None:
......@@ -378,17 +381,19 @@ class NonZeroDimsCAReduce(FixedOpCAReduce):
pattern_ = str(pattern)[1:-1]
decl += f"""int tosum[]={{{pattern_}}};"""
alloc += f"""
for(int i=0;i<PyArray_NDIM({iname});i++){{
if(PyArray_DIMS({iname})[i]==0 && tosum[i]){{
PyErr_Format(PyExc_ValueError,
"Input of CAReduce{{{node.op.scalar_op}}} has zero-size on axis %%d",i);
{sub["fail"]};
}}
setup = f"int tosum[]={{{pattern_}}};" + setup
alloc += dedent(
f"""
for(int i=0;i<PyArray_NDIM({iname});i++){{
if(PyArray_DIMS({iname})[i]==0 && tosum[i]){{
PyErr_Format(PyExc_ValueError,
"Input of CAReduce{{{node.op.scalar_op}}} has zero-size on axis %%d",i);
{sub["fail"]};
}}
"""
return decl, checks, alloc, loop, end
}}
"""
)
return setup, alloc, loop, cast
class Max(NonZeroDimsCAReduce):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论