CAReduce loop reordering C-impl

e752fc3d · Ricardo Vieira · Ricardo Vieira · 00a8a883 · e752fc3d · e752fc3d
--- a/pytensor/tensor/elemwise.py
+++ b/pytensor/tensor/elemwise.py
 from copy import copy
+from textwrap import dedent

 import numpy as np
 from numpy.core.numeric import normalize_axis_tuple
@@ -1448,15 +1449,16 @@ class CAReduce(COp):
            return ((),)
        return ([ishape[i] for i in range(node.inputs[0].type.ndim) if i not in axis],)

-    def _c_all(self, node, name, inames, onames, sub):
-        input = node.inputs[0]
-        output = node.outputs[0]
+    def _c_all(self, node, name, input_names, output_names, sub):
+        [inp] = node.inputs
+        [out] = node.outputs
+        ndim = inp.type.ndim

-        iname = inames[0]
-        oname = onames[0]
+        [inp_name] = input_names
+        [out_name] = output_names

-        idtype = input.type.dtype_specs()[1]
-        odtype = output.type.dtype_specs()[1]
+        inp_dtype = inp.type.dtype_specs()[1]
+        out_dtype = out.type.dtype_specs()[1]

        acc_dtype = getattr(self, "acc_dtype", None)

@@ -1464,100 +1466,97 @@ class CAReduce(COp):
            if acc_dtype == "float16":
                raise MethodNotDefined("no c_code for float16")
            acc_type = TensorType(shape=node.outputs[0].type.shape, dtype=acc_dtype)
-            adtype = acc_type.dtype_specs()[1]
+            acc_dtype = acc_type.dtype_specs()[1]
        else:
-            adtype = odtype
+            acc_dtype = out_dtype

        axis = self.axis
        if axis is None:
-            axis = list(range(input.type.ndim))
+            axis = list(range(inp.type.ndim))

        if len(axis) == 0:
+            # This is just an Elemwise cast operation
            # The acc_dtype is never a downcast compared to the input dtype
            # So we just need a cast to the output dtype.
-            var = pytensor.tensor.basic.cast(input, node.outputs[0].dtype)
-            if var is input:
-                var = Elemwise(scalar_identity)(input)
+            var = pytensor.tensor.basic.cast(inp, node.outputs[0].dtype)
+            if var is inp:
+                var = Elemwise(scalar_identity)(inp)
            assert var.dtype == node.outputs[0].dtype
-            return var.owner.op._c_all(var.owner, name, inames, onames, sub)
-
-        order1 = [i for i in range(input.type.ndim) if i not in axis]
-        order = order1 + list(axis)
+            return var.owner.op._c_all(var.owner, name, input_names, output_names, sub)

-        nnested = len(order1)
+        inp_dims = list(range(ndim))
+        non_reduced_dims = [i for i in inp_dims if i not in axis]
+        counter = iter(range(ndim))
+        acc_dims = ["x" if i in axis else next(counter) for i in range(ndim)]

-        sub = dict(sub)
-        for i, (input, iname) in enumerate(zip(node.inputs, inames)):
-            sub[f"lv{i}"] = iname
+        sub = sub.copy()
+        sub["lv0"] = inp_name
+        sub["lv1"] = out_name
+        sub["olv"] = out_name

-        decl = ""
-        if adtype != odtype:
+        if acc_dtype != out_dtype:
            # Create an accumulator variable different from the output
-            aname = "acc"
-            decl = acc_type.c_declare(aname, sub)
-            decl += acc_type.c_init(aname, sub)
+            acc_name = "acc"
+            setup = acc_type.c_declare(acc_name, sub) + acc_type.c_init(acc_name, sub)
        else:
            # the output is the accumulator variable
-            aname = oname
-
-        decl += cgen.make_declare([order], [idtype], sub)
-        checks = cgen.make_checks([order], [idtype], sub)
-
-        alloc = ""
-        i += 1
-        sub[f"lv{i}"] = oname
-        sub["olv"] = oname
-
-        # Allocate output buffer
-        alloc += cgen.make_declare(
-            [list(range(nnested)) + ["x"] * len(axis)], [odtype], dict(sub, lv0=oname)
-        )
-        alloc += cgen.make_alloc([order1], odtype, sub)
-        alloc += cgen.make_checks(
-            [list(range(nnested)) + ["x"] * len(axis)], [odtype], dict(sub, lv0=oname)
+            acc_name = out_name
+            setup = ""
+
+        # Define strides of input array
+        setup += cgen.make_declare(
+            [inp_dims], [inp_dtype], sub, compute_stride_jump=False
+        ) + cgen.make_checks([inp_dims], [inp_dtype], sub, compute_stride_jump=False)
+
+        # Define strides of output array and allocate it
+        out_sub = sub | {"lv0": out_name}
+        alloc = (
+            cgen.make_declare(
+                [acc_dims], [out_dtype], out_sub, compute_stride_jump=False
+            )
+            + cgen.make_alloc([non_reduced_dims], out_dtype, sub)
+            + cgen.make_checks(
+                [acc_dims], [out_dtype], out_sub, compute_stride_jump=False
+            )
        )

-        if adtype != odtype:
-            # Allocate accumulation buffer
-            sub[f"lv{i}"] = aname
-            sub["olv"] = aname
+        if acc_dtype != out_dtype:
+            # Define strides of accumulation buffer and allocate it
+            sub["lv1"] = acc_name
+            sub["olv"] = acc_name

-            alloc += cgen.make_declare(
-                [list(range(nnested)) + ["x"] * len(axis)],
-                [adtype],
-                dict(sub, lv0=aname),
-            )
-            alloc += cgen.make_alloc([order1], adtype, sub)
-            alloc += cgen.make_checks(
-                [list(range(nnested)) + ["x"] * len(axis)],
-                [adtype],
-                dict(sub, lv0=aname),
+            acc_sub = sub | {"lv0": acc_name}
+            alloc += (
+                cgen.make_declare(
+                    [acc_dims], [acc_dtype], acc_sub, compute_stride_jump=False
+                )
+                + cgen.make_alloc([non_reduced_dims], acc_dtype, sub)
+                + cgen.make_checks(
+                    [acc_dims], [acc_dtype], acc_sub, compute_stride_jump=False
+                )
            )

        identity = self.scalar_op.identity
-
        if np.isposinf(identity):
-            if input.type.dtype in ("float32", "float64"):
+            if inp.type.dtype in ("float32", "float64"):
                identity = "__builtin_inf()"
-            elif input.type.dtype.startswith("uint") or input.type.dtype == "bool":
+            elif inp.type.dtype.startswith("uint") or inp.type.dtype == "bool":
                identity = "1"
            else:
-                identity = "NPY_MAX_" + str(input.type.dtype).upper()
+                identity = "NPY_MAX_" + str(inp.type.dtype).upper()
        elif np.isneginf(identity):
-            if input.type.dtype in ("float32", "float64"):
+            if inp.type.dtype in ("float32", "float64"):
                identity = "-__builtin_inf()"
-            elif input.type.dtype.startswith("uint") or input.type.dtype == "bool":
+            elif inp.type.dtype.startswith("uint") or inp.type.dtype == "bool":
                identity = "0"
            else:
-                identity = "NPY_MIN_" + str(input.type.dtype).upper()
+                identity = "NPY_MIN_" + str(inp.type.dtype).upper()
        elif identity is None:
            raise TypeError(f"The {self.scalar_op} does not define an identity.")

-        task0_decl = f"{adtype}& {aname}_i = *{aname}_iter;\n{aname}_i = {identity};"
-
-        task1_decl = f"{idtype}& {inames[0]}_i = *{inames[0]}_iter;\n"
+        initial_value = f"{acc_name}_i = {identity};"

-        task1_code = self.scalar_op.c_code(
+        inner_task = self.scalar_op.c_code(
            Apply(
                self.scalar_op,
                [
@@ -1570,44 +1569,45 @@ class CAReduce(COp):
                ],
            ),
            None,
-            [f"{aname}_i", f"{inames[0]}_i"],
-            [f"{aname}_i"],
+            [f"{acc_name}_i", f"{inp_name}_i"],
+            [f"{acc_name}_i"],
            sub,
        )
-        code1 = f"""
-        {{
-            {task1_decl}
-            {task1_code}
-        }}
-        """

-        if node.inputs[0].type.ndim:
-            if len(axis) == 1:
-                all_code = [("", "")] * nnested + [(task0_decl, code1), ""]
-            else:
-                all_code = (
-                    [("", "")] * nnested
-                    + [(task0_decl, "")]
-                    + [("", "")] * (len(axis) - 2)
-                    + [("", code1), ""]
-                )
+        if out.type.ndim == 0:
+            # Simple case where everything is reduced, no need for loop ordering
+            loop = cgen.make_complete_loop_careduce(
+                inp_var=inp_name,
+                acc_var=acc_name,
+                inp_dtype=inp_dtype,
+                acc_dtype=acc_dtype,
+                initial_value=initial_value,
+                inner_task=inner_task,
+                fail_code=sub["fail"],
+            )
        else:
-            all_code = [task0_decl + code1]
-        loop = cgen.make_loop_careduce(
-            [order, list(range(nnested)) + ["x"] * len(axis)],
-            [idtype, adtype],
-            all_code,
-            sub,
-        )
+            loop = cgen.make_reordered_loop_careduce(
+                inp_var=inp_name,
+                acc_var=acc_name,
+                inp_dtype=inp_dtype,
+                acc_dtype=acc_dtype,
+                inp_ndim=ndim,
+                reduction_axes=axis,
+                initial_value=initial_value,
+                inner_task=inner_task,
+            )

-        end = ""
-        if adtype != odtype:
-            end = f"""
-            PyArray_CopyInto({oname}, {aname});
-            """
-            end += acc_type.c_cleanup(aname, sub)
+        if acc_dtype != out_dtype:
+            cast = dedent(
+                f"""
+                PyArray_CopyInto({out_name}, {acc_name});
+                {acc_type.c_cleanup(acc_name, sub)}
+                """
+            )
+        else:
+            cast = ""

-        return decl, checks, alloc, loop, end
+        return setup, alloc, loop, cast

    def c_code(self, node, name, inames, onames, sub):
        code = "\n".join(self._c_all(node, name, inames, onames, sub))
@@ -1619,7 +1619,7 @@ class CAReduce(COp):

    def c_code_cache_version_apply(self, node):
        # the version corresponding to the c code in this Op
-        version = [9]
+        version = [10]

        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(

--- a/pytensor/tensor/elemwise_cgen.py
+++ b/pytensor/tensor/elemwise_cgen.py
+from collections.abc import Sequence
 from textwrap import dedent, indent

 from pytensor.configdefaults import config


-def make_declare(loop_orders, dtypes, sub):
+def make_declare(loop_orders, dtypes, sub, compute_stride_jump=True):
    """
    Produce code to declare all necessary variables.

@@ -20,13 +21,11 @@ def make_declare(loop_orders, dtypes, sub):
                # the number of elements in that dimension,
                # the stride in that dimension,
                # and the jump from an iteration to the next
-                decl += f"""
-                npy_intp {var}_n{value};
-                ssize_t {var}_stride{value};
-                int {var}_jump{value}_{j};
-                """
+                decl += f"npy_intp {var}_n{value};\nssize_t {var}_stride{value};\n"
+                if compute_stride_jump:
+                    decl += f"int {var}_jump{value}_{j};\n"

-            else:
+            elif compute_stride_jump:
                # if the dimension is broadcasted, we only need
                # the jump (arbitrary length and stride = 0)
                decl += f"int {var}_jump{value}_{j};\n"
@@ -34,7 +33,7 @@ def make_declare(loop_orders, dtypes, sub):
    return decl


-def make_checks(loop_orders, dtypes, sub):
+def make_checks(loop_orders, dtypes, sub, compute_stride_jump=True):
    init = ""
    for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
        var = sub[f"lv{i}"]
@@ -67,13 +66,13 @@ def make_checks(loop_orders, dtypes, sub):
                # Initialize the variables associated to the jth loop
                # jump = stride - adjust
                jump = f"({var}_stride{index}) - ({adjust})"
-                init += f"""
-                {var}_n{index} = PyArray_DIMS({var})[{index}];
-                {var}_stride{index} = PyArray_STRIDES({var})[{index}] / sizeof({dtype});
-                {var}_jump{index}_{j} = {jump};
-                """
+                init += f"{var}_n{index} = PyArray_DIMS({var})[{index}];\n"
+                init += f"{var}_stride{index} = PyArray_STRIDES({var})[{index}] / sizeof({dtype});\n"
+                if compute_stride_jump:
+                    init += f"{var}_jump{index}_{j} = {jump};\n"
                adjust = f"{var}_n{index}*{var}_stride{index}"
-            else:
+
+            elif compute_stride_jump:
                jump = f"-({adjust})"
                init += f"{var}_jump{index}_{j} = {jump};\n"
                adjust = "0"
@@ -460,72 +459,298 @@ def make_reordered_loop(
 ################


-def make_loop_careduce(loop_orders, dtypes, loop_tasks, sub):
+def make_complete_loop_careduce(
+    inp_var: str,
+    acc_var: str,
+    inp_dtype: str,
+    acc_dtype: str,
+    initial_value: str,
+    inner_task: str,
+    fail_code,
+) -> str:
+    """Generate C code for a complete reduction loop.
+
+    The generated code for a float64 input variable `inp` and accumulation variable `acc` looks like:
+
+    .. code-block:: C
+        {
+            NpyIter* iter;
+            NpyIter_IterNextFunc *iternext;
+            char** data_ptr;
+            npy_intp* stride_ptr,* innersize_ptr;
+
+            // Special case for empty inputs
+            if (PyArray_SIZE(inp) == 0) {
+                npy_float64 acc_i = *(npy_float64*)(PyArray_DATA(acc));
+                acc_i = 0;
+            }else{
+                iter = NpyIter_New(inp,
+                                   NPY_ITER_READONLY| NPY_ITER_EXTERNAL_LOOP| NPY_ITER_REFS_OK,
+                                   NPY_KEEPORDER,
+                                   NPY_NO_CASTING,
+                                   NULL);
+                iternext = NpyIter_GetIterNext(iter, NULL);
+                if (iternext == NULL) {
+                    NpyIter_Deallocate(iter);
+                    { fail }
+                }
+                data_ptr = NpyIter_GetDataPtrArray(iter);
+                stride_ptr = NpyIter_GetInnerStrideArray(iter);
+                innersize_ptr = NpyIter_GetInnerLoopSizePtr(iter);
+
+                npy_float64 acc_i;
+                acc_i = 0;
+                do {
+                    char* data = *data_ptr;
+                    npy_intp stride = *stride_ptr;
+                    npy_intp count = *innersize_ptr;
+
+                    while(count--) {
+                        npy_float64 inp_i = *((npy_float64*)data);
+                        acc_i = acc_i + inp_i;
+                        data += stride;
+                    }
+
+                } while(iternext(iter));
+                NpyIter_Deallocate(iter);
+
+                *(npy_float64*)(PyArray_DATA(acc)) = acc_i;
+            }
+        }
    """
-    Make a nested loop over several arrays and associate specific code
-    to each level of nesting.
+    return dedent(
+        f"""
+        {{
+            NpyIter* iter;
+            NpyIter_IterNextFunc *iternext;
+            char** data_ptr;
+            npy_intp* stride_ptr,* innersize_ptr;
+
+            // Special case for empty inputs
+            if (PyArray_SIZE({inp_var}) == 0) {{
+                {acc_dtype} &{acc_var}_i = *({acc_dtype}*)(PyArray_DATA({acc_var}));
+                {initial_value}
+            }}else{{
+                iter = NpyIter_New({inp_var},
+                                   NPY_ITER_READONLY| NPY_ITER_EXTERNAL_LOOP| NPY_ITER_REFS_OK,
+                                   NPY_KEEPORDER,
+                                   NPY_NO_CASTING,
+                                   NULL);
+
+                iternext = NpyIter_GetIterNext(iter, NULL);
+                if (iternext == NULL) {{
+                    NpyIter_Deallocate(iter);
+                    {fail_code}
+                }}

-    Parameters
-    ----------
-    loop_orders : list of N tuples of length M
-        Each value of each tuple can be either the index of a dimension to
-        loop over or the letter 'x' which means there is no looping to be done
-        over that variable at that point (in other words we broadcast
-        over that dimension). If an entry is an integer, it will become
-        an alias of the entry of that rank.
-    loop_tasks : list of M+1 pieces of code
-        The ith loop_task is a pair of strings, the first
-        string is code to be executed before the ith loop starts, the second
-        one contains code to be executed just before going to the next element
-        of the ith dimension.
-        The last element if loop_tasks is a single string, containing code
-        to be executed at the very end.
-    sub: dictionary
-        Maps 'lv#' to a suitable variable name.
-        The 'lvi' variable corresponds to the ith element of loop_orders.
+                data_ptr = NpyIter_GetDataPtrArray(iter);
+                stride_ptr = NpyIter_GetInnerStrideArray(iter);
+                innersize_ptr = NpyIter_GetInnerLoopSizePtr(iter);

-    """
+                {acc_dtype} {acc_var}_i;
+                {initial_value}

-    def loop_over(preloop, code, indices, i):
-        iterv = f"ITER_{int(i)}"
-        update = ""
-        suitable_n = "1"
-        for j, index in enumerate(indices):
-            var = sub[f"lv{int(j)}"]
-            update += f"{var}_iter += {var}_jump{index}_{i};\n"
-            if index != "x":
-                suitable_n = f"{var}_n{index}"
-        return f"""
-        {preloop}
-        for (int {iterv} = {suitable_n}; {iterv}; {iterv}--) {{
-            {code}
-            {update}
+                do {{
+                    char* data = *data_ptr;
+                    npy_intp stride = *stride_ptr;
+                    npy_intp count = *innersize_ptr;
+
+                    while(count--) {{
+                        {inp_dtype} {inp_var}_i = *(({inp_dtype}*)data);
+                        {inner_task}
+                        data += stride;
+                    }}
+                }} while(iternext(iter));
+
+                NpyIter_Deallocate(iter);
+                *({acc_dtype}*)(PyArray_DATA({acc_var})) = {acc_var}_i;
+            }}
        }}
        """
+    )

-    preloops = {}
-    for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
-        for j, index in enumerate(loop_order):
-            if index != "x":
-                preloops.setdefault(j, "")
-                preloops[j] += (
-                    f"%(lv{i})s_iter = ({dtype}*)(PyArray_DATA(%(lv{i})s));\n"
-                ) % sub
-                break
-        else:  # all broadcastable
-            preloops.setdefault(0, "")
-            preloops[0] += (
-                f"%(lv{i})s_iter = ({dtype}*)(PyArray_DATA(%(lv{i})s));\n"
-            ) % sub

-    if len(loop_tasks) == 1:
-        s = preloops.get(0, "")
-    else:
-        s = ""
-        for i, (pre_task, task), indices in reversed(
-            list(zip(range(len(loop_tasks) - 1), loop_tasks, list(zip(*loop_orders))))
-        ):
-            s = loop_over(preloops.get(i, "") + pre_task, s + task, indices, i)
+def make_reordered_loop_careduce(
+    inp_var: str,
+    acc_var: str,
+    inp_dtype: str,
+    acc_dtype: str,
+    inp_ndim: int,
+    reduction_axes: Sequence[int],
+    initial_value: str,
+    inner_task: str,
+) -> str:
+    """Generate C code for a partial reduction loop, reordering for optimal memory access of the input variable.
+
+    The generated code for a sum along the last axis of a 2D float64 input variable `inp`
+    in an accumulation variable `acc` looks like:
+
+    .. code-block:: C
+        {
+            // Special case for empty inputs
+            if (PyArray_SIZE(inp) == 0) {
+                acc_iter = (npy_float64*)(PyArray_DATA(acc));
+                int_n =  PyArray_SIZE(acc);
+                for(int i = 0; i < n; i++)
+                {
+                    npy_float64 &acc_i = acc_iter[i];
+                    acc_i = 0;
+                }
+            } else {
+            std::vector< std::pair<int, int> > loops(2);
+            std::vector< std::pair<int, int> >::iterator loops_it = loops.begin();
+
+            loops_it->first = abs(PyArray_STRIDES(inp)[0]);
+            loops_it->second = 0;
+            ++loops_it;
+            loops_it->first = abs(PyArray_STRIDES(inp)[1]);
+            loops_it->second = 1;
+            ++loops_it;
+            std::sort(loops.rbegin(), loops.rend());
+
+            int dim_lengths[2] = {inp_n0, inp_n1};
+            int inp_strides[2] = {inp_stride0, inp_stride1};
+            int acc_strides[2] = {acc_stride0, 0};
+            bool reduction_axes[2] = {0, 1};
+
+            loops_it = loops.begin();
+            int dim_length_0 = dim_lengths[loops_it->second];
+            int is_reduction_axis_0 = reduction_axes[loops_it->second];
+            int inp_stride_0 = inp_strides[loops_it->second];
+            int acc_stride_0 = acc_strides[loops_it->second];
+            ++loops_it;
+            int dim_length_1 = dim_lengths[loops_it->second];
+            int is_reduction_axis_1 = reduction_axes[loops_it->second];
+            int inp_stride_1 = inp_strides[loops_it->second];
+            int acc_stride_1 = acc_strides[loops_it->second];
+            ++loops_it;
+
+            inp_iter = (npy_float64*)(PyArray_DATA(inp));
+            acc_iter = (npy_float64*)(PyArray_DATA(acc));
+
+            for(int iter_0 = 0; iter_0<dim_length_0; iter_0++){
+                for(int iter_1 = 0; iter_1<dim_length_1; iter_1++){
+                    npy_float64 &inp_i = *(inp_iter + inp_stride_1*iter_1 + inp_stride_0*iter_0);
+                    npy_float64 &acc_i = *(acc_iter + acc_stride_1*iter_1 + acc_stride_0*iter_0);
+
+                    if((!is_reduction_axis_0 || iter_0 == 0) && (!is_reduction_axis_1 || iter_1 == 0))
+                    {
+                        acc_i = 0;
+                    }
+                    {acc_i = acc_i + inp_i;}
+                }
+            }
+        }

-    s += loop_tasks[-1]
-    return f"{{{s}}}"
+    """
+
+    empty_case = dedent(
+        f"""
+        // Special case for empty inputs
+        if (PyArray_SIZE({inp_var}) == 0) {{
+            {acc_var}_iter = ({acc_dtype}*)(PyArray_DATA({acc_var}));
+            int n =  PyArray_SIZE({acc_var});
+            for(int i = 0; i < n; i++)
+            {{
+                {acc_dtype} &{acc_var}_i = {acc_var}_iter[i];
+                {initial_value}
+            }}
+        }} else {{
+        """
+    )
+
+    # The loops are ordered by (decreasing) absolute values of inp_var's strides.
+    # The first element of each pair is the absolute value of the stride
+    # The second element correspond to the index in the initial loop order
+    order_loops = dedent(
+        f"""
+        std::vector< std::pair<int, int> > loops({inp_ndim});
+        std::vector< std::pair<int, int> >::iterator loops_it = loops.begin();
+        """
+    )
+
+    # Fill the loop vector with the appropriate <stride, index> pairs
+    for i in range(inp_ndim):
+        order_loops += dedent(
+            f"""
+            loops_it->first = abs(PyArray_STRIDES({inp_var})[{i}]);
+            loops_it->second = {i};
+            ++loops_it;"""
+        )
+
+    # We sort in decreasing order so that the outermost loop (loop 0)
+    # has the largest stride, and the innermost loop has the smallest stride.
+    order_loops += "\nstd::sort(loops.rbegin(), loops.rend());\n"
+
+    # Sort shape and strides to match the new order that was computed by sorting the loop vector.
+    counter = iter(range(inp_ndim))
+    unsorted_vars = dedent(
+        f"""
+        int dim_lengths[{inp_ndim}] = {{{','.join(f'{inp_var}_n{i}' for i in range(inp_ndim))}}};
+        int inp_strides[{inp_ndim}] = {{{','.join(f'{inp_var}_stride{i}' for i in range(inp_ndim))}}};
+        int acc_strides[{inp_ndim}] = {{{','.join("0" if i in reduction_axes else f'{acc_var}_stride{next(counter)}'for i in range(inp_ndim))}}};
+        bool reduction_axes[{inp_ndim}] = {{{', '.join("1" if i in reduction_axes else "0" for i in range(inp_ndim))}}};\n
+        """
+    )
+
+    sorted_vars = "loops_it = loops.begin();"
+    for i in range(inp_ndim):
+        sorted_vars += dedent(
+            f"""
+            int dim_length_{i} = dim_lengths[loops_it->second];
+            int is_reduction_axis_{i} = reduction_axes[loops_it->second];
+            int {inp_var}_stride_{i} = inp_strides[loops_it->second];
+            int {acc_var}_stride_{i} = acc_strides[loops_it->second];
+            ++loops_it;
+            """
+        )
+
+    declare_iter = dedent(
+        f"""
+        {inp_var}_iter = ({inp_dtype}*)(PyArray_DATA({inp_var}));
+        {acc_var}_iter = ({acc_dtype}*)(PyArray_DATA({acc_var}));
+        """
+    )
+
+    pointer_update = ""
+    for var, dtype in ((inp_var, inp_dtype), (acc_var, acc_dtype)):
+        pointer_update += f"{dtype} &{var}_i = *({var}_iter"
+        for i in reversed(tuple(range(inp_ndim))):
+            iter_var = f"iter_{i}"
+            pointer_update += f" + {var}_stride_{i}*{iter_var}"
+        pointer_update += ");\n"
+
+    # Set initial value in first iteration of each output
+    # This happens on the first iteration of every reduction axis
+    initial_iteration = " && ".join(
+        f"(!is_reduction_axis_{i} || iter_{i} == 0)" for i in range(inp_ndim)
+    )
+    set_initial_value = dedent(
+        f"""
+        if({initial_iteration})
+        {{
+            {initial_value}
+        }}
+        """
+    )
+
+    # We set do pointer_update, initial_value and inner task in inner loop
+    loop = "\n\n".join((pointer_update, set_initial_value, f"{{{inner_task}}}"))
+
+    # Create outer loops recursively
+    for i in reversed(range(inp_ndim)):
+        iter_var = f"iter_{i}"
+        dim_length = f"dim_length_{i}"
+        loop = dedent(
+            f"""
+            for(int {iter_var} = 0; {iter_var}<{dim_length}; {iter_var}++){{
+                {loop}
+            }}
+            """
+        )
+
+    non_empty_case = "\n".join(
+        (order_loops, unsorted_vars, sorted_vars, declare_iter, loop)
+    )
+    code = "\n".join((empty_case, non_empty_case, "}"))
+    return f"{{\n{code}\n}}\n"
--- a/pytensor/tensor/math.py
+++ b/pytensor/tensor/math.py
 import builtins
 import warnings
 from collections.abc import Sequence
+from textwrap import dedent
 from typing import TYPE_CHECKING, Optional

 import numpy as np
@@ -361,12 +362,14 @@ class FixedOpCAReduce(CAReduce):


 class NonZeroDimsCAReduce(FixedOpCAReduce):
-    def _c_all(self, node, name, inames, onames, sub):
-        decl, checks, alloc, loop, end = super()._c_all(node, name, inames, onames, sub)
+    def _c_all(self, node, name, input_names, output_names, sub):
+        setup, alloc, loop, cast = super()._c_all(
+            node, name, input_names, output_names, sub
+        )

        # We add an additional check for zero-sized dimensions (This seems like
        # something that could enabled in `elemwise_cgen.make_checks`.)
-        iname = inames[0]
+        [iname] = input_names

        axis = self.axis
        if axis is None:
@@ -378,17 +381,19 @@ class NonZeroDimsCAReduce(FixedOpCAReduce):

        pattern_ = str(pattern)[1:-1]

-        decl += f"""int tosum[]={{{pattern_}}};"""
-        alloc += f"""
-                for(int i=0;i<PyArray_NDIM({iname});i++){{
-                    if(PyArray_DIMS({iname})[i]==0 && tosum[i]){{
-                        PyErr_Format(PyExc_ValueError,
-                            "Input of CAReduce{{{node.op.scalar_op}}} has zero-size on axis %%d",i);
-                        {sub["fail"]};
-                    }}
+        setup = f"int tosum[]={{{pattern_}}};" + setup
+        alloc += dedent(
+            f"""
+            for(int i=0;i<PyArray_NDIM({iname});i++){{
+                if(PyArray_DIMS({iname})[i]==0 && tosum[i]){{
+                    PyErr_Format(PyExc_ValueError,
+                        "Input of CAReduce{{{node.op.scalar_op}}} has zero-size on axis %%d",i);
+                    {sub["fail"]};
                }}
-                """
-        return decl, checks, alloc, loop, end
+            }}
+            """
+        )
+        return setup, alloc, loop, cast


 class Max(NonZeroDimsCAReduce):