Apply useless blockwise rewrite when there are only dummy batch dims

Also extend eager rewrite to more Ops The Blockwise MatrixInverse grad test became more sensitive in float32, because desired stabilization rewrites (mainly `inv_as_solve`) that target Dot of Blockwise{MatrixInverse} are now triggered in the default blockwise grad but not in the non-default non-blockwise grad

Apply useless blockwise rewrite when there are only dummy batch dims
10c36d2a · Ricardo Vieira · Ricardo Vieira · fe5865ef · 10c36d2a · 10c36d2a
--- a/pytensor/tensor/blockwise.py
+++ b/pytensor/tensor/blockwise.py
@@ -163,8 +163,8 @@ class Blockwise(Op):
        return Apply(self, batched_inputs, batched_outputs)
-    def _batch_ndim_from_outputs(self, outputs: Sequence[TensorVariable]) -> int:
+    def batch_ndim(self, node: Apply) -> int:
-        return cast(int, outputs[0].type.ndim - len(self.outputs_sig[0]))
+        return cast(int, node.outputs[0].type.ndim - len(self.outputs_sig[0]))
    def infer_shape(
        self, fgraph, node, input_shapes
@@ -172,7 +172,7 @@ class Blockwise(Op):
        from pytensor.tensor import broadcast_shape
        from pytensor.tensor.shape import Shape_i
-        batch_ndims = self._batch_ndim_from_outputs(node.outputs)
+        batch_ndims = self.batch_ndim(node)
        core_dims: dict[str, Any] = {}
        batch_shapes = []
        for input_shape, sig in zip(input_shapes, self.inputs_sig):
@@ -278,7 +278,7 @@ class Blockwise(Op):
            return new_rval
        # Sum out the broadcasted dimensions
-        batch_ndims = self._batch_ndim_from_outputs(outs)
+        batch_ndims = self.batch_ndim(outs[0].owner)
        batch_shape = outs[0].type.shape[:batch_ndims]
        for i, (inp, sig) in enumerate(zip(inputs, self.inputs_sig)):
            if isinstance(rval[i].type, (NullType, DisconnectedType)):
@@ -320,7 +320,7 @@ class Blockwise(Op):
        return self._gufunc
    def _check_runtime_broadcast(self, node, inputs):
-        batch_ndim = self._batch_ndim_from_outputs(node.outputs)
+        batch_ndim = self.batch_ndim(node)
        for dims_and_bcast in zip(
            *[

--- a/pytensor/tensor/rewriting/blockwise.py
+++ b/pytensor/tensor/rewriting/blockwise.py
@@ -2,9 +2,15 @@ from pytensor.compile.mode import optdb
 from pytensor.graph import node_rewriter
 from pytensor.graph.replace import vectorize_node
 from pytensor.graph.rewriting.basic import copy_stack_trace, out2in
+from pytensor.tensor.basic import Alloc, ARange, shape_padleft
 from pytensor.tensor.blockwise import Blockwise
-from pytensor.tensor.math import _matrix_matrix_matmul
+from pytensor.tensor.math import Dot
-from pytensor.tensor.rewriting.basic import register_canonicalize
+from pytensor.tensor.rewriting.basic import (
+    register_canonicalize,
+    register_specialize,
+    register_stabilize,
+)
+from pytensor.tensor.subtensor import AdvancedIncSubtensor, AdvancedSubtensor, Subtensor
 @node_rewriter([Blockwise])
@@ -29,8 +35,17 @@ def local_useless_unbatched_blockwise(fgraph, node):
    op = node.op
    inputs = node.inputs
-    if max(inp.type.ndim - len(sig) for inp, sig in zip(inputs, op.inputs_sig)) == 0:
+    batch_ndims = node.op.batch_ndim(node)
-        return copy_stack_trace(node.outputs, op.core_op.make_node(*inputs).outputs)
+    if all(all(inp.type.broadcastable[:batch_ndims]) for inp in inputs):
+        if batch_ndims:
+            # Remove dummy batch dims
+            axis = tuple(range(batch_ndims))
+            inputs = [inp.squeeze(axis) for inp in inputs]
+        new_outs = op.core_op.make_node(*inputs).outputs
+        if batch_ndims:
+            # Reintroduce dummy batch dims
+            new_outs = [shape_padleft(out, batch_ndims) for out in new_outs]
+        return copy_stack_trace(node.outputs, new_outs)
 # We register this rewrite late, so that other rewrites need only target Blockwise Ops
@@ -46,6 +61,22 @@ optdb.register(
 # Avoid redundant cases early on for Ops whose default form is not Blockwised
 @register_canonicalize
-@node_rewriter(tracks=[_matrix_matrix_matmul])
+@register_stabilize
+@register_specialize
+@node_rewriter(tracks=[Blockwise])
 def local_eager_useless_unbatched_blockwise(fgraph, node):
+    if isinstance(
+        node.op.core_op,
+        (
+            # Many Dot-related rewrites (e.g., all of BlasOpt) happen before specialize
+            Dot,
+            # These Ops can't always be trivially vectorized at runtime,
+            # Since their inputs may imply non-rectangular shapes.
+            Alloc,
+            ARange,
+            Subtensor,
+            AdvancedSubtensor,
+            AdvancedIncSubtensor,
+        ),
+    ):
        return local_useless_unbatched_blockwise.fn(fgraph, node)
--- a/tests/tensor/test_blockwise.py
+++ b/tests/tensor/test_blockwise.py
@@ -293,7 +293,7 @@ class BlockwiseOpTester:
                    pt_out,
                    np_out,
                    rtol=1e-7 if config.floatX == "float64" else 1e-5,
-                    atol=1e-6 if config.floatX == "float64" else 1e-5,
+                    atol=1e-6 if config.floatX == "float64" else 1e-4,
                )