Implement Blockwise Op to vectorize existing Ops

Inspired by: https://github.com/aesara-devs/aesara/pull/1215Co-authored-by: Brandon T. Willard <brandonwillard@users.noreply.github.com> Co-authored-by: Purna Chandra Mansingh <purnachandramansingh135@gmail.com> Co-authored-by: Sayam Kumar <sayamkumar049@gmail.com3> Co-authored-by: Kaustubh <ckaustubhm06@gmail.com>

Implement Blockwise Op to vectorize existing Ops
7fb4e70a · Ricardo Vieira · Thomas Wiecki · a6975da3 · 7fb4e70a · 7fb4e70a
--- a/pytensor/tensor/blockwise.py
+++ b/pytensor/tensor/blockwise.py
--- a/pytensor/tensor/elemwise.py
+++ b/pytensor/tensor/elemwise.py
@@ -22,6 +22,7 @@ from pytensor.scalar.basic import transfer_type, upcast
 from pytensor.tensor import elemwise_cgen as cgen
 from pytensor.tensor import get_vector_length
 from pytensor.tensor.basic import _get_vector_length, as_tensor_variable
+from pytensor.tensor.blockwise import _vectorize_node, vectorize_not_needed
 from pytensor.tensor.type import (
    TensorType,
    continuous_dtypes,
@@ -29,6 +30,7 @@ from pytensor.tensor.type import (
    float_dtypes,
    lvector,
 )
+from pytensor.tensor.utils import broadcast_static_dim_lengths, import_func_from_string
 from pytensor.tensor.variable import TensorVariable
 from pytensor.utils import uniq

@@ -232,7 +234,7 @@ class DimShuffle(ExternalCOp):
            return f"Transpose{{axes={self.shuffle}}}"
        return f"DimShuffle{{order=[{','.join(map(str, self.new_order))}]}}"

-    def perform(self, node, inp, out, params):
+    def perform(self, node, inp, out, params=None):
        (res,) = inp
        (storage,) = out

@@ -429,28 +431,12 @@ class Elemwise(OpenMPOp):
        # of all inputs in parallel... the all() gives us each output
        # broadcastable bit in turn.

-        def get_most_specialized_shape(shapes):
-            shapes = set(shapes)
-            # All shapes are the same
-            if len(shapes) == 1:
-                return tuple(shapes)[0]
-
-            # Only valid indeterminate case
-            if shapes == {None, 1}:
-                return None
-
-            shapes.discard(1)
-            shapes.discard(None)
-            if len(shapes) > 1:
-                raise ValueError
-            return tuple(shapes)[0]
-
        # it is multiplied by nout because Elemwise supports multiple outputs
        # (nout of them)
        try:
            out_shapes = [
                [
-                    get_most_specialized_shape(shape)
+                    broadcast_static_dim_lengths(shape)
                    for shape in zip(*[inp.type.shape for inp in inputs])
                ]
            ] * shadow.nout
@@ -665,22 +651,7 @@ class Elemwise(OpenMPOp):
            impl = "c"

        if getattr(self, "nfunc_spec", None) and impl != "c":
-            self.nfunc = getattr(np, self.nfunc_spec[0], None)
-            if self.nfunc is None:
-                # Not inside NumPy. So probably another package like scipy.
-                symb = self.nfunc_spec[0].split(".")
-                for idx in range(1, len(self.nfunc_spec[0])):
-                    try:
-                        module = __import__(".".join(symb[:idx]))
-                    except ImportError:
-                        break
-                for sub in symb[1:]:
-                    try:
-                        module = getattr(module, sub)
-                    except AttributeError:
-                        module = None
-                        break
-                self.nfunc = module
+            self.nfunc = import_func_from_string(self.nfunc_spec[0])

        if (
            (len(node.inputs) + len(node.outputs)) <= 32
@@ -1768,3 +1739,37 @@ def _get_vector_length_Elemwise(op, var):
        return get_vector_length(var.owner.inputs[0])

    raise ValueError(f"Length of {var} cannot be determined")
+
+
+_vectorize_node.register(Elemwise, vectorize_not_needed)
+
+
+@_vectorize_node.register(DimShuffle)
+def vectorize_dimshuffle(op: DimShuffle, node: Apply, x: TensorVariable) -> Apply:
+    batched_ndims = x.type.ndim - node.inputs[0].type.ndim
+    if not batched_ndims:
+        return node.op.make_node(x)
+    input_broadcastable = x.type.broadcastable[:batched_ndims] + op.input_broadcastable
+    # e.g., ds(matrix, order=(1, "x", 0)) -> ds(tensor4, order=(0, 1, 3, "x", 2))
+    # e.g., ds(row, order=(1, "x")) -> ds(tensor4, order=(0, 1, 3, "x"))
+    new_order = list(range(batched_ndims)) + [
+        "x" if (o == "x") else (o + batched_ndims) for o in op.new_order
+    ]
+    return DimShuffle(input_broadcastable, new_order).make_node(x)
+
+
+@_vectorize_node.register(CAReduce)
+def vectorize_careduce(op: CAReduce, node: Apply, x: TensorVariable) -> Apply:
+    batched_ndims = x.type.ndim - node.inputs[0].type.ndim
+    if not batched_ndims:
+        return node.op.make_node(x)
+    axes = op.axis
+    # e.g., sum(matrix, axis=None) -> sum(tensor4, axis=(2, 3))
+    # e.g., sum(matrix, axis=0) -> sum(tensor4, axis=(2,))
+    if axes is None:
+        axes = list(range(node.inputs[0].type.ndim))
+    else:
+        axes = list(axes)
+    new_axes = [axis + batched_ndims for axis in axes]
+    new_op = op.clone(axis=new_axes)
+    return new_op.make_node(x)
--- a/pytensor/tensor/random/op.py
+++ b/pytensor/tensor/random/op.py
@@ -5,19 +5,25 @@ import numpy as np

 import pytensor
 from pytensor.configdefaults import config
-from pytensor.graph.basic import Apply, Variable
+from pytensor.graph.basic import Apply, Variable, equal_computations
 from pytensor.graph.op import Op
 from pytensor.misc.safe_asarray import _asarray
 from pytensor.scalar import ScalarVariable
 from pytensor.tensor.basic import (
    as_tensor_variable,
+    concatenate,
    constant,
    get_underlying_scalar_constant_value,
    get_vector_length,
    infer_static_shape,
 )
+from pytensor.tensor.blockwise import _vectorize_node
 from pytensor.tensor.random.type import RandomGeneratorType, RandomStateType, RandomType
-from pytensor.tensor.random.utils import normalize_size_param, params_broadcast_shapes
+from pytensor.tensor.random.utils import (
+    broadcast_params,
+    normalize_size_param,
+    params_broadcast_shapes,
+)
 from pytensor.tensor.shape import shape_tuple
 from pytensor.tensor.type import TensorType, all_dtypes
 from pytensor.tensor.type_other import NoneConst
@@ -383,3 +389,22 @@ class DefaultGeneratorMakerOp(AbstractRNGConstructor):


 default_rng = DefaultGeneratorMakerOp()
+
+
+@_vectorize_node.register(RandomVariable)
+def vectorize_random_variable(
+    op: RandomVariable, node: Apply, rng, size, dtype, *dist_params
+) -> Apply:
+    # If size was provided originally and a new size hasn't been provided,
+    # We extend it to accommodate the new input batch dimensions.
+    # Otherwise, we assume the new size already has the right values
+    old_size = node.inputs[1]
+    len_old_size = get_vector_length(old_size)
+    if len_old_size and equal_computations([old_size], [size]):
+        bcasted_param = broadcast_params(dist_params, op.ndims_params)[0]
+        new_param_ndim = (bcasted_param.type.ndim - op.ndims_params[0]) - len_old_size
+        if new_param_ndim >= 0:
+            new_size_dims = bcasted_param.shape[:new_param_ndim]
+            size = concatenate([new_size_dims, size])
+
+    return op.make_node(rng, size, dtype, *dist_params)
--- a/pytensor/tensor/rewriting/__init__.py
+++ b/pytensor/tensor/rewriting/__init__.py
@@ -2,6 +2,7 @@ import pytensor.tensor.rewriting.basic
 import pytensor.tensor.rewriting.blas
 import pytensor.tensor.rewriting.blas_c
 import pytensor.tensor.rewriting.blas_scipy
+import pytensor.tensor.rewriting.blockwise
 import pytensor.tensor.rewriting.elemwise
 import pytensor.tensor.rewriting.extra_ops


--- a/pytensor/tensor/rewriting/blockwise.py
+++ b/pytensor/tensor/rewriting/blockwise.py
+from pytensor.compile.mode import optdb
+from pytensor.graph import node_rewriter
+from pytensor.graph.rewriting.basic import copy_stack_trace, out2in
+from pytensor.tensor.blockwise import Blockwise, vectorize_node
+
+
+@node_rewriter([Blockwise])
+def local_useless_blockwise(fgraph, node):
+    """
+    If there is a dispatch implementation that does not require Blockwise, use that instead.
+    This means a user created a Blockwise manually when there was no need.
+
+    Note: This rewrite is not registered by default anywhere
+    """
+    op = node.op
+    inputs = node.inputs
+    dummy_core_node = op._create_dummy_core_node(node.inputs)
+    vect_node = vectorize_node(dummy_core_node, *inputs)
+    if not isinstance(vect_node.op, Blockwise):
+        return copy_stack_trace(node.outputs, vect_node.outputs)
+
+
+@node_rewriter([Blockwise])
+def local_useless_unbatched_blockwise(fgraph, node):
+    """Remove Blockwise that don't have any batched dims."""
+    op = node.op
+    inputs = node.inputs
+
+    if max(inp.type.ndim - len(sig) for inp, sig in zip(inputs, op.inputs_sig)) == 0:
+        return copy_stack_trace(node.outputs, op.core_op.make_node(*inputs).outputs)
+
+
+# We register this rewrite late, so that other rewrites need only target Blockwise Ops
+optdb.register(
+    "local_useless_unbatched_blockwise",
+    out2in(local_useless_unbatched_blockwise, ignore_newtrees=True),
+    "fast_run",
+    "fast_compile",
+    "blockwise",
+    position=49,
+)
--- a/pytensor/tensor/utils.py
+++ b/pytensor/tensor/utils.py
+from typing import Sequence, Union
+
 import numpy as np

 import pytensor
@@ -107,3 +109,54 @@ def as_list(x):
        return list(x)
    except TypeError:
        return [x]
+
+
+def import_func_from_string(func_string: str):  # -> Optional[Callable]:
+    func = getattr(np, func_string, None)
+    if func is not None:
+        return func
+
+    # Not inside NumPy or Scipy. So probably another package like scipy.
+    module = None
+    items = func_string.split(".")
+    for idx in range(1, len(items)):
+        try:
+            module = __import__(".".join(items[:idx]))
+        except ImportError:
+            break
+
+    if module:
+        for sub in items[1:]:
+            try:
+                module = getattr(module, sub)
+            except AttributeError:
+                module = None
+                break
+        return module
+
+
+def broadcast_static_dim_lengths(
+    dim_lengths: Sequence[Union[int, None]]
+) -> Union[int, None]:
+    """Apply static broadcast given static dim length of inputs (obtained from var.type.shape).
+
+    Raises
+    ------
+    ValueError
+        When static dim lengths are incompatible
+    """
+
+    dim_lengths_set = set(dim_lengths)
+    # All dim_lengths are the same
+    if len(dim_lengths_set) == 1:
+        return tuple(dim_lengths_set)[0]
+
+    # Only valid indeterminate case
+    if dim_lengths_set == {None, 1}:
+        return None
+
+    dim_lengths_set.discard(1)
+    dim_lengths_set.discard(None)
+    if len(dim_lengths_set) > 1:
+        raise ValueError
+    return tuple(dim_lengths_set)[0]
--- a/tests/tensor/random/test_op.py
+++ b/tests/tensor/random/test_op.py
@@ -5,7 +5,9 @@ import pytensor.tensor as at
 from pytensor import config, function
 from pytensor.gradient import NullTypeGradError, grad
 from pytensor.raise_op import Assert
+from pytensor.tensor.blockwise import vectorize_node
 from pytensor.tensor.math import eq
+from pytensor.tensor.random import normal
 from pytensor.tensor.random.op import RandomState, RandomVariable, default_rng
 from pytensor.tensor.shape import specify_shape
 from pytensor.tensor.type import all_dtypes, iscalar, tensor
@@ -202,3 +204,37 @@ def test_RandomVariable_incompatible_size():
        ValueError, match="Size length is incompatible with batched dimensions"
    ):
        rv_op(np.zeros((2, 4, 3)), 1, size=(4,))
+
+
+def test_vectorize_node():
+    vec = tensor(shape=(None,))
+    vec.tag.test_value = [0, 0, 0]
+    mat = tensor(shape=(None, None))
+    mat.tag.test_value = [[0, 0, 0], [1, 1, 1]]
+
+    # Test without size
+    node = normal(vec).owner
+    new_inputs = node.inputs.copy()
+    new_inputs[3] = mat
+    vect_node = vectorize_node(node, *new_inputs)
+    assert vect_node.op is normal
+    assert vect_node.inputs[3] is mat
+
+    # Test with size, new size provided
+    node = normal(vec, size=(3,)).owner
+    new_inputs = node.inputs.copy()
+    new_inputs[1] = (2, 3)
+    new_inputs[3] = mat
+    vect_node = vectorize_node(node, *new_inputs)
+    assert vect_node.op is normal
+    assert tuple(vect_node.inputs[1].eval()) == (2, 3)
+    assert vect_node.inputs[3] is mat
+
+    # Test with size, new size not provided
+    node = normal(vec, size=(3,)).owner
+    new_inputs = node.inputs.copy()
+    new_inputs[3] = mat
+    vect_node = vectorize_node(node, *new_inputs)
+    assert vect_node.op is normal
+    assert vect_node.inputs[3] is mat
+    assert tuple(vect_node.inputs[1].eval({mat: mat.tag.test_value})) == (2, 3)
--- a/tests/tensor/rewriting/test_blockwise.py
+++ b/tests/tensor/rewriting/test_blockwise.py
+from pytensor import function
+from pytensor.graph import FunctionGraph
+from pytensor.scalar import log as scalar_log
+from pytensor.tensor import matrix, tensor3
+from pytensor.tensor.blockwise import Blockwise
+from pytensor.tensor.elemwise import Elemwise
+from pytensor.tensor.nlinalg import MatrixPinv
+from pytensor.tensor.rewriting.blockwise import local_useless_blockwise
+
+
+def test_useless_blockwise_of_elemwise():
+    x = matrix("x")
+    out = Blockwise(Elemwise(scalar_log), signature="()->()")(x)
+    assert isinstance(out.owner.op, Blockwise)
+    assert isinstance(out.owner.op.core_op, Elemwise)
+
+    fg = FunctionGraph([x], [out], clone=False)
+    [new_out] = local_useless_blockwise.transform(fg, out.owner)
+    assert isinstance(new_out.owner.op, Elemwise)
+
+
+def test_useless_unbatched_blockwise():
+    x = matrix("x")
+    blockwise_op = Blockwise(MatrixPinv(hermitian=False), signature="(m,n)->(n,m)")
+    out = blockwise_op(x)
+
+    assert isinstance(out.owner.op, Blockwise)
+    assert isinstance(out.owner.op.core_op, MatrixPinv)
+
+    fn = function([x], out, mode="FAST_COMPILE")
+    assert isinstance(fn.maker.fgraph.outputs[0].owner.op, MatrixPinv)
+
+    # Test that it's not removed when there are batched dims
+    x = tensor3("x")
+    out = blockwise_op(x)
+    fn = function([x], out, mode="FAST_COMPILE")
+    assert isinstance(fn.maker.fgraph.outputs[0].owner.op, Blockwise)
+    assert isinstance(fn.maker.fgraph.outputs[0].owner.op.core_op, MatrixPinv)
--- a/tests/tensor/test_blockwise.py
+++ b/tests/tensor/test_blockwise.py
+from itertools import product
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import pytest
+
+import pytensor
+from pytensor import config
+from pytensor.gradient import grad
+from pytensor.graph import Apply, Op
+from pytensor.tensor import tensor
+from pytensor.tensor.blockwise import Blockwise, _parse_gufunc_signature, vectorize_node
+from pytensor.tensor.nlinalg import MatrixInverse
+from pytensor.tensor.slinalg import Cholesky, Solve
+
+
+def test_vectorize_blockwise():
+    mat = tensor(shape=(None, None))
+    tns = tensor(shape=(None, None, None))
+
+    # Something that falls back to Blockwise
+    node = MatrixInverse()(mat).owner
+    vect_node = vectorize_node(node, tns)
+    assert isinstance(vect_node.op, Blockwise) and isinstance(
+        vect_node.op.core_op, MatrixInverse
+    )
+    assert vect_node.inputs[0] is tns
+
+    # Useless blockwise
+    tns4 = tensor(shape=(5, None, None, None))
+    new_vect_node = vectorize_node(vect_node, tns4)
+    assert new_vect_node.op is vect_node.op
+    assert isinstance(new_vect_node.op, Blockwise) and isinstance(
+        new_vect_node.op.core_op, MatrixInverse
+    )
+    assert new_vect_node.inputs[0] is tns4
+
+
+class TestOp(Op):
+    def make_node(self, *inputs):
+        return Apply(self, inputs, [i.type() for i in inputs])
+
+    def perform(self, *args, **kwargs):
+        raise NotImplementedError("Test Op should not be present in final graph")
+
+
+test_op = TestOp()
+
+
+def test_vectorize_node_default_signature():
+    vec = tensor(shape=(None,))
+    mat = tensor(shape=(5, None))
+    node = test_op.make_node(vec, mat)
+
+    vect_node = vectorize_node(node, mat, mat)
+    assert isinstance(vect_node.op, Blockwise) and isinstance(
+        vect_node.op.core_op, TestOp
+    )
+    assert vect_node.op.signature == ("(i00),(i10,i11)->(o00),(o10,o11)")
+
+    with pytest.raises(
+        ValueError, match="Signature not provided nor found in core_op TestOp"
+    ):
+        Blockwise(test_op)
+
+    vect_node = Blockwise(test_op, signature="(m),(n)->(m),(n)").make_node(vec, mat)
+    assert vect_node.outputs[0].type.shape == (
+        5,
+        None,
+    )
+    assert vect_node.outputs[0].type.shape == (
+        5,
+        None,
+    )
+
+
+def test_blockwise_shape():
+    # Single output
+    inp = tensor(shape=(5, None, None))
+    inp_test = np.zeros((5, 4, 3), dtype=config.floatX)
+
+    # Shape can be inferred from inputs
+    op = Blockwise(test_op, signature="(m, n) -> (n, m)")
+    out = op(inp)
+    assert out.type.shape == (5, None, None)
+
+    shape_fn = pytensor.function([inp], out.shape)
+    assert not any(
+        isinstance(getattr(n.op, "core_op", n.op), TestOp)
+        for n in shape_fn.maker.fgraph.apply_nodes
+    )
+    assert tuple(shape_fn(inp_test)) == (5, 3, 4)
+
+    # Shape can only be partially inferred from inputs
+    op = Blockwise(test_op, signature="(m, n) -> (m, k)")
+    out = op(inp)
+    assert out.type.shape == (5, None, None)
+
+    shape_fn = pytensor.function([inp], out.shape)
+    assert any(
+        isinstance(getattr(n.op, "core_op", n.op), TestOp)
+        for n in shape_fn.maker.fgraph.apply_nodes
+    )
+
+    shape_fn = pytensor.function([inp], out.shape[:-1])
+    assert not any(
+        isinstance(getattr(n.op, "core_op", n.op), TestOp)
+        for n in shape_fn.maker.fgraph.apply_nodes
+    )
+    assert tuple(shape_fn(inp_test)) == (5, 4)
+
+    # Mutiple outputs
+    inp1 = tensor(shape=(7, 1, None, None))
+    inp2 = tensor(shape=(1, 5, None, None))
+    inp1_test = np.zeros((7, 1, 4, 3), dtype=config.floatX)
+    inp2_test = np.zeros((1, 5, 4, 3), dtype=config.floatX)
+
+    op = Blockwise(test_op, signature="(m, n), (m, n) -> (n, m), (m, k)")
+    outs = op(inp1, inp2)
+    assert outs[0].type.shape == (7, 5, None, None)
+    assert outs[1].type.shape == (7, 5, None, None)
+
+    shape_fn = pytensor.function([inp1, inp2], [out.shape for out in outs])
+    assert any(
+        isinstance(getattr(n.op, "core_op", n.op), TestOp)
+        for n in shape_fn.maker.fgraph.apply_nodes
+    )
+
+    shape_fn = pytensor.function([inp1, inp2], outs[0].shape)
+    assert not any(
+        isinstance(getattr(n.op, "core_op", n.op), TestOp)
+        for n in shape_fn.maker.fgraph.apply_nodes
+    )
+    assert tuple(shape_fn(inp1_test, inp2_test)) == (7, 5, 3, 4)
+
+    shape_fn = pytensor.function([inp1, inp2], [outs[0].shape, outs[1].shape[:-1]])
+    assert not any(
+        isinstance(getattr(n.op, "core_op", n.op), TestOp)
+        for n in shape_fn.maker.fgraph.apply_nodes
+    )
+    assert tuple(shape_fn(inp1_test, inp2_test)[0]) == (7, 5, 3, 4)
+    assert tuple(shape_fn(inp1_test, inp2_test)[1]) == (7, 5, 4)
+
+
+class BlockwiseOpTester:
+    """Base class to test Blockwise works for specific Ops"""
+
+    core_op = None
+    signature = None
+    batcheable_axes = None
+
+    @classmethod
+    def setup_class(cls):
+        seed = sum(map(ord, str(cls.core_op)))
+        cls.rng = np.random.default_rng(seed)
+        cls.params_sig, cls.outputs_sig = _parse_gufunc_signature(cls.signature)
+        if cls.batcheable_axes is None:
+            cls.batcheable_axes = list(range(len(cls.params_sig)))
+        batch_shapes = [(), (1,), (5,), (1, 1), (1, 5), (3, 1), (3, 5)]
+        cls.test_batch_shapes = list(
+            product(batch_shapes, repeat=len(cls.batcheable_axes))
+        )
+        cls.block_op = Blockwise(core_op=cls.core_op, signature=cls.signature)
+
+    @staticmethod
+    def parse_shape(shape: Tuple[Union[str, int], ...]) -> Tuple[int, ...]:
+        """
+        Convert (5, "m", "n") -> (5, 7, 11)
+        """
+        mapping = {"m": 7, "n": 11, "k": 19}
+        return tuple(mapping.get(p, p) for p in shape)
+
+    def create_testvals(self, shape):
+        return self.rng.normal(size=self.parse_shape(shape)).astype(config.floatX)
+
+    def create_batched_inputs(self, batch_idx: Optional[int] = None):
+        for batch_shapes in self.test_batch_shapes:
+            vec_inputs = []
+            vec_inputs_testvals = []
+            for idx, (batch_shape, param_sig) in enumerate(
+                zip(batch_shapes, self.params_sig)
+            ):
+                if batch_idx is not None and idx != batch_idx:
+                    # Skip out combinations in which other inputs are batched
+                    if batch_shape != ():
+                        break
+                vec_inputs.append(tensor(shape=batch_shape + (None,) * len(param_sig)))
+                vec_inputs_testvals.append(
+                    self.create_testvals(shape=batch_shape + param_sig)
+                )
+            else:  # no-break
+                yield vec_inputs, vec_inputs_testvals
+
+    def test_perform(self):
+        base_inputs = [
+            tensor(shape=(None,) * len(param_sig)) for param_sig in self.params_sig
+        ]
+        core_func = pytensor.function(base_inputs, self.core_op(*base_inputs))
+        np_func = np.vectorize(core_func, signature=self.signature)
+
+        for vec_inputs, vec_inputs_testvals in self.create_batched_inputs():
+            pt_func = pytensor.function(vec_inputs, self.block_op(*vec_inputs))
+            if len(self.outputs_sig) != 1:
+                raise NotImplementedError("Did not implement test for multi-output Ops")
+            np.testing.assert_allclose(
+                pt_func(*vec_inputs_testvals),
+                np_func(*vec_inputs_testvals),
+            )
+
+    def test_grad(self):
+        base_inputs = [
+            tensor(shape=(None,) * len(param_sig)) for param_sig in self.params_sig
+        ]
+        out = self.core_op(*base_inputs).sum()
+        # Create separate numpy vectorized functions for each input
+        np_funcs = []
+        for i, inp in enumerate(base_inputs):
+            core_grad_func = pytensor.function(base_inputs, grad(out, wrt=inp))
+            params_sig = self.signature.split("->")[0]
+            param_sig = f"({','.join(self.params_sig[i])})"
+            grad_sig = f"{params_sig}->{param_sig}"
+            np_func = np.vectorize(core_grad_func, signature=grad_sig)
+            np_funcs.append(np_func)
+
+        # We test gradient wrt to one batched input at a time
+        for test_input_idx in range(len(base_inputs)):
+            for vec_inputs, vec_inputs_testvals in self.create_batched_inputs(
+                batch_idx=test_input_idx
+            ):
+                out = self.block_op(*vec_inputs).sum()
+                pt_func = pytensor.function(
+                    vec_inputs, grad(out, wrt=vec_inputs[test_input_idx])
+                )
+                pt_out = pt_func(*vec_inputs_testvals)
+                np_out = np_funcs[test_input_idx](*vec_inputs_testvals)
+                np.testing.assert_allclose(pt_out, np_out, atol=1e-6)
+
+
+class MatrixOpBlockwiseTester(BlockwiseOpTester):
+    def create_testvals(self, shape):
+        # Return a posdef matrix
+        X = super().create_testvals(shape)
+        return np.einsum("...ij,...kj->...ik", X, X)
+
+
+class TestCholesky(MatrixOpBlockwiseTester):
+    core_op = Cholesky(lower=True)
+    signature = "(m, m) -> (m, m)"
+
+
+class TestMatrixInverse(MatrixOpBlockwiseTester):
+    core_op = MatrixInverse()
+    signature = "(m, m) -> (m, m)"
+
+
+class TestSolve(BlockwiseOpTester):
+    core_op = Solve(lower=True)
+    signature = "(m, m),(m) -> (m)"
--- a/tests/tensor/test_elemwise.py
+++ b/tests/tensor/test_elemwise.py
@@ -17,10 +17,13 @@ from pytensor.link.basic import PerformLinker
 from pytensor.link.c.basic import CLinker, OpWiseCLinker
 from pytensor.tensor import as_tensor_variable
 from pytensor.tensor.basic import second
+from pytensor.tensor.blockwise import vectorize_node
 from pytensor.tensor.elemwise import CAReduce, DimShuffle, Elemwise
-from pytensor.tensor.math import all as at_all
-from pytensor.tensor.math import any as at_any
+from pytensor.tensor.math import Any, Sum
+from pytensor.tensor.math import all as pt_all
+from pytensor.tensor.math import any as pt_any
 from pytensor.tensor.math import exp
+from pytensor.tensor.math import sum as pt_sum
 from pytensor.tensor.type import (
    TensorType,
    bmatrix,
@@ -470,12 +473,12 @@ class TestCAReduce(unittest_tools.InferShapeTester):
                        axis2.append(a)
                assert len(axis2) == len(tosum)
                tosum = tuple(axis2)
-            if tensor_op == at_all:
+            if tensor_op == pt_all:
                for axis in sorted(tosum, reverse=True):
                    zv = np.all(zv, axis)
                if len(tosum) == 0:
                    zv = zv != 0
-            elif tensor_op == at_any:
+            elif tensor_op == pt_any:
                for axis in sorted(tosum, reverse=True):
                    zv = np.any(zv, axis)
                if len(tosum) == 0:
@@ -553,8 +556,8 @@ class TestCAReduce(unittest_tools.InferShapeTester):
            self.with_mode(Mode(linker="py"), aes.mul, dtype=dtype)
            self.with_mode(Mode(linker="py"), aes.scalar_maximum, dtype=dtype)
            self.with_mode(Mode(linker="py"), aes.scalar_minimum, dtype=dtype)
-            self.with_mode(Mode(linker="py"), aes.and_, dtype=dtype, tensor_op=at_all)
-            self.with_mode(Mode(linker="py"), aes.or_, dtype=dtype, tensor_op=at_any)
+            self.with_mode(Mode(linker="py"), aes.and_, dtype=dtype, tensor_op=pt_all)
+            self.with_mode(Mode(linker="py"), aes.or_, dtype=dtype, tensor_op=pt_any)
        for dtype in ["int8", "uint8"]:
            self.with_mode(Mode(linker="py"), aes.or_, dtype=dtype)
            self.with_mode(Mode(linker="py"), aes.and_, dtype=dtype)
@@ -575,14 +578,14 @@ class TestCAReduce(unittest_tools.InferShapeTester):
                aes.or_,
                dtype=dtype,
                test_nan=True,
-                tensor_op=at_any,
+                tensor_op=pt_any,
            )
            self.with_mode(
                Mode(linker="py"),
                aes.and_,
                dtype=dtype,
                test_nan=True,
-                tensor_op=at_all,
+                tensor_op=pt_all,
            )

    @pytest.mark.skipif(
@@ -606,8 +609,8 @@ class TestCAReduce(unittest_tools.InferShapeTester):
        for dtype in ["bool", "floatX", "int8", "uint8"]:
            self.with_mode(Mode(linker="c"), aes.scalar_minimum, dtype=dtype)
            self.with_mode(Mode(linker="c"), aes.scalar_maximum, dtype=dtype)
-            self.with_mode(Mode(linker="c"), aes.and_, dtype=dtype, tensor_op=at_all)
-            self.with_mode(Mode(linker="c"), aes.or_, dtype=dtype, tensor_op=at_any)
+            self.with_mode(Mode(linker="c"), aes.and_, dtype=dtype, tensor_op=pt_all)
+            self.with_mode(Mode(linker="c"), aes.or_, dtype=dtype, tensor_op=pt_any)
        for dtype in ["bool", "int8", "uint8"]:
            self.with_mode(Mode(linker="c"), aes.or_, dtype=dtype)
            self.with_mode(Mode(linker="c"), aes.and_, dtype=dtype)
@@ -915,3 +918,50 @@ def test_not_implemented_elemwise_grad():
    # Verify that trying to use the not implemented gradient fails.
    with pytest.raises(pytensor.gradient.NullTypeGradError):
        pytensor.gradient.grad(test_op(x, 2), x)
+
+
+class TestVectorize:
+    def test_elemwise(self):
+        vec = tensor(shape=(None,))
+        mat = tensor(shape=(None, None))
+
+        node = exp(vec).owner
+        vect_node = vectorize_node(node, mat)
+        assert vect_node.op == exp
+        assert vect_node.inputs[0] is mat
+
+    def test_dimshuffle(self):
+        vec = tensor(shape=(None,))
+        mat = tensor(shape=(None, None))
+
+        node = exp(vec).owner
+        vect_node = vectorize_node(node, mat)
+        assert vect_node.op == exp
+        assert vect_node.inputs[0] is mat
+
+        col_mat = tensor(shape=(None, 1))
+        tcol_mat = tensor(shape=(None, None, 1))
+        node = col_mat.dimshuffle(0).owner  # drop column
+        vect_node = vectorize_node(node, tcol_mat)
+        assert isinstance(vect_node.op, DimShuffle)
+        assert vect_node.op.new_order == (0, 1)
+        assert vect_node.inputs[0] is tcol_mat
+        assert vect_node.outputs[0].type.shape == (None, None)
+
+    def test_CAReduce(self):
+        mat = tensor(shape=(None, None))
+        tns = tensor(shape=(None, None, None))
+
+        node = pt_sum(mat).owner
+        vect_node = vectorize_node(node, tns)
+        assert isinstance(vect_node.op, Sum)
+        assert vect_node.op.axis == (1, 2)
+        assert vect_node.inputs[0] is tns
+
+        bool_mat = tensor(dtype="bool", shape=(None, None))
+        bool_tns = tensor(dtype="bool", shape=(None, None, None))
+        node = pt_any(bool_mat, axis=-2).owner
+        vect_node = vectorize_node(node, bool_tns)
+        assert isinstance(vect_node.op, Any)
+        assert vect_node.op.axis == (1,)
+        assert vect_node.inputs[0] is bool_tns