Use asv for benchmarking

28521389 · Ricardo Vieira · ricardoV94 · 0bd33bfc · 28521389 · 28521389
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
+name: Benchmarks
+on:
+  push:
+    branches:
+      - main
+      - v3
+  pull_request:
+    branches:
+      - main
+      - v3
+concurrency:
+  group: benchmarks-${{ github.event_name == 'pull_request' && github.head_ref || github.sha }}
+  cancel-in-progress: true
+jobs:
+  benchmarks:
+    name: "Run benchmarks"
+    if: github.event_name == 'push'
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pages: write
+      id-token: write
+    steps:
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 # zizmor: ignore[artipacked]
+        with:
+          fetch-depth: 0
+          # persist-credentials is true (default) because this job pushes to asv-results branch
+      - name: Set up Python
+        uses: mamba-org/setup-micromamba@add3a49764cedee8ee24e82dfde87f5bc2914462 # v2.0.7
+        with:
+          environment-name: pytensor-bench
+          micromamba-version: "1.5.10-0"
+          init-shell: bash
+          post-cleanup: "all"
+          cache-environment: true
+          create-args: >-
+            -c conda-forge
+            python=3.11
+            mkl
+            numpy
+            scipy
+            pip
+            mkl-service
+            cython
+            numba>=0.57
+            jax
+            jaxlib
+            asv
+      - name: Install dependencies
+        shell: micromamba-shell {0}
+        run: |
+          pip install -e ./
+          python -c 'import pytensor; print(pytensor.config.__str__(print_doc=False))'
+          python -c 'import pytensor; assert pytensor.config.blas__ldflags != "", "Blas flags are empty"'
+      - name: Fetch previous results from asv-results branch
+        shell: bash
+        run: |
+          git fetch origin asv-results:asv-results 2>/dev/null || true
+          if git rev-parse --verify asv-results 2>/dev/null; then
+            git worktree add /tmp/asv-results asv-results
+            if [ -d /tmp/asv-results/results ]; then
+              mkdir -p .asv/results
+              cp -r /tmp/asv-results/results/* .asv/results/
+            fi
+            git worktree remove /tmp/asv-results --force
+          fi
+      - name: Configure ASV machine
+        shell: micromamba-shell {0}
+        run: asv machine --yes --machine github-actions
+      - name: Run benchmarks
+        shell: micromamba-shell {0}
+        run: |
+          export PYTENSOR_FLAGS=warn__ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise,gcc__cxxflags=-pipe
+          asv run --python=same --set-commit-hash=$(git rev-parse HEAD) --show-stderr
+      - name: Push results to asv-results branch
+        shell: bash
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          # Create or update the asv-results branch
+          if git rev-parse --verify asv-results 2>/dev/null; then
+            git worktree add /tmp/asv-results asv-results
+          else
+            git worktree add --orphan -b asv-results /tmp/asv-results
+            cd /tmp/asv-results
+            git rm -rf . 2>/dev/null || true
+            cd -
+          fi
+          mkdir -p /tmp/asv-results/results
+          cp -r .asv/results/* /tmp/asv-results/results/
+          cd /tmp/asv-results
+          git add results/
+          git commit -m "Update benchmark results for ${{ github.sha }}" || true
+          git push origin asv-results
+          cd -
+          git worktree remove /tmp/asv-results --force
+      - name: Generate HTML
+        shell: micromamba-shell {0}
+        run: asv publish
+      - name: Upload Pages artifact
+        uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa # v3.0.1
+        with:
+          path: .asv/html
+  deploy-pages:
+    name: "Deploy benchmark dashboard"
+    if: github.event_name == 'push'
+    needs: benchmarks
+    runs-on: ubuntu-latest
+    permissions:
+      pages: write
+      id-token: write
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4.0.5
+  benchmarks-pr:
+    name: "Benchmark comparison"
+    if: github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    steps:
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          fetch-depth: 0
+          persist-credentials: false
+      - name: Set up Python
+        uses: mamba-org/setup-micromamba@add3a49764cedee8ee24e82dfde87f5bc2914462 # v2.0.7
+        with:
+          environment-name: pytensor-bench
+          micromamba-version: "1.5.10-0"
+          init-shell: bash
+          post-cleanup: "all"
+          cache-environment: true
+          create-args: >-
+            -c conda-forge
+            python=3.11
+            mkl
+            numpy
+            scipy
+            pip
+            mkl-service
+            cython
+            numba>=0.57
+            jax
+            jaxlib
+            asv
+      - name: Install dependencies
+        shell: micromamba-shell {0}
+        run: |
+          pip install -e ./
+      - name: Configure ASV machine
+        shell: micromamba-shell {0}
+        run: asv machine --yes --machine github-actions
+      - name: Run benchmark comparison
+        id: bench
+        shell: micromamba-shell {0}
+        env:
+          BASE_REF: ${{ github.base_ref }}
+        run: |
+          export PYTENSOR_FLAGS=warn__ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise,gcc__cxxflags=-pipe
+          HEAD_SHA=$(git rev-parse HEAD)
+          BASE_SHA=$(git merge-base "origin/$BASE_REF" HEAD)
+          # Benchmark the PR head (already installed)
+          asv run --python=same --set-commit-hash="$HEAD_SHA" --show-stderr
+          # Checkout base, reinstall, and benchmark
+          git checkout "$BASE_SHA"
+          pip install -e ./
+          asv run --python=same --set-commit-hash="$BASE_SHA" --show-stderr
+          # Return to PR head so asv.conf.json is available for compare
+          git checkout "$HEAD_SHA"
+          # Compare results (only regressions)
+          asv compare "$BASE_SHA" "$HEAD_SHA" --factor 1.2 --split --only-changed | tee bench_output.txt
+          # Check if there are regressions (lines after "Benchmarks that have got worse:")
+          if sed -n '/Benchmarks that have got worse:/,/^$/p' bench_output.txt | grep -qE '\S'; then
+            echo "has_regressions=true" >> "$GITHUB_OUTPUT"
+          fi
+      - name: Post benchmark regressions as PR comment
+        if: always() && steps.bench.outcome != 'cancelled' && steps.bench.outputs.has_regressions == 'true'
+        uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7.1.0
+        with:
+          script: |
+            const fs = require('fs');
+            const marker = '<!-- asv-benchmark-results -->';
+            let output = '';
+            try {
+              output = fs.readFileSync('bench_output.txt', 'utf8');
+            } catch (e) {
+              output = 'Benchmark comparison failed to produce results.';
+            }
+            const body = `${marker}\n## Benchmark regressions (main vs PR)\n\n\`\`\`\n${output}\n\`\`\`\n\nBenchmarks that regressed by more than 20% are shown.`;
+            // Find existing comment to update
+            const { data: comments } = await github.rest.issues.listComments({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+            });
+            const existing = comments.find(c => c.body.includes(marker));
+            if (existing) {
+              await github.rest.issues.updateComment({
+                comment_id: existing.id,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                body: body,
+              });
+            } else {
+              await github.rest.issues.createComment({
+                issue_number: context.issue.number,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                body: body,
+              });
+            }
--- a/.github/workflows/copilot-setup-steps.yml
+++ b/.github/workflows/copilot-setup-steps.yml
@@ -40,7 +40,7 @@ jobs:
      - name: "Install dependencies"
        shell: micromamba-shell {0}
        run: |
-          micromamba install --yes -q -c conda-forge python=3.13 mkl "numpy>=2.0" scipy pip mkl-service graphviz cython pytest coverage pytest-cov pytest-benchmark pytest-mock pytest-sphinx
+          micromamba install --yes -q -c conda-forge python=3.13 mkl "numpy>=2.0" scipy pip mkl-service graphviz cython pytest coverage pytest-cov pytest-mock pytest-sphinx
          micromamba install --yes -q -c conda-forge "numba>=0.57"
          micromamba install --yes -q -c conda-forge jax jaxlib numpyro equinox
          micromamba install --yes -q -c conda-forge mypy types-setuptools scipy-stubs pandas pre-commit

--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -163,9 +163,9 @@ jobs:
        run: |
          if [[ $OS == "macos-15" ]]; then
-            micromamba install --yes -q "python~=${PYTHON_VERSION}" numpy "scipy<1.17.0" "numba>=0.63" pip graphviz cython pytest coverage pytest-cov pytest-benchmark pytest-mock pytest-sphinx libblas=*=*accelerate;
+            micromamba install --yes -q "python~=${PYTHON_VERSION}" numpy "scipy<1.17.0" "numba>=0.63" pip graphviz cython pytest coverage pytest-cov pytest-mock pytest-sphinx libblas=*=*accelerate;
          else
-            micromamba install --yes -q "python~=${PYTHON_VERSION}" numpy "scipy<1.17.0" "numba>=0.63" pip graphviz cython pytest coverage pytest-cov pytest-benchmark pytest-mock pytest-sphinx mkl mkl-service;
+            micromamba install --yes -q "python~=${PYTHON_VERSION}" numpy "scipy<1.17.0" "numba>=0.63" pip graphviz cython pytest coverage pytest-cov pytest-mock pytest-sphinx mkl mkl-service;
          fi
          if [[ $INSTALL_JAX == "1" ]]; then micromamba install --yes -q -c conda-forge "python~=${PYTHON_VERSION}" && pip install "jax>=0.8,<0.9.1" jaxlib numpyro equinox tfp-nightly; fi
          if [[ $INSTALL_TORCH == "1" ]]; then micromamba install --yes -q -c conda-forge "python~=${PYTHON_VERSION}" pytorch pytorch-cuda=12.1 "mkl<=2024.0" -c pytorch -c nvidia; fi
@@ -194,7 +194,7 @@ jobs:
          if [[ $DEFAULT_MODE == "FAST_COMPILE" ]]; then export PYTENSOR_FLAGS=$PYTENSOR_FLAGS,mode=FAST_COMPILE; fi
          if [[ $DEFAULT_MODE == "CVM" ]]; then export PYTENSOR_FLAGS=$PYTENSOR_FLAGS,linker=cvm; fi
          export PYTENSOR_FLAGS=$PYTENSOR_FLAGS,warn__ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise,gcc__cxxflags=-pipe
-          python -m pytest -r A --verbose --runslow --durations=50 --cov=pytensor/ --cov-report=xml:coverage/coverage-${MATRIX_ID}.xml --no-cov-on-fail $PART --benchmark-skip
+          python -m pytest -r A --verbose --runslow --durations=50 --cov=pytensor/ --cov-report=xml:coverage/coverage-${MATRIX_ID}.xml --no-cov-on-fail $PART
        env:
          MATRIX_ID: ${{ steps.matrix-id.outputs.id }}
          MKL_THREADING_LAYER: GNU
@@ -209,60 +209,6 @@ jobs:
          name: coverage-${{ steps.matrix-id.outputs.id }}
          path: coverage/coverage-${{ steps.matrix-id.outputs.id }}.xml
-  benchmarks:
-    name: "Benchmarks"
-    needs:
-      - changes
-      - style
-    runs-on: ubuntu-latest
-    if: ${{ needs.changes.outputs.changes == 'true' && needs.style.result == 'success' }}
-    strategy:
-      fail-fast: false
-    steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-        with:
-          fetch-depth: 0
-          persist-credentials: false
-      - name: Set up Python 3.11
-        uses: mamba-org/setup-micromamba@add3a49764cedee8ee24e82dfde87f5bc2914462 # v2.0.7
-        with:
-          environment-name: pytensor-test
-          micromamba-version: "1.5.10-0" # until https://github.com/mamba-org/setup-micromamba/issues/225 is resolved
-          init-shell: bash
-          post-cleanup: "all"
-      - name: Install dependencies
-        shell: micromamba-shell {0}
-        run: |
-          micromamba install --yes -q -c conda-forge "python~=${PYTHON_VERSION}" mkl numpy scipy pip mkl-service cython pytest "numba>=0.57" jax jaxlib pytest-benchmark
-          pip install -e ./
-          micromamba list && pip freeze
-          python -c 'import pytensor; print(pytensor.config.__str__(print_doc=False))'
-          python -c 'import pytensor; assert pytensor.config.blas__ldflags != "", "Blas flags are empty"'
-        env:
-          PYTHON_VERSION: 3.11
-      - name: Download previous benchmark data
-        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
-        with:
-          path: ./cache
-          key: ${{ runner.os }}-benchmark
-      - name: Run benchmarks
-        shell: micromamba-shell {0}
-        run: |
-          export PYTENSOR_FLAGS=mode=FAST_COMPILE,warn__ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise,gcc__cxxflags=-pipe
-          python -m pytest --runslow --benchmark-only --benchmark-json output.json
-      - name: Store benchmark result
-        uses: benchmark-action/github-action-benchmark@4bdcce38c94cec68da58d012ac24b7b1155efe8b # v1.20.7
-        with:
-          name: Python Benchmark with pytest-benchmark
-          tool: "pytest"
-          output-file-path: output.json
-          external-data-json-path: ./cache/benchmark-data.json
-          alert-threshold: "200%"
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          comment-on-alert: false
-          fail-on-alert: false
-          auto-push: false
  all-checks:
    if: ${{ always() }}
    runs-on: ubuntu-latest

--- a/.gitignore
+++ b/.gitignore
@@ -49,9 +49,11 @@ core
 .mypy_cache/
 /htmlcov/
+.venv/
 pytensor-venv/
 /notebooks/Sandbox*
 .vscode/
 testing-report.html
 coverage.xml
 .coverage.*
+.asv/
--- a/asv.conf.json
+++ b/asv.conf.json
+{
+    "version": 1,
+    "project": "pytensor",
+    "project_url": "https://github.com/pymc-devs/pytensor",
+    "repo": ".",
+    "branches": ["HEAD"],
+    "dvcs": "git",
+    "environment_type": "existing",
+    "benchmark_dir": "benchmarks",
+    "env_dir": ".asv/env",
+    "results_dir": ".asv/results",
+    "html_dir": ".asv/html"
+}
--- a/benchmarks/__init__.py
+++ b/benchmarks/__init__.py
--- a/benchmarks/bench_blockwise.py
+++ b/benchmarks/bench_blockwise.py
+import numpy as np
+import pytensor
+from pytensor import grad
+from pytensor.tensor.math import log
+from pytensor.tensor.nlinalg import diagonal
+from pytensor.tensor.signal.conv import convolve1d
+from pytensor.tensor.slinalg import cholesky, solve_triangular
+from pytensor.tensor.type import dmatrix, tensor
+class BatchedMVNormalLogpAndDlogp:
+    """Benchmark batched multivariate normal log-probability and its gradient."""
+    params = [
+        [(), (1000,), (4, 1000)],
+        [(), (1000,), (4, 1000)],
+    ]
+    param_names = ["mu_batch_shape", "cov_batch_shape"]
+    def setup(self, mu_batch_shape, cov_batch_shape):
+        rng = np.random.default_rng(sum(map(ord, "batched_mvnormal")))
+        value_batch_shape = mu_batch_shape
+        if len(cov_batch_shape) > len(mu_batch_shape):
+            value_batch_shape = cov_batch_shape
+        value = tensor("value", shape=(*value_batch_shape, 10))
+        mu = tensor("mu", shape=(*mu_batch_shape, 10))
+        cov = tensor("cov", shape=(*cov_batch_shape, 10, 10))
+        self.test_values = [
+            rng.normal(size=value.type.shape),
+            rng.normal(size=mu.type.shape),
+            np.eye(cov.type.shape[-1]) * np.abs(rng.normal(size=cov.type.shape)),
+        ]
+        chol_cov = cholesky(cov, lower=True, on_error="raise")
+        delta_trans = solve_triangular(chol_cov, value - mu, b_ndim=1)
+        quaddist = (delta_trans**2).sum(axis=-1)
+        diag = diagonal(chol_cov, axis1=-2, axis2=-1)
+        logdet = log(diag).sum(axis=-1)
+        k = value.shape[-1]
+        norm = -0.5 * k * (np.log(2 * np.pi))
+        logp = norm - 0.5 * quaddist - logdet
+        dlogp = grad(logp.sum(), wrt=[value, mu, cov])
+        self.fn = pytensor.function([value, mu, cov], [logp, *dlogp])
+    def time_batched_mvnormal_logp_and_dlogp(self, mu_batch_shape, cov_batch_shape):
+        self.fn(*self.test_values)
+class SmallBlockwisePerformance:
+    """Benchmark small blockwise convolution."""
+    def setup(self):
+        a = dmatrix(shape=(7, 128))
+        b = dmatrix(shape=(7, 20))
+        out = convolve1d(a, b, mode="valid")
+        self.fn = pytensor.function([a, b], out, trust_input=True)
+        rng = np.random.default_rng(495)
+        self.a_test = rng.normal(size=a.type.shape)
+        self.b_test = rng.normal(size=b.type.shape)
+    def time_small_blockwise(self):
+        self.fn(self.a_test, self.b_test)
--- a/benchmarks/bench_compile.py
+++ b/benchmarks/bench_compile.py
+import numpy as np
+from pytensor import config, function
+from pytensor.compile.io import In
+from pytensor.tensor.random.basic import normal
+from pytensor.tensor.random.type import random_generator_type
+from .common import create_radon_model
+class MinimalRandomFunctionCall:
+    """Benchmark calling a minimal random function."""
+    params = [True, False]
+    param_names = ["trust_input"]
+    def setup(self, trust_input):
+        rng = random_generator_type()
+        x = normal(rng=rng, size=(100,))
+        self.f = function([In(rng, mutable=True)], x)
+        self.f.trust_input = trust_input
+        self.rng_val = np.random.default_rng()
+    def time_call(self, trust_input):
+        self.f(self.rng_val)
+class RadonModelCompileRepeatedly:
+    """Benchmark repeated compilation and single call of the radon model."""
+    params = ["C", "CVM"]
+    param_names = ["mode"]
+    number = 1
+    repeat = 5
+    def setup(self, mode):
+        self.joined_inputs, [self.model_logp, self.model_dlogp] = create_radon_model()
+        rng = np.random.default_rng(1)
+        self.x = rng.normal(size=self.joined_inputs.type.shape).astype(config.floatX)
+    def time_compile_and_call(self, mode):
+        fn = function(
+            [self.joined_inputs],
+            [self.model_logp, self.model_dlogp],
+            mode=mode,
+            trust_input=True,
+        )
+        fn(self.x)
+class RadonModelCompileVariants:
+    """Benchmark compiling 8 variants of the radon model."""
+    params = ["C", "CVM"]
+    param_names = ["mode"]
+    number = 1
+    repeat = 5
+    def setup(self, mode):
+        # Build the base model and compile once to populate caches
+        self.joined_inputs, [self.model_logp, self.model_dlogp] = create_radon_model()
+        rng = np.random.default_rng(1)
+        self.x = rng.normal(size=self.joined_inputs.type.shape).astype(config.floatX)
+        fn = function(
+            [self.joined_inputs],
+            [self.model_logp, self.model_dlogp],
+            mode=mode,
+            trust_input=True,
+        )
+        fn(self.x)
+        # Build the 8 variants
+        self.radon_model_variants = [
+            create_radon_model(
+                intercept_dist=intercept_dist,
+                sigma_dist=sigma_dist,
+                centered=centered,
+            )
+            for centered in (True, False)
+            for intercept_dist in ("normal", "lognormal")
+            for sigma_dist in ("halfnormal", "lognormal")
+        ]
+    def time_compile_variants(self, mode):
+        for joined_inputs, [model_logp, model_dlogp] in self.radon_model_variants:
+            fn = function(
+                [joined_inputs],
+                [model_logp, model_dlogp],
+                mode=mode,
+                trust_input=True,
+            )
+            fn(self.x)
+class RadonModelCall:
+    """Benchmark calling a pre-compiled radon model function."""
+    params = ["C", "CVM", "CVM_NOGC"]
+    param_names = ["mode"]
+    def setup(self, mode):
+        joined_inputs, [model_logp, model_dlogp] = create_radon_model()
+        real_mode = "CVM" if mode == "CVM_NOGC" else mode
+        self.fn = function(
+            [joined_inputs],
+            [model_logp, model_dlogp],
+            mode=real_mode,
+            trust_input=True,
+        )
+        if mode == "CVM_NOGC":
+            self.fn.vm.allow_gc = False
+        rng = np.random.default_rng(1)
+        self.x = rng.normal(size=joined_inputs.type.shape).astype(config.floatX)
+        # Warmup
+        self.fn(self.x)
+    def time_call(self, mode):
+        self.fn(self.x)
--- a/benchmarks/bench_elemwise.py
+++ b/benchmarks/bench_elemwise.py
+import itertools
+import numpy as np
+import pytensor
+import pytensor.tensor as pt
+from pytensor import In, Out, config
+from pytensor.gradient import grad
+from pytensor.graph.fg import FunctionGraph
+from pytensor.tensor.math import add, log
+from pytensor.tensor.math import sum as pt_sum
+from pytensor.tensor.rewriting.elemwise import FusionOptimizer
+from pytensor.tensor.type import dscalar, dvector, tensor3
+class DimShuffle:
+    """Benchmark DimShuffle operations with various transpositions and expansions."""
+    params = [True, False]
+    param_names = ["c_contiguous"]
+    def setup(self, c_contiguous):
+        x = tensor3("x")
+        if c_contiguous:
+            self.x_val = np.random.random((2, 3, 4)).astype(config.floatX)
+        else:
+            self.x_val = np.random.random((200, 300, 400)).transpose(1, 2, 0)
+        ys = [x.transpose(t) for t in itertools.permutations((0, 1, 2))]
+        ys += [x[None], x[:, None], x[:, :, None], x[:, :, :, None]]
+        self.fn = pytensor.function(
+            [In(x, borrow=True)],
+            [Out(y, borrow=True) for y in ys],
+            mode="FAST_RUN",
+        )
+        self.fn.trust_input = True
+        # Warmup / JIT compile
+        self.fn(self.x_val)
+    def time_dimshuffle(self, c_contiguous):
+        self.fn(self.x_val)
+class CAReduce:
+    """Benchmark CAReduce (sum) over various axes and memory layouts."""
+    params = [
+        [0, 1, 2, (0, 1), (0, 2), (1, 2), None],
+        [True, False],
+    ]
+    param_names = ["axis", "c_contiguous"]
+    def setup(self, axis, c_contiguous):
+        N = 256
+        x_test = np.random.uniform(size=(N, N, N))
+        transpose_axis = (0, 1, 2) if c_contiguous else (2, 0, 1)
+        x = pytensor.shared(x_test, name="x", shape=x_test.shape)
+        out = x.transpose(transpose_axis).sum(axis=axis)
+        self.fn = pytensor.function([], out, mode="FAST_RUN")
+    def time_careduce(self, axis, c_contiguous):
+        self.fn()
+class ElemwiseEval:
+    """Benchmark evaluation of a fused elemwise logp + gradient computation."""
+    def setup(self):
+        rng = np.random.default_rng(123)
+        size = 100_000
+        x = pytensor.shared(rng.normal(size=size), name="x")
+        mu = pytensor.shared(rng.normal(size=size), name="mu")
+        logp = -((x - mu) ** 2) / 2
+        grad_logp = grad(logp.sum(), x)
+        self.func = pytensor.function([], [logp, grad_logp], mode="FAST_RUN")
+    def time_eval(self):
+        self.func()
+class FusionRewrite:
+    """Benchmark the FusionOptimizer rewrite pass on different graph shapes."""
+    params = [
+        ["deep_small_kernels", "large_fuseable_graph"],
+        [20, 25],
+    ]
+    param_names = ["graph_fn", "n"]
+    number = 5
+    repeat = 7
+    @staticmethod
+    def large_fuseable_graph(n):
+        factors = []
+        sd = dscalar()
+        means = dvector()
+        cst_05 = pt.constant(0.5)
+        cst_m05 = pt.constant(-0.5)
+        cst_2 = pt.constant(2)
+        cst_m2 = pt.constant(-2)
+        ones = pt.constant(np.ones(10))
+        for i in range(n):
+            f = cst_m05 * sd**cst_m2 * (ones - means[i]) ** cst_2 + cst_05 * log(
+                cst_05 * (sd**cst_m2) / np.pi
+            )
+            factors.append(pt_sum(f))
+        logp = add(*factors)
+        vars = [sd, means]
+        dlogp = [pytensor.grad(logp, v) for v in vars]
+        return vars, dlogp
+    @staticmethod
+    def deep_small_kernels(n):
+        x = pt.matrix("x")
+        out = x
+        for _ in range(n):
+            out = pt.sin(out.T) + pt.cos(out)
+        return [x], [out]
+    def setup(self, graph_fn, n):
+        # Only run matching (graph_fn, n) combinations
+        valid = {
+            "deep_small_kernels": 20,
+            "large_fuseable_graph": 25,
+        }
+        if valid.get(graph_fn) != n:
+            raise NotImplementedError("Skip non-matching parameter combination")
+        builder = getattr(self, graph_fn)
+        inps, outs = builder(n)
+        self.fg = FunctionGraph(inps, outs)
+        self.opt = FusionOptimizer()
+    def time_rewrite(self, graph_fn, n):
+        fg_clone = self.fg.clone()
+        self.opt.apply(fg_clone)
--- a/benchmarks/bench_gradient.py
+++ b/benchmarks/bench_gradient.py
+import numpy as np
+import pytensor
+from pytensor import function
+from pytensor.gradient import jacobian
+from pytensor.tensor.math import outer, sqrt
+from pytensor.tensor.type import vector
+class Jacobian:
+    """Benchmark full Jacobian computation."""
+    params = [True, False]
+    param_names = ["vectorize"]
+    def setup(self, vectorize):
+        x = vector("x", shape=(3,))
+        y = outer(x, x)
+        jac_y = jacobian(y, x, vectorize=vectorize)
+        self.fn = function([x], jac_y, trust_input=True)
+        self.x_val = np.array([0, 1, 2], dtype=x.type.dtype)
+        # Warmup
+        self.fn(self.x_val)
+    def time_jacobian(self, vectorize):
+        self.fn(self.x_val)
+class PartialJacobian:
+    """Benchmark partial Jacobian computation on a large graph."""
+    params = [True, False]
+    param_names = ["vectorize"]
+    def setup(self, vectorize):
+        N = 1000
+        rng = np.random.default_rng(2025)
+        self.x_test = rng.random((N,))
+        f_mat = rng.random((N, N))
+        x = vector("x", dtype="float64")
+        def f(x):
+            return sqrt(f_mat @ x / N)
+        full_jacobian = jacobian(f(x), x, vectorize=vectorize)
+        partial_jacobian = full_jacobian[:5, :5]
+        self.fn = pytensor.function([x], partial_jacobian, trust_input=True)
+        # Warmup
+        self.fn(self.x_test)
+    def time_partial_jacobian(self, vectorize):
+        self.fn(self.x_test)
--- a/benchmarks/bench_graph.py
+++ b/benchmarks/bench_graph.py
+from pytensor.graph.basic import Apply, Variable
+from pytensor.graph.op import Op
+from pytensor.graph.traversal import (
+    apply_ancestors,
+    toposort,
+    toposort_with_orderings,
+    variable_ancestors,
+)
+from pytensor.graph.type import Type
+class MyType(Type):
+    def __init__(self, thingy):
+        self.thingy = thingy
+    def filter(self, *args, **kwargs):
+        raise NotImplementedError
+    def __eq__(self, other):
+        return type(self) is type(other) and self.thingy == other.thingy
+    def __hash__(self):
+        return hash(self.thingy)
+def MyVariable(thingy):
+    return Variable(MyType(thingy), owner=None, name=f"v{thingy}")
+class _MyOp(Op):
+    __props__ = ()
+    def make_node(self, *inputs):
+        outputs = [Variable(MyType(sum(i.type.thingy for i in inputs)), owner=None)]
+        return Apply(self, list(inputs), outputs)
+    def perform(self, *args, **kwargs):
+        raise NotImplementedError()
+_my_op = _MyOp()
+class Traversal:
+    """Benchmark graph traversal operations on a deep graph."""
+    params = [
+        "variable_ancestors",
+        "variable_ancestors_with_blockers",
+        "apply_ancestors",
+        "apply_ancestors_with_blockers",
+        "toposort",
+        "toposort_with_blockers",
+        "toposort_with_orderings",
+        "toposort_with_orderings_and_blockers",
+    ]
+    param_names = ["func_name"]
+    def setup(self, func_name):
+        r1 = MyVariable(1)
+        out = r1
+        for _ in range(50):
+            out = _my_op(out, out)
+        self.out = out
+        blocker = out.clone()
+        funcs = {
+            "variable_ancestors": lambda: all(variable_ancestors([self.out])),
+            "variable_ancestors_with_blockers": lambda: all(
+                variable_ancestors([self.out], blockers=[blocker])
+            ),
+            "apply_ancestors": lambda: all(apply_ancestors([self.out])),
+            "apply_ancestors_with_blockers": lambda: all(
+                apply_ancestors([self.out], blockers=[blocker])
+            ),
+            "toposort": lambda: all(toposort([self.out])),
+            "toposort_with_blockers": lambda: all(
+                toposort([self.out], blockers=[blocker])
+            ),
+            "toposort_with_orderings": lambda: all(
+                toposort_with_orderings([self.out], orderings={self.out.owner: []})
+            ),
+            "toposort_with_orderings_and_blockers": lambda: all(
+                toposort_with_orderings(
+                    [self.out],
+                    blockers=[blocker],
+                    orderings={self.out.owner: []},
+                )
+            ),
+        }
+        self.func = funcs[func_name]
+    def time_traversal(self, func_name):
+        self.func()
--- a/benchmarks/bench_jax.py
+++ b/benchmarks/bench_jax.py
+import numpy as np
+import pytensor.tensor as pt
+from pytensor import function, grad
+from pytensor.scan.basic import scan
+class JaxLogsumexp:
+    """Benchmark JAX logsumexp-like computation."""
+    params = [[(10, 10), (1000, 1000)], [0, 1]]
+    param_names = ["size", "axis"]
+    def setup(self, size, axis):
+        try:
+            import jax  # noqa: F401
+        except ImportError:
+            raise NotImplementedError("JAX not available")
+        X = pt.matrix("X")
+        X_max = pt.max(X, axis=axis, keepdims=True)
+        X_max = pt.switch(pt.isinf(X_max), 0, X_max)
+        X_lse = pt.log(pt.sum(pt.exp(X - X_max), axis=axis, keepdims=True)) + X_max
+        rng = np.random.default_rng(23920)
+        self.X_val = rng.normal(size=size)
+        self.fn = function([X], X_lse, mode="JAX")
+        self.fn(self.X_val)  # JIT warmup
+    def time_logsumexp(self, size, axis):
+        self.fn(self.X_val)
+class JaxScan:
+    """Benchmark JAX scan with forward and backward passes."""
+    params = [["forward", "backward", "both"]]
+    param_names = ["mode"]
+    def setup(self, mode):
+        try:
+            import jax  # noqa: F401
+        except ImportError:
+            raise NotImplementedError("JAX not available")
+        x0 = pt.vector("x0", shape=(10,), dtype="float64")
+        W = pt.matrix("W", shape=(10, 10), dtype="float64")
+        def step(x_prev, W):
+            return pt.tanh(pt.dot(x_prev, W))
+        result = scan(
+            fn=step,
+            outputs_info=[x0],
+            non_sequences=[W],
+            n_steps=50,
+            return_updates=False,
+        )
+        loss = result[-1].sum()
+        dloss = grad(loss, wrt=[x0, W])
+        if mode == "forward":
+            self.fn = function([x0, W], result, mode="JAX")
+        elif mode == "backward":
+            self.fn = function([x0, W], dloss, mode="JAX")
+        else:  # both
+            self.fn = function([x0, W], [loss, *dloss], mode="JAX")
+        rng = np.random.default_rng(42)
+        self.x0_val = rng.normal(size=(10,))
+        self.W_val = rng.normal(size=(10, 10)) * 0.1
+        self.fn(self.x0_val, self.W_val)  # JIT warmup
+    def time_scan(self, mode):
+        self.fn(self.x0_val, self.W_val)
--- a/benchmarks/bench_linalg.py
+++ b/benchmarks/bench_linalg.py
+import numpy as np
+import pytensor
+import pytensor.tensor as pt
+from pytensor import config, function
+from pytensor.compile.io import In
+from pytensor.tensor.slinalg import cholesky
+from pytensor.tensor.type import matrix
+def _check_blas_c():
+    try:
+        from pytensor.tensor.blas_c import CGemv  # noqa: F401
+    except ImportError:
+        raise NotImplementedError("C BLAS not available")
+class GemvVectorDot:
+    """Benchmark CGemv used as a vector dot product."""
+    def setup(self):
+        _check_blas_c()
+        from pytensor.tensor.blas_c import CGemv
+        n = 400_000
+        a = pt.vector("A", shape=(n,))
+        b = pt.vector("x", shape=(n,))
+        out = CGemv(inplace=True)(pt.empty((1,)), 1.0, a[None], b, 0.0)
+        self.fn = pytensor.function([a, b], out, accept_inplace=True, trust_input=True)
+        rng = np.random.default_rng(430)
+        self.test_a = rng.normal(size=n)
+        self.test_b = rng.normal(size=n)
+    def time_gemv_vector_dot(self):
+        self.fn(self.test_a, self.test_b)
+class GemvNegativeStrides:
+    """Benchmark CGemv with negative strides and Fortran layout."""
+    params = [[True, False], [True, False], [True, False]]
+    param_names = ["neg_stride0", "neg_stride1", "F_layout"]
+    def setup(self, neg_stride0, neg_stride1, F_layout):
+        _check_blas_c()
+        from pytensor.tensor.blas_c import CGemv
+        A = pt.matrix("A", shape=(512, 512))
+        x = pt.vector("x", shape=(512,))
+        y = pt.vector("y", shape=(512,))
+        out = CGemv(inplace=False)(y, 1.0, A, x, 1.0)
+        self.fn = pytensor.function([A, x, y], out, trust_input=True)
+        rng = np.random.default_rng(430)
+        test_A = rng.normal(size=(512, 512))
+        self.test_x = rng.normal(size=(512,))
+        self.test_y = rng.normal(size=(512,))
+        if F_layout:
+            test_A = np.asfortranarray(test_A)
+        if neg_stride0:
+            test_A = test_A[::-1]
+        if neg_stride1:
+            test_A = test_A[:, ::-1]
+        self.test_A = test_A
+    def time_gemv_negative_strides(self, neg_stride0, neg_stride1, F_layout):
+        self.fn(self.test_A, self.test_x, self.test_y)
+class Ger:
+    """Benchmark general rank-1 update (ger)."""
+    params = [[2**7, 2**9, 2**13], [True, False]]
+    param_names = ["n", "inplace"]
+    def setup(self, n, inplace):
+        alpha = pt.dscalar("alpha")
+        x = pt.dvector("x")
+        y = pt.dvector("y")
+        A = pt.dmatrix("A")
+        out = alpha * pt.outer(x, y) + A
+        self.fn = pytensor.function(
+            [alpha, x, y, In(A, mutable=inplace)], out, trust_input=True
+        )
+        rng = np.random.default_rng([2274, n])
+        self.alpha_test = rng.normal(size=())
+        self.x_test = rng.normal(size=(n,))
+        self.y_test = rng.normal(size=(n,))
+        self.A_test = rng.normal(size=(n, n))
+    def time_ger(self, n, inplace):
+        self.fn(self.alpha_test, self.x_test, self.y_test, self.A_test)
+class Cholesky:
+    """Benchmark Cholesky decomposition."""
+    def setup(self):
+        rng = np.random.default_rng(1234)
+        r = rng.standard_normal((10, 10)).astype(config.floatX)
+        self.pd = np.dot(r, r.T)
+        x = matrix()
+        chol = cholesky(x)
+        self.fn = function([x], chol)
+    def time_cholesky(self):
+        self.fn(self.pd)
--- a/benchmarks/bench_math.py
+++ b/benchmarks/bench_math.py
+import numpy as np
+import pytensor
+import pytensor.tensor as pt
+from pytensor.gradient import grad
+from pytensor.tensor.math import gammaincc
+from pytensor.tensor.type import vector
+class GammainccdkGrad:
+    """Benchmark gradient of gammaincc with respect to k."""
+    def setup(self):
+        k = vector("k")
+        x = vector("x")
+        out = gammaincc(k, x)
+        self.grad_fn = pytensor.function(
+            [k, x], grad(out.sum(), wrt=[k]), mode="FAST_RUN", trust_input=True
+        )
+        self.vals = [
+            np.full((1000,), 3.2, dtype=k.dtype),
+            np.full((1000,), 0.01, dtype=x.dtype),
+        ]
+        # Warmup
+        self.grad_fn(*self.vals)
+    def time_gammaincc_grad(self):
+        self.grad_fn(*self.vals)
+class Hyp2F1Grad:
+    """Benchmark gradient of hyp2f1 with few and many iterations."""
+    params = [["few_iters", "many_iters"], ["a", "all"]]
+    param_names = ["case", "wrt"]
+    _cases = {
+        "few_iters": (10.0, -2.0, 7.0, 0.7),
+        "many_iters": (3.5, 1.1, 2.0, 0.3),
+    }
+    def setup(self, case, wrt):
+        a1, a2, b1, z = pt.scalars("a1", "a2", "b1", "z")
+        hyp2f1_out = pt.hyp2f1(a1, a2, b1, z)
+        if wrt == "a":
+            hyp2f1_grad = pt.grad(hyp2f1_out, wrt=a1)
+        else:
+            hyp2f1_grad = pt.grad(hyp2f1_out, wrt=[a1, a2, b1, z])
+        self.f_grad = pytensor.function([a1, a2, b1, z], hyp2f1_grad, trust_input=True)
+        test_a1, test_a2, test_b1, test_z = self._cases[case]
+        self.test_vals = [
+            np.array(test_a1),
+            np.array(test_a2),
+            np.array(test_b1),
+            np.array(test_z),
+        ]
+        # Warmup
+        self.f_grad(*self.test_vals)
+    def time_hyp2f1_grad(self, case, wrt):
+        self.f_grad(*self.test_vals)
--- a/benchmarks/bench_numba.py
+++ b/benchmarks/bench_numba.py
--- a/benchmarks/bench_rewriting.py
+++ b/benchmarks/bench_rewriting.py
+import numpy as np
+import pytensor
+import pytensor.tensor as pt
+from pytensor.compile.mode import get_default_mode
+from pytensor.tensor.type import tensor
+class BlockDiagDot:
+    """Benchmark block_diag @ vector with and without the rewrite optimization."""
+    params = [[10, 100, 1000], [True, False]]
+    param_names = ["size", "rewrite"]
+    def setup(self, size, rewrite):
+        rng = np.random.default_rng(sum(ord(c) for c in f"{size}_{rewrite}"))
+        a_size = int(rng.uniform(1, int(0.8 * size)))
+        b_size = int(rng.uniform(1, int(0.8 * (size - a_size))))
+        c_size = size - a_size - b_size
+        a = tensor("a", shape=(a_size, a_size))
+        b = tensor("b", shape=(b_size, b_size))
+        c = tensor("c", shape=(c_size, c_size))
+        d = tensor("d", shape=(size,))
+        x = pt.linalg.block_diag(a, b, c)
+        out = x @ d
+        mode = get_default_mode()
+        if not rewrite:
+            mode = mode.excluding("local_block_diag_dot_to_dot_block_diag")
+        self.fn = pytensor.function([a, b, c, d], out, mode=mode)
+        self.a_val = rng.normal(size=a.type.shape).astype(a.type.dtype)
+        self.b_val = rng.normal(size=b.type.shape).astype(b.type.dtype)
+        self.c_val = rng.normal(size=c.type.shape).astype(c.type.dtype)
+        self.d_val = rng.normal(size=d.type.shape).astype(d.type.dtype)
+        # Warmup
+        self.fn(self.a_val, self.b_val, self.c_val, self.d_val)
+    def time_block_diag_dot(self, size, rewrite):
+        self.fn(self.a_val, self.b_val, self.c_val, self.d_val)
--- a/benchmarks/bench_scan.py
+++ b/benchmarks/bench_scan.py
+import numpy as np
+import pytensor.tensor as pt
+from pytensor import config, function, grad, shared
+from pytensor.compile.mode import Mode
+from pytensor.gradient import hessian
+from pytensor.scan.basic import scan
+from pytensor.tensor.math import dot
+from pytensor.tensor.type import (
+    dmatrix,
+    dscalar,
+    dvector,
+    fvector,
+    iscalar,
+    matrix,
+    vector,
+)
+class CythonPerformance:
+    """Benchmark scan with CVM linker (cython scan_perform)."""
+    def setup(self):
+        N = 200
+        M = -1 / np.arange(1, 11).astype(config.floatX)
+        r = np.arange(N * 10).astype(config.floatX).reshape(N, 10)
+        s_r = pt.as_tensor_variable(r, dtype=config.floatX)
+        s_y = scan(
+            fn=lambda ri, rii, M: ri + M * rii,
+            sequences=[s_r[1:]],
+            non_sequences=[pt.as_tensor_variable(M, dtype=config.floatX)],
+            outputs_info=s_r[0],
+            mode=Mode(linker="cvm", optimizer="fast_run"),
+            return_updates=False,
+        )
+        self.f_cvm = function([], s_y, mode="FAST_RUN")
+        self.f_cvm.trust_input = True
+    def time_cython_scan(self):
+        self.f_cvm()
+class Reordering:
+    """Benchmark RNN scan with multiple inputs/outputs and reordering."""
+    def setup(self):
+        rng = np.random.default_rng(1234)
+        vW_in2 = rng.uniform(-0.5, 0.5, size=(2,)).astype(config.floatX)
+        vW = rng.uniform(-0.5, 0.5, size=(2, 2)).astype(config.floatX)
+        vWout = rng.uniform(-0.5, 0.5, size=(2,)).astype(config.floatX)
+        self.vW_in1 = rng.uniform(-0.5, 0.5, size=(2, 2)).astype(config.floatX)
+        self.v_u1 = rng.uniform(-0.5, 0.5, size=(3, 2)).astype(config.floatX)
+        self.v_u2 = rng.uniform(-0.5, 0.5, size=(3,)).astype(config.floatX)
+        self.v_x0 = rng.uniform(-0.5, 0.5, size=(2,)).astype(config.floatX)
+        self.v_y0 = rng.uniform(size=(3,)).astype(config.floatX)
+        W_in2 = shared(vW_in2, name="win2")
+        W = shared(vW, name="w")
+        W_out = shared(vWout, name="wout")
+        W_in1 = matrix("win")
+        u1 = matrix("u1")
+        u2 = vector("u2")
+        x0 = vector("x0")
+        y0 = vector("y0")
+        def f_rnn_cmpl(u1_t, u2_t, x_tm1, y_tm1, y_tm3, W_in1):
+            return [
+                y_tm3 + 1,
+                y_tm3 + 2,
+                dot(u1_t, W_in1) + u2_t * W_in2 + dot(x_tm1, W),
+                y_tm1 + dot(x_tm1, W_out),
+            ]
+        outputs = scan(
+            f_rnn_cmpl,
+            [u1, u2],
+            [None, None, x0, dict(initial=y0, taps=[-1, -3])],
+            W_in1,
+            n_steps=None,
+            truncate_gradient=-1,
+            go_backwards=False,
+            return_updates=False,
+        )
+        self.f = function([u1, u2, x0, y0, W_in1], outputs, allow_input_downcast=True)
+    def time_reordering(self):
+        self.f(self.v_u1, self.v_u2, self.v_x0, self.v_y0, self.vW_in1)
+class ScanAsTensorOnGradients:
+    """Benchmark compilation of gradient through scan."""
+    number = 1
+    repeat = 5
+    def setup(self):
+        to_scan = dvector("to_scan")
+        seq = dmatrix("seq")
+        f1 = dscalar("f1")
+        def scanStep(prev, seq, f1):
+            return prev + f1 * seq
+        scanned = scan(
+            fn=scanStep,
+            sequences=[seq],
+            outputs_info=[to_scan],
+            non_sequences=[f1],
+            return_updates=False,
+        )
+        function(
+            inputs=[to_scan, seq, f1],
+            outputs=scanned,
+            allow_input_downcast=True,
+        )
+        self.t_grad = grad(scanned.sum(), wrt=[to_scan, f1], consider_constant=[seq])
+        self.inputs = [to_scan, seq, f1]
+    def time_compile_grad(self):
+        function(
+            inputs=self.inputs,
+            outputs=self.t_grad,
+            allow_input_downcast=True,
+        )
+class HessianBugGradGradTwoScans:
+    """Benchmark nested scan with hessian computation."""
+    def setup(self):
+        W = fvector(name="W")
+        n_steps = iscalar(name="Nb_steps")
+        def loss_outer(sum_outer, W):
+            def loss_inner(sum_inner, W):
+                return sum_inner + (W**2).sum()
+            result_inner = scan(
+                fn=loss_inner,
+                outputs_info=pt.as_tensor_variable(np.asarray(0, dtype=np.float32)),
+                non_sequences=[W],
+                n_steps=1,
+                return_updates=False,
+            )
+            return sum_outer + result_inner[-1]
+        result_outer = scan(
+            fn=loss_outer,
+            outputs_info=pt.as_tensor_variable(np.asarray(0, dtype=np.float32)),
+            non_sequences=[W],
+            n_steps=n_steps,
+            return_list=True,
+            return_updates=False,
+        )
+        cost = result_outer[0][-1]
+        H = hessian(cost, W)
+        self.f = function([W, n_steps], H)
+    def time_hessian_two_scans(self):
+        self.f(np.ones((8,), dtype="float32"), 1)
+class MultipleOutsTaps:
+    """Benchmark complex RNN scan with multiple output taps."""
+    def setup(self):
+        l = 5
+        rng = np.random.default_rng(1234)
+        vW_in2 = rng.uniform(-2.0, 2.0, size=(2,)).astype(config.floatX)
+        vW = rng.uniform(-2.0, 2.0, size=(2, 2)).astype(config.floatX)
+        vWout = rng.uniform(-2.0, 2.0, size=(2,)).astype(config.floatX)
+        self.vW_in1 = rng.uniform(-2.0, 2.0, size=(2, 2)).astype(config.floatX)
+        self.v_u1 = rng.uniform(-2.0, 2.0, size=(l, 2)).astype(config.floatX)
+        self.v_u2 = rng.uniform(-2.0, 2.0, size=(l + 2, 2)).astype(config.floatX)
+        self.v_x0 = rng.uniform(-2.0, 2.0, size=(2,)).astype(config.floatX)
+        self.v_y0 = rng.uniform(size=(3,)).astype(config.floatX)
+        W_in2 = shared(vW_in2, name="win2")
+        W = shared(vW, name="w")
+        W_out = shared(vWout, name="wout")
+        W_in1 = matrix("win")
+        u1 = matrix("u1")
+        u2 = matrix("u2")
+        x0 = vector("x0")
+        y0 = vector("y0")
+        def f_rnn_cmpl(u1_t, u2_tm1, u2_t, u2_tp1, x_tm1, y_tm1, y_tm3, W_in1):
+            return [
+                dot(u1_t, W_in1) + (u2_t + u2_tm1 * u2_tp1) * W_in2 + dot(x_tm1, W),
+                (y_tm1 + y_tm3) * dot(x_tm1, W_out),
+                dot(u1_t, W_in1),
+            ]
+        outputs = scan(
+            f_rnn_cmpl,
+            [u1, dict(input=u2, taps=[-1, 0, 1])],
+            [x0, dict(initial=y0, taps=[-1, -3]), None],
+            W_in1,
+            n_steps=None,
+            truncate_gradient=-1,
+            go_backwards=False,
+            return_updates=False,
+        )
+        self.f = function([u1, u2, x0, y0, W_in1], outputs, allow_input_downcast=True)
+    def time_multiple_outs_taps(self):
+        self.f(self.v_u1, self.v_u2, self.v_x0, self.v_y0, self.vW_in1)
+class PregreedyOptimizer:
+    """Benchmark scan with chained dot products (pregreedy optimizer path)."""
+    def setup(self):
+        W = pt.zeros((5, 4))
+        bv = pt.zeros((5,))
+        bh = pt.zeros((4,))
+        v = matrix("v")
+        (bv_t, bh_t) = scan(
+            lambda _: [bv, bh],
+            sequences=v,
+            outputs_info=[None, None],
+            return_updates=False,
+        )
+        chain = scan(
+            lambda x: dot(dot(x, W) + bh_t, W.T) + bv_t,
+            outputs_info=v,
+            n_steps=2,
+            return_updates=False,
+        )
+        self.chain_fn = function([v], chain)
+        self.v_data = np.zeros((3, 5), dtype=config.floatX)
+    def time_pregreedy_optimizer(self):
+        self.chain_fn(self.v_data)
+class SavememOpt:
+    """Benchmark scan with save_mem optimization."""
+    def setup(self):
+        y0 = shared(np.ones((2, 10)))
+        [_y1, y2] = scan(
+            lambda y: [y, y],
+            outputs_info=[dict(initial=y0, taps=[-2]), None],
+            n_steps=5,
+            return_updates=False,
+        )
+        self.fn = function([], y2.sum())
+    def time_savemem_opt(self):
+        self.fn()
--- a/benchmarks/bench_shape.py
+++ b/benchmarks/bench_shape.py
+import numpy as np
+import pytensor
+from pytensor import config
+from pytensor.compile.io import In, Out
+from pytensor.tensor.type import tensor3
+class Reshape:
+    """Benchmark reshape operations with multiple output shapes."""
+    def setup(self):
+        x = tensor3("x")
+        self.x_val = np.random.random((2, 3, 4)).astype(config.floatX)
+        y1 = x.reshape((6, 4))
+        y2 = x.reshape((2, 12))
+        y3 = x.reshape((-1,))
+        self.reshape_fn = pytensor.function(
+            [In(x, borrow=True)],
+            [Out(y1, borrow=True), Out(y2, borrow=True), Out(y3, borrow=True)],
+        )
+        self.reshape_fn.trust_input = True
+        # Warmup
+        self.reshape_fn(self.x_val)
+    def time_reshape(self):
+        self.reshape_fn(self.x_val)
--- a/benchmarks/bench_signal.py
+++ b/benchmarks/bench_signal.py
+import numpy as np
+import pytensor
+from pytensor.gradient import grad
+from pytensor.tensor.signal.conv import convolve1d
+from pytensor.tensor.type import tensor
+class Convolve1dGrad:
+    """Benchmark gradient of convolve1d with different modes."""
+    params = ["full", "valid"]
+    param_names = ["convolve_mode"]
+    def setup(self, convolve_mode):
+        larger = tensor("larger", shape=(8, None))
+        smaller = tensor("smaller", shape=(8, None))
+        grad_wrt_smaller = grad(
+            convolve1d(larger, smaller, mode=convolve_mode).sum(), wrt=smaller
+        )
+        self.fn = pytensor.function(
+            [larger, smaller], grad_wrt_smaller, trust_input=True
+        )
+        rng = np.random.default_rng([119, convolve_mode == "full"])
+        self.test_larger = rng.normal(size=(8, 1024)).astype(larger.type.dtype)
+        self.test_smaller = rng.normal(size=(8, 16)).astype(smaller.type.dtype)
+        # Warmup
+        self.fn(self.test_larger, self.test_smaller)
+    def time_convolve1d_grad(self, convolve_mode):
+        self.fn(self.test_larger, self.test_smaller)
--- a/benchmarks/bench_subtensor.py
+++ b/benchmarks/bench_subtensor.py
+import numpy as np
+import pytensor
+import pytensor.tensor.basic as ptb
+from pytensor import Out
+from pytensor.tensor.basic import join
+from pytensor.tensor.subtensor import inc_subtensor, set_subtensor
+from pytensor.tensor.type import matrices, vector, vectors
+class AdvancedSubtensor1:
+    """Benchmark advanced subtensor1 indexing."""
+    params = [[True, False], [True, False]]
+    param_names = ["static_shape", "gc"]
+    def setup(self, static_shape, gc):
+        x = vector("x", shape=(85 if static_shape else None,))
+        x_values = np.random.normal(size=(85,)).astype(x.type.dtype)
+        idxs_values = np.arange(85).repeat(11)
+        out = x[idxs_values]
+        self.fn = pytensor.function(
+            [x],
+            pytensor.Out(out, borrow=True),
+            on_unused_input="ignore",
+            trust_input=True,
+        )
+        self.fn.vm.allow_gc = gc
+        self.x_values = x_values
+        # Warmup
+        self.fn(self.x_values)
+    def time_advanced_subtensor1(self, static_shape, gc):
+        self.fn(self.x_values)
+class AdvancedIncSubtensor1:
+    """Benchmark advanced inc/set subtensor1 operations."""
+    params = [["inc_subtensor", "set_subtensor"], [True, False], [True, False]]
+    param_names = ["func", "static_shape", "gc"]
+    def setup(self, func, static_shape, gc):
+        func_map = {"inc_subtensor": inc_subtensor, "set_subtensor": set_subtensor}
+        subtensor_func = func_map[func]
+        x = vector("x", shape=(85 if static_shape else None,))
+        x_values = np.zeros((85,), dtype=x.type.dtype)
+        buffer = ptb.zeros_like(x)
+        y_values = np.random.normal(size=(85 * 11,)).astype(x.type.dtype)
+        idxs_values = np.arange(85).repeat(11)
+        out1 = subtensor_func(buffer[idxs_values], y_values)
+        out2 = subtensor_func(buffer[idxs_values[::-1]], y_values)
+        self.fn = pytensor.function(
+            [x],
+            [pytensor.Out(out1, borrow=True), pytensor.Out(out2, borrow=True)],
+            on_unused_input="ignore",
+            trust_input=True,
+        )
+        self.fn.vm.allow_gc = gc
+        self.x_values = x_values
+        # Warmup
+        self.fn(self.x_values)
+    def time_advanced_incsubtensor1(self, func, static_shape, gc):
+        self.fn(self.x_values)
+class JoinPerformance:
+    """Benchmark join (concatenation) with various dimensions and memory layouts."""
+    params = [[1, 2], [0, 1], ["C", "F", "Mixed"], [True, False]]
+    param_names = ["ndim", "axis", "memory_layout", "gc"]
+    def setup(self, ndim, axis, memory_layout, gc):
+        # Skip invalid combinations
+        if ndim == 1 and not (memory_layout == "C" and axis == 0):
+            raise NotImplementedError("Skip invalid combination")
+        n = 64
+        if ndim == 1:
+            inputs = vectors("abcdef")
+        else:
+            inputs = matrices("abcdef")
+        out = join(axis, *inputs)
+        self.fn = pytensor.function(inputs, Out(out, borrow=True), trust_input=True)
+        self.fn.vm.allow_gc = gc
+        test_values = [np.zeros((n, n)[:ndim], dtype=inputs[0].dtype) for _ in inputs]
+        if memory_layout == "F":
+            test_values = [np.asfortranarray(t) for t in test_values]
+        elif memory_layout == "Mixed":
+            test_values = [
+                np.asfortranarray(t) if i % 2 else t for i, t in enumerate(test_values)
+            ]
+        self.test_values = test_values
+        # Warmup
+        self.fn(*self.test_values)
+    def time_join(self, ndim, axis, memory_layout, gc):
+        self.fn(*self.test_values)
--- a/benchmarks/common.py
+++ b/benchmarks/common.py
+"""Common utilities for ASV benchmarks."""
+import sys
+from pathlib import Path
+# ASV doesn't add the repo root to sys.path, so `tests` isn't importable.
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "tests"))
+from fixtures import create_radon_model
+__all__ = ["create_radon_model"]
--- a/doc/benchmarking.rst
+++ b/doc/benchmarking.rst
+Benchmarking
+============
+PyTensor uses `airspeed velocity (ASV) <https://asv.readthedocs.io/>`_ for
+performance benchmarking. Benchmarks are stored in the ``benchmarks/`` directory
+and track performance across commits over time.
+A `dashboard <https://pymc-devs.github.io/pytensor/>`_ is automatically updated
+on each push to ``main``.
+Quick start
+-----------
+Install ASV::
+    pip install asv virtualenv
+Or with the benchmark extra::
+    pip install -e ".[benchmark]"
+Running benchmarks
+------------------
+Run all benchmarks against your current working tree::
+    asv run --python=same --quick
+The ``--python=same`` flag uses your current Python environment instead of
+creating a new virtual environment. The ``--quick`` flag runs each benchmark
+only once for a fast (but noisier) result.
+For more accurate results, drop ``--quick``::
+    asv run --python=same
+Run a specific benchmark module or class::
+    asv run --python=same --bench bench_compile
+    asv run --python=same --bench "bench_elemwise.CAReduce"
+Run benchmarks matching a pattern::
+    asv run --python=same --bench ".*Numba.*"
+Comparing branches
+------------------
+Compare the current branch against ``main``::
+    asv continuous --python=same main HEAD
+This runs benchmarks on both commits and reports any regressions or
+improvements. Use ``--factor`` to set the threshold for flagging changes::
+    asv continuous --python=same --factor 1.1 main HEAD
+This flags benchmarks that changed by more than 10%.
+Viewing results
+---------------
+Generate the HTML dashboard and open it in a browser::
+    asv publish
+    asv preview
+This starts a local web server (typically at ``http://127.0.0.1:8080``) where
+you can explore benchmark results interactively.
+Profiling
+---------
+Profile a specific benchmark to identify bottlenecks::
+    asv profile bench_compile.RadonModelCall.time_call --python=same
+This runs the benchmark under cProfile and displays the results.
+Writing benchmarks
+------------------
+Benchmarks live in ``benchmarks/`` as Python files prefixed with ``bench_``.
+Each file contains classes with:
+- A ``setup()`` method for initialization (compilation, data generation).
+  This is **not** timed.
+- Methods prefixed with ``time_`` that contain **only** the code to benchmark.
+- ``params`` and ``param_names`` class attributes for parametrization.
+Example::
+    import numpy as np
+    import pytensor
+    import pytensor.tensor as pt
+    class MyBenchmark:
+        params = [[10, 100, 1000]]
+        param_names = ["size"]
+        def setup(self, size):
+            x = pt.vector("x", shape=(size,))
+            self.fn = pytensor.function([x], pt.exp(x), trust_input=True)
+            self.x_val = np.random.normal(size=size)
+            self.fn(self.x_val)  # warmup / JIT compile
+        def time_exp(self, size):
+            self.fn(self.x_val)
+For benchmarks that require optional backends (Numba, JAX), raise
+``NotImplementedError`` in ``setup()`` if the backend is not available::
+    def setup(self, ...):
+        try:
+            import numba  # noqa: F401
+        except ImportError:
+            raise NotImplementedError("Numba not available")
+See the `ASV documentation <https://asv.readthedocs.io/en/stable/writing_benchmarks.html>`_
+for more details on writing benchmarks.
+CI integration
+--------------
+Benchmarks run automatically in GitHub Actions:
+- **On push to main**: Full benchmark suite runs and results are published to
+  the dashboard on GitHub Pages. Historical results are stored on the
+  ``asv-results`` branch.
+- **On pull requests**: ``asv continuous`` compares benchmarks between ``main``
+  and the PR head. Results are posted as a PR comment, flagging any benchmarks
+  that regressed by more than 10%.
--- a/environment-osx-arm64.yml
+++ b/environment-osx-arm64.yml
@@ -28,7 +28,6 @@ dependencies:
  - pytest
  - pytest-cov
  - pytest-xdist
-  - pytest-benchmark
  - pytest-mock
  - pytest-sphinx
  # For building docs

--- a/environment.yml
+++ b/environment.yml
@@ -30,7 +30,6 @@ dependencies:
  - pytest
  - pytest-cov
  - pytest-xdist
-  - pytest-benchmark
  - pytest-mock
  - pytest-sphinx
  # For building docs

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -75,13 +75,13 @@ tests = [
    "pre-commit",
    "pytest-cov>=2.6.1",
    "coverage>=5.1",
-    "pytest-benchmark",
    "pytest-mock",
    "pytest-sphinx",
 ]
 rtd = ["sphinx>=5.1.0,<6", "pygments", "pydot"]
 jax = ["jax", "jaxlib"]
 numba = ["numba>=0.57", "llvmlite"]
+benchmark = ["asv", "virtualenv"]
 [tool.setuptools.packages.find]
 include = ["pytensor*"]

--- a/tests/compile/function/test_types.py
+++ b/tests/compile/function/test_types.py
@@ -20,8 +20,6 @@ from pytensor.link.vm import VMLinker
 from pytensor.printing import debugprint
 from pytensor.tensor.math import dot, tanh
 from pytensor.tensor.math import sum as pt_sum
-from pytensor.tensor.random import normal
-from pytensor.tensor.random.type import random_generator_type
 from pytensor.tensor.type import (
    dmatrix,
    dscalar,
@@ -34,7 +32,6 @@ from pytensor.tensor.type import (
    scalars,
    vector,
 )
-from tests.fixtures import *  # noqa: F403
 pytestmark = pytest.mark.filterwarnings(
@@ -1355,79 +1352,3 @@ def test_empty_givens_updates():
    y = x * 2
    function([In(x)], y, givens={})
    function([In(x)], y, updates={})
-@pytest.mark.parametrize("trust_input", [True, False])
-def test_minimal_random_function_call_benchmark(trust_input, benchmark):
-    rng = random_generator_type()
-    x = normal(rng=rng, size=(100,))
-    f = function([In(rng, mutable=True)], x)
-    f.trust_input = trust_input
-    rng_val = np.random.default_rng()
-    benchmark(f, rng_val)
-@pytest.mark.parametrize("mode", ["C", "CVM"])
-def test_radon_model_compile_repeatedly_benchmark(mode, radon_model, benchmark):
-    joined_inputs, [model_logp, model_dlogp] = radon_model
-    rng = np.random.default_rng(1)
-    x = rng.normal(size=joined_inputs.type.shape).astype(config.floatX)
-    def compile_and_call_once():
-        fn = function(
-            [joined_inputs], [model_logp, model_dlogp], mode=mode, trust_input=True
-        )
-        fn(x)
-    benchmark.pedantic(compile_and_call_once, rounds=5, iterations=1)
-@pytest.mark.parametrize("mode", ["C", "CVM"])
-def test_radon_model_compile_variants_benchmark(
-    mode, radon_model, radon_model_variants, benchmark
-):
-    """Test compilation speed when a slightly variant of a function is compiled each time.
-    This test more realistically simulates a use case where a model is recompiled
-    multiple times with small changes, such as in an interactive environment.
-    NOTE: For this test to be meaningful on subsequent runs, the cache must be cleared
-    """
-    joined_inputs, [model_logp, model_dlogp] = radon_model
-    rng = np.random.default_rng(1)
-    x = rng.normal(size=joined_inputs.type.shape).astype(config.floatX)
-    # Compile base function once to populate the cache
-    fn = function(
-        [joined_inputs], [model_logp, model_dlogp], mode=mode, trust_input=True
-    )
-    fn(x)
-    def compile_and_call_once():
-        for joined_inputs, [model_logp, model_dlogp] in radon_model_variants:
-            fn = function(
-                [joined_inputs], [model_logp, model_dlogp], mode=mode, trust_input=True
-            )
-            fn(x)
-    benchmark.pedantic(compile_and_call_once, rounds=1, iterations=1)
-@pytest.mark.parametrize("mode", ["C", "CVM", "CVM_NOGC"])
-def test_radon_model_call_benchmark(mode, radon_model, benchmark):
-    joined_inputs, [model_logp, model_dlogp] = radon_model
-    real_mode = "CVM" if mode == "CVM_NOGC" else mode
-    fn = function(
-        [joined_inputs], [model_logp, model_dlogp], mode=real_mode, trust_input=True
-    )
-    if mode == "CVM_NOGC":
-        fn.vm.allow_gc = False
-    rng = np.random.default_rng(1)
-    x = rng.normal(size=joined_inputs.type.shape).astype(config.floatX)
-    fn(x)  # warmup
-    benchmark(fn, x)
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
 import numpy as np
-import pytest
 import pytensor.tensor as pt
 from pytensor.graph.replace import graph_replace
@@ -134,23 +133,3 @@ def create_radon_model(
    model_logp, model_dlogp = graph_replace([model_logp, model_dlogp], replacement)
    return joined_inputs, [model_logp, model_dlogp]
-@pytest.fixture(scope="session")
-def radon_model():
-    return create_radon_model()
-@pytest.fixture(scope="session")
-def radon_model_variants():
-    # Convert to list comp
-    return [
-        create_radon_model(
-            intercept_dist=intercept_dist,
-            sigma_dist=sigma_dist,
-            centered=centered,
-        )
-        for centered in (True, False)
-        for intercept_dist in ("normal", "lognormal")
-        for sigma_dist in ("halfnormal", "lognormal")
-    ]
--- a/tests/graph/test_traversal.py
+++ b/tests/graph/test_traversal.py
@@ -4,17 +4,13 @@ from pytensor import Variable, shared
 from pytensor import tensor as pt
 from pytensor.graph import Apply, ancestors, graph_inputs
 from pytensor.graph.traversal import (
-    apply_ancestors,
    apply_depends_on,
    explicit_graph_inputs,
    general_toposort,
    get_var_by_name,
    io_toposort,
    orphans_between,
-    toposort,
-    toposort_with_orderings,
    truncated_graph_inputs,
-    variable_ancestors,
    variable_depends_on,
    vars_between,
    walk,
@@ -406,37 +402,3 @@ def test_get_var_by_name():
    exp_res = igo.fgraph.outputs[0]
    assert res == exp_res
-@pytest.mark.parametrize(
-    "func",
-    [
-        lambda x: all(variable_ancestors([x])),
-        lambda x: all(variable_ancestors([x], blockers=[x.clone()])),
-        lambda x: all(apply_ancestors([x])),
-        lambda x: all(apply_ancestors([x], blockers=[x.clone()])),
-        lambda x: all(toposort([x])),
-        lambda x: all(toposort([x], blockers=[x.clone()])),
-        lambda x: all(toposort_with_orderings([x], orderings={x: []})),
-        lambda x: all(
-            toposort_with_orderings([x], blockers=[x.clone()], orderings={x: []})
-        ),
-    ],
-    ids=[
-        "variable_ancestors",
-        "variable_ancestors_with_blockers",
-        "apply_ancestors",
-        "apply_ancestors_with_blockers)",
-        "toposort",
-        "toposort_with_blockers",
-        "toposort_with_orderings",
-        "toposort_with_orderings_and_blockers",
-    ],
-)
-def test_traversal_benchmark(func, benchmark):
-    r1 = MyVariable(1)
-    out = r1
-    for i in range(50):
-        out = MyOp(out, out)
-    benchmark(func, out)
--- a/tests/link/jax/test_elemwise.py
+++ b/tests/link/jax/test_elemwise.py
 import numpy as np
 import pytest
-import scipy.special
-import pytensor
 import pytensor.tensor as pt
 from pytensor.compile import get_mode
 from pytensor.configdefaults import config
@@ -98,28 +96,6 @@ def test_softmax_grad(axis):
    compare_jax_and_py([dy, sm], [out], [dy_test_value, sm_test_value])
-@pytest.mark.parametrize("size", [(10, 10), (1000, 1000)])
-@pytest.mark.parametrize("axis", [0, 1])
-def test_logsumexp_benchmark(size, axis, benchmark):
-    X = pt.matrix("X")
-    X_max = pt.max(X, axis=axis, keepdims=True)
-    X_max = pt.switch(pt.isinf(X_max), 0, X_max)
-    X_lse = pt.log(pt.sum(pt.exp(X - X_max), axis=axis, keepdims=True)) + X_max
-    rng = np.random.default_rng(23920)
-    X_val = rng.normal(size=size)
-    X_lse_fn = pytensor.function([X], X_lse, mode="JAX")
-    # JIT compile first
-    _ = X_lse_fn(X_val)
-    res = benchmark(X_lse_fn, X_val)
-    exp_res = scipy.special.logsumexp(X_val, axis=axis, keepdims=True)
-    np.testing.assert_array_almost_equal(res, exp_res)
 def test_multiple_input_multiply():
    x, y, z = vectors("xyz")
    out = pt.mul(x, y, z)

--- a/tests/link/jax/test_scan.py
+++ b/tests/link/jax/test_scan.py
@@ -4,7 +4,7 @@ import numpy as np
 import pytest
 import pytensor.tensor as pt
-from pytensor import function, ifelse, shared
+from pytensor import function, shared
 from pytensor.compile import get_mode
 from pytensor.configdefaults import config
 from pytensor.graph import Apply, Op
@@ -12,8 +12,7 @@ from pytensor.scan import until
 from pytensor.scan.basic import scan
 from pytensor.scan.op import Scan, ScanInfo
 from pytensor.tensor import as_tensor, empty, random
-from pytensor.tensor.math import gammaln, log
+from pytensor.tensor.type import dmatrix, dvector, matrix, scalar
-from pytensor.tensor.type import dmatrix, dvector, matrix, scalar, vector
 from tests.link.jax.test_basic import compare_jax_and_py
 from tests.scan.test_basic import ScanCompatibilityTests
@@ -393,242 +392,6 @@ def test_dynamic_sequence_length():
    np.testing.assert_allclose(f2(np.zeros((0, 3))), np.empty((0, 3)))
-def SEIR_model_logp():
-    """Setup a Scan implementation of a SEIR model.
-    SEIR model definition:
-    S[t+1] = S[t] - B[t]
-    E[t+1] = E[t] +B[t] - C[t]
-    I[t+1] = I[t+1] + C[t] - D[t]
-    B[t] ~ Binom(S[t], beta)
-    C[t] ~ Binom(E[t], gamma)
-    D[t] ~ Binom(I[t], delta)
-    """
-    def binomln(n, k):
-        return gammaln(n + 1) - gammaln(k + 1) - gammaln(n - k + 1)
-    def binom_log_prob(n, p, value):
-        return binomln(n, value) + value * log(p) + (n - value) * log(1 - p)
-    # sequences
-    C_t = vector("C_t", dtype="int32", shape=(1200,))
-    D_t = vector("D_t", dtype="int32", shape=(1200,))
-    # outputs_info (initial conditions)
-    st0 = scalar("s_t0")
-    et0 = scalar("e_t0")
-    it0 = scalar("i_t0")
-    # non_sequences
-    beta = scalar("beta")
-    gamma = scalar("gamma")
-    delta = scalar("delta")
-    def seir_one_step(ct0, dt0, st0, et0, it0, beta, gamma, delta):
-        # bt0 = trng.binomial(n=st0, p=beta)
-        bt0 = st0 * beta
-        bt0 = bt0.astype(st0.dtype)
-        logp_c1 = binom_log_prob(et0, gamma, ct0)
-        logp_d1 = binom_log_prob(it0, delta, dt0)
-        st1 = st0 - bt0
-        et1 = et0 + bt0 - ct0
-        it1 = it0 + ct0 - dt0
-        return st1, et1, it1, logp_c1, logp_d1
-    (st, et, it, logp_c_all, logp_d_all) = scan(
-        fn=seir_one_step,
-        sequences=[C_t, D_t],
-        outputs_info=[st0, et0, it0, None, None],
-        non_sequences=[beta, gamma, delta],
-        return_updates=False,
-    )
-    st.name = "S_t"
-    et.name = "E_t"
-    it.name = "I_t"
-    logp_c_all.name = "C_t_logp"
-    logp_d_all.name = "D_t_logp"
-    st0_val, et0_val, it0_val = np.array(100.0), np.array(50.0), np.array(25.0)
-    beta_val, gamma_val, delta_val = (
-        np.array(0.277792),
-        np.array(0.135330),
-        np.array(0.108753),
-    )
-    C_t_val = np.array([3, 5, 8, 13, 21, 26, 10, 3] * 150, dtype=np.int32)
-    D_t_val = np.array([1, 2, 3, 7, 9, 11, 5, 1] * 150, dtype=np.int32)
-    assert C_t_val.shape == D_t_val.shape == C_t.type.shape == D_t.type.shape
-    test_input_vals = [
-        C_t_val,
-        D_t_val,
-        st0_val,
-        et0_val,
-        it0_val,
-        beta_val,
-        gamma_val,
-        delta_val,
-    ]
-    loss_graph = logp_c_all.sum() + logp_d_all.sum()
-    return dict(
-        graph_inputs=[C_t, D_t, st0, et0, it0, beta, gamma, delta],
-        differentiable_vars=[st0, et0, it0, beta, gamma, delta],
-        test_input_vals=test_input_vals,
-        loss_graph=loss_graph,
-    )
-def cyclical_reduction():
-    """Setup a Scan implementation of the cyclical reduction algorithm.
-    This solves the matrix equation A @ X @ X + B @ X + C = 0 for X
-    Adapted from https://github.com/jessegrabowski/gEconpy/blob/da495b22ac383cb6cb5dec15f305506aebef7302/gEconpy/solvers/cycle_reduction.py#L187
-    """
-    def stabilize(x, jitter=1e-16):
-        return x + jitter * pt.eye(x.shape[0])
-    def step(A0, A1, A2, A1_hat, norm, step_num, tol):
-        def cycle_step(A0, A1, A2, A1_hat, _norm, step_num):
-            tmp = pt.dot(
-                pt.vertical_stack(A0, A2),
-                pt.linalg.solve(
-                    stabilize(A1),
-                    pt.horizontal_stack(A0, A2),
-                    assume_a="gen",
-                    check_finite=False,
-                ),
-            )
-            n = A0.shape[0]
-            idx_0 = pt.arange(n)
-            idx_1 = idx_0 + n
-            A1 = A1 - tmp[idx_0, :][:, idx_1] - tmp[idx_1, :][:, idx_0]
-            A0 = -tmp[idx_0, :][:, idx_0]
-            A2 = -tmp[idx_1, :][:, idx_1]
-            A1_hat = A1_hat - tmp[idx_1, :][:, idx_0]
-            A0_L1_norm = pt.linalg.norm(A0, ord=1)
-            return A0, A1, A2, A1_hat, A0_L1_norm, step_num + 1
-        return ifelse(
-            norm < tol,
-            (A0, A1, A2, A1_hat, norm, step_num),
-            cycle_step(A0, A1, A2, A1_hat, norm, step_num),
-        )
-    A = pt.matrix("A", shape=(20, 20))
-    B = pt.matrix("B", shape=(20, 20))
-    C = pt.matrix("C", shape=(20, 20))
-    norm = np.array(1e9, dtype="float64")
-    step_num = pt.zeros((), dtype="int32")
-    max_iter = 100
-    tol = 1e-7
-    (*_, A1_hat, norm, _n_steps) = scan(
-        step,
-        outputs_info=[A, B, C, B, norm, step_num],
-        non_sequences=[tol],
-        n_steps=max_iter,
-        return_updates=False,
-    )
-    A1_hat = A1_hat[-1]
-    T = -pt.linalg.solve(stabilize(A1_hat), A, assume_a="gen", check_finite=False)
-    rng = np.random.default_rng(sum(map(ord, "cycle_reduction")))
-    n = A.type.shape[0]
-    A_test = rng.standard_normal(size=(n, n))
-    C_test = rng.standard_normal(size=(n, n))
-    # B must be invertible, so we make it symmetric positive-definite
-    B_rand = rng.standard_normal(size=(n, n))
-    B_test = B_rand @ B_rand.T + np.eye(n) * 1e-3
-    return dict(
-        graph_inputs=[A, B, C],
-        differentiable_vars=[A, B, C],
-        test_input_vals=[A_test, B_test, C_test],
-        loss_graph=pt.sum(T),
-    )
-@pytest.mark.parametrize("gradient_backend", ["PYTENSOR", "JAX"])
-@pytest.mark.parametrize("mode", ("0forward", "1backward", "2both"))
-@pytest.mark.parametrize("model", [cyclical_reduction, SEIR_model_logp])
-def test_scan_benchmark(model, mode, gradient_backend, benchmark):
-    model_dict = model()
-    graph_inputs = model_dict["graph_inputs"]
-    differentiable_vars = model_dict["differentiable_vars"]
-    loss_graph = model_dict["loss_graph"]
-    test_input_vals = model_dict["test_input_vals"]
-    if gradient_backend == "PYTENSOR":
-        backward_loss = pt.grad(
-            loss_graph,
-            wrt=differentiable_vars,
-        )
-        match mode:
-            # TODO: Restore original test separately
-            case "0forward":
-                graph_outputs = [loss_graph]
-            case "1backward":
-                graph_outputs = backward_loss
-            case "2both":
-                graph_outputs = [loss_graph, *backward_loss]
-            case _:
-                raise ValueError(f"Unknown mode: {mode}")
-        jax_fn, _ = compare_jax_and_py(
-            graph_inputs,
-            graph_outputs,
-            test_input_vals,
-            jax_mode="JAX",
-        )
-        jax_fn.trust_input = True
-    else:  # gradient_backend == "JAX"
-        import jax
-        loss_fn_tuple = function(graph_inputs, loss_graph, mode="JAX").vm.jit_fn
-        def loss_fn(*args):
-            return loss_fn_tuple(*args)[0]
-        match mode:
-            case "0forward":
-                jax_fn = jax.jit(loss_fn_tuple)
-            case "1backward":
-                jax_fn = jax.jit(
-                    jax.grad(loss_fn, argnums=tuple(range(len(graph_inputs))[2:]))
-                )
-            case "2both":
-                value_and_grad_fn = jax.value_and_grad(
-                    loss_fn, argnums=tuple(range(len(graph_inputs))[2:])
-                )
-                @jax.jit
-                def jax_fn(*args):
-                    loss, grads = value_and_grad_fn(*args)
-                    return loss, *grads
-            case _:
-                raise ValueError(f"Unknown mode: {mode}")
-    def block_until_ready(*inputs, jax_fn=jax_fn):
-        return [o.block_until_ready() for o in jax_fn(*inputs)]
-    block_until_ready(*test_input_vals)  # Warmup
-    benchmark.pedantic(block_until_ready, test_input_vals, rounds=200, iterations=1)
 def test_higher_order_derivatives():
    ScanCompatibilityTests.check_higher_order_derivative(mode="JAX")

--- a/tests/link/mlx/test_elemwise.py
+++ b/tests/link/mlx/test_elemwise.py
 import numpy as np
 import pytest
-import scipy
-from pytensor import config, function
+from pytensor import config
 from pytensor.tensor.basic import switch
 from pytensor.tensor.math import (
    add,
@@ -14,7 +13,6 @@ from pytensor.tensor.math import (
    ge,
    gt,
    int_div,
-    isinf,
    isnan,
    le,
    log,
@@ -107,28 +105,6 @@ def test_logsoftmax(axis):
    compare_mlx_and_py([x], [out], [x_test_value])
-@pytest.mark.parametrize("size", [(10, 10), (1000, 1000)])
-@pytest.mark.parametrize("axis", [0, 1])
-def test_logsumexp_benchmark(size, axis, benchmark):
-    X = matrix("X")
-    X_max = pt_max(X, axis=axis, keepdims=True)
-    X_max = switch(isinf(X_max), 0, X_max)
-    X_lse = log(pt_sum(exp(X - X_max), axis=axis, keepdims=True)) + X_max
-    rng = np.random.default_rng(23920)
-    X_val = rng.normal(size=size)
-    X_lse_fn = function([X], X_lse, mode="MLX")
-    # JIT compile first
-    _ = X_lse_fn(X_val)
-    res = benchmark(X_lse_fn, X_val)
-    exp_res = scipy.special.logsumexp(X_val, axis=axis, keepdims=True)
-    np.testing.assert_array_almost_equal(res, exp_res)
 def test_multiple_input_multiply():
    x, y, z = vectors("xyz")
    out = mul(x, y, z)

--- a/tests/link/numba/signal/test_conv.py
+++ b/tests/link/numba/signal/test_conv.py
-from functools import partial
 import numpy as np
 import pytest
-from pytensor import function
+from pytensor.tensor import dmatrix
-from pytensor.tensor import dmatrix, tensor
 from pytensor.tensor.signal import convolve1d
 from tests.link.numba.test_basic import compare_numba_and_py
-from tests.tensor.signal.test_conv import convolve1d_grad_benchmarker
 pytestmark = pytest.mark.filterwarnings(
@@ -43,31 +39,3 @@ def test_convolve1d(mode, bcast_order):
            np.swapaxes(numba_fn(test_y, test_x), 0, 1),
            res,
        )
-@pytest.mark.parametrize("mode", ("full", "valid"), ids=lambda x: f"mode={x}")
-@pytest.mark.parametrize("batch", (False, True), ids=lambda x: f"batch={x}")
-def test_convolve1d_benchmark_numba(batch, mode, benchmark):
-    x = tensor(shape=(7, 183) if batch else (183,))
-    y = tensor(shape=(7, 6) if batch else (6,))
-    out = convolve1d(x, y, mode=mode)
-    fn = function([x, y], out, mode="NUMBA", trust_input=True)
-    rng = np.random.default_rng()
-    x_test = rng.normal(size=(x.type.shape)).astype(x.type.dtype)
-    y_test = rng.normal(size=(y.type.shape)).astype(y.type.dtype)
-    np_convolve1d = np.vectorize(
-        partial(np.convolve, mode=mode), signature="(x),(y)->(z)"
-    )
-    np.testing.assert_allclose(
-        fn(x_test, y_test),
-        np_convolve1d(x_test, y_test),
-    )
-    benchmark(fn, x_test, y_test)
-@pytest.mark.parametrize("convolve_mode", ["full", "valid"])
-def test_convolve1d_grad_benchmark_numba(convolve_mode, benchmark):
-    convolve1d_grad_benchmarker(convolve_mode, "NUMBA", benchmark)
--- a/tests/link/numba/test_basic.py
+++ b/tests/link/numba/test_basic.py
@@ -522,23 +522,6 @@ class TestNumbaWarnings:
        np.testing.assert_allclose(fn(A_test, b_test), np.dot(A_test, b_test[:, None]))
-@pytest.mark.parametrize("mode", ("default", "trust_input", "direct"))
-def test_function_overhead(mode, benchmark):
-    x = pt.vector("x")
-    out = pt.exp(x)
-    fn = function([x], out, mode="NUMBA")
-    if mode == "trust_input":
-        fn.trust_input = True
-    elif mode == "direct":
-        fn = fn.vm.jit_fn
-    test_x = np.zeros(1000)
-    assert np.sum(fn(test_x)) == 1000
-    benchmark(fn, test_x)
 class ComplexType:
    def __init__(self, a, b):
        self.a = a

--- a/tests/link/numba/test_blockwise.py
+++ b/tests/link/numba/test_blockwise.py
@@ -9,7 +9,7 @@ from pytensor.tensor.basic import Alloc, ARange, constant
 from pytensor.tensor.blockwise import Blockwise, BlockwiseWithCoreShape
 from pytensor.tensor.elemwise import DimShuffle, Elemwise
 from pytensor.tensor.nlinalg import SVD, Det
-from pytensor.tensor.slinalg import Cholesky, cholesky
+from pytensor.tensor.slinalg import Cholesky
 from tests.link.numba.test_basic import compare_numba_and_py, numba_mode
@@ -52,17 +52,6 @@ def test_non_square_blockwise():
        fn([3, 4, 5])
-def test_blockwise_benchmark(benchmark):
-    x = tensor(shape=(5, 3, 3))
-    out = cholesky(x)
-    assert isinstance(out.owner.op, Blockwise)
-    fn = function([x], out, mode="NUMBA")
-    x_test = np.eye(3) * np.arange(1, 6)[:, None, None]
-    fn(x_test)  # JIT compile
-    benchmark(fn, x_test)
 def test_repeated_args():
    x = tensor3("x")
    x_test = np.full((1, 1, 1), 2.0, dtype=x.type.dtype)

--- a/tests/link/numba/test_elemwise.py
+++ b/tests/link/numba/test_elemwise.py
@@ -2,7 +2,6 @@ import contextlib
 import numpy as np
 import pytest
-import scipy.special
 import pytensor
 import pytensor.tensor as pt
@@ -13,7 +12,7 @@ from pytensor.compile.ops import deep_copy_op
 from pytensor.gradient import grad
 from pytensor.scalar import Composite, float64
 from pytensor.scalar import add as scalar_add
-from pytensor.tensor import blas, matrix, tensor, tensor3
+from pytensor.tensor import blas, matrix, tensor3
 from pytensor.tensor.elemwise import CAReduce, DimShuffle, Elemwise
 from pytensor.tensor.math import All, Any, Max, Min, Prod, ProdWithoutZeros, Sum
 from pytensor.tensor.special import LogSoftmax, Softmax, SoftmaxGrad
@@ -22,11 +21,7 @@ from tests.link.numba.test_basic import (
    numba_mode,
    scalar_my_multi_out,
 )
-from tests.tensor.test_elemwise import (
+from tests.tensor.test_elemwise import check_elemwise_runtime_broadcast
-    careduce_benchmark_tester,
-    check_elemwise_runtime_broadcast,
-    dimshuffle_benchmark,
-)
 rng = np.random.default_rng(42849)
@@ -686,78 +681,6 @@ def test_gammainc_wrt_k_grad():
    )
-class TestsBenchmark:
-    def test_elemwise_speed(self, benchmark):
-        x = pt.dmatrix("y")
-        y = pt.dvector("z")
-        out = np.exp(2 * x * y + y)
-        rng = np.random.default_rng(42)
-        x_val = rng.normal(size=(200, 500))
-        y_val = rng.normal(size=500)
-        func = function([x, y], out, mode="NUMBA")
-        func = func.vm.jit_fn
-        (out,) = func(x_val, y_val)
-        np.testing.assert_allclose(np.exp(2 * x_val * y_val + y_val), out)
-        benchmark(func, x_val, y_val)
-    def test_fused_elemwise_benchmark(self, benchmark):
-        rng = np.random.default_rng(123)
-        size = 100_000
-        x = pytensor.shared(rng.normal(size=size), name="x")
-        mu = pytensor.shared(rng.normal(size=size), name="mu")
-        logp = -((x - mu) ** 2) / 2
-        grad_logp = grad(logp.sum(), x)
-        func = pytensor.function([], [logp, grad_logp], mode="NUMBA")
-        # JIT compile first
-        func()
-        benchmark(func)
-    @pytest.mark.parametrize("size", [(10, 10), (1000, 1000), (10000, 10000)])
-    @pytest.mark.parametrize("axis", [0, 1])
-    def test_logsumexp_benchmark(self, size, axis, benchmark):
-        X = pt.matrix("X")
-        X_max = pt.max(X, axis=axis, keepdims=True)
-        X_max = pt.switch(pt.isinf(X_max), 0, X_max)
-        X_lse = pt.log(pt.sum(pt.exp(X - X_max), axis=axis, keepdims=True)) + X_max
-        rng = np.random.default_rng(23920)
-        X_val = rng.normal(size=size)
-        X_lse_fn = pytensor.function([X], X_lse, mode="NUMBA")
-        # JIT compile first
-        res = X_lse_fn(X_val)
-        exp_res = scipy.special.logsumexp(X_val, axis=axis, keepdims=True)
-        np.testing.assert_array_almost_equal(res, exp_res)
-        benchmark(X_lse_fn, X_val)
-    @pytest.mark.parametrize(
-        "axis",
-        (0, 1, 2, (0, 1), (0, 2), (1, 2), None),
-        ids=lambda x: f"axis={x}",
-    )
-    @pytest.mark.parametrize(
-        "c_contiguous",
-        (True, False),
-        ids=lambda x: f"c_contiguous={x}",
-    )
-    def test_numba_careduce_benchmark(self, axis, c_contiguous, benchmark):
-        return careduce_benchmark_tester(
-            axis, c_contiguous, mode="NUMBA", benchmark=benchmark
-        )
-    @pytest.mark.parametrize("c_contiguous", (True, False))
-    def test_dimshuffle(self, c_contiguous, benchmark):
-        dimshuffle_benchmark("NUMBA", c_contiguous, benchmark)
 @pytest.mark.parametrize(
    "x, y",
    [
@@ -855,18 +778,3 @@ def test_BatchedDot(x, y, exc):
            g,
            [x_test_value, y_test_value],
        )
-@pytest.mark.parametrize("dtype", ("float64", "float32", "mixed"))
-def test_mat_vec_dot_performance(dtype, benchmark):
-    A = tensor("A", shape=(512, 512), dtype="float64" if dtype == "mixed" else dtype)
-    x = tensor("x", shape=(512,), dtype="float32" if dtype == "mixed" else dtype)
-    out = ptm.dot(A, x)
-    fn = function([A, x], out, mode="NUMBA", trust_input=True)
-    rng = np.random.default_rng(948)
-    A_test = rng.standard_normal(size=A.type.shape, dtype=A.type.dtype)
-    x_test = rng.standard_normal(size=x.type.shape, dtype=x.type.dtype)
-    np.testing.assert_allclose(fn(A_test, x_test), np.dot(A_test, x_test), atol=1e-4)
-    benchmark(fn, A_test, x_test)
--- a/tests/link/numba/test_performance.py
+++ b/tests/link/numba/test_performance.py
@@ -13,7 +13,6 @@ from pytensor.compile.mode import Mode
 from pytensor.graph.rewriting.db import RewriteDatabaseQuery
 from pytensor.link.numba.linker import NumbaLinker
 from pytensor.tensor.math import Max
-from tests.fixtures import *  # noqa: F403
 opts = RewriteDatabaseQuery(include=[None], exclude=["cxx_only", "BlasOpt"])
@@ -76,72 +75,3 @@ def test_careduce_performance(careduce_fn, numpy_fn, axis, inputs, input_vals):
    # FIXME: Why are we asserting >=? Numba could be doing worse than numpy!
    assert mean_numba_time / mean_numpy_time >= 0.75
-@pytest.mark.parametrize("cache", (False, True))
-def test_radon_model_compile_repeatedly_numba_benchmark(cache, radon_model, benchmark):
-    joined_inputs, [model_logp, model_dlogp] = radon_model
-    rng = np.random.default_rng(1)
-    x = rng.normal(size=joined_inputs.type.shape).astype(config.floatX)
-    def compile_and_call_once():
-        with config.change_flags(numba__cache=cache):
-            fn = function(
-                [joined_inputs],
-                [model_logp, model_dlogp],
-                mode="NUMBA",
-                trust_input=True,
-            )
-            fn(x)
-    benchmark.pedantic(compile_and_call_once, rounds=5, iterations=1)
-@pytest.mark.parametrize("cache", (False, True))
-def test_radon_model_compile_variants_numba_benchmark(
-    cache, radon_model, radon_model_variants, benchmark
-):
-    """Test compilation speed when a slightly variant of a function is compiled each time.
-    This test more realistically simulates a use case where a model is recompiled
-    multiple times with small changes, such as in an interactive environment.
-    NOTE: For this test to be meaningful on subsequent runs, the cache must be cleared
-    """
-    joined_inputs, [model_logp, model_dlogp] = radon_model
-    rng = np.random.default_rng(1)
-    x = rng.normal(size=joined_inputs.type.shape).astype(config.floatX)
-    # Compile base function once to populate the cache
-    fn = function(
-        [joined_inputs], [model_logp, model_dlogp], mode="NUMBA", trust_input=True
-    )
-    fn(x)
-    def compile_and_call_once():
-        with config.change_flags(numba__cache=cache):
-            for joined_inputs, [model_logp, model_dlogp] in radon_model_variants:
-                fn = function(
-                    [joined_inputs],
-                    [model_logp, model_dlogp],
-                    mode="NUMBA",
-                    trust_input=True,
-                )
-                fn(x)
-    benchmark.pedantic(compile_and_call_once, rounds=1, iterations=1)
-@pytest.mark.parametrize("cache", (False, True))
-def test_radon_model_call_numba_benchmark(cache, radon_model, benchmark):
-    joined_inputs, [model_logp, model_dlogp] = radon_model
-    with config.change_flags(numba__cache=cache):
-        fn = function(
-            [joined_inputs], [model_logp, model_dlogp], mode="NUMBA", trust_input=True
-        )
-        rng = np.random.default_rng(1)
-    x = rng.normal(size=joined_inputs.type.shape).astype(config.floatX)
-    fn(x)  # warmup
-    benchmark.pedantic(fn, (x,), rounds=10_000, iterations=10)
--- a/tests/link/numba/test_scan.py
+++ b/tests/link/numba/test_scan.py
@@ -160,7 +160,7 @@ def test_xit_xot_types(
        assert np.allclose(res_val, output_vals)
-def test_scan_multiple_output(benchmark):
+def test_scan_multiple_output():
    """Test a scan implementation of a SEIR model.
    SEIR model definition:
@@ -243,14 +243,12 @@ def test_scan_multiple_output(benchmark):
        gamma_val,
        delta_val,
    ]
-    scan_fn, _ = compare_numba_and_py(
+    compare_numba_and_py(
        [pt_C, pt_D, st0, et0, it0, logp_c, logp_d, beta, gamma, delta],
        out,
        test_input_vals,
    )
-    benchmark(scan_fn, *test_input_vals)
 def test_scan_tap_output():
    a_pt = pt.scalar("a")
@@ -415,8 +413,8 @@ def test_inner_graph_optimized():
    )
-def test_vector_taps_benchmark(benchmark):
+def test_vector_taps():
-    """Test vector taps performance.
+    """Test vector taps.
    Vector taps get indexed into numeric types, that must be wrapped back into
    scalar arrays. The numba Scan implementation has an optimization to reuse
@@ -464,8 +462,6 @@ def test_vector_taps_benchmark(benchmark):
    for numba_r, ref_r in zip(numba_res, ref_res, strict=True):
        np.testing.assert_array_almost_equal(numba_r, ref_r)
-    benchmark(numba_fn, *test.values())
 @pytest.mark.parametrize("n_steps_constant", (True, False))
 def test_inplace_taps(n_steps_constant):
@@ -542,10 +538,10 @@ def test_inplace_taps(n_steps_constant):
 )
 @pytest.mark.parametrize("n_steps, op_size", [(10, 2), (512, 2), (512, 256)])
 class TestScanSITSOTBuffer:
-    def buffer_tester(self, n_steps, op_size, buffer_size, benchmark=None):
+    def buffer_tester(self, n_steps, op_size, buffer_size):
        x0 = pt.vector(shape=(op_size,), dtype="float64")
        xs = pytensor.scan(
-            fn=lambda xtm1: (xtm1 + 1),
+            fn=lambda xtm1: xtm1 + 1,
            outputs_info=[x0],
            n_steps=n_steps - 1,  # 1- makes it easier to align/misalign
            return_updates=False,
@@ -582,21 +578,14 @@ class TestScanSITSOTBuffer:
        buffer = scan_node.inputs[1]
        assert buffer.type.shape[0] == expected_buffer_size
-        if benchmark is not None:
-            numba_fn.trust_input = True
-            benchmark(numba_fn, x_test)
    def test_sit_sot_buffer(self, n_steps, op_size, buffer_size):
-        self.buffer_tester(n_steps, op_size, buffer_size, benchmark=None)
+        self.buffer_tester(n_steps, op_size, buffer_size)
-    def test_sit_sot_buffer_benchmark(self, n_steps, op_size, buffer_size, benchmark):
-        self.buffer_tester(n_steps, op_size, buffer_size, benchmark=benchmark)
 @pytest.mark.parametrize("constant_n_steps", [False, True])
 @pytest.mark.parametrize("n_steps_val", [1, 1000])
 class TestScanMITSOTBuffer:
-    def buffer_tester(self, constant_n_steps, n_steps_val, benchmark=None):
+    def buffer_tester(self, constant_n_steps, n_steps_val):
        """Make sure we can handle storage changes caused by the `scan_save_mem` rewrite."""
        def f_pow2(x_tm2, x_tm1):
@@ -644,15 +633,9 @@ class TestScanMITSOTBuffer:
            on_unused_input="ignore",
        )
        assert tuple(mitsot_buffer_shape) == (2,)
-        if benchmark is not None:
-            numba_fn.trust_input = True
-            benchmark(numba_fn, *test_vals)
    def test_mit_sot_buffer(self, constant_n_steps, n_steps_val):
-        self.buffer_tester(constant_n_steps, n_steps_val, benchmark=None)
+        self.buffer_tester(constant_n_steps, n_steps_val)
-    def test_mit_sot_buffer_benchmark(self, constant_n_steps, n_steps_val, benchmark):
-        self.buffer_tester(constant_n_steps, n_steps_val, benchmark=benchmark)
 def test_higher_order_derivatives():

--- a/tests/scan/test_basic.py
+++ b/tests/scan/test_basic.py
@@ -52,8 +52,6 @@ from pytensor.tensor.subtensor import Subtensor
 from pytensor.tensor.type import (
    TensorType,
    dcol,
-    dmatrix,
-    dscalar,
    dvector,
    fmatrix,
    fscalar,
@@ -2357,7 +2355,7 @@ def test_cvm_exception_handling(mode):
 @pytest.mark.skipif(
    not config.cxx, reason="G++ not available, so we need to skip this test."
 )
-def test_cython_performance(benchmark):
+def test_cython_performance():
    # This implicitly confirms that the Cython version is being used
    from pytensor.scan import scan_perform_ext  # noqa: F401
@@ -2391,7 +2389,7 @@ def test_cython_performance(benchmark):
    # Make sure we're actually computing a `Scan`
    assert any(isinstance(node.op, Scan) for node in f_cvm.maker.fgraph.apply_nodes)
-    cvm_res = benchmark(f_cvm)
+    cvm_res = f_cvm()
    # Make sure the results are the same between the two implementations
    assert np.allclose(cvm_res, py_res)
@@ -2741,7 +2739,7 @@ class TestExamples:
        n_result = numpy_implementation(v_vsample)
        utt.assert_allclose(t_result, n_result)
-    def test_reordering(self, benchmark):
+    def test_reordering(self):
        """Test re-ordering of inputs.
        some rnn with multiple outputs and multiple inputs; other
@@ -2800,38 +2798,13 @@ class TestExamples:
            v_x[i] = np.dot(v_u1[i], vW_in1) + v_u2[i] * vW_in2 + np.dot(v_x[i - 1], vW)
            v_y[i] = np.dot(v_x[i - 1], vWout) + v_y[i - 1]
-        (_pytensor_dump1, _pytensor_dump2, pytensor_x, pytensor_y) = benchmark(
+        (_pytensor_dump1, _pytensor_dump2, pytensor_x, pytensor_y) = f4(
-            f4, v_u1, v_u2, v_x0, v_y0, vW_in1
+            v_u1, v_u2, v_x0, v_y0, vW_in1
        )
        utt.assert_allclose(pytensor_x, v_x)
        utt.assert_allclose(pytensor_y, v_y)
-    def test_scan_as_tensor_on_gradients(self, benchmark):
-        to_scan = dvector("to_scan")
-        seq = dmatrix("seq")
-        f1 = dscalar("f1")
-        def scanStep(prev, seq, f1):
-            return prev + f1 * seq
-        scanned = scan(
-            fn=scanStep,
-            sequences=[seq],
-            outputs_info=[to_scan],
-            non_sequences=[f1],
-            return_updates=False,
-        )
-        function(inputs=[to_scan, seq, f1], outputs=scanned, allow_input_downcast=True)
-        t_grad = grad(scanned.sum(), wrt=[to_scan, f1], consider_constant=[seq])
-        benchmark(
-            function,
-            inputs=[to_scan, seq, f1],
-            outputs=t_grad,
-            allow_input_downcast=True,
-        )
    def caching_nsteps_by_scan_op(self):
        W = matrix("weights")
        initial = vector("initial")
@@ -3128,7 +3101,7 @@ class TestExamples:
        utt.assert_allclose(outputs, expected_outputs)
    @pytest.mark.slow
-    def test_hessian_bug_grad_grad_two_scans(self, benchmark):
+    def test_hessian_bug_grad_grad_two_scans(self):
        # Bug reported by Bitton Tenessi
        # NOTE : The test to reproduce the bug reported by Bitton Tenessi
        # was modified from its original version to be faster to run.
@@ -3163,7 +3136,7 @@ class TestExamples:
        H = hessian(cost, W)
        # print(".", file=sys.stderr)
        f = function([W, n_steps], H)
-        benchmark(f, np.ones((8,), dtype="float32"), 1)
+        f(np.ones((8,), dtype="float32"), 1)
    def test_grad_connectivity_matrix(self):
        def inner_fn(x_tm1, y_tm1, z_tm1):
@@ -3747,7 +3720,7 @@ class TestExamples:
        utt.assert_allclose(pytensor_x, v_x)
        utt.assert_allclose(pytensor_y, v_y)
-    def test_multiple_outs_taps(self, benchmark):
+    def test_multiple_outs_taps(self):
        l = 5
        rng = np.random.default_rng(utt.fetch_seed())
@@ -3841,8 +3814,6 @@ class TestExamples:
        np.testing.assert_almost_equal(res[1], ny1)
        np.testing.assert_almost_equal(res[2], ny2)
-        benchmark(f, v_u1, v_u2, v_x0, v_y0, vW_in1)
    def _grad_mout_helper(self, n_iters, mode):
        rng = np.random.default_rng(utt.fetch_seed())
        n_hid = 3

--- a/tests/scan/test_rewriting.py
+++ b/tests/scan/test_rewriting.py
@@ -674,28 +674,6 @@ class TestPushOutAddScan:
        vB = rng.uniform(size=(5, 5)).astype(config.floatX)
        utt.assert_allclose(f(vA, vB), np.dot(vA.T, vB))
-    def test_pregreedy_optimizer(self, benchmark):
-        W = pt.zeros((5, 4))
-        bv = pt.zeros((5,))
-        bh = pt.zeros((4,))
-        v = matrix("v")
-        (bv_t, bh_t) = scan(
-            lambda _: [bv, bh],
-            sequences=v,
-            outputs_info=[None, None],
-            return_updates=False,
-        )
-        chain = scan(
-            lambda x: dot(dot(x, W) + bh_t, W.T) + bv_t,
-            outputs_info=v,
-            n_steps=2,
-            return_updates=False,
-        )
-        # TODO FIXME: Make this a real test and assert something.
-        chain_fn = function([v], chain)
-        benchmark(chain_fn, np.zeros((3, 5), dtype=config.floatX))
    def test_machine_translation(self):
        """
        This test case comes from https://github.com/rizar/scan-grad-speed and
@@ -1560,18 +1538,6 @@ class TestSaveMem:
        ]
        assert len(scan_nodes) == 1
-    def test_savemem_opt(self, benchmark):
-        y0 = shared(np.ones((2, 10)))
-        [_y1, y2] = scan(
-            lambda y: [y, y],
-            outputs_info=[dict(initial=y0, taps=[-2]), None],
-            n_steps=5,
-            return_updates=False,
-        )
-        # TODO FIXME: Make this a real test and assert something.
-        fn = function([], y2.sum(), mode=self.mode)
-        benchmark(fn)
    def test_savemem_opt_0_step(self):
        """
        Test a case where the savemem optimization has the opportunity to

--- a/tests/tensor/rewriting/test_elemwise.py
+++ b/tests/tensor/rewriting/test_elemwise.py
@@ -11,7 +11,6 @@ from pytensor.compile.function import function
 from pytensor.compile.function.types import add_supervisor_to_fgraph
 from pytensor.compile.mode import Mode, get_default_mode
 from pytensor.configdefaults import config
-from pytensor.gradient import grad
 from pytensor.graph.basic import Constant, equal_computations
 from pytensor.graph.fg import FunctionGraph
 from pytensor.graph.rewriting.basic import check_stack_trace, out2in
@@ -1351,40 +1350,6 @@ class TestFusion:
        assert len(nodes) == 1
        assert isinstance(nodes[0].op.scalar_op, Composite)
-    def test_eval_benchmark(self, benchmark):
-        rng = np.random.default_rng(123)
-        size = 100_000
-        x = pytensor.shared(rng.normal(size=size), name="x")
-        mu = pytensor.shared(rng.normal(size=size), name="mu")
-        logp = -((x - mu) ** 2) / 2
-        grad_logp = grad(logp.sum(), x)
-        func = pytensor.function([], [logp, grad_logp], mode="FAST_RUN")
-        benchmark(func)
-    @pytest.mark.skipif(not config.cxx, reason="No cxx compiler")
-    @pytest.mark.parametrize(
-        "graph_fn, n, expected_n_repl",
-        [
-            ("deep_small_kernels", 20, (20, 60)),
-            ("large_fuseable_graph", 25, (128, 876)),
-        ],
-    )
-    def test_rewrite_benchmark(self, graph_fn, n, expected_n_repl, benchmark):
-        inps, outs = getattr(self, graph_fn)(n)
-        fg = FunctionGraph(inps, outs)
-        opt = FusionOptimizer()
-        def rewrite_func():
-            fg_clone = fg.clone()
-            _, nb_fused, nb_replacement, *_ = opt.apply(fg_clone)
-            # fg_clone.dprint()
-            return nb_fused, nb_replacement
-        assert rewrite_func() == expected_n_repl
-        benchmark.pedantic(rewrite_func, rounds=7, iterations=5)
    def test_no_warning_from_old_client(self):
        # There used to be a warning issued when creating fuseable mapping
        # for nodes that are no longer in the FunctionGraph

--- a/tests/tensor/rewriting/test_math.py
+++ b/tests/tensor/rewriting/test_math.py
@@ -5006,37 +5006,3 @@ class TestBlockDiagDotToDotBlockDiag:
            original, include=("canonicalize", "stabilize", "specialize")
        )
        assert_equal_computations([rewritten], [original])
-    @pytest.mark.parametrize("rewrite", [True, False], ids=["rewrite", "no_rewrite"])
-    @pytest.mark.parametrize("size", [10, 100, 1000], ids=["small", "medium", "large"])
-    def test_benchmark(self, benchmark, size, rewrite):
-        rng = np.random.default_rng()
-        a_size = int(rng.uniform(1, int(0.8 * size)))
-        b_size = int(rng.uniform(1, int(0.8 * (size - a_size))))
-        c_size = size - a_size - b_size
-        a = tensor("a", shape=(a_size, a_size))
-        b = tensor("b", shape=(b_size, b_size))
-        c = tensor("c", shape=(c_size, c_size))
-        d = tensor("d", shape=(size,))
-        x = pt.linalg.block_diag(a, b, c)
-        out = x @ d
-        mode = get_default_mode()
-        if not rewrite:
-            mode = mode.excluding("local_block_diag_dot_to_dot_block_diag")
-        fn = pytensor.function([a, b, c, d], out, mode=mode)
-        a_val = rng.normal(size=a.type.shape).astype(a.type.dtype)
-        b_val = rng.normal(size=b.type.shape).astype(b.type.dtype)
-        c_val = rng.normal(size=c.type.shape).astype(c.type.dtype)
-        d_val = rng.normal(size=d.type.shape).astype(d.type.dtype)
-        benchmark(
-            fn,
-            a_val,
-            b_val,
-            c_val,
-            d_val,
-        )
--- a/tests/tensor/signal/test_conv.py
+++ b/tests/tensor/signal/test_conv.py
@@ -8,7 +8,7 @@ from scipy.signal import convolve2d as scipy_convolve2d
 from pytensor import config, function, grad
 from pytensor.graph.rewriting import rewrite_graph
 from pytensor.graph.traversal import ancestors, io_toposort
-from pytensor.tensor import matrix, tensor, vector
+from pytensor.tensor import matrix, vector
 from pytensor.tensor.basic import expand_dims
 from pytensor.tensor.blockwise import Blockwise
 from pytensor.tensor.signal.conv import Convolve1d, convolve1d, convolve2d
@@ -122,27 +122,6 @@ def test_convolve1d_valid_grad(static_shape):
        assert full_mode.eval({larger: smaller_test, smaller: larger_test}) == True  # noqa: E712
-def convolve1d_grad_benchmarker(convolve_mode, mode, benchmark):
-    # Use None core shape so PyTensor doesn't know which mode to use until runtime.
-    larger = tensor("larger", shape=(8, None))
-    smaller = tensor("smaller", shape=(8, None))
-    grad_wrt_smaller = grad(
-        convolve1d(larger, smaller, mode=convolve_mode).sum(), wrt=smaller
-    )
-    fn = function([larger, smaller], grad_wrt_smaller, trust_input=True, mode=mode)
-    rng = np.random.default_rng([119, mode == "full"])
-    test_larger = rng.normal(size=(8, 1024)).astype(larger.type.dtype)
-    test_smaller = rng.normal(size=(8, 16)).astype(smaller.type.dtype)
-    benchmark(fn, test_larger, test_smaller)
-@pytest.mark.parametrize("convolve_mode", ["full", "valid"])
-def test_convolve1d_grad_benchmark_c(convolve_mode, benchmark):
-    convolve1d_grad_benchmarker(convolve_mode, "FAST_RUN", benchmark)
 @pytest.mark.parametrize(
    "kernel_shape", [(3, 3), (5, 3), (5, 8)], ids=lambda x: f"kernel_shape={x}"
 )

--- a/tests/tensor/test_basic.py
+++ b/tests/tensor/test_basic.py
@@ -118,7 +118,6 @@ from pytensor.tensor.type import (
    ivector,
    lscalar,
    lvector,
-    matrices,
    matrix,
    row,
    scalar,
@@ -2106,32 +2105,6 @@ class TestJoinAndSplit:
            assert np.allclose(r, expected)
            assert r.base is x_test
-    @pytest.mark.parametrize("gc", (True, False), ids=lambda x: f"gc={x}")
-    @pytest.mark.parametrize("memory_layout", ["C-contiguous", "F-contiguous", "Mixed"])
-    @pytest.mark.parametrize("axis", (0, 1), ids=lambda x: f"axis={x}")
-    @pytest.mark.parametrize("ndim", (1, 2), ids=["vector", "matrix"])
-    @config.change_flags(cmodule__warn_no_version=False)
-    def test_join_performance(self, ndim, axis, memory_layout, gc, benchmark):
-        if ndim == 1 and not (memory_layout == "C-contiguous" and axis == 0):
-            pytest.skip("Redundant parametrization")
-        n = 64
-        inputs = vectors("abcdef") if ndim == 1 else matrices("abcdef")
-        out = join(axis, *inputs)
-        fn = pytensor.function(inputs, Out(out, borrow=True), trust_input=True)
-        fn.vm.allow_gc = gc
-        test_values = [np.zeros((n, n)[:ndim], dtype=inputs[0].dtype) for _ in inputs]
-        if memory_layout == "C-contiguous":
-            pass
-        elif memory_layout == "F-contiguous":
-            test_values = [t.T for t in test_values]
-        elif memory_layout == "Mixed":
-            test_values = [t if i % 2 else t.T for i, t in enumerate(test_values)]
-        else:
-            raise ValueError
-        assert fn(*test_values).shape == (n * 6, n)[:ndim] if axis == 0 else (n, n * 6)
-        benchmark(fn, *test_values)
    def test_join_negative_axis_rewrite(self):
        """Test that constant negative axis is rewritten to positive axis in make_node."""
        v = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], dtype=self.floatX)

--- a/tests/tensor/test_blas_c.py
+++ b/tests/tensor/test_blas_c.py
@@ -443,94 +443,3 @@ class TestSdotNoFlags(TestCGemvNoFlags):
 class TestBlasStridesC(TestBlasStrides):
    mode = mode_blas_opt
-def test_gemv_vector_dot_perf(benchmark):
-    n = 400_000
-    a = pt.vector("A", shape=(n,))
-    b = pt.vector("x", shape=(n,))
-    out = CGemv(inplace=True)(
-        pt.empty((1,)),
-        1.0,
-        a[None],
-        b,
-        0.0,
-    )
-    fn = pytensor.function([a, b], out, accept_inplace=True, trust_input=True)
-    rng = np.random.default_rng(430)
-    test_a = rng.normal(size=n)
-    test_b = rng.normal(size=n)
-    np.testing.assert_allclose(
-        fn(test_a, test_b),
-        np.dot(test_a, test_b),
-    )
-    benchmark(fn, test_a, test_b)
-@pytest.mark.parametrize(
-    "neg_stride1", (True, False), ids=["neg_stride1", "pos_stride1"]
-)
-@pytest.mark.parametrize(
-    "neg_stride0", (True, False), ids=["neg_stride0", "pos_stride0"]
-)
-@pytest.mark.parametrize("F_layout", (True, False), ids=["F_layout", "C_layout"])
-def test_gemv_negative_strides_perf(neg_stride0, neg_stride1, F_layout, benchmark):
-    A = pt.matrix("A", shape=(512, 512))
-    x = pt.vector("x", shape=(A.type.shape[-1],))
-    y = pt.vector("y", shape=(A.type.shape[0],))
-    out = CGemv(inplace=False)(
-        y,
-        1.0,
-        A,
-        x,
-        1.0,
-    )
-    fn = pytensor.function([A, x, y], out, trust_input=True)
-    rng = np.random.default_rng(430)
-    test_A = rng.normal(size=A.type.shape)
-    test_x = rng.normal(size=x.type.shape)
-    test_y = rng.normal(size=y.type.shape)
-    if F_layout:
-        test_A = test_A.T
-    if neg_stride0:
-        test_A = test_A[::-1]
-    if neg_stride1:
-        test_A = test_A[:, ::-1]
-    assert (test_A.strides[0] < 0) == neg_stride0
-    assert (test_A.strides[1] < 0) == neg_stride1
-    # Check result is correct by using a copy of A with positive strides
-    res = fn(test_A, test_x, test_y)
-    np.testing.assert_allclose(res, fn(test_A.copy(), test_x, test_y))
-    benchmark(fn, test_A, test_x, test_y)
-@pytest.mark.parametrize("inplace", (True, False), ids=["inplace", "no_inplace"])
-@pytest.mark.parametrize("n", [2**7, 2**9, 2**13])
-def test_ger_benchmark(n, inplace, benchmark):
-    alpha = pt.dscalar("alpha")
-    x = pt.dvector("x")
-    y = pt.dvector("y")
-    A = pt.dmatrix("A")
-    out = alpha * pt.outer(x, y) + A
-    fn = pytensor.function(
-        [alpha, x, y, pytensor.In(A, mutable=inplace)], out, trust_input=True
-    )
-    rng = np.random.default_rng([2274, n])
-    alpha_test = rng.normal(size=())
-    x_test = rng.normal(size=(n,))
-    y_test = rng.normal(size=(n,))
-    A_test = rng.normal(size=(n, n))
-    benchmark(fn, alpha_test, x_test, y_test, A_test)
--- a/tests/tensor/test_blockwise.py
+++ b/tests/tensor/test_blockwise.py
@@ -15,9 +15,6 @@ from pytensor.graph.replace import _vectorize_node, vectorize_graph
 from pytensor.link.numba import NumbaLinker
 from pytensor.raise_op import assert_op
 from pytensor.tensor import (
-    diagonal,
-    dmatrix,
-    log,
    matrices,
    matrix,
    ones_like,
@@ -34,7 +31,6 @@ from pytensor.tensor.nlinalg import MatrixInverse, eig
 from pytensor.tensor.random import normal
 from pytensor.tensor.random.op import default_rng
 from pytensor.tensor.rewriting.blas import specialize_matmul_to_batched_dot
-from pytensor.tensor.signal import convolve1d
 from pytensor.tensor.slinalg import (
    Cholesky,
    Solve,
@@ -530,66 +526,6 @@ class TestSolveMatrix(BlockwiseOpTester):
    signature = "(m, m),(m, n) -> (m, n)"
-@pytest.mark.parametrize(
-    "mu_batch_shape", [(), (1000,), (4, 1000)], ids=lambda arg: f"mu:{arg}"
-)
-@pytest.mark.parametrize(
-    "cov_batch_shape", [(), (1000,), (4, 1000)], ids=lambda arg: f"cov:{arg}"
-)
-def test_batched_mvnormal_logp_and_dlogp(mu_batch_shape, cov_batch_shape, benchmark):
-    rng = np.random.default_rng(sum(map(ord, "batched_mvnormal")))
-    value_batch_shape = mu_batch_shape
-    if len(cov_batch_shape) > len(mu_batch_shape):
-        value_batch_shape = cov_batch_shape
-    value = tensor("value", shape=(*value_batch_shape, 10))
-    mu = tensor("mu", shape=(*mu_batch_shape, 10))
-    cov = tensor("cov", shape=(*cov_batch_shape, 10, 10))
-    test_values = [
-        rng.normal(size=value.type.shape),
-        rng.normal(size=mu.type.shape),
-        np.eye(cov.type.shape[-1]) * np.abs(rng.normal(size=cov.type.shape)),
-    ]
-    chol_cov = cholesky(cov, lower=True, on_error="raise")
-    delta_trans = solve_triangular(chol_cov, value - mu, b_ndim=1)
-    quaddist = (delta_trans**2).sum(axis=-1)
-    diag = diagonal(chol_cov, axis1=-2, axis2=-1)
-    logdet = log(diag).sum(axis=-1)
-    k = value.shape[-1]
-    norm = -0.5 * k * (np.log(2 * np.pi))
-    logp = norm - 0.5 * quaddist - logdet
-    dlogp = grad(logp.sum(), wrt=[value, mu, cov])
-    fn = pytensor.function([value, mu, cov], [logp, *dlogp])
-    benchmark(fn, *test_values)
-def test_small_blockwise_performance(benchmark):
-    a = dmatrix(shape=(7, 128))
-    b = dmatrix(shape=(7, 20))
-    out = convolve1d(a, b, mode="valid")
-    fn = pytensor.function([a, b], out, trust_input=True)
-    assert isinstance(
-        fn.maker.fgraph.outputs[0].owner.op, Blockwise | BlockwiseWithCoreShape
-    )
-    rng = np.random.default_rng(495)
-    a_test = rng.normal(size=a.type.shape)
-    b_test = rng.normal(size=b.type.shape)
-    np.testing.assert_allclose(
-        fn(a_test, b_test),
-        [
-            np.convolve(a_test[i], b_test[i], mode="valid")
-            for i in range(a_test.shape[0])
-        ],
-    )
-    benchmark(fn, a_test, b_test)
 def test_cop_with_params():
    matrix_assert = Blockwise(core_op=assert_op, signature="(x1,x2),()->(x1,x2)")

--- a/tests/tensor/test_elemwise.py
+++ b/tests/tensor/test_elemwise.py
-import itertools
 import math
 import re
 import tracemalloc
@@ -11,7 +10,7 @@ import pytensor
 import pytensor.scalar as ps
 import pytensor.tensor as pt
 import tests.unittest_tools as utt
-from pytensor import In, Out, config, grad
+from pytensor import In, config, grad
 from pytensor.compile.function import function
 from pytensor.compile.mode import Mode, get_default_mode
 from pytensor.graph.basic import Apply, Variable
@@ -41,7 +40,6 @@ from pytensor.tensor.type import (
    matrix,
    scalar,
    tensor,
-    tensor3,
    vector,
    vectors,
 )
@@ -80,30 +78,6 @@ def reduce_bitwise_and(x, axis=-1, dtype="int8"):
    return np.apply_along_axis(custom_reduce, axis, x)
-def dimshuffle_benchmark(mode, c_contiguous, benchmark):
-    x = tensor3("x")
-    if c_contiguous:
-        x_val = np.random.random((2, 3, 4)).astype(config.floatX)
-    else:
-        x_val = np.random.random((200, 300, 400)).transpose(1, 2, 0)
-    ys = [x.transpose(t) for t in itertools.permutations((0, 1, 2))]
-    ys += [
-        x[None],
-        x[:, None],
-        x[:, :, None],
-        x[:, :, :, None],
-    ]
-    # Borrow to avoid deepcopy overhead
-    fn = pytensor.function(
-        [In(x, borrow=True)],
-        [Out(y, borrow=True) for y in ys],
-        mode=mode,
-    )
-    fn.trust_input = True
-    fn(x_val)  # JIT compile for JIT backends
-    benchmark(fn, x_val)
 class TestDimShuffle(unittest_tools.InferShapeTester):
    op = DimShuffle
    type = TensorType
@@ -261,10 +235,6 @@ class TestDimShuffle(unittest_tools.InferShapeTester):
        with pytest.raises(TypeError, match="input_ndim must be an integer"):
            DimShuffle(input_ndim=(True, False), new_order=(1, 0))
-    @pytest.mark.parametrize("c_contiguous", [True, False])
-    def test_benchmark(self, c_contiguous, benchmark):
-        dimshuffle_benchmark("FAST_RUN", c_contiguous, benchmark)
 class TestBroadcast:
    # this is to allow other types to reuse this class to test their ops
@@ -1077,38 +1047,6 @@ class TestVectorize:
        assert vect_out.owner.inputs[0] is bool_tns
-def careduce_benchmark_tester(axis, c_contiguous, mode, benchmark):
-    N = 256
-    x_test = np.random.uniform(size=(N, N, N))
-    transpose_axis = (0, 1, 2) if c_contiguous else (2, 0, 1)
-    x = pytensor.shared(x_test, name="x", shape=x_test.shape)
-    out = x.transpose(transpose_axis).sum(axis=axis)
-    fn = pytensor.function([], out, mode=mode)
-    np.testing.assert_allclose(
-        fn(),
-        x_test.transpose(transpose_axis).sum(axis=axis),
-    )
-    benchmark(fn)
-@pytest.mark.parametrize(
-    "axis",
-    (0, 1, 2, (0, 1), (0, 2), (1, 2), None),
-    ids=lambda x: f"axis={x}",
-)
-@pytest.mark.parametrize(
-    "c_contiguous",
-    (True, False),
-    ids=lambda x: f"c_contiguous={x}",
-)
-def test_c_careduce_benchmark(axis, c_contiguous, benchmark):
-    return careduce_benchmark_tester(
-        axis, c_contiguous, mode="FAST_RUN", benchmark=benchmark
-    )
 def test_gradient_mixed_discrete_output_scalar_op():
    class MixedDtypeScalarOp(ScalarOp):
        def make_node(self, *inputs):

--- a/tests/tensor/test_math_scipy.py
+++ b/tests/tensor/test_math_scipy.py
@@ -12,7 +12,7 @@ from pytensor.compile.mode import get_default_mode
 from pytensor.configdefaults import config
 from pytensor.gradient import NullTypeGradError, verify_grad
 from pytensor.scalar import ScalarLoop
-from pytensor.tensor import gammaincc, kn, kv, kve, vector
+from pytensor.tensor import kn, kv, kve, vector
 from pytensor.tensor.elemwise import Elemwise
 from tests import unittest_tools as utt
 from tests.tensor.utils import (
@@ -337,25 +337,6 @@ def test_gammainc_ddk_tabulated_values():
        )
-def test_gammaincc_ddk_performance(benchmark):
-    rng = np.random.default_rng(1)
-    k = vector("k")
-    x = vector("x")
-    out = gammaincc(k, x)
-    grad_fn = function(
-        [k, x], grad(out.sum(), wrt=[k]), mode="FAST_RUN", trust_input=True
-    )
-    vals = [
-        # Values that hit the second branch of the gradient
-        np.full((1000,), 3.2, dtype=k.dtype),
-        np.full((1000,), 0.01, dtype=x.dtype),
-    ]
-    verify_grad(gammaincc, vals, rng=rng)
-    benchmark(grad_fn, *vals)
 TestGammaUBroadcast = makeBroadcastTester(
    op=pt.gammau,
    expected=expected_gammau,
@@ -888,30 +869,6 @@ class TestHyp2F1Grad:
                rtol=rtol,
            )
-    @pytest.mark.parametrize("case", (few_iters_case, many_iters_case))
-    @pytest.mark.parametrize("wrt", ("a", "all"))
-    def test_benchmark(self, case, wrt, benchmark):
-        a1, a2, b1, z = pt.scalars("a1", "a2", "b1", "z")
-        hyp2f1_out = pt.hyp2f1(a1, a2, b1, z)
-        hyp2f1_grad = pt.grad(hyp2f1_out, wrt=a1 if wrt == "a" else [a1, a2, b1, z])
-        f_grad = function([a1, a2, b1, z], hyp2f1_grad, trust_input=True)
-        (test_a1, test_a2, test_b1, test_z, *expected_dds) = case
-        test_a1 = np.array(test_a1, dtype=a1.dtype)
-        test_a2 = np.array(test_a2, dtype=a2.dtype)
-        test_b1 = np.array(test_b1, dtype=b1.dtype)
-        test_z = np.array(test_z, dtype=z.dtype)
-        result = benchmark(f_grad, test_a1, test_a2, test_b1, test_z)
-        rtol = 1e-9 if config.floatX == "float64" else 2e-3
-        expected_result = expected_dds[0] if wrt == "a" else np.array(expected_dds)
-        np.testing.assert_allclose(
-            result,
-            expected_result,
-            rtol=rtol,
-        )
    @pytest.mark.parametrize("wrt", ([0], [1], [2], [0, 1], [1, 2], [0, 2], [0, 1, 2]))
    def test_unused_grad_loop_opt(self, wrt):
        """Test that we don't compute unnecessary outputs in the grad scalar loop"""

--- a/tests/tensor/test_shape.py
+++ b/tests/tensor/test_shape.py
@@ -4,7 +4,7 @@ import numpy as np
 import pytest
 import pytensor
-from pytensor import In, Mode, Out, function, grad
+from pytensor import Mode, function, grad
 from pytensor.compile.ops import DeepCopyOp
 from pytensor.configdefaults import config
 from pytensor.graph.basic import Variable, equal_computations
@@ -382,20 +382,6 @@ class TestReshape(utt.InferShapeTester, utt.OptimizationTestMixin):
            np.arange(8).reshape(test_shape),
        )
-    def test_benchmark(self, benchmark):
-        x = tensor3("x")
-        x_val = np.random.random((2, 3, 4)).astype(config.floatX)
-        y1 = x.reshape((6, 4))
-        y2 = x.reshape((2, 12))
-        y3 = x.reshape((-1,))
-        # Borrow to avoid deepcopy overhead
-        reshape_fn = pytensor.function(
-            [In(x, borrow=True)],
-            [Out(y1, borrow=True), Out(y2, borrow=True), Out(y3, borrow=True)],
-        )
-        reshape_fn.trust_input = True
-        benchmark(reshape_fn, x_val)
 def test_shape_i_hash():
    assert isinstance(Shape_i(np.int64(1)).__hash__(), int)

--- a/tests/tensor/test_slinalg.py
+++ b/tests/tensor/test_slinalg.py
@@ -81,16 +81,6 @@ def test_cholesky():
    check_upper_triangular(pd, ch_f)
-def test_cholesky_performance(benchmark):
-    rng = np.random.default_rng(utt.fetch_seed())
-    r = rng.standard_normal((10, 10)).astype(config.floatX)
-    pd = np.dot(r, r.T)
-    x = matrix()
-    chol = cholesky(x)
-    ch_f = function([x], chol)
-    benchmark(ch_f, pd)
 def test_cholesky_empty():
    empty = np.empty([0, 0], dtype=config.floatX)
    x = matrix()

--- a/tests/tensor/test_subtensor.py
+++ b/tests/tensor/test_subtensor.py
@@ -3171,57 +3171,6 @@ def test_flip(size: tuple[int]):
        np.testing.assert_allclose(expected, f(x), atol=ATOL, rtol=RTOL)
-class TestBenchmarks:
-    @pytest.mark.parametrize(
-        "static_shape", (False, True), ids=lambda x: f"static_shape={x}"
-    )
-    @pytest.mark.parametrize("gc", (False, True), ids=lambda x: f"gc={x}")
-    def test_advanced_subtensor1(self, static_shape, gc, benchmark):
-        x = vector("x", shape=(85 if static_shape else None,))
-        x_values = np.random.normal(size=(85,))
-        idxs_values = np.arange(85).repeat(11)
-        # With static shape and constant indices we know all idxs are valid
-        # And can use faster mode in numpy.take
-        out = x[idxs_values]
-        fn = pytensor.function(
-            [x],
-            pytensor.Out(out, borrow=True),
-            on_unused_input="ignore",
-            trust_input=True,
-        )
-        fn.vm.allow_gc = gc
-        benchmark(fn, x_values, idxs_values)
-    @pytest.mark.parametrize(
-        "static_shape", (False, True), ids=lambda x: f"static_shape={x}"
-    )
-    @pytest.mark.parametrize("gc", (False, True), ids=lambda x: f"gc={x}")
-    @pytest.mark.parametrize("func", (inc_subtensor, set_subtensor))
-    def test_advanced_incsubtensor1(self, func, static_shape, gc, benchmark):
-        x = vector("x", shape=(85 if static_shape else None,))
-        x_values = np.zeros((85,))
-        buffer = ptb.zeros_like(x)
-        y_values = np.random.normal(size=(85 * 11,))
-        idxs_values = np.arange(85).repeat(11)
-        # With static shape and constant indices we know all idxs are valid
-        # Reuse same buffer of zeros, to check we rather allocate twice than copy inside IncSubtensor
-        out1 = func(buffer[idxs_values], y_values)
-        out2 = func(buffer[idxs_values[::-1]], y_values)
-        fn = pytensor.function(
-            [x],
-            [pytensor.Out(out1, borrow=True), pytensor.Out(out2, borrow=True)],
-            on_unused_input="ignore",
-            trust_input=True,
-        )
-        fn.vm.allow_gc = gc
-        benchmark(fn, x_values)
 def test_subtensor_hash_and_eq():
    s1 = Subtensor(idx_list=[slice(None, None, None), 0])
    s2 = Subtensor(idx_list=[slice(None, None, None), 0])

--- a/tests/test_gradient.py
+++ b/tests/test_gradient.py
@@ -35,7 +35,7 @@ from pytensor.graph.op import Op
 from pytensor.graph.traversal import graph_inputs
 from pytensor.scalar import float64
 from pytensor.scan.op import Scan
-from pytensor.tensor.math import add, dot, exp, outer, sigmoid, sqr, sqrt, tanh
+from pytensor.tensor.math import add, dot, exp, outer, sigmoid, sqr, tanh
 from pytensor.tensor.math import sum as pt_sum
 from pytensor.tensor.random import RandomStream
 from pytensor.tensor.type import (
@@ -1136,33 +1136,6 @@ class TestJacobian:
        val = np.ones((4, 4), dtype=config.floatX)
        np.testing.assert_allclose(func_v(val, val), np.zeros((3, 2, 4, 4)))
-    def test_benchmark(self, vectorize, benchmark):
-        x = vector("x", shape=(3,))
-        y = outer(x, x)
-        jac_y = jacobian(y, x, vectorize=vectorize)
-        fn = function([x], jac_y, trust_input=True)
-        benchmark(fn, np.array([0, 1, 2], dtype=x.type.dtype))
-    def test_benchmark_partial_jacobian(self, vectorize, benchmark):
-        # Example from https://github.com/jax-ml/jax/discussions/5904#discussioncomment-422956
-        N = 1000
-        rng = np.random.default_rng(2025)
-        x_test = rng.random((N,))
-        f_mat = rng.random((N, N))
-        x = vector("x", dtype="float64")
-        def f(x):
-            return sqrt(f_mat @ x / N)
-        full_jacobian = jacobian(f(x), x, vectorize=vectorize)
-        partial_jacobian = full_jacobian[:5, :5]
-        f = pytensor.function([x], partial_jacobian, trust_input=True)
-        benchmark(f, x_test)
 def test_hessian():
    x = vector()