test: adds search-replace evaluation suite (#3205)

See `src/__tests__/evals/README.md` for usage. Other notes: - The test fixtures are 300+ lines each. Even so, I still think some of them are a little too easy. I might swap some of them out for more challenging ones, or edit them so that they're not so straightforward. - This currently still only tests `search_replace`, so I don't yet have a way to compare correctness/token usage/time taken of `search_replace` vs `edit_file` vs `write_file`. - Otherwise, though, I think I'm fairly thorough about collecting data. One thing I'm missing is the cost (it would probably be a rough estimate at best) but I'm at least able to store the number of input/output tokens for each tool call.  --- <a href="https://app.devin.ai/review/dyad-sh/dyad/pull/3205" target="_blank"> <picture> <source media="(prefers-color-scheme: dark)" srcset="https://static.devin.ai/assets/gh-open-in-devin-review-dark.svg?v=1"> <img src="https://static.devin.ai/assets/gh-open-in-devin-review-light.svg?v=1" alt="Open with Devin"> </picture> </a>

test: adds search-replace evaluation suite (#3205)
5f823117 · Ryan Groch · GitHub · 9dbc0630 · 5f823117 · 5f823117
--- a/.gitignore
+++ b/.gitignore
@@ -113,3 +113,6 @@ __pycache__/

 # Storybook
 storybook-static/
+
+# Eval framework — run results
+eval-results/
--- a/package.json
+++ b/package.json
@@ -35,6 +35,7 @@
    "fmt:check": "npx oxfmt --check",
    "fmt": "npx oxfmt",
    "presubmit": "npm run fmt:check && npm run lint",
+    "eval": "cross-env NODE_OPTIONS=--no-deprecation vitest run --config vitest.eval.config.ts",
    "test": "cross-env NODE_OPTIONS=--no-deprecation VITE_CJS_IGNORE_WARNING=true vitest run",
    "test:watch": "cross-env NODE_OPTIONS=--no-deprecation VITE_CJS_IGNORE_WARNING=true vitest",
    "test:ui": "cross-env NODE_OPTIONS=--no-deprecation VITE_CJS_IGNORE_WARNING=true vitest --ui",

--- a/src/__tests__/evals/README.md
+++ b/src/__tests__/evals/README.md
+# Evals
+
+LLM eval suite for tool-use quality. Six suites run the same 16 cases and
+the same three models (Claude Sonnet 4.6, GPT 5.4, Gemini 3 Flash) but with
+different tool sets and system prompts:
+
+| Suite name                | Tools available                             | System prompt                                 |
+| ------------------------- | ------------------------------------------- | --------------------------------------------- |
+| `search_replace`          | `search_replace` only                       | Minimal custom "precise code editor" prompt   |
+| `search_replace_few`      | `search_replace` only                       | Variant prompt encouraging fewer tool calls   |
+| `edit_file`               | `edit_file` only                            | Minimal custom `edit_file` prompt             |
+| `basic_agent`             | `search_replace`, `write_file`              | Production `LOCAL_AGENT_BASIC_SYSTEM_PROMPT`  |
+| `pro_agent`               | `search_replace`, `edit_file`, `write_file` | Production `LOCAL_AGENT_SYSTEM_PROMPT` (Pro)  |
+| `pro_agent_experimental`  | `search_replace`, `edit_file`, `write_file` | Editable copy of the Pro prompt for tweaking  |
+
+Each case gives the model a real source file plus an editing instruction,
+runs the model with the suite's tools wired up, applies the produced edits,
+and then asks an LLM judge (GPT 5.4) whether the result satisfies the
+instruction.
+
+## Prerequisites
+
+All models are routed through the Dyad Engine gateway, so you only need one
+credential: a Dyad Pro API key, exposed as `DYAD_PRO_API_KEY`. The
+`edit_file` tool additionally calls the engine's `/tools/turbo-file-edit`
+endpoint to apply sketched edits — that uses the same key.
+
+The suite is skipped entirely when `DYAD_PRO_API_KEY` is unset — no tests will
+fail, they just won't run. This keeps regular `vitest run` safe for contributors
+without a key.
+
+Export the key for the session (plus the two required filter vars — see
+[Running the suite](#running-the-suite)):
+
+```bash
+export DYAD_PRO_API_KEY="..."
+EVAL_SUITE=all EVAL_MODEL=all npm run eval
+```
+
+Or set everything inline for a single command:
+
+```bash
+DYAD_PRO_API_KEY="..." EVAL_SUITE=all EVAL_MODEL=all npm run eval
+```
+
+Optional: override the gateway URL with `DYAD_ENGINE_URL` (defaults to
+`https://engine.dyad.sh/v1`).
+
+## Running the suite
+
+**Both `EVAL_SUITE` and `EVAL_MODEL` are required.** A full run of every
+suite against every model is expensive, so the suite will not run unless
+the caller opts in explicitly. If either variable is unset, the eval prints
+a warning describing how to configure it and registers a single skipped
+placeholder — it does not fail CI, but it also does not run any cases.
+
+Use the special value `all` to mean "run everything":
+
+```bash
+# Run every suite against every model against every case.
+EVAL_SUITE=all EVAL_MODEL=all DYAD_PRO_API_KEY="..." npm run eval
+```
+
+**Heads up — this is expensive.** A full `all`/`all` run issues one
+generation per (suite × model × case) triple plus one judge call per case,
+across 6 suites, 3 models, and 16 cases. The `edit_file`, `pro_agent`, and
+`pro_agent_experimental` suites also make additional engine calls for each
+sketched edit the model produces through `edit_file`. Expect dozens of LLM requests, some of which run reasoning
+models on 300+ line fixtures. Use sparingly; prefer narrow filters during
+development.
+
+### Running a single suite
+
+Set `EVAL_SUITE` to the exact `name` (case-insensitive) of the suite — the
+same name that appears as a folder under `eval-results/`. A comma-separated
+list runs multiple suites:
+
+```bash
+# Just the original search_replace-only suite
+EVAL_SUITE=search_replace EVAL_MODEL=all DYAD_PRO_API_KEY="..." npm run eval
+
+# The basic_agent suite (Basic agent prompt, search_replace + write_file)
+EVAL_SUITE=basic_agent EVAL_MODEL=all DYAD_PRO_API_KEY="..." npm run eval
+
+# The pro_agent suite (Pro agent prompt, search_replace + edit_file + write_file)
+EVAL_SUITE=pro_agent EVAL_MODEL=all DYAD_PRO_API_KEY="..." npm run eval
+```
+
+Note: `EVAL_SUITE` matches suite `name`s exactly (case-insensitive), and
+accepts a comma-separated list for multiple suites (e.g.
+`EVAL_SUITE=search_replace,edit_file`). Unknown names error out with the
+available list.
+
+### Running a single case
+
+Vitest's `-t` flag filters by test name. Case names are the `name` field in
+the `CASES` array of [tool_use.eval.ts](tool_use.eval.ts).
+
+```bash
+EVAL_SUITE=all EVAL_MODEL=all DYAD_PRO_API_KEY="..." \
+  npm run eval -- -t "Extract a helper function"
+```
+
+`-t` matches as a substring, so a short unique fragment works too:
+
+```bash
+EVAL_SUITE=all EVAL_MODEL=all DYAD_PRO_API_KEY="..." npm run eval -- -t "zod"
+```
+
+### Running against one model
+
+Set `EVAL_MODEL` to a case-insensitive substring of the model's label or
+model name. It matches against both, so short fragments like `sonnet`, `gpt`,
+or `gemini` work:
+
+```bash
+EVAL_SUITE=all EVAL_MODEL=sonnet DYAD_PRO_API_KEY="..." npm run eval
+```
+
+### Combining filters
+
+`EVAL_SUITE`, `EVAL_MODEL`, and `-t` compose. A tight development loop:
+
+```bash
+EVAL_SUITE=search_replace EVAL_MODEL=sonnet \
+  DYAD_PRO_API_KEY="..." npm run eval -- -t "Extract a helper function"
+```
+
+Note: vitest's `-t` pattern is applied across the full describe/test
+hierarchy as a regex, which makes "model label > case name" style patterns
+brittle across vitest versions. Prefer `EVAL_SUITE` / `EVAL_MODEL` for
+suite and model filtering and reserve `-t` for case-name filtering.
+
+## Where results are stored
+
+Every run writes structured output to `eval-results/` at the repo root. The
+directory is gitignored and never cleaned automatically — delete old runs by
+hand when you want to.
+
+Layout:
+
+```
+eval-results/
+  <suite-name>/                          ← one top-level folder per suite
+    <run-start-ts>__<model-label>/       ← one folder per (run, model)
+      <case-name>/                       ← one folder per case
+        record.json                      ← full structured record
+        record.txt                       ← human-readable render of the same
+        details/                         ← per-record split views
+          file_before.<ext>              ← file at the start of the run
+          file_after.<ext>               ← file at the end of the run
+          diff.patch                     ← cumulative unified diff
+          system_prompt.txt              ← system prompt sent to the model
+          instructions.txt               ← case instructions (no file content)
+          user_prompt.txt                ← full user message (file + instructions)
+          metadata.json                  ← run metadata without big blobs
+          metadata.txt                   ← same info, human-readable
+        tool_calls/
+          01.txt                         ← combined view of tool call #1
+          01/                            ← split view, one piece per file
+            file_before.<ext>
+            file_after.<ext>
+            diff.patch
+            meta.txt
+            <arg_name>.<ext>             ← one file per tool arg (see below)
+          02.txt
+          02/
+          ...
+```
+
+The top-level folder is the suite `name`, so each suite lands in its own
+directory:
+
+- `eval-results/search_replace/`
+- `eval-results/search_replace_few/`
+- `eval-results/edit_file/`
+- `eval-results/basic_agent/`
+- `eval-results/pro_agent/`
+- `eval-results/pro_agent_experimental/`
+
+`<run-start-ts>` is captured once at process start, so every case from the
+same `npm run eval` invocation for a given (suite, model) pair clusters into
+one folder. Folder names sort chronologically under `ls`.
+
+### Record format
+
+`record.json` contains the complete machine-readable record. Key fields:
+
+- `timestamp`, `suite`, `caseName` — identifying metadata.
+- `model` — `{label, provider, modelName, responseModelId}`. `responseModelId`
+  is the exact model string the gateway echoed back, which can differ from
+  `modelName` (e.g. dated snapshots).
+- `prompt` — `{system, instructions, user}`. `system` is the full system
+  prompt sent to the model (including the production agent prompts when the
+  suite uses one). `instructions` is the bare case instruction — useful for
+  scanning what was asked without the fixture file inlined. `user` is the
+  full user message actually sent (file content + instructions).
+- `file` — `{name, before, after}`. The fixture file name plus its content
+  at the start and end of the run. `before` / `after` are also written to
+  `details/file_before.<ext>` / `details/file_after.<ext>` for easy editor
+  opening with matching syntax highlighting.
+- `llm.totalDurationMs`, `llm.totalUsage` — wall-clock time and token totals
+  for the model under test (not the judge).
+- `llm.requests` — per-step breakdown: each entry is one HTTP round-trip with
+  its own duration, usage, and `finishReason`.
+- `toolCalls` — every tool call the model made. Each entry records
+  `toolName`, `filePath`, an `args` map (keyed by the tool's parameter names,
+  so `old_string`/`new_string` for `search_replace`, `content` for
+  `write_file`, `content`/`instructions` for `edit_file`), the file before
+  and after the call, and a unified diff of just that call.
+- `diff` — unified diff from the original fixture to the final file
+  (i.e. the cumulative effect of all tool calls).
+- `judge` — the judge's verdict: `label`, `modelName`, `durationMs`,
+  `usage`, `pass` (boolean), and `explanation` (the judge's written
+  reasoning, with the trailing `PASS`/`FAIL` verdict line stripped).
+- `passed` — the overall test outcome. Requires the judge to say `PASS` *and*
+  all structural checks to pass *and* no exceptions to be thrown.
+- `errorMessage` — set when the test threw (tool-call failure, structural
+  check failure, judge FAIL, etc.); `null` otherwise.
+
+`record.txt` is a readable render of the same information — headers, the
+system prompt and instructions, inline tool-call bodies, usage totals, the
+final diff, and the judge's explanation. Open it when you want a quick
+human-readable summary instead of parsing JSON.
+
+### The `details/` folder
+
+`details/` is a split view of the record, intended for quick inspection and
+diffing without having to parse JSON or scroll through `record.txt`:
+
+- `file_before.<ext>` / `file_after.<ext>` — raw file content before and
+  after the run, with the fixture's extension preserved so editors apply
+  the right syntax highlighting.
+- `diff.patch` — the same unified diff as `record.diff`.
+- `system_prompt.txt`, `instructions.txt`, `user_prompt.txt` — the three
+  views of the prompt input.
+- `metadata.json` / `metadata.txt` — everything from `record.json` minus the
+  large content blobs that already have their own files (no inline file
+  contents and no per-tool-call entries). Useful for skimming token counts,
+  judge verdict, and model identity across many runs.
+
+### The `tool_calls/` folder
+
+One `NN.txt` (combined view) and one `NN/` folder (split view) per tool
+call. The split view contains the raw pieces as standalone files:
+
+- `file_before.<ext>`, `file_after.<ext>`, `diff.patch` — file state around
+  the single call.
+- `meta.txt` — timestamp, tool name, target path, and per-arg length summary.
+- One file per tool argument, named after the arg's key. String args use the
+  target file's extension (for syntax highlighting); non-string args become
+  JSON blobs. So a `search_replace` call produces `old_string.ts` and
+  `new_string.ts`; a `write_file` call produces `content.ts` and
+  `description.ts`; an `edit_file` call produces `content.ts` and
+  `instructions.ts`.
--- a/src/__tests__/evals/fixtures/UserProfile.tsx
+++ b/src/__tests__/evals/fixtures/UserProfile.tsx
+// UserProfile.tsx — class-based user profile component
+
+import React from "react";
+import { fetchUser, updateUser, fetchUserActivity } from "../services/userService";
+import type { User, ActivityEntry } from "../types";
+
+interface Props {
+  userId: string;
+  onProfileUpdated?: (user: User) => void;
+  readOnly?: boolean;
+}
+
+interface State {
+  user: User | null;
+  loading: boolean;
+  editing: boolean;
+  draft: Partial<User>;
+  error: string | null;
+  saveError: string | null;
+  saving: boolean;
+  activity: ActivityEntry[];
+  activityLoading: boolean;
+  activityError: string | null;
+  showActivity: boolean;
+  uploadingAvatar: boolean;
+  avatarError: string | null;
+}
+
+export class UserProfile extends React.Component<Props, State> {
+  private avatarInputRef = React.createRef<HTMLInputElement>();
+
+  constructor(props: Props) {
+    super(props);
+    this.state = {
+      user: null,
+      loading: true,
+      editing: false,
+      draft: {},
+      error: null,
+      saveError: null,
+      saving: false,
+      activity: [],
+      activityLoading: false,
+      activityError: null,
+      showActivity: false,
+      uploadingAvatar: false,
+      avatarError: null,
+    };
+  }
+
+  async componentDidMount() {
+    await this.loadUser();
+  }
+
+  async componentDidUpdate(prevProps: Props) {
+    if (prevProps.userId !== this.props.userId) {
+      this.setState({
+        editing: false,
+        draft: {},
+        saveError: null,
+        activity: [],
+        showActivity: false,
+      });
+      await this.loadUser();
+    }
+  }
+
+  componentWillUnmount() {
+    // Clean up any pending state updates
+  }
+
+  async loadUser() {
+    this.setState({ loading: true, error: null });
+    try {
+      const user = await fetchUser(this.props.userId);
+      this.setState({ user, loading: false });
+    } catch (err) {
+      this.setState({
+        error: err instanceof Error ? err.message : "Failed to load user",
+        loading: false,
+      });
+    }
+  }
+
+  async loadActivity() {
+    this.setState({ activityLoading: true, activityError: null });
+    try {
+      const activity = await fetchUserActivity(this.props.userId);
+      this.setState({ activity, activityLoading: false });
+    } catch (err) {
+      this.setState({
+        activityError: err instanceof Error ? err.message : "Failed to load activity",
+        activityLoading: false,
+      });
+    }
+  }
+
+  handleEdit = () => {
+    this.setState({ editing: true, draft: { ...this.state.user }, saveError: null });
+  };
+
+  handleCancel = () => {
+    this.setState({ editing: false, draft: {}, saveError: null });
+  };
+
+  handleChange = (field: keyof User, value: string) => {
+    this.setState((prev) => ({ draft: { ...prev.draft, [field]: value } }));
+  };
+
+  handleSave = async () => {
+    this.setState({ saving: true, saveError: null });
+    try {
+      const updated = await updateUser(this.props.userId, this.state.draft);
+      this.setState({ user: updated, editing: false, draft: {}, saving: false });
+      this.props.onProfileUpdated?.(updated);
+    } catch (err) {
+      this.setState({
+        saveError: err instanceof Error ? err.message : "Failed to save changes",
+        saving: false,
+      });
+    }
+  };
+
+  handleToggleActivity = async () => {
+    const { showActivity, activity } = this.state;
+    if (!showActivity && activity.length === 0) {
+      await this.loadActivity();
+    }
+    this.setState((prev) => ({ showActivity: !prev.showActivity }));
+  };
+
+  handleAvatarClick = () => {
+    this.avatarInputRef.current?.click();
+  };
+
+  handleAvatarChange = async (e: React.ChangeEvent<HTMLInputElement>) => {
+    const file = e.target.files?.[0];
+    if (!file) return;
+
+    if (file.size > 5 * 1024 * 1024) {
+      this.setState({ avatarError: "Avatar must be under 5 MB" });
+      return;
+    }
+
+    this.setState({ uploadingAvatar: true, avatarError: null });
+    try {
+      // Upload stub — real impl would POST to /api/avatars
+      await new Promise((r) => setTimeout(r, 500));
+      const fakeUrl = URL.createObjectURL(file);
+      const updated = await updateUser(this.props.userId, { avatarUrl: fakeUrl });
+      this.setState({ user: updated, uploadingAvatar: false });
+    } catch (err) {
+      this.setState({
+        avatarError: err instanceof Error ? err.message : "Failed to upload avatar",
+        uploadingAvatar: false,
+      });
+    }
+  };
+
+  renderActivityFeed() {
+    const { activity, activityLoading, activityError } = this.state;
+
+    if (activityLoading) {
+      return <p className="activity-loading">Loading activity…</p>;
+    }
+    if (activityError) {
+      return <p className="activity-error">{activityError}</p>;
+    }
+    if (activity.length === 0) {
+      return <p className="activity-empty">No recent activity.</p>;
+    }
+    return (
+      <ul className="activity-list">
+        {activity.map((entry) => (
+          <li key={entry.id} className="activity-entry">
+            <span className="activity-action">{entry.action}</span>
+            <span className="activity-time">
+              {new Date(entry.timestamp).toLocaleString()}
+            </span>
+          </li>
+        ))}
+      </ul>
+    );
+  }
+
+  render() {
+    const {
+      user,
+      loading,
+      editing,
+      draft,
+      error,
+      saveError,
+      saving,
+      showActivity,
+      uploadingAvatar,
+      avatarError,
+    } = this.state;
+    const { readOnly } = this.props;
+
+    if (loading) {
+      return <div className="profile-loading">Loading profile…</div>;
+    }
+
+    if (error) {
+      return (
+        <div className="profile-error">
+          <p>{error}</p>
+          <button onClick={() => this.loadUser()}>Retry</button>
+        </div>
+      );
+    }
+
+    if (!user) return null;
+
+    return (
+      <div className="user-profile">
+        <div className="profile-header">
+          <div className="avatar-wrapper" onClick={!readOnly ? this.handleAvatarClick : undefined}>
+            {user.avatarUrl ? (
+              <img src={user.avatarUrl} alt={`${user.name}'s avatar`} className="avatar" />
+            ) : (
+              <div className="avatar-placeholder">{user.name.charAt(0).toUpperCase()}</div>
+            )}
+            {!readOnly && (
+              <div className="avatar-overlay">{uploadingAvatar ? "Uploading…" : "Change"}</div>
+            )}
+          </div>
+          {!readOnly && (
+            <input
+              ref={this.avatarInputRef}
+              type="file"
+              accept="image/*"
+              style={{ display: "none" }}
+              onChange={this.handleAvatarChange}
+            />
+          )}
+          {avatarError && <p className="avatar-error">{avatarError}</p>}
+          <h1>{user.name}</h1>
+          <span className={`role-badge role-badge--${user.role}`}>{user.role}</span>
+        </div>
+
+        {editing ? (
+          <form
+            className="profile-form"
+            onSubmit={(e) => {
+              e.preventDefault();
+              this.handleSave();
+            }}
+          >
+            <label>
+              Name
+              <input
+                value={draft.name ?? ""}
+                onChange={(e) => this.handleChange("name", e.target.value)}
+              />
+            </label>
+            <label>
+              Email
+              <input
+                type="email"
+                value={draft.email ?? ""}
+                onChange={(e) => this.handleChange("email", e.target.value)}
+              />
+            </label>
+            <label>
+              Bio
+              <textarea
+                value={draft.bio ?? ""}
+                rows={4}
+                onChange={(e) => this.handleChange("bio", e.target.value)}
+              />
+            </label>
+            {saveError && <p className="error">{saveError}</p>}
+            <div className="form-actions">
+              <button type="submit" disabled={saving}>
+                {saving ? "Saving…" : "Save"}
+              </button>
+              <button type="button" onClick={this.handleCancel} disabled={saving}>
+                Cancel
+              </button>
+            </div>
+          </form>
+        ) : (
+          <div className="profile-view">
+            <p>
+              <strong>Email:</strong> {user.email}
+            </p>
+            <p>
+              <strong>Role:</strong> {user.role}
+            </p>
+            {user.bio && (
+              <p>
+                <strong>Bio:</strong> {user.bio}
+              </p>
+            )}
+            <p>
+              <strong>Member since:</strong>{" "}
+              {new Date(user.createdAt).toLocaleDateString()}
+            </p>
+            {!readOnly && (
+              <button onClick={this.handleEdit}>Edit Profile</button>
+            )}
+          </div>
+        )}
+
+        <div className="activity-section">
+          <button className="toggle-activity" onClick={this.handleToggleActivity}>
+            {showActivity ? "Hide activity" : "Show recent activity"}
+          </button>
+          {showActivity && this.renderActivityFeed()}
+        </div>
+      </div>
+    );
+  }
+}
--- a/src/__tests__/evals/fixtures/UserProfileFull.tsx
+++ b/src/__tests__/evals/fixtures/UserProfileFull.tsx
+// UserProfileFull.tsx — full-featured user profile page component
+
+import React, { useState, useEffect, useCallback, useRef, useMemo } from "react";
+import { useNavigate, useParams } from "react-router-dom";
+import { fetchUser, updateUser, uploadAvatar, fetchUserActivity } from "../services/userService";
+import type { User, ActivityItem } from "../types";
+
+interface UserProfileFullProps {
+  userId?: string;
+  showStats?: boolean;
+  showActivity?: boolean;
+}
+
+interface StatCard {
+  label: string;
+  value: number | string;
+  change?: number;
+  unit?: string;
+}
+
+// ── Types for internal state ──────────────────────────────────
+
+interface AvatarState {
+  url: string | null;
+  uploading: boolean;
+  error: string | null;
+  previewUrl: string | null;
+}
+
+interface StatsState {
+  cards: StatCard[];
+  loading: boolean;
+  error: string | null;
+  period: "week" | "month" | "year";
+}
+
+interface ActivityState {
+  items: ActivityItem[];
+  loading: boolean;
+  error: string | null;
+  page: number;
+  hasMore: boolean;
+}
+
+// ── Utility functions ─────────────────────────────────────────
+
+function formatStatValue(value: number | string, unit?: string): string {
+  if (typeof value === "number") {
+    const formatted = value >= 1000 ? `${(value / 1000).toFixed(1)}k` : String(value);
+    return unit ? `${formatted} ${unit}` : formatted;
+  }
+  return value;
+}
+
+function getChangeClass(change: number | undefined): string {
+  if (!change) return "stat-change--neutral";
+  return change > 0 ? "stat-change--positive" : "stat-change--negative";
+}
+
+function formatChangePercent(change: number | undefined): string {
+  if (!change) return "—";
+  const sign = change > 0 ? "+" : "";
+  return `${sign}${change.toFixed(1)}%`;
+}
+
+function formatActivityDate(timestamp: string): string {
+  const date = new Date(timestamp);
+  const now = new Date();
+  const diffMs = now.getTime() - date.getTime();
+  const diffMins = Math.floor(diffMs / 60000);
+  const diffHours = Math.floor(diffMins / 60);
+  const diffDays = Math.floor(diffHours / 24);
+
+  if (diffMins < 1) return "just now";
+  if (diffMins < 60) return `${diffMins}m ago`;
+  if (diffHours < 24) return `${diffHours}h ago`;
+  if (diffDays < 7) return `${diffDays}d ago`;
+  return date.toLocaleDateString();
+}
+
+function getActivityIcon(type: string): string {
+  switch (type) {
+    case "commit":
+      return "📝";
+    case "review":
+      return "👀";
+    case "merge":
+      return "🔀";
+    case "comment":
+      return "💬";
+    case "deploy":
+      return "🚀";
+    default:
+      return "📌";
+  }
+}
+
+// ── Main component ────────────────────────────────────────────
+
+export function UserProfile({
+  userId: propUserId,
+  showStats = true,
+  showActivity = true,
+}: UserProfileFullProps) {
+  const { id: paramUserId } = useParams<{ id: string }>();
+  const navigate = useNavigate();
+  const resolvedUserId = propUserId ?? paramUserId ?? "";
+
+  const [user, setUser] = useState<User | null>(null);
+  const [loading, setLoading] = useState(true);
+  const [error, setError] = useState<string | null>(null);
+  const [editing, setEditing] = useState(false);
+  const [draft, setDraft] = useState<Partial<User>>({});
+  const [saving, setSaving] = useState(false);
+  const [saveError, setSaveError] = useState<string | null>(null);
+
+  // ── Avatar state and logic ──────────────────────────────────
+  const fileInputRef = useRef<HTMLInputElement>(null);
+  const [avatar, setAvatar] = useState<AvatarState>({
+    url: null,
+    uploading: false,
+    error: null,
+    previewUrl: null,
+  });
+
+  const handleAvatarClick = useCallback(() => {
+    fileInputRef.current?.click();
+  }, []);
+
+  const handleAvatarSelect = useCallback(
+    async (event: React.ChangeEvent<HTMLInputElement>) => {
+      const file = event.target.files?.[0];
+      if (!file) return;
+
+      if (!file.type.startsWith("image/")) {
+        setAvatar((prev) => ({ ...prev, error: "Please select an image file" }));
+        return;
+      }
+
+      if (file.size > 5 * 1024 * 1024) {
+        setAvatar((prev) => ({ ...prev, error: "Image must be under 5MB" }));
+        return;
+      }
+
+      const previewUrl = URL.createObjectURL(file);
+      setAvatar((prev) => ({ ...prev, previewUrl, uploading: true, error: null }));
+
+      try {
+        const result = await uploadAvatar(resolvedUserId, file);
+        setAvatar({
+          url: result.url,
+          uploading: false,
+          error: null,
+          previewUrl: null,
+        });
+        URL.revokeObjectURL(previewUrl);
+      } catch (err) {
+        setAvatar((prev) => ({
+          ...prev,
+          uploading: false,
+          error: err instanceof Error ? err.message : "Upload failed",
+        }));
+        URL.revokeObjectURL(previewUrl);
+      }
+    },
+    [resolvedUserId],
+  );
+
+  const handleAvatarRemove = useCallback(() => {
+    setAvatar({ url: null, uploading: false, error: null, previewUrl: null });
+  }, []);
+
+  const avatarDisplayUrl = avatar.previewUrl ?? avatar.url ?? "/default-avatar.png";
+
+  const renderAvatarBadge = useCallback(() => {
+    if (avatar.uploading) {
+      return <span className="avatar-badge avatar-badge--uploading">⏳</span>;
+    }
+    if (avatar.error) {
+      return <span className="avatar-badge avatar-badge--error">⚠️</span>;
+    }
+    return null;
+  }, [avatar.uploading, avatar.error]);
+
+  // ── Load user data ──────────────────────────────────────────
+
+  const loadUser = useCallback(async () => {
+    if (!resolvedUserId) return;
+    setLoading(true);
+    setError(null);
+    try {
+      const userData = await fetchUser(resolvedUserId);
+      setUser(userData);
+      setAvatar((prev) => ({ ...prev, url: userData.avatarUrl ?? null }));
+    } catch (err) {
+      setError(err instanceof Error ? err.message : "Failed to load user");
+    } finally {
+      setLoading(false);
+    }
+  }, [resolvedUserId]);
+
+  useEffect(() => {
+    loadUser();
+  }, [loadUser]);
+
+  // ── Edit handlers ───────────────────────────────────────────
+
+  const handleEdit = useCallback(() => {
+    if (user) {
+      setEditing(true);
+      setDraft({ ...user });
+      setSaveError(null);
+    }
+  }, [user]);
+
+  const handleCancel = useCallback(() => {
+    setEditing(false);
+    setDraft({});
+    setSaveError(null);
+  }, []);
+
+  const handleFieldChange = useCallback((field: keyof User, value: string) => {
+    setDraft((prev) => ({ ...prev, [field]: value }));
+  }, []);
+
+  const handleSave = useCallback(async () => {
+    setSaving(true);
+    setSaveError(null);
+    try {
+      const updated = await updateUser(resolvedUserId, draft);
+      setUser(updated);
+      setEditing(false);
+      setDraft({});
+    } catch (err) {
+      setSaveError(err instanceof Error ? err.message : "Failed to save");
+    } finally {
+      setSaving(false);
+    }
+  }, [resolvedUserId, draft]);
+
+  // ── Stats state and logic ───────────────────────────────────
+  const INITIAL_STATS_PERIOD: "week" | "month" | "year" = "month";
+  const [stats, setStats] = useState<StatsState>({
+    cards: [],
+    loading: true,
+    error: null,
+    period: INITIAL_STATS_PERIOD,
+  });
+
+  const loadStats = useCallback(
+    async (period: "week" | "month" | "year") => {
+      setStats((prev) => ({ ...prev, loading: true, error: null, period }));
+      try {
+        // Simulate loading stats — in production this hits the analytics API
+        await new Promise((r) => setTimeout(r, 100));
+        const mockCards: StatCard[] = [
+          { label: "Commits", value: period === "week" ? 23 : period === "month" ? 87 : 1042, change: 12.5 },
+          { label: "PRs Merged", value: period === "week" ? 5 : period === "month" ? 18 : 203, change: -3.2 },
+          { label: "Reviews", value: period === "week" ? 11 : period === "month" ? 42 : 498, change: 8.1 },
+          { label: "Lines Changed", value: period === "week" ? 1250 : period === "month" ? 4800 : 58000, unit: "lines", change: 15.7 },
+          { label: "Issues Closed", value: period === "week" ? 7 : period === "month" ? 25 : 312, change: 0 },
+          { label: "Build Success", value: "98.2%", change: 1.1 },
+        ];
+        setStats({ cards: mockCards, loading: false, error: null, period });
+      } catch (err) {
+        setStats((prev) => ({
+          ...prev,
+          loading: false,
+          error: err instanceof Error ? err.message : "Failed to load stats",
+        }));
+      }
+    },
+    [],
+  );
+
+  useEffect(() => {
+    if (showStats && resolvedUserId) {
+      loadStats(INITIAL_STATS_PERIOD);
+    }
+  }, [showStats, resolvedUserId, loadStats]);
+
+  const handlePeriodChange = useCallback(
+    (period: "week" | "month" | "year") => {
+      loadStats(period);
+    },
+    [loadStats],
+  );
+
+  const statsGridColumns = useMemo(() => {
+    return stats.cards.length <= 3 ? "stats-grid--2col" : "stats-grid--3col";
+  }, [stats.cards.length]);
+
+  const totalCommits = useMemo(() => {
+    const commitCard = stats.cards.find((c) => c.label === "Commits");
+    return typeof commitCard?.value === "number" ? commitCard.value : 0;
+  }, [stats.cards]);
+
+  const renderStatCard = useCallback(
+    (card: StatCard, index: number) => {
+      return (
+        <div key={index} className="stat-card">
+          <div className="stat-card__label">{card.label}</div>
+          <div className="stat-card__value">{formatStatValue(card.value, card.unit)}</div>
+          <div className={`stat-card__change ${getChangeClass(card.change)}`}>
+            {formatChangePercent(card.change)}
+          </div>
+        </div>
+      );
+    },
+    [],
+  );
+
+  const renderStatsHeader = useCallback(() => {
+    const periods: Array<"week" | "month" | "year"> = ["week", "month", "year"];
+    return (
+      <div className="stats-header">
+        <h2 className="stats-header__title">Performance Stats</h2>
+        <div className="stats-header__periods">
+          {periods.map((p) => (
+            <button
+              key={p}
+              className={`period-btn ${stats.period === p ? "period-btn--active" : ""}`}
+              onClick={() => handlePeriodChange(p)}
+            >
+              {p.charAt(0).toUpperCase() + p.slice(1)}
+            </button>
+          ))}
+        </div>
+      </div>
+    );
+  }, [stats.period, handlePeriodChange]);
+
+  const renderStatsSummary = useCallback(() => {
+    if (totalCommits === 0) return null;
+    return (
+      <div className="stats-summary">
+        <p>
+          Total activity: <strong>{totalCommits}</strong> commits this {stats.period}.
+        </p>
+      </div>
+    );
+  }, [totalCommits, stats.period]);
+
+  // ── Activity feed state and logic ───────────────────────────
+  const [activityState, setActivityState] = useState<ActivityState>({
+    items: [],
+    loading: true,
+    error: null,
+    page: 1,
+    hasMore: true,
+  });
+
+  const loadActivity = useCallback(
+    async (page: number, append = false) => {
+      setActivityState((prev) => ({ ...prev, loading: true, error: null }));
+      try {
+        const items = await fetchUserActivity(resolvedUserId, page);
+        setActivityState((prev) => ({
+          items: append ? [...prev.items, ...items] : items,
+          loading: false,
+          error: null,
+          page,
+          hasMore: items.length >= 20,
+        }));
+      } catch (err) {
+        setActivityState((prev) => ({
+          ...prev,
+          loading: false,
+          error: err instanceof Error ? err.message : "Failed to load activity",
+        }));
+      }
+    },
+    [resolvedUserId],
+  );
+
+  useEffect(() => {
+    if (showActivity && resolvedUserId) {
+      loadActivity(1);
+    }
+  }, [showActivity, resolvedUserId, loadActivity]);
+
+  const handleLoadMore = useCallback(() => {
+    loadActivity(activityState.page + 1, true);
+  }, [loadActivity, activityState.page]);
+
+  const groupedActivity = useMemo(() => {
+    const groups: Record<string, ActivityItem[]> = {};
+    for (const item of activityState.items) {
+      const dateKey = new Date(item.timestamp).toLocaleDateString();
+      if (!groups[dateKey]) groups[dateKey] = [];
+      groups[dateKey].push(item);
+    }
+    return groups;
+  }, [activityState.items]);
+
+  const renderActivityItem = useCallback((item: ActivityItem) => {
+    return (
+      <div key={item.id} className="activity-item">
+        <span className="activity-item__icon">{getActivityIcon(item.type)}</span>
+        <div className="activity-item__content">
+          <p className="activity-item__description">{item.description}</p>
+          <span className="activity-item__time">{formatActivityDate(item.timestamp)}</span>
+          {item.metadata?.pr && (
+            <a
+              className="activity-item__link"
+              href={`/pr/${item.metadata.pr}`}
+              onClick={(e) => {
+                e.preventDefault();
+                navigate(`/pr/${item.metadata!.pr}`);
+              }}
+            >
+              PR #{item.metadata.pr}
+            </a>
+          )}
+        </div>
+      </div>
+    );
+  }, [navigate]);
+
+  const renderActivityGroup = useCallback(
+    (dateKey: string, items: ActivityItem[]) => {
+      return (
+        <div key={dateKey} className="activity-group">
+          <h3 className="activity-group__date">{dateKey}</h3>
+          <div className="activity-group__items">
+            {items.map((item) => renderActivityItem(item))}
+          </div>
+        </div>
+      );
+    },
+    [renderActivityItem],
+  );
+
+  const renderActivityEmpty = useCallback(() => {
+    return (
+      <div className="activity-empty">
+        <p>No recent activity to show.</p>
+      </div>
+    );
+  }, []);
+
+  const renderActivityError = useCallback(() => {
+    return (
+      <div className="activity-error">
+        <p>{activityState.error}</p>
+        <button onClick={() => loadActivity(1)}>Retry</button>
+      </div>
+    );
+  }, [activityState.error, loadActivity]);
+
+  // ── Loading / error states ──────────────────────────────────
+
+  if (loading) {
+    return (
+      <div className="profile-loading">
+        <div className="spinner" />
+        <p>Loading profile…</p>
+      </div>
+    );
+  }
+
+  if (error) {
+    return (
+      <div className="profile-error">
+        <p>{error}</p>
+        <button onClick={loadUser}>Retry</button>
+        <button onClick={() => navigate(-1)}>Go Back</button>
+      </div>
+    );
+  }
+
+  if (!user) return null;
+
+  // ── Render ──────────────────────────────────────────────────
+
+  return (
+    <div className="user-profile">
+      <header className="user-profile__header">
+        <button className="back-btn" onClick={() => navigate(-1)}>
+          ← Back
+        </button>
+        <h1>{user.name}'s Profile</h1>
+      </header>
+
+      {/* ── Avatar section ────────────────────────────────── */}
+      <section className="avatar-section">
+        <div className="avatar-container" onClick={handleAvatarClick}>
+          <img
+            src={avatarDisplayUrl}
+            alt={`${user.name}'s avatar`}
+            className={`avatar-image ${avatar.uploading ? "avatar-image--uploading" : ""}`}
+          />
+          {renderAvatarBadge()}
+          <div className="avatar-overlay">
+            <span>Change Photo</span>
+          </div>
+        </div>
+        <input
+          ref={fileInputRef}
+          type="file"
+          accept="image/*"
+          className="avatar-input"
+          onChange={handleAvatarSelect}
+        />
+        {avatar.url && (
+          <button className="avatar-remove-btn" onClick={handleAvatarRemove}>
+            Remove Photo
+          </button>
+        )}
+        {avatar.error && <p className="avatar-error">{avatar.error}</p>}
+        <div className="avatar-info">
+          <h2>{user.name}</h2>
+          <p className="avatar-info__role">{user.role}</p>
+          <p className="avatar-info__email">{user.email}</p>
+          <p className="avatar-info__joined">
+            Member since {new Date(user.createdAt).toLocaleDateString()}
+          </p>
+        </div>
+      </section>
+
+      {/* ── Profile edit form ─────────────────────────────── */}
+      {editing ? (
+        <section className="edit-section">
+          <h2>Edit Profile</h2>
+          <form
+            onSubmit={(e) => {
+              e.preventDefault();
+              handleSave();
+            }}
+          >
+            <div className="form-field">
+              <label htmlFor="edit-name">Name</label>
+              <input
+                id="edit-name"
+                value={draft.name ?? ""}
+                onChange={(e) => handleFieldChange("name", e.target.value)}
+              />
+            </div>
+            <div className="form-field">
+              <label htmlFor="edit-email">Email</label>
+              <input
+                id="edit-email"
+                type="email"
+                value={draft.email ?? ""}
+                onChange={(e) => handleFieldChange("email", e.target.value)}
+              />
+            </div>
+            <div className="form-field">
+              <label htmlFor="edit-bio">Bio</label>
+              <textarea
+                id="edit-bio"
+                value={draft.bio ?? ""}
+                onChange={(e) => handleFieldChange("bio", e.target.value)}
+                rows={4}
+              />
+            </div>
+            {saveError && <p className="form-error">{saveError}</p>}
+            <div className="form-actions">
+              <button type="submit" disabled={saving} className="btn btn--primary">
+                {saving ? "Saving…" : "Save Changes"}
+              </button>
+              <button type="button" onClick={handleCancel} disabled={saving} className="btn btn--secondary">
+                Cancel
+              </button>
+            </div>
+          </form>
+        </section>
+      ) : (
+        <section className="profile-details">
+          <div className="profile-details__row">
+            <strong>Email:</strong> <span>{user.email}</span>
+          </div>
+          <div className="profile-details__row">
+            <strong>Role:</strong> <span>{user.role}</span>
+          </div>
+          <div className="profile-details__row">
+            <strong>Bio:</strong> <span>{user.bio ?? "No bio provided"}</span>
+          </div>
+          <button onClick={handleEdit} className="btn btn--primary">
+            Edit Profile
+          </button>
+        </section>
+      )}
+
+      {/* ── Stats panel ───────────────────────────────────── */}
+      {showStats && (
+        <section className="stats-panel">
+          {renderStatsHeader()}
+          {stats.loading ? (
+            <div className="stats-loading">
+              <div className="spinner spinner--small" />
+              <p>Loading stats…</p>
+            </div>
+          ) : stats.error ? (
+            <div className="stats-error">
+              <p>{stats.error}</p>
+              <button onClick={() => loadStats(stats.period)}>Retry</button>
+            </div>
+          ) : (
+            <>
+              <div className={`stats-grid ${statsGridColumns}`}>
+                {stats.cards.map((card, i) => renderStatCard(card, i))}
+              </div>
+              {renderStatsSummary()}
+            </>
+          )}
+        </section>
+      )}
+
+      {/* ── Activity feed ─────────────────────────────────── */}
+      {showActivity && (
+        <section className="activity-feed">
+          <h2 className="activity-feed__title">Recent Activity</h2>
+          {activityState.loading && activityState.items.length === 0 ? (
+            <div className="activity-loading">
+              <div className="spinner spinner--small" />
+              <p>Loading activity…</p>
+            </div>
+          ) : activityState.error && activityState.items.length === 0 ? (
+            renderActivityError()
+          ) : activityState.items.length === 0 ? (
+            renderActivityEmpty()
+          ) : (
+            <>
+              {Object.entries(groupedActivity).map(([dateKey, items]) =>
+                renderActivityGroup(dateKey, items),
+              )}
+              {activityState.hasMore && (
+                <div className="activity-load-more">
+                  <button
+                    onClick={handleLoadMore}
+                    disabled={activityState.loading}
+                    className="btn btn--secondary"
+                  >
+                    {activityState.loading ? "Loading…" : "Load More"}
+                  </button>
+                </div>
+              )}
+              {activityState.error && (
+                <div className="activity-inline-error">
+                  <p>Failed to load more: {activityState.error}</p>
+                </div>
+              )}
+            </>
+          )}
+        </section>
+      )}
+
+      {/* ── Footer ────────────────────────────────────────── */}
+      <footer className="user-profile__footer">
+        <p>Profile last updated: {user.updatedAt ? new Date(user.updatedAt).toLocaleString() : "Never"}</p>
+      </footer>
+    </div>
+  );
+}
--- a/src/__tests__/evals/fixtures/analytics.ts
+++ b/src/__tests__/evals/fixtures/analytics.ts
+// Analytics event tracking.
+//
+// Historical note: earlier versions used `console.log` directly, which
+// made output noisy in the browser's console. We now route through a
+// logger abstraction instead.
+
+interface Event {
+  name: string;
+  props: Record<string, unknown>;
+  timestamp: number;
+  sessionId: string;
+  userId?: string;
+}
+
+interface Session {
+  id: string;
+  startedAt: number;
+  userId?: string;
+  userAgent: string;
+  referrer: string;
+}
+
+interface FlushResult {
+  sent: number;
+  failed: number;
+  requeued: number;
+}
+
+type ConsentLevel = "none" | "essential" | "analytics" | "full";
+
+let currentSession: Session | null = null;
+let consentLevel: ConsentLevel = "none";
+const queue: Event[] = [];
+const MAX_BATCH_SIZE = 500;
+const FLUSH_INTERVAL_MS = 30_000;
+let flushTimer: ReturnType<typeof setInterval> | null = null;
+
+// ── Session management ─────────────────────────────────────────────────────
+
+export function startSession(userId?: string): Session {
+  const session: Session = {
+    id: Math.random().toString(36).slice(2),
+    startedAt: Date.now(),
+    userId,
+    userAgent: typeof navigator !== "undefined" ? navigator.userAgent : "",
+    referrer: typeof document !== "undefined" ? document.referrer : "",
+  };
+  currentSession = session;
+  console.log(`analytics session started: ${session.id}`);
+  return session;
+}
+
+export function endSession(): void {
+  if (!currentSession) {
+    console.warn("endSession called with no active session");
+    return;
+  }
+  const durationMs = Date.now() - currentSession.startedAt;
+  console.log(`analytics session ended: ${currentSession.id} (${durationMs}ms)`);
+  currentSession = null;
+}
+
+export function setConsent(level: ConsentLevel): void {
+  console.log(`analytics consent changed: ${consentLevel} → ${level}`);
+  consentLevel = level;
+  if (level === "none") {
+    queue.splice(0, queue.length);
+    console.log("analytics queue cleared due to consent withdrawal");
+  }
+}
+
+// ── Event tracking ─────────────────────────────────────────────────────────
+
+export function track(name: string, props: Record<string, unknown> = {}): void {
+  if (!name) {
+    console.warn("track called with empty event name");
+    return;
+  }
+  if (consentLevel === "none") {
+    console.warn(`track("${name}") skipped — no analytics consent`);
+    return;
+  }
+  const event: Event = {
+    name,
+    props,
+    timestamp: Date.now(),
+    sessionId: currentSession?.id ?? "no-session",
+    userId: currentSession?.userId,
+  };
+  queue.push(event);
+  console.log(`tracked event: ${name}`);
+  if (queue.length >= MAX_BATCH_SIZE) {
+    console.warn(`analytics queue full (${queue.length}), flushing immediately`);
+    flush();
+  }
+}
+
+export function trackPageView(path: string, title: string): void {
+  console.log(`page view: ${path}`);
+  track("page_view", { path, title });
+}
+
+export function trackError(err: Error, context: Record<string, unknown> = {}): void {
+  console.error(`analytics error event: ${err.message}`, err);
+  track("error", {
+    message: err.message,
+    stack: err.stack ?? null,
+    ...context,
+  });
+}
+
+export function trackClick(elementId: string, label: string): void {
+  console.log(`click: ${elementId} — ${label}`);
+  track("click", { elementId, label });
+}
+
+export function trackFormSubmit(formId: string, fieldCount: number): void {
+  if (fieldCount === 0) {
+    console.warn(`trackFormSubmit("${formId}") called with no fields`);
+  }
+  track("form_submit", { formId, fieldCount });
+}
+
+export function trackSearch(query: string, resultCount: number): void {
+  if (!query.trim()) {
+    console.warn("trackSearch called with empty query");
+    return;
+  }
+  console.log(`search: "${query}" — ${resultCount} results`);
+  track("search", { query, resultCount });
+}
+
+export function trackTiming(category: string, variable: string, durationMs: number): void {
+  if (durationMs < 0) {
+    console.error(`trackTiming: negative duration ${durationMs}ms for ${category}/${variable}`);
+    return;
+  }
+  track("timing", { category, variable, durationMs });
+}
+
+// ── Flush ──────────────────────────────────────────────────────────────────
+
+export function flush(): FlushResult {
+  if (queue.length === 0) {
+    console.log("flush: nothing to send");
+    return { sent: 0, failed: 0, requeued: 0 };
+  }
+
+  const drained = queue.splice(0, queue.length);
+  console.log(`flushing ${drained.length} events`);
+
+  try {
+    sendToBackend(drained);
+  } catch (err) {
+    console.error("flush failed, re-queueing events", err);
+    queue.unshift(...drained);
+    throw err;
+  }
+
+  return { sent: drained.length, failed: 0, requeued: 0 };
+}
+
+export function startAutoFlush(): void {
+  if (flushTimer !== null) {
+    console.warn("startAutoFlush called while timer already running");
+    return;
+  }
+  flushTimer = setInterval(() => {
+    console.log("auto-flush triggered");
+    flush();
+  }, FLUSH_INTERVAL_MS);
+  console.log(`auto-flush scheduled every ${FLUSH_INTERVAL_MS}ms`);
+}
+
+export function stopAutoFlush(): void {
+  if (flushTimer === null) {
+    console.warn("stopAutoFlush called with no active timer");
+    return;
+  }
+  clearInterval(flushTimer);
+  flushTimer = null;
+  console.log("auto-flush stopped");
+}
+
+export function queueSize(): number {
+  return queue.length;
+}
+
+// ── Backend transport ──────────────────────────────────────────────────────
+
+function sendToBackend(events: Event[]): void {
+  // The help text below mentions "console" — do not touch it.
+  const helpText =
+    "Events are buffered. Run `flush()` to send. Check the console for errors.";
+  if (events.length > 1000) {
+    console.warn(`sending large batch of ${events.length} events`);
+  }
+  // XHR omitted for fixture brevity.
+  void helpText;
+}
+
+// ── User identification ────────────────────────────────────────────────────
+
+let _identifiedUserId: string | null = null;
+
+export function identify(userId: string, traits: Record<string, unknown> = {}): void {
+  if (!userId) {
+    console.warn("identify called with empty userId");
+    return;
+  }
+  _identifiedUserId = userId;
+  if (currentSession) {
+    currentSession.userId = userId;
+  }
+  console.log(`analytics identify: ${userId}`);
+  track("identify", { userId, ...traits });
+}
+
+export function reset(): void {
+  if (!_identifiedUserId) {
+    console.warn("analytics reset called with no identified user");
+  }
+  _identifiedUserId = null;
+  endSession();
+  console.log("analytics reset");
+}
+
+// ── E-commerce events ──────────────────────────────────────────────────────
+
+export function trackProductViewed(
+  productId: string,
+  name: string,
+  price: number,
+  category: string,
+): void {
+  console.log(`product viewed: ${productId} (${name})`);
+  track("product_viewed", { productId, name, price, category });
+}
+
+export function trackAddToCart(
+  productId: string,
+  quantity: number,
+  price: number,
+): void {
+  if (quantity <= 0) {
+    console.error(`trackAddToCart: invalid quantity ${quantity} for product ${productId}`);
+    return;
+  }
+  track("add_to_cart", { productId, quantity, price });
+}
+
+export function trackCheckoutStarted(cartValue: number, itemCount: number): void {
+  if (cartValue < 0) {
+    console.error(`trackCheckoutStarted: negative cart value ${cartValue}`);
+    return;
+  }
+  console.log(`checkout started — ${itemCount} items, $${cartValue.toFixed(2)}`);
+  track("checkout_started", { cartValue, itemCount });
+}
+
+export function trackOrderCompleted(
+  orderId: string,
+  revenue: number,
+  currency: string,
+): void {
+  console.log(`order completed: ${orderId} — ${currency} ${revenue.toFixed(2)}`);
+  track("order_completed", { orderId, revenue, currency });
+}
+
+export function trackOrderCancelled(orderId: string, reason: string): void {
+  console.warn(`order cancelled: ${orderId} — ${reason}`);
+  track("order_cancelled", { orderId, reason });
+}
+
+// ── Feature flags ──────────────────────────────────────────────────────────
+
+const flagOverrides: Record<string, boolean> = {};
+
+export function setFlagOverride(flag: string, value: boolean): void {
+  console.log(`feature flag override: ${flag} = ${value}`);
+  flagOverrides[flag] = value;
+}
+
+export function clearFlagOverride(flag: string): void {
+  if (!(flag in flagOverrides)) {
+    console.warn(`clearFlagOverride: no override found for flag "${flag}"`);
+    return;
+  }
+  delete flagOverrides[flag];
+  console.log(`feature flag override cleared: ${flag}`);
+}
+
+export function trackFeatureFlagEvaluated(
+  flag: string,
+  value: boolean,
+  reason: string,
+): void {
+  track("feature_flag_evaluated", { flag, value, reason });
+}
+
+export function trackExperimentExposed(
+  experimentId: string,
+  variant: string,
+  userId?: string,
+): void {
+  if (!experimentId) {
+    console.warn("trackExperimentExposed called with empty experimentId");
+    return;
+  }
+  console.log(`experiment exposure: ${experimentId} variant=${variant}`);
+  track("experiment_exposed", { experimentId, variant, userId });
+}
--- a/src/__tests__/evals/fixtures/cache_manager.ts
+++ b/src/__tests__/evals/fixtures/cache_manager.ts
+// cache_manager.ts — in-memory LRU cache with TTL and size limits
+
+interface CacheEntry<T> {
+  value: T;
+  expiresAt: number;
+  sizeBytes: number;
+  lastAccessedAt: number;
+  accessCount: number;
+  key: string;
+}
+
+interface CacheStats {
+  entries: number;
+  totalBytes: number;
+  hits: number;
+  misses: number;
+  evictions: number;
+  expirations: number;
+}
+
+interface WarmingSpec<T> {
+  key: string;
+  fetch: () => Promise<T>;
+  sizeEstimate: number;
+}
+
+const entries = new Map<string, CacheEntry<unknown>>();
+let totalBytes = 0;
+let hits = 0;
+let misses = 0;
+let evictions = 0;
+let expirations = 0;
+
+// ── Write ──────────────────────────────────────────────────────────────────
+
+export function set<T>(key: string, value: T, sizeBytes: number): void {
+  if (sizeBytes > 10 * 1024 * 1024) {
+    throw new Error(`entry too large: ${sizeBytes} bytes`);
+  }
+  while (totalBytes + sizeBytes > 100 * 1024 * 1024) {
+    evictOldest();
+  }
+  entries.set(key, {
+    value,
+    expiresAt: Date.now() + 60 * 60 * 1000,
+    sizeBytes,
+    lastAccessedAt: Date.now(),
+    accessCount: 0,
+    key,
+  });
+  totalBytes += sizeBytes;
+}
+
+export function setWithTtl<T>(
+  key: string,
+  value: T,
+  sizeBytes: number,
+  ttlMs: number,
+): void {
+  if (ttlMs <= 0) {
+    throw new Error(`ttlMs must be positive, got ${ttlMs}`);
+  }
+  if (sizeBytes > 10 * 1024 * 1024) {
+    throw new Error(`entry too large: ${sizeBytes} bytes`);
+  }
+  while (totalBytes + sizeBytes > 100 * 1024 * 1024) {
+    evictOldest();
+  }
+  entries.set(key, {
+    value,
+    expiresAt: Date.now() + ttlMs,
+    sizeBytes,
+    lastAccessedAt: Date.now(),
+    accessCount: 0,
+    key,
+  });
+  totalBytes += sizeBytes;
+}
+
+export function setMany<T>(
+  items: Array<{ key: string; value: T; sizeBytes: number }>,
+): void {
+  for (const item of items) {
+    set(item.key, item.value, item.sizeBytes);
+  }
+}
+
+// ── Read ───────────────────────────────────────────────────────────────────
+
+export function get<T>(key: string): T | null {
+  const entry = entries.get(key);
+  if (!entry) {
+    misses++;
+    return null;
+  }
+  if (entry.expiresAt < Date.now()) {
+    entries.delete(key);
+    totalBytes -= entry.sizeBytes;
+    expirations++;
+    misses++;
+    return null;
+  }
+  entry.lastAccessedAt = Date.now();
+  entry.accessCount++;
+  hits++;
+  return entry.value as T;
+}
+
+export function getOrSet<T>(
+  key: string,
+  factory: () => T,
+  sizeBytes: number,
+): T {
+  const cached = get<T>(key);
+  if (cached !== null) return cached;
+  const value = factory();
+  set(key, value, sizeBytes);
+  return value;
+}
+
+export function peek<T>(key: string): T | null {
+  const entry = entries.get(key);
+  if (!entry || entry.expiresAt < Date.now()) return null;
+  return entry.value as T;
+}
+
+export function has(key: string): boolean {
+  const entry = entries.get(key);
+  if (!entry) return false;
+  if (entry.expiresAt < Date.now()) {
+    entries.delete(key);
+    totalBytes -= entry.sizeBytes;
+    expirations++;
+    return false;
+  }
+  return true;
+}
+
+export function ttlRemainingMs(key: string): number | null {
+  const entry = entries.get(key);
+  if (!entry) return null;
+  const remaining = entry.expiresAt - Date.now();
+  return remaining > 0 ? remaining : null;
+}
+
+// ── Delete ─────────────────────────────────────────────────────────────────
+
+export function del(key: string): boolean {
+  const entry = entries.get(key);
+  if (!entry) return false;
+  totalBytes -= entry.sizeBytes;
+  entries.delete(key);
+  return true;
+}
+
+export function delMany(keys: string[]): number {
+  let removed = 0;
+  for (const key of keys) {
+    if (del(key)) removed++;
+  }
+  return removed;
+}
+
+export function clear(): void {
+  entries.clear();
+  totalBytes = 0;
+}
+
+// ── Maintenance ────────────────────────────────────────────────────────────
+
+export function pruneExpired(): number {
+  const now = Date.now();
+  let removed = 0;
+  for (const [key, entry] of entries) {
+    if (entry.expiresAt < now) {
+      entries.delete(key);
+      totalBytes -= entry.sizeBytes;
+      expirations++;
+      removed++;
+    }
+  }
+  return removed;
+}
+
+export function scheduleDailyCleanup(): NodeJS.Timeout {
+  return setInterval(pruneExpired, 24 * 60 * 60 * 1000);
+}
+
+export function scheduleHourlyCleanup(): NodeJS.Timeout {
+  return setInterval(pruneExpired, 60 * 60 * 1000);
+}
+
+function evictOldest(): void {
+  let oldest: CacheEntry<unknown> | null = null;
+  for (const entry of entries.values()) {
+    if (!oldest || entry.lastAccessedAt < oldest.lastAccessedAt) {
+      oldest = entry;
+    }
+  }
+  if (!oldest) return;
+  totalBytes -= oldest.sizeBytes;
+  entries.delete(oldest.key);
+  evictions++;
+}
+
+// ── Warming ────────────────────────────────────────────────────────────────
+
+export async function warmCache<T>(specs: WarmingSpec<T>[]): Promise<void> {
+  await Promise.allSettled(
+    specs.map(async (spec) => {
+      const value = await spec.fetch();
+      set(spec.key, value, spec.sizeEstimate);
+    }),
+  );
+}
+
+// ── Stats ──────────────────────────────────────────────────────────────────
+
+export function getStats(): CacheStats {
+  return {
+    entries: entries.size,
+    totalBytes,
+    hits,
+    misses,
+    evictions,
+    expirations,
+  };
+}
+
+export function resetStats(): void {
+  hits = 0;
+  misses = 0;
+  evictions = 0;
+  expirations = 0;
+}
+
+export function hitRate(): number {
+  const total = hits + misses;
+  return total === 0 ? 0 : hits / total;
+}
+
+export function keys(): string[] {
+  return Array.from(entries.keys());
+}
+
+export function byteUsagePct(): number {
+  return totalBytes / (100 * 1024 * 1024);
+}
+
+// ── Namespaced sub-cache ───────────────────────────────────────────────────
+
+/**
+ * Returns a cache interface scoped to a namespace prefix. All keys are
+ * stored in the same underlying map with the prefix prepended, so
+ * `ns.set("x", ...)` and `globalGet("myns:x")` see the same entry.
+ */
+export function createNamespace(prefix: string) {
+  const ns = (key: string) => `${prefix}:${key}`;
+  return {
+    set<T>(key: string, value: T, sizeBytes: number): void {
+      set(ns(key), value, sizeBytes);
+    },
+    setWithTtl<T>(key: string, value: T, sizeBytes: number, ttlMs: number): void {
+      setWithTtl(ns(key), value, sizeBytes, ttlMs);
+    },
+    get<T>(key: string): T | null {
+      return get<T>(ns(key));
+    },
+    has(key: string): boolean {
+      return has(ns(key));
+    },
+    del(key: string): boolean {
+      return del(ns(key));
+    },
+    keys(): string[] {
+      return keys()
+        .filter((k) => k.startsWith(`${prefix}:`))
+        .map((k) => k.slice(prefix.length + 1));
+    },
+    clear(): void {
+      for (const k of keys().filter((k) => k.startsWith(`${prefix}:`))) {
+        del(k);
+      }
+    },
+  };
+}
+
+// ── Serialized access (write-through) ─────────────────────────────────────
+
+/**
+ * Reads a value from cache, calling `fetch` on miss and storing the
+ * result. Concurrent calls for the same key each trigger an independent
+ * fetch; callers that need deduplication should use their own in-flight
+ * map on top of this.
+ */
+export async function getOrFetch<T>(
+  key: string,
+  fetch: () => Promise<T>,
+  sizeBytes: number,
+  ttlMs?: number,
+): Promise<T> {
+  const cached = get<T>(key);
+  if (cached !== null) return cached;
+  const value = await fetch();
+  if (ttlMs !== undefined) {
+    setWithTtl(key, value, sizeBytes, ttlMs);
+  } else {
+    set(key, value, sizeBytes);
+  }
+  return value;
+}
+
+// ── Bulk operations ────────────────────────────────────────────────────────
+
+export function getMany<T>(keys: string[]): Array<T | null> {
+  return keys.map((k) => get<T>(k));
+}
+
+export function delByPrefix(prefix: string): number {
+  const matching = keys().filter((k) => k.startsWith(prefix));
+  return delMany(matching);
+}
--- a/src/__tests__/evals/fixtures/config_reader.ts
+++ b/src/__tests__/evals/fixtures/config_reader.ts
+interface TlsConfig {
+  cert?: string;
+  key?: string;
+  ca?: string;
+  rejectUnauthorized?: boolean;
+}
+
+interface ServerConfig {
+  host?: string;
+  port?: number;
+  tls?: TlsConfig;
+  keepAliveTimeoutMs?: number;
+  maxRequestBodyBytes?: number;
+}
+
+interface PoolConfig {
+  min?: number;
+  max?: number;
+  idleTimeoutMs?: number;
+  acquireTimeoutMs?: number;
+}
+
+interface DatabaseConfig {
+  url?: string;
+  pool?: PoolConfig;
+  statementTimeoutMs?: number;
+  ssl?: {
+    enabled?: boolean;
+    rejectUnauthorized?: boolean;
+  };
+}
+
+interface RedisConfig {
+  host?: string;
+  port?: number;
+  password?: string;
+  db?: number;
+  tls?: { enabled?: boolean };
+  maxRetriesPerRequest?: number;
+}
+
+interface LoggingConfig {
+  level?: "debug" | "info" | "warn" | "error";
+  format?: "json" | "text";
+  destination?: {
+    console?: boolean;
+    file?: { path?: string; maxSizeMb?: number; maxFiles?: number };
+  };
+}
+
+interface QueueConfig {
+  concurrency?: number;
+  maxRetries?: number;
+  backoffMs?: number;
+  visibilityTimeoutMs?: number;
+  deadLetterQueueName?: string;
+}
+
+interface RateLimitConfig {
+  windowMs?: number;
+  maxRequests?: number;
+  keyPrefix?: string;
+  skipSuccessfulRequests?: boolean;
+}
+
+interface FeatureFlagsConfig {
+  experimental?: {
+    newUi?: boolean;
+    betaSearch?: boolean;
+    streamingExport?: boolean;
+  };
+  rollout?: {
+    newOnboarding?: number;
+    improvedEditor?: number;
+  };
+}
+
+interface AppConfig {
+  server?: ServerConfig;
+  database?: DatabaseConfig;
+  redis?: RedisConfig;
+  logging?: LoggingConfig;
+  queue?: QueueConfig;
+  rateLimit?: RateLimitConfig;
+  features?: FeatureFlagsConfig;
+}
+
+// ── Server ─────────────────────────────────────────────────────────────────
+
+export function getServerUrl(cfg: AppConfig): string {
+  const host = cfg.server.host;
+  const port = cfg.server.port;
+  const scheme = cfg.server.tls.cert ? "https" : "http";
+  return `${scheme}://${host}:${port}`;
+}
+
+export function getDatabasePoolSize(cfg: AppConfig): {
+  min: number;
+  max: number;
+} {
+  return {
+    min: cfg.database.pool.min,
+    max: cfg.database.pool.max,
+  };
+}
+
+export function isExperimentalUiEnabled(cfg: AppConfig): boolean {
+  return cfg.features.experimental.newUi;
+}
+
+export function describeServer(cfg: AppConfig): string {
+  const certLen = cfg.server.tls.cert.length;
+  const keyLen = cfg.server.tls.key.length;
+  return `tls cert ${certLen} bytes, key ${keyLen} bytes`;
+}
+
+export function getServerKeepAliveMs(cfg: AppConfig): number {
+  return cfg.server.keepAliveTimeoutMs;
+}
+
+export function getMaxRequestBodyBytes(cfg: AppConfig): number {
+  return cfg.server.maxRequestBodyBytes;
+}
+
+export function isTlsCaRequired(cfg: AppConfig): boolean {
+  return cfg.server.tls.rejectUnauthorized;
+}
+
+// ── Database ───────────────────────────────────────────────────────────────
+
+export function getDatabaseUrl(cfg: AppConfig): string {
+  return cfg.database.url;
+}
+
+export function getDatabaseStatementTimeoutMs(cfg: AppConfig): number {
+  return cfg.database.statementTimeoutMs;
+}
+
+export function isDatabaseSslEnabled(cfg: AppConfig): boolean {
+  return cfg.database.ssl.enabled;
+}
+
+export function getDatabasePoolIdleTimeoutMs(cfg: AppConfig): number {
+  return cfg.database.pool.idleTimeoutMs;
+}
+
+export function getDatabasePoolAcquireTimeoutMs(cfg: AppConfig): number {
+  return cfg.database.pool.acquireTimeoutMs;
+}
+
+// ── Redis ──────────────────────────────────────────────────────────────────
+
+export function getRedisHost(cfg: AppConfig): string {
+  return cfg.redis.host;
+}
+
+export function getRedisPort(cfg: AppConfig): number {
+  return cfg.redis.port;
+}
+
+export function getRedisPassword(cfg: AppConfig): string {
+  return cfg.redis.password;
+}
+
+export function getRedisDb(cfg: AppConfig): number {
+  return cfg.redis.db;
+}
+
+export function isRedisTlsEnabled(cfg: AppConfig): boolean {
+  return cfg.redis.tls.enabled;
+}
+
+export function getRedisMaxRetries(cfg: AppConfig): number {
+  return cfg.redis.maxRetriesPerRequest;
+}
+
+// ── Logging ────────────────────────────────────────────────────────────────
+
+export function getLogLevel(cfg: AppConfig): string {
+  return cfg.logging.level;
+}
+
+export function getLogFormat(cfg: AppConfig): string {
+  return cfg.logging.format;
+}
+
+export function isConsoleLoggingEnabled(cfg: AppConfig): boolean {
+  return cfg.logging.destination.console;
+}
+
+export function getLogFilePath(cfg: AppConfig): string {
+  return cfg.logging.destination.file.path;
+}
+
+export function getLogFileMaxSizeMb(cfg: AppConfig): number {
+  return cfg.logging.destination.file.maxSizeMb;
+}
+
+export function getLogFileMaxFiles(cfg: AppConfig): number {
+  return cfg.logging.destination.file.maxFiles;
+}
+
+// ── Queue ──────────────────────────────────────────────────────────────────
+
+export function getQueueConcurrency(cfg: AppConfig): number {
+  return cfg.queue.concurrency;
+}
+
+export function getQueueMaxRetries(cfg: AppConfig): number {
+  return cfg.queue.maxRetries;
+}
+
+export function getQueueBackoffMs(cfg: AppConfig): number {
+  return cfg.queue.backoffMs;
+}
+
+export function getQueueVisibilityTimeoutMs(cfg: AppConfig): number {
+  return cfg.queue.visibilityTimeoutMs;
+}
+
+export function getDeadLetterQueueName(cfg: AppConfig): string {
+  return cfg.queue.deadLetterQueueName;
+}
+
+// ── Rate limit ─────────────────────────────────────────────────────────────
+
+export function getRateLimitWindowMs(cfg: AppConfig): number {
+  return cfg.rateLimit.windowMs;
+}
+
+export function getRateLimitMaxRequests(cfg: AppConfig): number {
+  return cfg.rateLimit.maxRequests;
+}
+
+export function getRateLimitKeyPrefix(cfg: AppConfig): string {
+  return cfg.rateLimit.keyPrefix;
+}
+
+export function isSkipSuccessfulRequestsEnabled(cfg: AppConfig): boolean {
+  return cfg.rateLimit.skipSuccessfulRequests;
+}
+
+// ── Feature flags ──────────────────────────────────────────────────────────
+
+export function isBetaSearchEnabled(cfg: AppConfig): boolean {
+  return cfg.features.experimental.betaSearch;
+}
+
+export function isStreamingExportEnabled(cfg: AppConfig): boolean {
+  return cfg.features.experimental.streamingExport;
+}
+
+export function getNewOnboardingRolloutPct(cfg: AppConfig): number {
+  return cfg.features.rollout.newOnboarding;
+}
+
+export function getImprovedEditorRolloutPct(cfg: AppConfig): number {
+  return cfg.features.rollout.improvedEditor;
+}
+
+// ── Composite helpers ──────────────────────────────────────────────────────
+
+export function describeConfig(cfg: AppConfig): string {
+  const host = cfg.server.host;
+  const port = cfg.server.port;
+  const dbUrl = cfg.database.url;
+  const logLevel = cfg.logging.level;
+  const redisHost = cfg.redis.host;
+  return `server=${host}:${port} db=${dbUrl} log=${logLevel} redis=${redisHost}`;
+}
+
+export function getEffectiveLogDestinations(cfg: AppConfig): string[] {
+  const destinations: string[] = [];
+  if (cfg.logging.destination.console) {
+    destinations.push("console");
+  }
+  const filePath = cfg.logging.destination.file.path;
+  if (filePath) {
+    destinations.push(`file:${filePath}`);
+  }
+  return destinations;
+}
+
+export function isProductionLike(cfg: AppConfig): boolean {
+  const level = cfg.logging.level;
+  return level === "warn" || level === "error";
+}
+
+export function getFullRedisConnectionString(cfg: AppConfig): string {
+  const host = cfg.redis.host;
+  const port = cfg.redis.port;
+  const db = cfg.redis.db;
+  const useTls = cfg.redis.tls.enabled;
+  const scheme = useTls ? "rediss" : "redis";
+  return `${scheme}://${host}:${port}/${db}`;
+}
+
+export function getDatabaseSslRejectUnauthorized(cfg: AppConfig): boolean {
+  return cfg.database.ssl.rejectUnauthorized;
+}
+
+export function getTlsCaPath(cfg: AppConfig): string {
+  return cfg.server.tls.ca;
+}
+
+export function getRateLimitConfig(cfg: AppConfig): {
+  windowMs: number;
+  maxRequests: number;
+  keyPrefix: string;
+  skipSuccessful: boolean;
+} {
+  return {
+    windowMs: cfg.rateLimit.windowMs,
+    maxRequests: cfg.rateLimit.maxRequests,
+    keyPrefix: cfg.rateLimit.keyPrefix,
+    skipSuccessful: cfg.rateLimit.skipSuccessfulRequests,
+  };
+}
+
+export function getQueueConfig(cfg: AppConfig): {
+  concurrency: number;
+  maxRetries: number;
+  backoffMs: number;
+  visibilityTimeoutMs: number;
+} {
+  return {
+    concurrency: cfg.queue.concurrency,
+    maxRetries: cfg.queue.maxRetries,
+    backoffMs: cfg.queue.backoffMs,
+    visibilityTimeoutMs: cfg.queue.visibilityTimeoutMs,
+  };
+}
--- a/src/__tests__/evals/fixtures/contact_book.ts
+++ b/src/__tests__/evals/fixtures/contact_book.ts
+// contact_book.ts — in-memory contact book with import/export and search.
+
+export interface Contact {
+  id: string;
+  name: string;          // e.g. "Ada Lovelace"
+  email: string;
+  phone: string;
+  tags: string[];
+  starred: boolean;
+  createdAt: string;     // ISO timestamp
+}
+
+export interface ContactBook {
+  contacts: Contact[];
+}
+
+// ── Construction ───────────────────────────────────────────────────────────
+
+export function createContact(input: {
+  id: string;
+  name: string;
+  email: string;
+  phone?: string;
+  tags?: string[];
+  starred?: boolean;
+}): Contact {
+  return {
+    id: input.id,
+    name: input.name.trim(),
+    email: input.email.trim().toLowerCase(),
+    phone: input.phone?.trim() ?? "",
+    tags: input.tags?.slice() ?? [],
+    starred: input.starred ?? false,
+    createdAt: new Date().toISOString(),
+  };
+}
+
+export function emptyBook(): ContactBook {
+  return { contacts: [] };
+}
+
+export function addContact(book: ContactBook, contact: Contact): ContactBook {
+  return { contacts: [...book.contacts, contact] };
+}
+
+export function removeContact(book: ContactBook, id: string): ContactBook {
+  return { contacts: book.contacts.filter((c) => c.id !== id) };
+}
+
+// ── Display ────────────────────────────────────────────────────────────────
+
+export function displayName(contact: Contact): string {
+  return contact.name;
+}
+
+export function lastFirstDisplay(contact: Contact): string {
+  // "Ada Lovelace" → "Lovelace, Ada"
+  const parts = contact.name.trim().split(/\s+/);
+  if (parts.length < 2) return contact.name;
+  const last = parts[parts.length - 1];
+  const rest = parts.slice(0, -1).join(" ");
+  return `${last}, ${rest}`;
+}
+
+export function initials(contact: Contact): string {
+  const parts = contact.name.trim().split(/\s+/);
+  if (parts.length === 0) return "";
+  if (parts.length === 1) return parts[0].charAt(0).toUpperCase();
+  const first = parts[0].charAt(0).toUpperCase();
+  const last = parts[parts.length - 1].charAt(0).toUpperCase();
+  return `${first}${last}`;
+}
+
+export function formatLine(contact: Contact): string {
+  const star = contact.starred ? "★ " : "";
+  return `${star}${contact.name} <${contact.email}>`;
+}
+
+// ── Search & filter ────────────────────────────────────────────────────────
+
+export function findById(book: ContactBook, id: string): Contact | null {
+  return book.contacts.find((c) => c.id === id) ?? null;
+}
+
+export function searchByName(book: ContactBook, query: string): Contact[] {
+  const q = query.trim().toLowerCase();
+  if (q === "") return [];
+  return book.contacts.filter((c) => c.name.toLowerCase().includes(q));
+}
+
+export function searchByEmail(book: ContactBook, query: string): Contact[] {
+  const q = query.trim().toLowerCase();
+  if (q === "") return [];
+  return book.contacts.filter((c) => c.email.toLowerCase().includes(q));
+}
+
+export function starredContacts(book: ContactBook): Contact[] {
+  return book.contacts.filter((c) => c.starred);
+}
+
+export function contactsByTag(book: ContactBook, tag: string): Contact[] {
+  return book.contacts.filter((c) => c.tags.includes(tag));
+}
+
+// ── Sorting ────────────────────────────────────────────────────────────────
+
+export function sortByName(book: ContactBook): ContactBook {
+  const sorted = [...book.contacts].sort((a, b) => {
+    const an = a.name.toLowerCase();
+    const bn = b.name.toLowerCase();
+    if (an < bn) return -1;
+    if (an > bn) return 1;
+    return 0;
+  });
+  return { contacts: sorted };
+}
+
+export function sortByLastName(book: ContactBook): ContactBook {
+  const keyOf = (c: Contact): string => {
+    const parts = c.name.trim().split(/\s+/);
+    return parts.length === 0 ? "" : parts[parts.length - 1].toLowerCase();
+  };
+  const sorted = [...book.contacts].sort((a, b) => {
+    const ak = keyOf(a);
+    const bk = keyOf(b);
+    if (ak < bk) return -1;
+    if (ak > bk) return 1;
+    return 0;
+  });
+  return { contacts: sorted };
+}
+
+// ── CSV import/export ──────────────────────────────────────────────────────
+
+export function toCsv(book: ContactBook): string {
+  const rows = ["name,email,phone,tags,starred"];
+  for (const c of book.contacts) {
+    const tags = c.tags.join("|");
+    rows.push(
+      [c.name, c.email, c.phone, tags, c.starred ? "true" : "false"].join(","),
+    );
+  }
+  return rows.join("\n");
+}
+
+export function fromCsv(csv: string): ContactBook {
+  const lines = csv.split("\n").filter((l) => l.trim() !== "");
+  if (lines.length <= 1) return emptyBook();
+  const contacts: Contact[] = [];
+  for (let i = 1; i < lines.length; i++) {
+    const [name, email, phone, tagsCsv, starredStr] = lines[i].split(",");
+    contacts.push({
+      id: `csv-${i}`,
+      name: name?.trim() ?? "",
+      email: email?.trim().toLowerCase() ?? "",
+      phone: phone?.trim() ?? "",
+      tags: tagsCsv ? tagsCsv.split("|").filter((t) => t !== "") : [],
+      starred: starredStr?.trim() === "true",
+      createdAt: new Date().toISOString(),
+    });
+  }
+  return { contacts };
+}
+
+// ── Deduplication ──────────────────────────────────────────────────────────
+
+export function dedupeByName(book: ContactBook): ContactBook {
+  const seen = new Set<string>();
+  const contacts: Contact[] = [];
+  for (const c of book.contacts) {
+    const key = c.name.toLowerCase();
+    if (seen.has(key)) continue;
+    seen.add(key);
+    contacts.push(c);
+  }
+  return { contacts };
+}
+
+// ── Validation ─────────────────────────────────────────────────────────────
+
+export function validateContact(contact: Contact): string[] {
+  const errors: string[] = [];
+  if (contact.name.trim() === "") {
+    errors.push("name is required");
+  }
+  if (!contact.email.includes("@")) {
+    errors.push("email must contain @");
+  }
+  return errors;
+}
+
+// ── Rendering helpers ──────────────────────────────────────────────────────
+
+export function renderDirectory(book: ContactBook): string {
+  const sorted = sortByLastName(book);
+  return sorted.contacts
+    .map((c) => `  • ${lastFirstDisplay(c)} — ${c.email}`)
+    .join("\n");
+}
+
+export function greetingFor(contact: Contact): string {
+  const first = contact.name.split(" ")[0];
+  return `Hello, ${first || "there"}!`;
+}
+
+// ── Merge ──────────────────────────────────────────────────────────────────
+
+export function mergeBooks(a: ContactBook, b: ContactBook): ContactBook {
+  const merged = [...a.contacts, ...b.contacts];
+  return dedupeByName({ contacts: merged });
+}
--- a/src/__tests__/evals/fixtures/event_handler.ts
+++ b/src/__tests__/evals/fixtures/event_handler.ts
+// event_handler.ts — dispatches application events to their handlers
+
+import { createLogger } from "./logger";
+
+const logger = createLogger("event-handler");
+
+export type EventType =
+  | "user.created"
+  | "user.updated"
+  | "user.deleted"
+  | "user.deactivated"
+  | "user.role_changed"
+  | "project.created"
+  | "project.updated"
+  | "project.archived"
+  | "project.deleted"
+  | "project.member_added"
+  | "project.member_removed"
+  | "payment.succeeded"
+  | "payment.failed"
+  | "payment.refunded"
+  | "subscription.created"
+  | "subscription.cancelled"
+  | "subscription.renewed";
+
+export interface AppEvent {
+  type: EventType;
+  payload: Record<string, unknown>;
+  timestamp: string;
+  correlationId: string;
+  sourceService: string;
+}
+
+// ── Individual handlers ────────────────────────────────────────────────────
+
+async function notifyUserCreated(payload: Record<string, unknown>): Promise<void> {
+  logger.info(`Sending welcome email to ${payload.email}`);
+  // implementation elided
+}
+
+async function syncUserToSearchIndex(payload: Record<string, unknown>): Promise<void> {
+  logger.info(`Syncing user ${payload.id} to search index`);
+  // implementation elided
+}
+
+async function revokeUserSessions(payload: Record<string, unknown>): Promise<void> {
+  logger.info(`Revoking all sessions for user ${payload.id}`);
+  // implementation elided
+}
+
+async function notifyUserRoleChanged(payload: Record<string, unknown>): Promise<void> {
+  logger.info(`Notifying user ${payload.id} of role change: ${payload.oldRole} → ${payload.newRole}`);
+  // implementation elided
+}
+
+async function notifyProjectCreated(payload: Record<string, unknown>): Promise<void> {
+  logger.info(`Notifying team about new project ${payload.id}`);
+  // implementation elided
+}
+
+async function archiveProjectAssets(payload: Record<string, unknown>): Promise<void> {
+  logger.info(`Archiving assets for project ${payload.id}`);
+  // implementation elided
+}
+
+async function cleanupProjectResources(payload: Record<string, unknown>): Promise<void> {
+  logger.info(`Cleaning up resources for deleted project ${payload.id}`);
+  // implementation elided
+}
+
+async function notifyProjectMemberAdded(payload: Record<string, unknown>): Promise<void> {
+  logger.info(`Notifying user ${payload.memberId} they were added to project ${payload.projectId}`);
+  // implementation elided
+}
+
+async function notifyProjectMemberRemoved(payload: Record<string, unknown>): Promise<void> {
+  logger.info(`Notifying user ${payload.memberId} they were removed from project ${payload.projectId}`);
+  // implementation elided
+}
+
+async function recordPaymentSuccess(payload: Record<string, unknown>): Promise<void> {
+  logger.info(`Recording payment ${payload.transactionId}`);
+  // implementation elided
+}
+
+async function handlePaymentFailure(payload: Record<string, unknown>): Promise<void> {
+  logger.warn(`Payment failed for order ${payload.orderId}`);
+  // implementation elided
+}
+
+async function processRefund(payload: Record<string, unknown>): Promise<void> {
+  logger.info(`Processing refund ${payload.refundId} for transaction ${payload.transactionId}`);
+  // implementation elided
+}
+
+async function provisionSubscriptionFeatures(payload: Record<string, unknown>): Promise<void> {
+  logger.info(`Provisioning features for subscription ${payload.subscriptionId}`);
+  // implementation elided
+}
+
+async function deprovisionSubscriptionFeatures(payload: Record<string, unknown>): Promise<void> {
+  logger.info(`Deprovisioning features for cancelled subscription ${payload.subscriptionId}`);
+  // implementation elided
+}
+
+async function extendSubscriptionAccess(payload: Record<string, unknown>): Promise<void> {
+  logger.info(`Extending access for renewed subscription ${payload.subscriptionId}`);
+  // implementation elided
+}
+
+/**
+ * Routes an application event to the correct handler.
+ * TODO: Refactor this switch into a Record<EventType, handler> map + dispatch function.
+ */
+export async function handleEvent(event: AppEvent): Promise<void> {
+  logger.info(`Handling event ${event.type} (correlation: ${event.correlationId}, source: ${event.sourceService})`);
+
+  switch (event.type) {
+    case "user.created":
+      await notifyUserCreated(event.payload);
+      await syncUserToSearchIndex(event.payload);
+      break;
+
+    case "user.updated":
+      await syncUserToSearchIndex(event.payload);
+      break;
+
+    case "user.deleted":
+      logger.info(`User ${event.payload.id} deleted — cleaning up`);
+      await revokeUserSessions(event.payload);
+      break;
+
+    case "user.deactivated":
+      logger.info(`User ${event.payload.id} deactivated`);
+      await revokeUserSessions(event.payload);
+      break;
+
+    case "user.role_changed":
+      await notifyUserRoleChanged(event.payload);
+      break;
+
+    case "project.created":
+      await notifyProjectCreated(event.payload);
+      break;
+
+    case "project.updated":
+      logger.info(`Project ${event.payload.id} updated`);
+      break;
+
+    case "project.archived":
+      await archiveProjectAssets(event.payload);
+      break;
+
+    case "project.deleted":
+      await cleanupProjectResources(event.payload);
+      break;
+
+    case "project.member_added":
+      await notifyProjectMemberAdded(event.payload);
+      break;
+
+    case "project.member_removed":
+      await notifyProjectMemberRemoved(event.payload);
+      break;
+
+    case "payment.succeeded":
+      await recordPaymentSuccess(event.payload);
+      break;
+
+    case "payment.failed":
+      await handlePaymentFailure(event.payload);
+      break;
+
+    case "payment.refunded":
+      await processRefund(event.payload);
+      break;
+
+    case "subscription.created":
+      await provisionSubscriptionFeatures(event.payload);
+      break;
+
+    case "subscription.cancelled":
+      await deprovisionSubscriptionFeatures(event.payload);
+      break;
+
+    case "subscription.renewed":
+      await extendSubscriptionAccess(event.payload);
+      break;
+
+    default: {
+      const exhaustiveCheck: never = event.type;
+      logger.warn(`Unknown event type: ${exhaustiveCheck}`);
+    }
+  }
+}
+
+// ── Batch processing ───────────────────────────────────────────────────────
+
+export interface EventBatch {
+  events: AppEvent[];
+  batchId: string;
+  enqueuedAt: string;
+}
+
+export interface BatchResult {
+  batchId: string;
+  total: number;
+  succeeded: number;
+  failed: number;
+  errors: Array<{ correlationId: string; message: string }>;
+}
+
+/**
+ * Processes a batch of events sequentially. Failures are recorded but
+ * do not abort the remaining events in the batch.
+ */
+export async function handleEventBatch(batch: EventBatch): Promise<BatchResult> {
+  logger.info(
+    `Processing batch ${batch.batchId} with ${batch.events.length} event(s)`,
+  );
+  const result: BatchResult = {
+    batchId: batch.batchId,
+    total: batch.events.length,
+    succeeded: 0,
+    failed: 0,
+    errors: [],
+  };
+
+  for (const event of batch.events) {
+    try {
+      await handleEvent(event);
+      result.succeeded++;
+    } catch (err) {
+      result.failed++;
+      result.errors.push({
+        correlationId: event.correlationId,
+        message: err instanceof Error ? err.message : String(err),
+      });
+      logger.error(
+        `Batch ${batch.batchId}: event ${event.correlationId} (${event.type}) failed`,
+        err,
+      );
+    }
+  }
+
+  logger.info(
+    `Batch ${batch.batchId} complete — succeeded: ${result.succeeded}, failed: ${result.failed}`,
+  );
+  return result;
+}
+
+// ── Dead-letter retry ──────────────────────────────────────────────────────
+
+interface DeadLetterEntry {
+  event: AppEvent;
+  failedAt: string;
+  reason: string;
+  attempts: number;
+}
+
+const MAX_RETRY_ATTEMPTS = 3;
+
+export async function retryDeadLetter(
+  entries: DeadLetterEntry[],
+): Promise<{ retried: number; exhausted: number }> {
+  let retried = 0;
+  let exhausted = 0;
+
+  for (const entry of entries) {
+    if (entry.attempts >= MAX_RETRY_ATTEMPTS) {
+      logger.warn(
+        `Dead-letter entry for ${entry.event.correlationId} exhausted (${entry.attempts} attempts)`,
+      );
+      exhausted++;
+      continue;
+    }
+    try {
+      await handleEvent(entry.event);
+      retried++;
+      logger.info(`Retried dead-letter event ${entry.event.correlationId} successfully`);
+    } catch (err) {
+      logger.error(
+        `Retry failed for dead-letter event ${entry.event.correlationId}`,
+        err,
+      );
+    }
+  }
+
+  return { retried, exhausted };
+}
+
+// ── Metrics ────────────────────────────────────────────────────────────────
+
+const handledCounts = new Map<EventType, number>();
+
+export function getHandledCount(type: EventType): number {
+  return handledCounts.get(type) ?? 0;
+}
+
+export function resetHandledCounts(): void {
+  handledCounts.clear();
+  logger.info("Event handled counts reset");
+}
+
+export function getAllHandledCounts(): Record<string, number> {
+  return Object.fromEntries(handledCounts.entries());
+}
--- a/src/__tests__/evals/fixtures/fetch_client.ts
+++ b/src/__tests__/evals/fixtures/fetch_client.ts
+// fetch_client.ts — authenticated fetch wrapper used by all service layers
+
+import { createLogger } from "./logger";
+import { getAuthToken } from "./auth";
+
+const logger = createLogger("fetch-client");
+
+const BASE_URL = process.env.SERVICE_BASE_URL ?? "https://api.internal.example.com";
+const DEFAULT_TIMEOUT_MS = 8_000;
+
+export interface FetchClientOptions {
+  method?: "GET" | "POST" | "PUT" | "PATCH" | "DELETE";
+  body?: unknown;
+  headers?: Record<string, string>;
+  timeoutMs?: number;
+  retries?: number;
+}
+
+export interface ServiceError {
+  code: string;
+  message: string;
+  status: number;
+}
+
+export interface PagedResponse<T> {
+  items: T[];
+  total: number;
+  page: number;
+  hasMore: boolean;
+}
+
+export interface UserProfile {
+  id: string;
+  name: string;
+  email: string;
+  role: string;
+  avatarUrl: string | null;
+  createdAt: string;
+}
+
+export interface Project {
+  id: string;
+  name: string;
+  status: string;
+  ownerId: string;
+  createdAt: string;
+}
+
+export interface ProjectMember {
+  userId: string;
+  name: string;
+  email: string;
+  role: string;
+}
+
+export interface Subscription {
+  id: string;
+  plan: string;
+  status: string;
+  renewsAt: string | null;
+}
+
+/**
+ * Sends an authenticated request to the internal service API.
+ * Throws a ServiceError on non-2xx responses.
+ */
+export async function serviceRequest<T>(
+  path: string,
+  options: FetchClientOptions = {},
+): Promise<T> {
+  const { method = "GET", body, headers = {}, timeoutMs = DEFAULT_TIMEOUT_MS } = options;
+
+  const token = await getAuthToken();
+  const controller = new AbortController();
+  const timer = setTimeout(() => controller.abort(), timeoutMs);
+
+  const response = await fetch(`${BASE_URL}${path}`, {
+    method,
+    headers: {
+      "Content-Type": "application/json",
+      Authorization: `Bearer ${token}`,
+      ...headers,
+    },
+    body: body != null ? JSON.stringify(body) : undefined,
+    signal: controller.signal,
+  });
+  clearTimeout(timer);
+
+  if (!response.ok) {
+    const errorBody = await response.json().catch(() => ({}));
+    throw {
+      code: errorBody.code ?? "UNKNOWN_ERROR",
+      message: errorBody.message ?? response.statusText,
+      status: response.status,
+    } satisfies ServiceError;
+  }
+
+  return response.json() as Promise<T>;
+}
+
+// ── Verb wrappers ──────────────────────────────────────────────────────────
+
+export async function getResource<T>(path: string, headers?: Record<string, string>): Promise<T> {
+  const data = await serviceRequest<T>(path, { method: "GET", headers });
+  return data;
+}
+
+export async function postResource<T>(path: string, body: unknown): Promise<T> {
+  const data = await serviceRequest<T>(path, { method: "POST", body });
+  return data;
+}
+
+export async function putResource<T>(path: string, body: unknown): Promise<T> {
+  const data = await serviceRequest<T>(path, { method: "PUT", body });
+  return data;
+}
+
+export async function patchResource<T>(path: string, body: unknown): Promise<T> {
+  const data = await serviceRequest<T>(path, { method: "PATCH", body });
+  return data;
+}
+
+export async function deleteResource(path: string): Promise<void> {
+  const data = await serviceRequest<void>(path, { method: "DELETE" });
+  return data;
+}
+
+// ── User resources ─────────────────────────────────────────────────────────
+
+export async function getUserProfile(userId: string): Promise<UserProfile> {
+  return getResource<UserProfile>(`/users/${userId}`);
+}
+
+export async function updateUserProfile(
+  userId: string,
+  updates: { name?: string; email?: string; avatarUrl?: string },
+): Promise<UserProfile> {
+  return patchResource<UserProfile>(`/users/${userId}`, updates);
+}
+
+export async function deleteUserAccount(userId: string): Promise<void> {
+  return deleteResource(`/users/${userId}`);
+}
+
+export async function listUsers(
+  page = 1,
+  limit = 20,
+): Promise<PagedResponse<UserProfile>> {
+  return getResource<PagedResponse<UserProfile>>(
+    `/users?page=${page}&limit=${limit}`,
+  );
+}
+
+export async function getUserSubscription(userId: string): Promise<Subscription | null> {
+  return getResource<Subscription | null>(`/users/${userId}/subscription`);
+}
+
+// ── Project resources ──────────────────────────────────────────────────────
+
+export async function getProjectList(workspaceId: string): Promise<Project[]> {
+  return getResource<Project[]>(`/workspaces/${workspaceId}/projects`);
+}
+
+export async function createProject(
+  workspaceId: string,
+  payload: { name: string; template?: string },
+): Promise<Project> {
+  return postResource<Project>(`/workspaces/${workspaceId}/projects`, payload);
+}
+
+export async function getProject(projectId: string): Promise<Project> {
+  return getResource<Project>(`/projects/${projectId}`);
+}
+
+export async function updateProject(
+  projectId: string,
+  updates: { name?: string; status?: string },
+): Promise<Project> {
+  return patchResource<Project>(`/projects/${projectId}`, updates);
+}
+
+export async function archiveProject(projectId: string): Promise<void> {
+  return postResource<void>(`/projects/${projectId}/archive`, {});
+}
+
+export async function deleteProject(projectId: string): Promise<void> {
+  return deleteResource(`/projects/${projectId}`);
+}
+
+// ── Project membership ─────────────────────────────────────────────────────
+
+export async function getProjectMembers(projectId: string): Promise<ProjectMember[]> {
+  return getResource<ProjectMember[]>(`/projects/${projectId}/members`);
+}
+
+export async function addProjectMember(
+  projectId: string,
+  userId: string,
+  role: string,
+): Promise<void> {
+  return postResource<void>(`/projects/${projectId}/members`, { userId, role });
+}
+
+export async function removeProjectMember(
+  projectId: string,
+  userId: string,
+): Promise<void> {
+  return deleteResource(`/projects/${projectId}/members/${userId}`);
+}
+
+export async function updateProjectMemberRole(
+  projectId: string,
+  userId: string,
+  role: string,
+): Promise<ProjectMember> {
+  return patchResource<ProjectMember>(`/projects/${projectId}/members/${userId}`, { role });
+}
+
+// ── Workspace resources ────────────────────────────────────────────────────
+
+export async function getWorkspace(workspaceId: string): Promise<{ id: string; name: string; plan: string }> {
+  return getResource(`/workspaces/${workspaceId}`);
+}
+
+export async function updateWorkspace(
+  workspaceId: string,
+  updates: { name?: string },
+): Promise<{ id: string; name: string }> {
+  return patchResource(`/workspaces/${workspaceId}`, updates);
+}
+
+// ── Billing resources ──────────────────────────────────────────────────────
+
+export interface Invoice {
+  id: string;
+  amount: number;
+  currency: string;
+  status: "draft" | "open" | "paid" | "void";
+  dueDate: string;
+  createdAt: string;
+}
+
+export interface PaymentMethod {
+  id: string;
+  type: "card" | "bank_account";
+  last4: string;
+  expMonth?: number;
+  expYear?: number;
+  isDefault: boolean;
+}
+
+export async function listInvoices(workspaceId: string): Promise<Invoice[]> {
+  return getResource<Invoice[]>(`/workspaces/${workspaceId}/billing/invoices`);
+}
+
+export async function getInvoice(invoiceId: string): Promise<Invoice> {
+  return getResource<Invoice>(`/billing/invoices/${invoiceId}`);
+}
+
+export async function downloadInvoicePdf(invoiceId: string): Promise<Blob> {
+  const token = await getAuthToken();
+  const controller = new AbortController();
+  const timer = setTimeout(() => controller.abort(), DEFAULT_TIMEOUT_MS);
+
+  const response = await fetch(`${BASE_URL}/billing/invoices/${invoiceId}/pdf`, {
+    headers: { Authorization: `Bearer ${token}` },
+    signal: controller.signal,
+  });
+  clearTimeout(timer);
+
+  if (!response.ok) {
+    throw { code: "DOWNLOAD_FAILED", message: "Failed to download invoice PDF", status: response.status } satisfies ServiceError;
+  }
+  return response.blob();
+}
+
+export async function listPaymentMethods(workspaceId: string): Promise<PaymentMethod[]> {
+  return getResource<PaymentMethod[]>(`/workspaces/${workspaceId}/billing/payment-methods`);
+}
+
+export async function addPaymentMethod(
+  workspaceId: string,
+  token: string,
+): Promise<PaymentMethod> {
+  return postResource<PaymentMethod>(`/workspaces/${workspaceId}/billing/payment-methods`, { token });
+}
+
+export async function removePaymentMethod(
+  workspaceId: string,
+  paymentMethodId: string,
+): Promise<void> {
+  return deleteResource(`/workspaces/${workspaceId}/billing/payment-methods/${paymentMethodId}`);
+}
+
+export async function setDefaultPaymentMethod(
+  workspaceId: string,
+  paymentMethodId: string,
+): Promise<void> {
+  return postResource<void>(
+    `/workspaces/${workspaceId}/billing/payment-methods/${paymentMethodId}/set-default`,
+    {},
+  );
+}
--- a/src/__tests__/evals/fixtures/order_math.ts
+++ b/src/__tests__/evals/fixtures/order_math.ts
+// order_math.ts — order total calculation and related helpers
+
+interface LineItem {
+  sku: string;
+  quantity: number;
+  unitPrice: number;
+  weight: number;       // grams
+  taxable: boolean;
+  discountable: boolean;
+}
+
+interface Coupon {
+  code: string;
+  type: "pct" | "fixed";
+  value: number;        // percent (0-100) or absolute USD
+  minimumOrderValue: number;
+  appliesToShipping: boolean;
+}
+
+interface ShippingRate {
+  carrier: string;
+  service: string;
+  rateUsd: number;
+  estimatedDays: number;
+}
+
+interface Order {
+  items: LineItem[];
+  discountPct: number;
+  taxRate: number;
+  shippingCost: number;
+  coupon?: Coupon;
+  currency: string;
+  notes?: string;
+}
+
+interface OrderSummary {
+  subtotal: number;
+  discountAmount: number;
+  taxableAmount: number;
+  taxAmount: number;
+  shippingCost: number;
+  couponSavings: number;
+  total: number;
+  itemCount: number;
+}
+
+// ── Core calculation ───────────────────────────────────────────────────────
+
+export function calculateTotal(order: Order): number {
+  const subtotal = subtotalOf(order.items);
+  const afterDiscount = subtotal * (1 - order.discountPct);
+  const withTax = afterDiscount * (1 + order.taxRate);
+  return withTax + order.shippingCost;
+}
+
+export function subtotalOf(items: LineItem[]): number {
+  let sum = 0;
+  for (const item of items) {
+    sum += item.quantity * item.unitPrice;
+  }
+  return sum;
+}
+
+export function discountableSubtotal(items: LineItem[]): number {
+  let sum = 0;
+  for (const item of items) {
+    if (item.discountable) {
+      sum += item.quantity * item.unitPrice;
+    }
+  }
+  return sum;
+}
+
+export function taxableSubtotal(items: LineItem[]): number {
+  let sum = 0;
+  for (const item of items) {
+    if (item.taxable) {
+      sum += item.quantity * item.unitPrice;
+    }
+  }
+  return sum;
+}
+
+export function totalWeightGrams(items: LineItem[]): number {
+  let grams = 0;
+  for (const item of items) {
+    grams += item.quantity * item.weight;
+  }
+  return grams;
+}
+
+// ── Coupon helpers ─────────────────────────────────────────────────────────
+
+export function applyCoupon(order: Order, subtotal: number): number {
+  if (!order.coupon) return 0;
+  const { coupon } = order;
+  // Shipping-only coupons are applied via effectiveShippingCost; returning
+  // the coupon value here too would double-count the discount in buildSummary.
+  if (coupon.appliesToShipping) return 0;
+  if (subtotal < coupon.minimumOrderValue) return 0;
+  if (coupon.type === "fixed") return Math.min(coupon.value, subtotal);
+  return subtotal * (coupon.value / 100);
+}
+
+export function couponAppliestoShipping(order: Order): boolean {
+  return !!order.coupon?.appliesToShipping;
+}
+
+export function effectiveShippingCost(order: Order): number {
+  const base = order.shippingCost;
+  if (!order.coupon?.appliesToShipping) return base;
+  if (order.coupon.type === "fixed") return Math.max(0, base - order.coupon.value);
+  return base * (1 - order.coupon.value / 100);
+}
+
+// ── Breakdown & description ────────────────────────────────────────────────
+
+export function buildSummary(order: Order): OrderSummary {
+  const subtotal = subtotalOf(order.items);
+  const discountAmount = subtotal * order.discountPct;
+  const afterDiscount = subtotal - discountAmount;
+  const taxableAmount = taxableSubtotal(order.items) * (1 - order.discountPct);
+  const taxAmount = taxableAmount * order.taxRate;
+  const couponSavings = applyCoupon(order, afterDiscount);
+  const shipping = effectiveShippingCost(order);
+  const total = afterDiscount + taxAmount - couponSavings + shipping;
+
+  return {
+    subtotal,
+    discountAmount,
+    taxableAmount,
+    taxAmount,
+    shippingCost: shipping,
+    couponSavings,
+    total,
+    itemCount: order.items.reduce((n, i) => n + i.quantity, 0),
+  };
+}
+
+export function describeOrder(order: Order): string {
+  const total = calculateTotal(order);
+  return `Order of ${order.items.length} items, total $${total.toFixed(2)}`;
+}
+
+export function validateOrder(order: Order): void {
+  if (order.items.length === 0) {
+    throw new Error("calculateTotal failed: order has no items");
+  }
+  const total = calculateTotal(order);
+  if (total < 0) {
+    throw new Error(`calculateTotal returned negative value: ${total}`);
+  }
+}
+
+export function summarizeOrder(order: Order): { items: number; total: number } {
+  return {
+    items: order.items.length,
+    total: calculateTotal(order),
+  };
+}
+
+export function compareOrders(a: Order, b: Order): number {
+  return calculateTotal(a) - calculateTotal(b);
+}
+
+export function cheapestShipping(rates: ShippingRate[]): ShippingRate | null {
+  if (rates.length === 0) return null;
+  return rates.reduce((best, r) => (r.rateUsd < best.rateUsd ? r : best));
+}
+
+export function formatOrderLine(item: LineItem): string {
+  return `${item.sku} × ${item.quantity} @ $${item.unitPrice.toFixed(2)}`;
+}
+
+export function applyBulkDiscount(items: LineItem[], threshold: number, pct: number): number {
+  const sub = subtotalOf(items);
+  if (sub < threshold) return sub;
+  return sub * (1 - pct);
+}
+
+export function estimateTax(order: Order): number {
+  const taxable = taxableSubtotal(order.items);
+  const afterDiscount = taxable * (1 - order.discountPct);
+  return afterDiscount * order.taxRate;
+}
+
+export function orderContainsSku(order: Order, sku: string): boolean {
+  return order.items.some((i) => i.sku === sku);
+}
+
+export function totalForSku(order: Order, sku: string): number {
+  return order.items
+    .filter((i) => i.sku === sku)
+    .reduce((n, i) => n + i.quantity * i.unitPrice, 0);
+}
+
+export function mergeOrders(orders: Order[]): Order {
+  if (orders.length === 0) {
+    throw new Error("calculateTotal failed: cannot merge empty order list");
+  }
+  const base = orders[0];
+  return {
+    ...base,
+    items: orders.flatMap((o) => o.items),
+    shippingCost: orders.reduce((n, o) => n + o.shippingCost, 0),
+  };
+}
+
+export function printOrderTotals(orders: Order[]): void {
+  for (const order of orders) {
+    const total = calculateTotal(order);
+    console.log(`  ${order.currency} ${total.toFixed(2)}`);
+  }
+}
+
+// ── Multi-currency support ─────────────────────────────────────────────────
+
+const EXCHANGE_RATES: Record<string, number> = {
+  USD: 1.0,
+  EUR: 0.92,
+  GBP: 0.79,
+  CAD: 1.36,
+  AUD: 1.53,
+  JPY: 154.0,
+};
+
+export function convertTotal(order: Order, targetCurrency: string): number {
+  const total = calculateTotal(order);
+  const fromRate = EXCHANGE_RATES[order.currency] ?? 1;
+  const toRate = EXCHANGE_RATES[targetCurrency] ?? 1;
+  return (total / fromRate) * toRate;
+}
+
+export function formatCurrency(amount: number, currency: string): string {
+  return new Intl.NumberFormat("en-US", {
+    style: "currency",
+    currency,
+    minimumFractionDigits: 2,
+  }).format(amount);
+}
+
+// ── Refund calculations ────────────────────────────────────────────────────
+
+export function refundAmountForItems(
+  order: Order,
+  returnedSkus: string[],
+): number {
+  const returnedItems = order.items.filter((i) => returnedSkus.includes(i.sku));
+  if (returnedItems.length === 0) return 0;
+
+  const returnedSubtotal = subtotalOf(returnedItems);
+  const totalSubtotal = subtotalOf(order.items);
+  if (totalSubtotal === 0) return 0;
+
+  const discountFraction = order.discountPct;
+  const afterDiscount = returnedSubtotal * (1 - discountFraction);
+  const taxable = returnedItems.filter((i) => i.taxable);
+  const taxableSubtotalReturned = subtotalOf(taxable) * (1 - discountFraction);
+  const tax = taxableSubtotalReturned * order.taxRate;
+
+  return afterDiscount + tax;
+}
+
+export function isFullRefund(order: Order, returnedSkus: string[]): boolean {
+  const allSkus = order.items.map((i) => i.sku);
+  return allSkus.every((sku) => returnedSkus.includes(sku));
+}
+
+// ── Reporting helpers ──────────────────────────────────────────────────────
+
+export function totalsByCurrency(
+  orders: Order[],
+): Record<string, number> {
+  const totals: Record<string, number> = {};
+  for (const order of orders) {
+    const key = order.currency;
+    totals[key] = (totals[key] ?? 0) + calculateTotal(order);
+  }
+  return totals;
+}
+
+export function averageOrderValue(orders: Order[]): number {
+  if (orders.length === 0) return 0;
+  const sum = orders.reduce((n, o) => n + calculateTotal(o), 0);
+  return sum / orders.length;
+}
+
+export function topOrdersByValue(orders: Order[], n: number): Order[] {
+  return [...orders].sort((a, b) => calculateTotal(b) - calculateTotal(a)).slice(0, n);
+}
+
+export function totalRevenue(orders: Order[]): number {
+  return orders.reduce((sum, o) => sum + calculateTotal(o), 0);
+}
+
+export function ordersAboveThreshold(orders: Order[], threshold: number): Order[] {
+  return orders.filter((o) => calculateTotal(o) >= threshold);
+}
+
+export function ordersBelowThreshold(orders: Order[], threshold: number): Order[] {
+  return orders.filter((o) => calculateTotal(o) < threshold);
+}
--- a/src/__tests__/evals/fixtures/order_processor.ts
+++ b/src/__tests__/evals/fixtures/order_processor.ts
+// order_processor.ts — core order processing logic
+
+import { createLogger } from "./logger";
+import type { Order, InventoryItem, PaymentMethod, ShippingAddress } from "./types";
+
+const logger = createLogger("order-processor");
+
+interface ProcessResult {
+  success: boolean;
+  orderId?: string;
+  error?: string;
+}
+
+interface FulfillmentResult {
+  transactionId: string;
+  trackingNumber: string;
+  estimatedDelivery: string;
+}
+
+interface RefundResult {
+  refundId: string;
+  amount: number;
+  status: "pending" | "completed" | "failed";
+}
+
+// ── External service stubs ─────────────────────────────────────────────────
+
+async function getInventory(productId: string): Promise<InventoryItem | null> {
+  // Implementation elided — hits the warehouse API
+  return null;
+}
+
+async function reserveInventory(
+  productId: string,
+  quantity: number,
+): Promise<boolean> {
+  // Implementation elided — locks inventory for the order
+  return true;
+}
+
+async function releaseInventory(
+  productId: string,
+  quantity: number,
+): Promise<void> {
+  // Implementation elided — releases a previously-reserved hold
+}
+
+async function chargePayment(
+  method: PaymentMethod,
+  amount: number,
+): Promise<{ transactionId: string }> {
+  // Implementation elided — hits the payment gateway
+  return { transactionId: "txn_placeholder" };
+}
+
+async function refundPayment(
+  transactionId: string,
+  amount: number,
+): Promise<RefundResult> {
+  // Implementation elided — issues a refund via the payment gateway
+  return { refundId: "ref_placeholder", amount, status: "pending" };
+}
+
+async function createShipment(
+  address: ShippingAddress,
+  items: string[],
+): Promise<{ trackingNumber: string; estimatedDelivery: string }> {
+  // Implementation elided — hits the shipping API
+  return { trackingNumber: "track_placeholder", estimatedDelivery: new Date(Date.now() + 5 * 24 * 3600 * 1000).toISOString() };
+}
+
+async function cancelShipment(trackingNumber: string): Promise<void> {
+  // Implementation elided — cancels a shipment before it ships
+}
+
+async function saveOrder(order: Order): Promise<string> {
+  // Implementation elided — writes to DB
+  return "order_placeholder";
+}
+
+async function updateOrderStatus(
+  orderId: string,
+  status: string,
+): Promise<void> {
+  // Implementation elided — updates DB record
+}
+
+async function notifyOrderConfirmed(
+  orderId: string,
+  email: string,
+): Promise<void> {
+  // Implementation elided — sends confirmation email
+}
+
+/**
+ * Processes an order end-to-end.
+ * Validation, payment, shipment creation, and persistence are all inlined here.
+ * TODO: Extract the validation block into a separate `validateOrder` function.
+ */
+export async function processOrder(order: Order): Promise<ProcessResult> {
+  logger.info(`Processing order for ${order.items.length} item(s)`);
+
+  // ── Validation block (extract this into validateOrder) ────────────────────
+  // Check inventory for each item
+  for (const item of order.items) {
+    const inventory = await getInventory(item.productId);
+    if (!inventory) {
+      logger.warn(`Product ${item.productId} not found in inventory`);
+      return { success: false, error: `Product ${item.productId} not found` };
+    }
+    if (inventory.quantity < item.quantity) {
+      logger.warn(
+        `Insufficient stock for ${item.productId}: need ${item.quantity}, have ${inventory.quantity}`,
+      );
+      return {
+        success: false,
+        error: `Insufficient stock for ${item.productId}`,
+      };
+    }
+  }
+
+  // Validate payment method
+  if (!order.payment.method || !["card", "paypal", "bank"].includes(order.payment.method)) {
+    return { success: false, error: "Invalid payment method" };
+  }
+  if (!order.payment.amount || order.payment.amount <= 0) {
+    return { success: false, error: "Invalid payment amount" };
+  }
+
+  // Validate shipping address
+  if (!order.shipping.address.street || !order.shipping.address.city) {
+    return { success: false, error: "Incomplete shipping address" };
+  }
+  if (!order.shipping.address.postalCode.match(/^\d{5}(-\d{4})?$/)) {
+    return { success: false, error: "Invalid postal code" };
+  }
+  // ── End of validation block ───────────────────────────────────────────────
+
+  // Reserve inventory
+  const reservations: Array<{ productId: string; quantity: number }> = [];
+  for (const item of order.items) {
+    const ok = await reserveInventory(item.productId, item.quantity);
+    if (!ok) {
+      // Roll back any reservations already made
+      for (const r of reservations) {
+        await releaseInventory(r.productId, r.quantity);
+      }
+      return { success: false, error: `Could not reserve stock for ${item.productId}` };
+    }
+    reservations.push({ productId: item.productId, quantity: item.quantity });
+  }
+
+  // Charge the customer
+  let transaction: { transactionId: string };
+  try {
+    transaction = await chargePayment(order.payment.method, order.payment.amount);
+  } catch (err) {
+    logger.error("Payment failed", err);
+    for (const r of reservations) {
+      await releaseInventory(r.productId, r.quantity);
+    }
+    return { success: false, error: "Payment processing failed" };
+  }
+
+  // Create shipment
+  let shipment: { trackingNumber: string; estimatedDelivery: string };
+  try {
+    const itemIds = order.items.map((i) => i.productId);
+    shipment = await createShipment(order.shipping.address, itemIds);
+  } catch (err) {
+    logger.error("Shipment creation failed", err);
+    await refundPayment(transaction.transactionId, order.payment.amount);
+    for (const r of reservations) {
+      await releaseInventory(r.productId, r.quantity);
+    }
+    return { success: false, error: "Shipment creation failed" };
+  }
+
+  // Persist the order
+  const enrichedOrder: Order = {
+    ...order,
+    transactionId: transaction.transactionId,
+    trackingNumber: shipment.trackingNumber,
+    status: "confirmed",
+    confirmedAt: new Date().toISOString(),
+  };
+
+  const orderId = await saveOrder(enrichedOrder);
+  logger.info(`Order ${orderId} confirmed (tracking: ${shipment.trackingNumber})`);
+
+  // Send confirmation email (best-effort — don't fail the order)
+  try {
+    await notifyOrderConfirmed(orderId, order.customerEmail);
+  } catch (err) {
+    logger.warn(`Failed to send confirmation email for order ${orderId}`, err);
+  }
+
+  return { success: true, orderId };
+}
+
+// ── Cancellation ───────────────────────────────────────────────────────────
+
+export async function cancelOrder(
+  orderId: string,
+  transactionId: string,
+  trackingNumber: string,
+  amount: number,
+  reservations: Array<{ productId: string; quantity: number }>,
+): Promise<ProcessResult> {
+  logger.info(`Cancelling order ${orderId}`);
+
+  try {
+    await cancelShipment(trackingNumber);
+  } catch (err) {
+    logger.warn(`Could not cancel shipment ${trackingNumber} for order ${orderId}`, err);
+  }
+
+  let refundOk = false;
+  try {
+    await refundPayment(transactionId, amount);
+    refundOk = true;
+  } catch (err) {
+    logger.error(`Refund failed for order ${orderId}`, err);
+  }
+
+  for (const r of reservations) {
+    await releaseInventory(r.productId, r.quantity);
+  }
+
+  await updateOrderStatus(orderId, refundOk ? "cancelled" : "cancellation_pending");
+  logger.info(`Order ${orderId} cancelled (refund ok: ${refundOk})`);
+  return { success: true, orderId };
+}
+
+// ── Order lookup helpers ───────────────────────────────────────────────────
+
+export async function getOrderStatus(orderId: string): Promise<string | null> {
+  logger.info(`Looking up status for order ${orderId}`);
+  // Implementation elided — queries DB
+  return null;
+}
+
+export async function listOrdersForCustomer(
+  customerEmail: string,
+  page: number,
+  limit: number,
+): Promise<Order[]> {
+  logger.info(`Listing orders for customer ${customerEmail} (page=${page})`);
+  // Implementation elided — queries DB
+  return [];
+}
+
+// ── Order enrichment ───────────────────────────────────────────────────────
+
+export async function enrichOrderWithTracking(order: Order): Promise<Order & { trackingUrl: string }> {
+  if (!order.trackingNumber) {
+    throw new Error("Order has no tracking number");
+  }
+  const carrier = order.trackingNumber.startsWith("UPS") ? "ups" : "fedex";
+  const trackingUrl = `https://tracking.${carrier}.com/${order.trackingNumber}`;
+  logger.info(`Enriched order with tracking URL (carrier: ${carrier})`);
+  return { ...order, trackingUrl };
+}
+
+export async function estimateDeliveryDate(
+  address: ShippingAddress,
+  expedited: boolean,
+): Promise<string> {
+  const baseDays = expedited ? 2 : 5;
+  const regionBuffer = address.country !== "US" ? 7 : 0;
+  const estimate = new Date(Date.now() + (baseDays + regionBuffer) * 24 * 3600 * 1000);
+  return estimate.toISOString();
+}
+
+// ── Duplicate detection ────────────────────────────────────────────────────
+
+interface IdempotencyRecord {
+  key: string;
+  orderId: string;
+  createdAt: string;
+}
+
+const idempotencyStore = new Map<string, IdempotencyRecord>();
+
+export function checkIdempotency(key: string): string | null {
+  return idempotencyStore.get(key)?.orderId ?? null;
+}
+
+export function registerIdempotency(key: string, orderId: string): void {
+  idempotencyStore.set(key, {
+    key,
+    orderId,
+    createdAt: new Date().toISOString(),
+  });
+  logger.info(`Registered idempotency key ${key} → order ${orderId}`);
+}
+
+export async function processOrderIdempotent(
+  order: Order,
+  idempotencyKey: string,
+): Promise<ProcessResult> {
+  const existing = checkIdempotency(idempotencyKey);
+  if (existing) {
+    logger.info(`Idempotency hit: returning existing order ${existing} for key ${idempotencyKey}`);
+    return { success: true, orderId: existing };
+  }
+  const result = await processOrder(order);
+  if (result.success && result.orderId) {
+    registerIdempotency(idempotencyKey, result.orderId);
+  }
+  return result;
+}
--- a/src/__tests__/evals/fixtures/permissions.ts
+++ b/src/__tests__/evals/fixtures/permissions.ts
+// permissions.ts — role-based access control policies
+
+interface Policy {
+  canViewContent(): boolean;
+  canCreateContent(): boolean;
+  canEditContent(): boolean;
+  canDeleteContent(): boolean;
+  canViewUsers(): boolean;
+  canManageUsers(): boolean;
+  canViewRoles(): boolean;
+  canManageRoles(): boolean;
+  canViewAuditLog(): boolean;
+  canExportData(): boolean;
+}
+
+export class AdminPolicy implements Policy {
+  canViewContent(): boolean {
+    return true;
+  }
+
+  canCreateContent(): boolean {
+    return true;
+  }
+
+  canEditContent(): boolean {
+    return true;
+  }
+
+  canDeleteContent(): boolean {
+    return true;
+  }
+
+  canViewUsers(): boolean {
+    return true;
+  }
+
+  canManageUsers(): boolean {
+    return true;
+  }
+
+  canViewRoles(): boolean {
+    return true;
+  }
+
+  canManageRoles(): boolean {
+    return true;
+  }
+
+  canViewAuditLog(): boolean {
+    return true;
+  }
+
+  canExportData(): boolean {
+    return true;
+  }
+}
+
+export class ModeratorPolicy implements Policy {
+  canViewContent(): boolean {
+    return true;
+  }
+
+  canCreateContent(): boolean {
+    return true;
+  }
+
+  canEditContent(): boolean {
+    return true;
+  }
+
+  canDeleteContent(): boolean {
+    return true;
+  }
+
+  canViewUsers(): boolean {
+    return true;
+  }
+
+  canManageUsers(): boolean {
+    return true;
+  }
+
+  canViewRoles(): boolean {
+    return true;
+  }
+
+  canManageRoles(): boolean {
+    return false;
+  }
+
+  canViewAuditLog(): boolean {
+    return true;
+  }
+
+  canExportData(): boolean {
+    return true;
+  }
+}
+
+export function createPolicy(role: string): Policy {
+  switch (role) {
+    case "admin":
+      return new AdminPolicy();
+    case "moderator":
+      return new ModeratorPolicy();
+    default:
+      throw new Error(`Unknown role: ${role}`);
+  }
+}
+
+export function hasPermission(policy: Policy, action: string): boolean {
+  switch (action) {
+    case "view_content":    return policy.canViewContent();
+    case "create_content":  return policy.canCreateContent();
+    case "edit_content":    return policy.canEditContent();
+    case "delete_content":  return policy.canDeleteContent();
+    case "view_users":      return policy.canViewUsers();
+    case "manage_users":    return policy.canManageUsers();
+    case "view_roles":      return policy.canViewRoles();
+    case "manage_roles":    return policy.canManageRoles();
+    case "view_audit_log":  return policy.canViewAuditLog();
+    case "export_data":     return policy.canExportData();
+    default:                return false;
+  }
+}
--- a/src/__tests__/evals/fixtures/report_builders.ts
+++ b/src/__tests__/evals/fixtures/report_builders.ts
+// report_builders.ts — a grab-bag of reporting functions that were each
+// added by a different contributor over time. Many of them repeat the
+// same small chunks of logic.
+
+interface Sale {
+  id: string;
+  sku: string;
+  category: string;
+  region: string;
+  customerId: string;
+  unitPrice: number;
+  quantity: number;
+  currency: string;
+  soldAt: string; // ISO date
+}
+
+interface Refund {
+  saleId: string;
+  amount: number;
+  currency: string;
+  refundedAt: string;
+  reason: string;
+}
+
+interface Subscription {
+  customerId: string;
+  plan: string;
+  mrr: number;
+  currency: string;
+  startedAt: string;
+  canceledAt: string | null;
+}
+
+interface ReportRange {
+  from: string; // ISO date, inclusive
+  to: string;   // ISO date, exclusive
+}
+
+const MONTH_NAMES = [
+  "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
+];
+
+// ── Sales reports ──────────────────────────────────────────────────────────
+
+export function totalSalesRevenue(sales: Sale[], range: ReportRange): string {
+  const fromTs = Date.parse(range.from);
+  const toTs = Date.parse(range.to);
+  const inRange = sales.filter((s) => {
+    const t = Date.parse(s.soldAt);
+    return t >= fromTs && t < toTs;
+  });
+
+  let sum = 0;
+  for (const s of inRange) sum += s.unitPrice * s.quantity;
+
+  const d1 = new Date(range.from);
+  const d2 = new Date(range.to);
+  const label = `${MONTH_NAMES[d1.getUTCMonth()]} ${d1.getUTCDate()}, ${d1.getUTCFullYear()} – ${MONTH_NAMES[d2.getUTCMonth()]} ${d2.getUTCDate()}, ${d2.getUTCFullYear()}`;
+
+  const formatted = new Intl.NumberFormat("en-US", {
+    style: "currency",
+    currency: "USD",
+    minimumFractionDigits: 2,
+  }).format(sum);
+
+  return `Sales ${label}: ${formatted}`;
+}
+
+export function salesByCategory(
+  sales: Sale[],
+  range: ReportRange,
+): Array<{ category: string; revenue: string }> {
+  const fromTs = Date.parse(range.from);
+  const toTs = Date.parse(range.to);
+  const inRange = sales.filter((s) => {
+    const t = Date.parse(s.soldAt);
+    return t >= fromTs && t < toTs;
+  });
+
+  const buckets = new Map<string, number>();
+  for (const s of inRange) {
+    const current = buckets.get(s.category) ?? 0;
+    buckets.set(s.category, current + s.unitPrice * s.quantity);
+  }
+
+  const rows: Array<{ category: string; revenue: string }> = [];
+  for (const [category, amount] of buckets) {
+    rows.push({
+      category,
+      revenue: new Intl.NumberFormat("en-US", {
+        style: "currency",
+        currency: "USD",
+        minimumFractionDigits: 2,
+      }).format(amount),
+    });
+  }
+  rows.sort((a, b) => (a.category < b.category ? -1 : 1));
+  return rows;
+}
+
+export function salesByRegion(
+  sales: Sale[],
+  range: ReportRange,
+): Array<{ region: string; revenue: string; units: number }> {
+  const fromTs = Date.parse(range.from);
+  const toTs = Date.parse(range.to);
+  const inRange = sales.filter((s) => {
+    const t = Date.parse(s.soldAt);
+    return t >= fromTs && t < toTs;
+  });
+
+  const revenueByRegion = new Map<string, number>();
+  const unitsByRegion = new Map<string, number>();
+  for (const s of inRange) {
+    revenueByRegion.set(
+      s.region,
+      (revenueByRegion.get(s.region) ?? 0) + s.unitPrice * s.quantity,
+    );
+    unitsByRegion.set(s.region, (unitsByRegion.get(s.region) ?? 0) + s.quantity);
+  }
+
+  const rows: Array<{ region: string; revenue: string; units: number }> = [];
+  for (const [region, amount] of revenueByRegion) {
+    rows.push({
+      region,
+      revenue: new Intl.NumberFormat("en-US", {
+        style: "currency",
+        currency: "USD",
+        minimumFractionDigits: 2,
+      }).format(amount),
+      units: unitsByRegion.get(region) ?? 0,
+    });
+  }
+  rows.sort((a, b) => (a.region < b.region ? -1 : 1));
+  return rows;
+}
+
+export function topSkusByRevenue(
+  sales: Sale[],
+  range: ReportRange,
+  limit: number,
+): Array<{ sku: string; revenue: string }> {
+  const fromTs = Date.parse(range.from);
+  const toTs = Date.parse(range.to);
+  const inRange = sales.filter((s) => {
+    const t = Date.parse(s.soldAt);
+    return t >= fromTs && t < toTs;
+  });
+
+  const buckets = new Map<string, number>();
+  for (const s of inRange) {
+    buckets.set(s.sku, (buckets.get(s.sku) ?? 0) + s.unitPrice * s.quantity);
+  }
+
+  const rows = Array.from(buckets.entries())
+    .map(([sku, amount]) => ({ sku, amount }))
+    .sort((a, b) => b.amount - a.amount)
+    .slice(0, limit);
+
+  return rows.map((r) => ({
+    sku: r.sku,
+    revenue: new Intl.NumberFormat("en-US", {
+      style: "currency",
+      currency: "USD",
+      minimumFractionDigits: 2,
+    }).format(r.amount),
+  }));
+}
+
+// ── Refund reports ─────────────────────────────────────────────────────────
+
+export function refundsTotal(refunds: Refund[], range: ReportRange): string {
+  const fromTs = Date.parse(range.from);
+  const toTs = Date.parse(range.to);
+  const inRange = refunds.filter((r) => {
+    const t = Date.parse(r.refundedAt);
+    return t >= fromTs && t < toTs;
+  });
+
+  let sum = 0;
+  for (const r of inRange) sum += r.amount;
+
+  const d1 = new Date(range.from);
+  const d2 = new Date(range.to);
+  const label = `${MONTH_NAMES[d1.getUTCMonth()]} ${d1.getUTCDate()}, ${d1.getUTCFullYear()} – ${MONTH_NAMES[d2.getUTCMonth()]} ${d2.getUTCDate()}, ${d2.getUTCFullYear()}`;
+
+  const formatted = new Intl.NumberFormat("en-US", {
+    style: "currency",
+    currency: "USD",
+    minimumFractionDigits: 2,
+  }).format(sum);
+
+  return `Refunds ${label}: ${formatted}`;
+}
+
+export function refundsByReason(
+  refunds: Refund[],
+  range: ReportRange,
+): Array<{ reason: string; amount: string; count: number }> {
+  const fromTs = Date.parse(range.from);
+  const toTs = Date.parse(range.to);
+  const inRange = refunds.filter((r) => {
+    const t = Date.parse(r.refundedAt);
+    return t >= fromTs && t < toTs;
+  });
+
+  const amountByReason = new Map<string, number>();
+  const countByReason = new Map<string, number>();
+  for (const r of inRange) {
+    amountByReason.set(r.reason, (amountByReason.get(r.reason) ?? 0) + r.amount);
+    countByReason.set(r.reason, (countByReason.get(r.reason) ?? 0) + 1);
+  }
+
+  const rows: Array<{ reason: string; amount: string; count: number }> = [];
+  for (const [reason, amount] of amountByReason) {
+    rows.push({
+      reason,
+      amount: new Intl.NumberFormat("en-US", {
+        style: "currency",
+        currency: "USD",
+        minimumFractionDigits: 2,
+      }).format(amount),
+      count: countByReason.get(reason) ?? 0,
+    });
+  }
+  rows.sort((a, b) => (a.reason < b.reason ? -1 : 1));
+  return rows;
+}
+
+export function refundRate(
+  sales: Sale[],
+  refunds: Refund[],
+  range: ReportRange,
+): string {
+  const fromTs = Date.parse(range.from);
+  const toTs = Date.parse(range.to);
+
+  const salesInRange = sales.filter((s) => {
+    const t = Date.parse(s.soldAt);
+    return t >= fromTs && t < toTs;
+  });
+  const refundsInRange = refunds.filter((r) => {
+    const t = Date.parse(r.refundedAt);
+    return t >= fromTs && t < toTs;
+  });
+
+  let salesSum = 0;
+  for (const s of salesInRange) salesSum += s.unitPrice * s.quantity;
+  let refundSum = 0;
+  for (const r of refundsInRange) refundSum += r.amount;
+
+  if (salesSum === 0) return "0.0%";
+  const pct = (refundSum / salesSum) * 100;
+  return `${pct.toFixed(1)}%`;
+}
+
+// ── Subscription reports ───────────────────────────────────────────────────
+
+export function activeMrr(subs: Subscription[], asOf: string): string {
+  const asOfTs = Date.parse(asOf);
+  let sum = 0;
+  for (const sub of subs) {
+    const started = Date.parse(sub.startedAt);
+    const canceled = sub.canceledAt ? Date.parse(sub.canceledAt) : null;
+    if (started > asOfTs) continue;
+    if (canceled !== null && canceled <= asOfTs) continue;
+    sum += sub.mrr;
+  }
+  return new Intl.NumberFormat("en-US", {
+    style: "currency",
+    currency: "USD",
+    minimumFractionDigits: 2,
+  }).format(sum);
+}
+
+export function mrrByPlan(
+  subs: Subscription[],
+  asOf: string,
+): Array<{ plan: string; mrr: string; subscribers: number }> {
+  const asOfTs = Date.parse(asOf);
+
+  const mrrTotals = new Map<string, number>();
+  const countByPlan = new Map<string, number>();
+  for (const sub of subs) {
+    const started = Date.parse(sub.startedAt);
+    const canceled = sub.canceledAt ? Date.parse(sub.canceledAt) : null;
+    if (started > asOfTs) continue;
+    if (canceled !== null && canceled <= asOfTs) continue;
+    mrrTotals.set(sub.plan, (mrrTotals.get(sub.plan) ?? 0) + sub.mrr);
+    countByPlan.set(sub.plan, (countByPlan.get(sub.plan) ?? 0) + 1);
+  }
+
+  const rows: Array<{ plan: string; mrr: string; subscribers: number }> = [];
+  for (const [plan, amount] of mrrTotals) {
+    rows.push({
+      plan,
+      mrr: new Intl.NumberFormat("en-US", {
+        style: "currency",
+        currency: "USD",
+        minimumFractionDigits: 2,
+      }).format(amount),
+      subscribers: countByPlan.get(plan) ?? 0,
+    });
+  }
+  rows.sort((a, b) => (a.plan < b.plan ? -1 : 1));
+  return rows;
+}
+
+export function churnRate(
+  subs: Subscription[],
+  range: ReportRange,
+): string {
+  const fromTs = Date.parse(range.from);
+  const toTs = Date.parse(range.to);
+
+  let activeAtStart = 0;
+  let canceledInRange = 0;
+  for (const sub of subs) {
+    const started = Date.parse(sub.startedAt);
+    const canceled = sub.canceledAt ? Date.parse(sub.canceledAt) : null;
+    const wasActiveAtStart =
+      started < fromTs && (canceled === null || canceled >= fromTs);
+    if (wasActiveAtStart) activeAtStart++;
+    if (canceled !== null && canceled >= fromTs && canceled < toTs) {
+      canceledInRange++;
+    }
+  }
+
+  if (activeAtStart === 0) return "0.0%";
+  const pct = (canceledInRange / activeAtStart) * 100;
+  return `${pct.toFixed(1)}%`;
+}
+
+// ── Customer reports ───────────────────────────────────────────────────────
+
+export function topCustomersByRevenue(
+  sales: Sale[],
+  range: ReportRange,
+  limit: number,
+): Array<{ customerId: string; revenue: string }> {
+  const fromTs = Date.parse(range.from);
+  const toTs = Date.parse(range.to);
+  const inRange = sales.filter((s) => {
+    const t = Date.parse(s.soldAt);
+    return t >= fromTs && t < toTs;
+  });
+
+  const buckets = new Map<string, number>();
+  for (const s of inRange) {
+    buckets.set(
+      s.customerId,
+      (buckets.get(s.customerId) ?? 0) + s.unitPrice * s.quantity,
+    );
+  }
+
+  const rows = Array.from(buckets.entries())
+    .map(([customerId, amount]) => ({ customerId, amount }))
+    .sort((a, b) => b.amount - a.amount)
+    .slice(0, limit);
+
+  return rows.map((r) => ({
+    customerId: r.customerId,
+    revenue: new Intl.NumberFormat("en-US", {
+      style: "currency",
+      currency: "USD",
+      minimumFractionDigits: 2,
+    }).format(r.amount),
+  }));
+}
+
+export function averageSaleValue(
+  sales: Sale[],
+  range: ReportRange,
+): string {
+  const fromTs = Date.parse(range.from);
+  const toTs = Date.parse(range.to);
+  const inRange = sales.filter((s) => {
+    const t = Date.parse(s.soldAt);
+    return t >= fromTs && t < toTs;
+  });
+
+  if (inRange.length === 0) {
+    return new Intl.NumberFormat("en-US", {
+      style: "currency",
+      currency: "USD",
+      minimumFractionDigits: 2,
+    }).format(0);
+  }
+
+  let sum = 0;
+  for (const s of inRange) sum += s.unitPrice * s.quantity;
+  const avg = sum / inRange.length;
+  return new Intl.NumberFormat("en-US", {
+    style: "currency",
+    currency: "USD",
+    minimumFractionDigits: 2,
+  }).format(avg);
+}
+
+// ── Combined overview ──────────────────────────────────────────────────────
+
+export function overviewHeader(range: ReportRange): string {
+  const d1 = new Date(range.from);
+  const d2 = new Date(range.to);
+  const label = `${MONTH_NAMES[d1.getUTCMonth()]} ${d1.getUTCDate()}, ${d1.getUTCFullYear()} – ${MONTH_NAMES[d2.getUTCMonth()]} ${d2.getUTCDate()}, ${d2.getUTCFullYear()}`;
+  return `Overview for ${label}`;
+}
--- a/src/__tests__/evals/fixtures/route_handlers.ts
+++ b/src/__tests__/evals/fixtures/route_handlers.ts
+import type { Request, Response } from "express";
+import { db } from "./db";
+import { logger } from "./logger";
+
+interface AuthedRequest extends Request {
+  userId?: string;
+}
+
+// ── Project handlers ───────────────────────────────────────────────────────
+
+export async function getProject(
+  req: AuthedRequest,
+  res: Response,
+): Promise<void> {
+  const start = Date.now();
+  if (!req.userId) {
+    logger.warn(`getProject called without userId from ${req.ip}`);
+    res.status(401).json({ error: "unauthorized" });
+    return;
+  }
+  const id = req.params.id;
+  if (!id || typeof id !== "string") {
+    logger.warn(`getProject called with invalid id from user ${req.userId}`);
+    res.status(400).json({ error: "invalid id" });
+    return;
+  }
+
+  const rows = await db.query("SELECT * FROM projects WHERE id = ?", [id]);
+  if (rows.length === 0) {
+    res.status(404).json({ error: "not found" });
+    return;
+  }
+  logger.info(`getProject(${id}) took ${Date.now() - start}ms`);
+  res.json(rows[0]);
+}
+
+export async function updateProject(
+  req: AuthedRequest,
+  res: Response,
+): Promise<void> {
+  const start = Date.now();
+  if (!req.userId) {
+    logger.warn(`updateProject called without userId from ${req.ip}`);
+    res.status(401).json({ error: "unauthorized" });
+    return;
+  }
+  const id = req.params.id;
+  if (!id || typeof id !== "string") {
+    logger.warn(`updateProject called with invalid id from user ${req.userId}`);
+    res.status(400).json({ error: "invalid id" });
+    return;
+  }
+
+  const name = req.body.name as string;
+  await db.query("UPDATE projects SET name = ? WHERE id = ?", [name, id]);
+  logger.info(`updateProject(${id}) took ${Date.now() - start}ms`);
+  res.json({ ok: true });
+}
+
+export async function deleteProject(
+  req: AuthedRequest,
+  res: Response,
+): Promise<void> {
+  const start = Date.now();
+  if (!req.userId) {
+    logger.warn(`deleteProject called without userId from ${req.ip}`);
+    res.status(401).json({ error: "unauthorized" });
+    return;
+  }
+  const id = req.params.id;
+  if (!id || typeof id !== "string") {
+    logger.warn(`deleteProject called with invalid id from user ${req.userId}`);
+    res.status(400).json({ error: "invalid id" });
+    return;
+  }
+
+  await db.query("DELETE FROM projects WHERE id = ?", [id]);
+  logger.info(`deleteProject(${id}) took ${Date.now() - start}ms`);
+  res.json({ ok: true });
+}
+
+export async function archiveProject(
+  req: AuthedRequest,
+  res: Response,
+): Promise<void> {
+  const start = Date.now();
+  if (!req.userId) {
+    logger.warn(`archiveProject called without userId from ${req.ip}`);
+    res.status(401).json({ error: "unauthorized" });
+    return;
+  }
+  const id = req.params.id;
+  if (!id || typeof id !== "string") {
+    logger.warn(`archiveProject called with invalid id from user ${req.userId}`);
+    res.status(400).json({ error: "invalid id" });
+    return;
+  }
+
+  await db.query(
+    "UPDATE projects SET status = 'archived', archived_at = ? WHERE id = ?",
+    [new Date().toISOString(), id],
+  );
+  logger.info(`archiveProject(${id}) took ${Date.now() - start}ms`);
+  res.json({ ok: true });
+}
+
+export async function getProjectMembers(
+  req: AuthedRequest,
+  res: Response,
+): Promise<void> {
+  const start = Date.now();
+  if (!req.userId) {
+    logger.warn(`getProjectMembers called without userId from ${req.ip}`);
+    res.status(401).json({ error: "unauthorized" });
+    return;
+  }
+  const id = req.params.id;
+  if (!id || typeof id !== "string") {
+    logger.warn(`getProjectMembers called with invalid id from user ${req.userId}`);
+    res.status(400).json({ error: "invalid id" });
+    return;
+  }
+
+  const rows = await db.query(
+    "SELECT u.id, u.name, u.email, pm.role FROM project_members pm JOIN users u ON u.id = pm.user_id WHERE pm.project_id = ?",
+    [id],
+  );
+  logger.info(`getProjectMembers(${id}) took ${Date.now() - start}ms`);
+  res.json({ members: rows });
+}
+
+export async function addProjectMember(
+  req: AuthedRequest,
+  res: Response,
+): Promise<void> {
+  const start = Date.now();
+  if (!req.userId) {
+    logger.warn(`addProjectMember called without userId from ${req.ip}`);
+    res.status(401).json({ error: "unauthorized" });
+    return;
+  }
+  const id = req.params.id;
+  if (!id || typeof id !== "string") {
+    logger.warn(`addProjectMember called with invalid id from user ${req.userId}`);
+    res.status(400).json({ error: "invalid id" });
+    return;
+  }
+
+  const { memberId, role } = req.body as { memberId: string; role: string };
+  await db.query(
+    "INSERT INTO project_members (project_id, user_id, role, added_at) VALUES (?, ?, ?, ?)",
+    [id, memberId, role, new Date().toISOString()],
+  );
+  logger.info(`addProjectMember(${id}, member=${memberId}) took ${Date.now() - start}ms`);
+  res.status(201).json({ ok: true });
+}
+
+export async function removeProjectMember(
+  req: AuthedRequest,
+  res: Response,
+): Promise<void> {
+  const start = Date.now();
+  if (!req.userId) {
+    logger.warn(`removeProjectMember called without userId from ${req.ip}`);
+    res.status(401).json({ error: "unauthorized" });
+    return;
+  }
+  const id = req.params.id;
+  if (!id || typeof id !== "string") {
+    logger.warn(`removeProjectMember called with invalid id from user ${req.userId}`);
+    res.status(400).json({ error: "invalid id" });
+    return;
+  }
+
+  const { memberId } = req.params;
+  await db.query(
+    "DELETE FROM project_members WHERE project_id = ? AND user_id = ?",
+    [id, memberId],
+  );
+  logger.info(`removeProjectMember(${id}, member=${memberId}) took ${Date.now() - start}ms`);
+  res.json({ ok: true });
+}
+
+export async function transferProjectOwnership(
+  req: AuthedRequest,
+  res: Response,
+): Promise<void> {
+  const start = Date.now();
+  if (!req.userId) {
+    logger.warn(`transferProjectOwnership called without userId from ${req.ip}`);
+    res.status(401).json({ error: "unauthorized" });
+    return;
+  }
+  const id = req.params.id;
+  if (!id || typeof id !== "string") {
+    logger.warn(`transferProjectOwnership called with invalid id from user ${req.userId}`);
+    res.status(400).json({ error: "invalid id" });
+    return;
+  }
+
+  const { newOwnerId } = req.body as { newOwnerId: string };
+  await db.query(
+    "UPDATE projects SET owner_id = ?, updated_at = ? WHERE id = ?",
+    [newOwnerId, new Date().toISOString(), id],
+  );
+  logger.info(`transferProjectOwnership(${id}, newOwner=${newOwnerId}) took ${Date.now() - start}ms`);
+  res.json({ ok: true });
+}
+
+export async function listProjectVersions(
+  req: AuthedRequest,
+  res: Response,
+): Promise<void> {
+  const start = Date.now();
+  if (!req.userId) {
+    logger.warn(`listProjectVersions called without userId from ${req.ip}`);
+    res.status(401).json({ error: "unauthorized" });
+    return;
+  }
+  const id = req.params.id;
+  if (!id || typeof id !== "string") {
+    logger.warn(`listProjectVersions called with invalid id from user ${req.userId}`);
+    res.status(400).json({ error: "invalid id" });
+    return;
+  }
+
+  const rows = await db.query(
+    "SELECT id, version, created_at, created_by FROM project_versions WHERE project_id = ? ORDER BY created_at DESC LIMIT 50",
+    [id],
+  );
+  logger.info(`listProjectVersions(${id}) took ${Date.now() - start}ms`);
+  res.json({ versions: rows });
+}
+
+export async function restoreProjectVersion(
+  req: AuthedRequest,
+  res: Response,
+): Promise<void> {
+  const start = Date.now();
+  if (!req.userId) {
+    logger.warn(`restoreProjectVersion called without userId from ${req.ip}`);
+    res.status(401).json({ error: "unauthorized" });
+    return;
+  }
+  const id = req.params.id;
+  if (!id || typeof id !== "string") {
+    logger.warn(`restoreProjectVersion called with invalid id from user ${req.userId}`);
+    res.status(400).json({ error: "invalid id" });
+    return;
+  }
+
+  const { versionId } = req.body as { versionId: string };
+  const versionRows = await db.query(
+    "SELECT * FROM project_versions WHERE id = ? AND project_id = ?",
+    [versionId, id],
+  );
+  if (versionRows.length === 0) {
+    res.status(404).json({ error: "version not found" });
+    return;
+  }
+
+  await db.query(
+    "UPDATE projects SET data = ?, updated_at = ? WHERE id = ?",
+    [(versionRows[0] as { data: unknown }).data, new Date().toISOString(), id],
+  );
+  logger.info(`restoreProjectVersion(${id}, version=${versionId}) took ${Date.now() - start}ms`);
+  res.json({ ok: true });
+}
+
+export async function duplicateProject(
+  req: AuthedRequest,
+  res: Response,
+): Promise<void> {
+  const start = Date.now();
+  if (!req.userId) {
+    logger.warn(`duplicateProject called without userId from ${req.ip}`);
+    res.status(401).json({ error: "unauthorized" });
+    return;
+  }
+  const id = req.params.id;
+  if (!id || typeof id !== "string") {
+    logger.warn(`duplicateProject called with invalid id from user ${req.userId}`);
+    res.status(400).json({ error: "invalid id" });
+    return;
+  }
+
+  const rows = await db.query("SELECT * FROM projects WHERE id = ?", [id]);
+  if (rows.length === 0) {
+    res.status(404).json({ error: "not found" });
+    return;
+  }
+
+  const { name } = req.body as { name: string };
+  const newRows = await db.query(
+    "INSERT INTO projects (name, owner_id, status, data, created_at) SELECT ?, owner_id, 'active', data, ? FROM projects WHERE id = ? RETURNING id",
+    [name, new Date().toISOString(), id],
+  );
+  logger.info(`duplicateProject(${id} → ${(newRows[0] as { id: string }).id}) took ${Date.now() - start}ms`);
+  res.status(201).json(newRows[0]);
+}
--- a/src/__tests__/evals/fixtures/stat_utils.ts
+++ b/src/__tests__/evals/fixtures/stat_utils.ts
+// stat_utils.ts — descriptive statistics and distance helpers
+
+export function mean(xs: number[]): number {
+  if (xs.length === 0) {
+    return 0;
+  }
+
+  let sum = 0;
+  for (const x of xs) {
+    sum += x;
+  }
+  return sum / xs.length;
+}
+
+export function correlation(xs: number[], ys: number[]): number {
+  const sdX = stddev(xs);
+  const sdY = stddev(ys);
+  if (sdX === 0 || sdY === 0) return 0;
+  return covariance(xs, ys) / (sdX * sdY);
+}
+
+export function variance(xs: number[]): number {
+  if (xs.length < 2) return 0;
+  const mu = mean(xs);
+  let sum = 0;
+  for (const x of xs) {
+    sum += Math.pow(x - mu, 2);
+  }
+  return sum / (xs.length - 1);
+}
+
+export function correlation(xs: number[], ys: number[]): number {
+  const sdX = stddev(xs);
+  const sdY = stddev(ys);
+  if (sdX === 0 || sdY === 0) return 0;
+  return covariance(xs, ys) / (sdX * sdY);
+}
+
+export function populationVariance(xs: number[]): number {
+  if (xs.length === 0) {
+    return 0;
+  }
+
+  const mu = mean(xs);
+  let sum = 0;
+  for (const x of xs) {
+    sum += Math.pow(x - mu, 2);
+  }
+  return sum / xs.length;
+}
+
+export function stddev(xs: number[]): number {
+  return Math.sqrt(variance(xs));
+}
+
+export function populationStddev(xs: number[]): number {
+  return Math.sqrt(populationVariance(xs));
+}
+
+export function covariance(xs: number[], ys: number[]): number {
+  if (xs.length !== ys.length || xs.length < 2) return 0;
+  const muX = mean(xs);
+  const muY = mean(ys);
+  let sum = 0;
+  for (let i = 0; i < xs.length; i++) {
+    sum += (xs[i] - muX) * (ys[i] - muY);
+  }
+  return sum / (xs.length - 1);
+}
+
+export function correlation(xs: number[], ys: number[]): number {
+  const sdX = stddev(xs);
+  const sdY = stddev(ys);
+  if (sdX === 0 || sdY === 0) return 0;
+  return covariance(xs, ys) / (sdX * sdY);
+}
+
+export function skewness(xs: number[]): number {
+  if (xs.length < 3) return 0;
+  const mu = mean(xs);
+  const sd = stddev(xs);
+  if (sd === 0) return 0;
+  let sum = 0;
+  for (const x of xs) {
+    sum += Math.pow(x - mu, 3);
+  }
+  const n = xs.length;
+  return (n / ((n - 1) * (n - 2))) * (sum / Math.pow(sd, 3));
+}
+
+export function kurtosis(xs: number[]): number {
+  if (xs.length < 4) return 0;
+  const mu = mean(xs);
+  const sd = stddev(xs);
+  if (sd === 0) return 0;
+  let sum = 0;
+  for (const x of xs) {
+    sum += Math.pow(x - mu, 4);
+  }
+  const n = xs.length;
+  return sum / (n * Math.pow(sd, 4)) - 3;
+}
+
+export function mse(predicted: number[], actual: number[]): number {
+  if (predicted.length !== actual.length || predicted.length === 0) return 0;
+  let sum = 0;
+  for (let i = 0; i < predicted.length; i++) {
+    sum += Math.pow(predicted[i] - actual[i], 2);
+  }
+  return sum / predicted.length;
+}
+
+export function rmse(predicted: number[], actual: number[]): number {
+  return Math.sqrt(mse(predicted, actual));
+}
+
+export function rSquared(predicted: number[], actual: number[]): number {
+  if (predicted.length !== actual.length || predicted.length === 0) return 0;
+  const mu = mean(actual);
+  let ssTot = 0;
+  let ssRes = 0;
+  for (let i = 0; i < actual.length; i++) {
+    ssTot += Math.pow(actual[i] - mu, 2);
+    ssRes += Math.pow(predicted[i] - actual[i], 2);
+  }
+  if (ssTot === 0) return 1;
+  return 1 - ssRes / ssTot;
+}
+
+export function euclideanDistance(a: number[], b: number[]): number {
+  if (a.length !== b.length) return 0;
+  let sum = 0;
+  for (let i = 0; i < a.length; i++) {
+    sum += Math.pow(a[i] - b[i], 2);
+  }
+  return Math.sqrt(sum);
+}
+
+export function zScore(x: number, xs: number[]): number {
+  const sd = stddev(xs);
+  if (sd === 0) return 0;
+  return (x - mean(xs)) / sd;
+}
+
+export function median(xs: number[]): number {
+  if (xs.length === 0) {
+    return 0;
+  }
+
+  let sum = 0;
+  for (const x of xs) {
+    sum += x;
+  }
+  return sum / xs.length;
+}
+
+export function correlation(xs: number[], ys: number[]): number {
+  const sdX = stddev(xs);
+  const sdY = stddev(ys);
+  if (sdX === 0 || sdY === 0) return 0;
+  return covariance(xs, ys) / (sdX * sdY);
+}
--- a/src/__tests__/evals/fixtures/user_handler.ts
+++ b/src/__tests__/evals/fixtures/user_handler.ts
+import type { Request, Response } from "express";
+import { db } from "./db";
+import { logger } from "./logger";
+
+// ── Types ──────────────────────────────────────────────────────────────────
+
+type UserRole = "admin" | "member" | "guest";
+
+interface User {
+  id: string;
+  email: string;
+  name: string;
+  age: number;
+  role: UserRole;
+  createdAt: string;
+  updatedAt: string;
+  bio: string | null;
+  avatarUrl: string | null;
+  isActive: boolean;
+}
+
+interface CreateUserBody {
+  email: string;
+  name: string;
+  age: number;
+  role: UserRole;
+  bio?: string;
+  avatarUrl?: string;
+}
+
+interface UpdateUserBody {
+  name?: string;
+  age?: number;
+  role?: UserRole;
+  bio?: string;
+  avatarUrl?: string;
+  isActive?: boolean;
+}
+
+interface ListUsersQuery {
+  role?: UserRole;
+  isActive?: string;
+  page?: string;
+  limit?: string;
+  search?: string;
+}
+
+interface PaginatedResponse<T> {
+  items: T[];
+  total: number;
+  page: number;
+  limit: number;
+  hasMore: boolean;
+}
+
+// ── Helpers ────────────────────────────────────────────────────────────────
+
+function parseIntParam(value: string | undefined, defaultVal: number): number {
+  if (!value) return defaultVal;
+  const n = parseInt(value, 10);
+  return isNaN(n) ? defaultVal : n;
+}
+
+function buildWhereClause(query: ListUsersQuery): {
+  sql: string;
+  params: unknown[];
+} {
+  const conditions: string[] = [];
+  const params: unknown[] = [];
+
+  if (query.role) {
+    conditions.push("role = ?");
+    params.push(query.role);
+  }
+  if (query.isActive !== undefined) {
+    conditions.push("is_active = ?");
+    params.push(query.isActive === "true" ? 1 : 0);
+  }
+  if (query.search) {
+    conditions.push("(name LIKE ? OR email LIKE ?)");
+    const pattern = `%${query.search}%`;
+    params.push(pattern, pattern);
+  }
+
+  return {
+    sql: conditions.length > 0 ? `WHERE ${conditions.join(" AND ")}` : "",
+    params,
+  };
+}
+
+// ── Handlers ───────────────────────────────────────────────────────────────
+
+export async function createUserHandler(
+  req: Request,
+  res: Response,
+): Promise<void> {
+  const email = req.body.email as string;
+  const name = req.body.name as string;
+  const age = req.body.age as number;
+  const role = req.body.role as "admin" | "member" | "guest";
+
+  const existing = await db.query("SELECT id FROM users WHERE email = ?", [
+    email,
+  ]);
+  if (existing.length > 0) {
+    res.status(409).json({ error: "email already in use" });
+    return;
+  }
+
+  const rows = await db.query(
+    "INSERT INTO users (email, name, age, role) VALUES (?, ?, ?, ?) RETURNING *",
+    [email, name, age, role],
+  );
+
+  logger.info(`created user ${rows[0].id} with role ${role}`);
+  res.status(201).json(rows[0]);
+}
+
+export async function getUserHandler(
+  req: Request,
+  res: Response,
+): Promise<void> {
+  const { id } = req.params;
+  if (!id) {
+    res.status(400).json({ error: "missing id" });
+    return;
+  }
+
+  const rows = await db.query("SELECT * FROM users WHERE id = ?", [id]);
+  if (rows.length === 0) {
+    res.status(404).json({ error: "user not found" });
+    return;
+  }
+
+  logger.info(`fetched user ${id}`);
+  res.json(rows[0]);
+}
+
+export async function listUsersHandler(
+  req: Request,
+  res: Response,
+): Promise<void> {
+  const query = req.query as ListUsersQuery;
+  const page = parseIntParam(query.page, 1);
+  const limit = parseIntParam(query.limit, 20);
+  const offset = (page - 1) * limit;
+
+  const { sql: whereClause, params: whereParams } = buildWhereClause(query);
+
+  const countRows = await db.query(
+    `SELECT COUNT(*) AS total FROM users ${whereClause}`,
+    whereParams,
+  );
+  const total = (countRows[0] as { total: number }).total;
+
+  const rows = await db.query(
+    `SELECT * FROM users ${whereClause} ORDER BY created_at DESC LIMIT ? OFFSET ?`,
+    [...whereParams, limit, offset],
+  );
+
+  const response: PaginatedResponse<User> = {
+    items: rows as User[],
+    total,
+    page,
+    limit,
+    hasMore: offset + rows.length < total,
+  };
+
+  logger.info(`listed ${rows.length} users (page=${page}, total=${total})`);
+  res.json(response);
+}
+
+export async function updateUserHandler(
+  req: Request,
+  res: Response,
+): Promise<void> {
+  const { id } = req.params;
+  if (!id) {
+    res.status(400).json({ error: "missing id" });
+    return;
+  }
+
+  const existing = await db.query("SELECT * FROM users WHERE id = ?", [id]);
+  if (existing.length === 0) {
+    res.status(404).json({ error: "user not found" });
+    return;
+  }
+
+  const body = req.body as UpdateUserBody;
+
+  if (body.role && !["admin", "member", "guest"].includes(body.role)) {
+    res.status(400).json({ error: "invalid role" });
+    return;
+  }
+
+  if (body.age !== undefined && (typeof body.age !== "number" || body.age < 0)) {
+    res.status(400).json({ error: "age must be a non-negative number" });
+    return;
+  }
+
+  if (body.name !== undefined && body.name.trim().length === 0) {
+    res.status(400).json({ error: "name cannot be empty" });
+    return;
+  }
+
+  const setClauses: string[] = [];
+  const params: unknown[] = [];
+  for (const [key, value] of Object.entries(body)) {
+    setClauses.push(`${key} = ?`);
+    params.push(value);
+  }
+  setClauses.push("updated_at = ?");
+  params.push(new Date().toISOString());
+  params.push(id);
+
+  const rows = await db.query(
+    `UPDATE users SET ${setClauses.join(", ")} WHERE id = ? RETURNING *`,
+    params,
+  );
+
+  logger.info(`updated user ${id}`);
+  res.json(rows[0]);
+}
+
+export async function deleteUserHandler(
+  req: Request,
+  res: Response,
+): Promise<void> {
+  const { id } = req.params;
+  if (!id) {
+    res.status(400).json({ error: "missing id" });
+    return;
+  }
+
+  const existing = await db.query("SELECT id FROM users WHERE id = ?", [id]);
+  if (existing.length === 0) {
+    res.status(404).json({ error: "user not found" });
+    return;
+  }
+
+  await db.query("DELETE FROM users WHERE id = ?", [id]);
+  logger.info(`deleted user ${id}`);
+  res.status(204).send();
+}
+
+export async function changeRoleHandler(
+  req: Request,
+  res: Response,
+): Promise<void> {
+  const { id } = req.params;
+  const { role } = req.body as { role: UserRole };
+
+  if (!["admin", "member", "guest"].includes(role)) {
+    res.status(400).json({ error: "invalid role" });
+    return;
+  }
+
+  const existing = await db.query("SELECT id, role FROM users WHERE id = ?", [id]);
+  if (existing.length === 0) {
+    res.status(404).json({ error: "user not found" });
+    return;
+  }
+
+  const previousRole = (existing[0] as { role: string }).role;
+  await db.query(
+    "UPDATE users SET role = ?, updated_at = ? WHERE id = ?",
+    [role, new Date().toISOString(), id],
+  );
+
+  logger.info(`changed role for user ${id}: ${previousRole} → ${role}`);
+  res.json({ id, role });
+}
+
+export async function deactivateUserHandler(
+  req: Request,
+  res: Response,
+): Promise<void> {
+  const { id } = req.params;
+
+  const existing = await db.query(
+    "SELECT id, is_active FROM users WHERE id = ?",
+    [id],
+  );
+  if (existing.length === 0) {
+    res.status(404).json({ error: "user not found" });
+    return;
+  }
+
+  if (!(existing[0] as { is_active: boolean }).is_active) {
+    res.status(409).json({ error: "user already inactive" });
+    return;
+  }
+
+  await db.query(
+    "UPDATE users SET is_active = 0, updated_at = ? WHERE id = ?",
+    [new Date().toISOString(), id],
+  );
+
+  logger.info(`deactivated user ${id}`);
+  res.json({ id, isActive: false });
+}
+
+export async function getUsersByRoleHandler(
+  req: Request,
+  res: Response,
+): Promise<void> {
+  const { role } = req.params;
+
+  if (!["admin", "member", "guest"].includes(role)) {
+    res.status(400).json({ error: "invalid role" });
+    return;
+  }
+
+  const rows = await db.query(
+    "SELECT * FROM users WHERE role = ? AND is_active = 1 ORDER BY name ASC",
+    [role],
+  );
+
+  logger.info(`fetched ${rows.length} users with role=${role}`);
+  res.json({ role, users: rows });
+}
+
+export async function searchUsersHandler(
+  req: Request,
+  res: Response,
+): Promise<void> {
+  const { q } = req.query as { q?: string };
+  if (!q || q.trim().length < 2) {
+    res.status(400).json({ error: "query must be at least 2 characters" });
+    return;
+  }
+
+  const pattern = `%${q.trim()}%`;
+  const rows = await db.query(
+    "SELECT id, name, email, role FROM users WHERE (name LIKE ? OR email LIKE ?) AND is_active = 1 LIMIT 50",
+    [pattern, pattern],
+  );
+
+  logger.info(`search "${q}" returned ${rows.length} users`);
+  res.json({ query: q, results: rows });
+}
--- a/src/__tests__/evals/fixtures/user_service.ts
+++ b/src/__tests__/evals/fixtures/user_service.ts
+import { db } from "./db";
+import { logger } from "./logger";
+
+interface User {
+  id: string;
+  email: string;
+  name: string;
+  role: string;
+  isActive: boolean;
+  createdAt: string;
+  updatedAt: string;
+}
+
+interface UserAuditEntry {
+  userId: string;
+  action: string;
+  performedBy: string;
+  timestamp: string;
+  metadata: Record<string, unknown>;
+}
+
+interface PaginatedUsers {
+  items: User[];
+  total: number;
+  page: number;
+}
+
+// ── Basic CRUD ─────────────────────────────────────────────────────────────
+
+export function fetchUser(id: string): Promise<User | null> {
+  return db
+    .query("SELECT * FROM users WHERE id = ?", [id])
+    .then((rows) => {
+      if (rows.length === 0) return null;
+      return rows[0] as User;
+    })
+    .catch((err) => {
+      logger.error(`fetchUser failed for id=${id}`, err);
+      throw err;
+    });
+}
+
+export function createUser(email: string, name: string): Promise<User> {
+  return db
+    .query("INSERT INTO users (email, name) VALUES (?, ?) RETURNING *", [
+      email,
+      name,
+    ])
+    .then((rows) => {
+      const user = rows[0] as User;
+      logger.info(`created user ${user.id}`);
+      return user;
+    })
+    .catch((err) => {
+      logger.error(`createUser failed for email=${email}`, err);
+      throw err;
+    });
+}
+
+export function deleteUser(id: string): Promise<void> {
+  return db
+    .query("DELETE FROM users WHERE id = ?", [id])
+    .then(() => {
+      logger.info(`deleted user ${id}`);
+    })
+    .catch((err) => {
+      logger.error(`deleteUser failed for id=${id}`, err);
+      throw err;
+    });
+}
+
+export function updateEmail(id: string, email: string): Promise<User> {
+  return fetchUser(id)
+    .then((user) => {
+      if (!user) throw new Error(`user ${id} not found`);
+      return db.query("UPDATE users SET email = ? WHERE id = ? RETURNING *", [
+        email,
+        id,
+      ]);
+    })
+    .then((rows) => rows[0] as User)
+    .catch((err) => {
+      logger.error(`updateEmail failed for id=${id}`, err);
+      throw err;
+    });
+}
+
+export function updateName(id: string, name: string): Promise<User> {
+  return fetchUser(id)
+    .then((user) => {
+      if (!user) throw new Error(`user ${id} not found`);
+      return db.query("UPDATE users SET name = ? WHERE id = ? RETURNING *", [
+        name,
+        id,
+      ]);
+    })
+    .then((rows) => rows[0] as User)
+    .catch((err) => {
+      logger.error(`updateName failed for id=${id}`, err);
+      throw err;
+    });
+}
+
+export function updateRole(id: string, role: string): Promise<User> {
+  return fetchUser(id)
+    .then((user) => {
+      if (!user) throw new Error(`user ${id} not found`);
+      const validRoles = ["admin", "member", "guest"];
+      if (!validRoles.includes(role)) throw new Error(`invalid role: ${role}`);
+      return db.query("UPDATE users SET role = ? WHERE id = ? RETURNING *", [
+        role,
+        id,
+      ]);
+    })
+    .then((rows) => rows[0] as User)
+    .catch((err) => {
+      logger.error(`updateRole failed for id=${id}`, err);
+      throw err;
+    });
+}
+
+export function deactivateUser(id: string): Promise<User> {
+  return fetchUser(id)
+    .then((user) => {
+      if (!user) throw new Error(`user ${id} not found`);
+      if (!user.isActive) throw new Error(`user ${id} is already inactive`);
+      return db.query(
+        "UPDATE users SET is_active = 0, updated_at = ? WHERE id = ? RETURNING *",
+        [new Date().toISOString(), id],
+      );
+    })
+    .then((rows) => {
+      logger.info(`deactivated user ${id}`);
+      return rows[0] as User;
+    })
+    .catch((err) => {
+      logger.error(`deactivateUser failed for id=${id}`, err);
+      throw err;
+    });
+}
+
+export function reactivateUser(id: string): Promise<User> {
+  return fetchUser(id)
+    .then((user) => {
+      if (!user) throw new Error(`user ${id} not found`);
+      if (user.isActive) throw new Error(`user ${id} is already active`);
+      return db.query(
+        "UPDATE users SET is_active = 1, updated_at = ? WHERE id = ? RETURNING *",
+        [new Date().toISOString(), id],
+      );
+    })
+    .then((rows) => {
+      logger.info(`reactivated user ${id}`);
+      return rows[0] as User;
+    })
+    .catch((err) => {
+      logger.error(`reactivateUser failed for id=${id}`, err);
+      throw err;
+    });
+}
+
+// ── Listing ────────────────────────────────────────────────────────────────
+
+export function listUsers(page: number, limit: number): Promise<PaginatedUsers> {
+  const offset = (page - 1) * limit;
+  return db
+    .query(
+      "SELECT * FROM users ORDER BY created_at DESC LIMIT ? OFFSET ?",
+      [limit, offset],
+    )
+    .then((rows) => {
+      return db
+        .query("SELECT COUNT(*) AS total FROM users", [])
+        .then((countRows) => ({
+          items: rows as User[],
+          total: (countRows[0] as { total: number }).total,
+          page,
+        }));
+    })
+    .catch((err) => {
+      logger.error(`listUsers failed page=${page}`, err);
+      throw err;
+    });
+}
+
+export function fetchUsersByRole(role: string): Promise<User[]> {
+  return db
+    .query("SELECT * FROM users WHERE role = ? AND is_active = 1", [role])
+    .then((rows) => rows as User[])
+    .catch((err) => {
+      logger.error(`fetchUsersByRole failed for role=${role}`, err);
+      throw err;
+    });
+}
+
+export function searchUsers(query: string): Promise<User[]> {
+  const pattern = `%${query}%`;
+  return db
+    .query(
+      "SELECT * FROM users WHERE (name LIKE ? OR email LIKE ?) AND is_active = 1 LIMIT 50",
+      [pattern, pattern],
+    )
+    .then((rows) => rows as User[])
+    .catch((err) => {
+      logger.error(`searchUsers failed for query=${query}`, err);
+      throw err;
+    });
+}
+
+// ── Audit ──────────────────────────────────────────────────────────────────
+
+export function logAuditEntry(entry: UserAuditEntry): Promise<void> {
+  return db
+    .query(
+      "INSERT INTO user_audit (user_id, action, performed_by, timestamp, metadata) VALUES (?, ?, ?, ?, ?)",
+      [entry.userId, entry.action, entry.performedBy, entry.timestamp, JSON.stringify(entry.metadata)],
+    )
+    .then(() => {
+      logger.info(`audit: ${entry.action} on user ${entry.userId} by ${entry.performedBy}`);
+    })
+    .catch((err) => {
+      logger.error(`logAuditEntry failed for userId=${entry.userId}`, err);
+      throw err;
+    });
+}
+
+export function fetchAuditLog(userId: string): Promise<UserAuditEntry[]> {
+  return db
+    .query(
+      "SELECT * FROM user_audit WHERE user_id = ? ORDER BY timestamp DESC LIMIT 100",
+      [userId],
+    )
+    .then((rows) => rows as UserAuditEntry[])
+    .catch((err) => {
+      logger.error(`fetchAuditLog failed for userId=${userId}`, err);
+      throw err;
+    });
+}
+
+// ── Verification ───────────────────────────────────────────────────────────
+
+export function requestEmailVerification(id: string): Promise<void> {
+  return fetchUser(id)
+    .then((user) => {
+      if (!user) throw new Error(`user ${id} not found`);
+      return db.query(
+        "INSERT INTO email_verifications (user_id, token, expires_at) VALUES (?, ?, ?)",
+        [id, Math.random().toString(36).slice(2), new Date(Date.now() + 24 * 3600 * 1000).toISOString()],
+      );
+    })
+    .then(() => {
+      logger.info(`requested email verification for user ${id}`);
+    })
+    .catch((err) => {
+      logger.error(`requestEmailVerification failed for id=${id}`, err);
+      throw err;
+    });
+}
+
+export function verifyEmail(id: string, token: string): Promise<User> {
+  return db
+    .query(
+      "SELECT * FROM email_verifications WHERE user_id = ? AND token = ? AND expires_at > ?",
+      [id, token, new Date().toISOString()],
+    )
+    .then((rows) => {
+      if (rows.length === 0) throw new Error("invalid or expired verification token");
+      return db.query("UPDATE users SET email_verified = 1 WHERE id = ? RETURNING *", [id]);
+    })
+    .then((rows) => {
+      logger.info(`verified email for user ${id}`);
+      return rows[0] as User;
+    })
+    .catch((err) => {
+      logger.error(`verifyEmail failed for id=${id}`, err);
+      throw err;
+    });
+}
+
+// ── Password reset ─────────────────────────────────────────────────────────
+
+export function requestPasswordReset(email: string): Promise<void> {
+  return db
+    .query("SELECT id FROM users WHERE email = ?", [email])
+    .then((rows) => {
+      if (rows.length === 0) {
+        // Don't reveal whether the email exists
+        logger.info(`password reset requested for unknown email (redacted)`);
+        return;
+      }
+      const userId = (rows[0] as { id: string }).id;
+      return db
+        .query(
+          "INSERT INTO password_resets (user_id, token, expires_at) VALUES (?, ?, ?)",
+          [userId, Math.random().toString(36).slice(2), new Date(Date.now() + 3600 * 1000).toISOString()],
+        )
+        .then(() => { logger.info(`password reset token created for user ${userId}`); });
+    })
+    .catch((err) => {
+      logger.error(`requestPasswordReset failed for email`, err);
+      throw err;
+    });
+}
+
+export function resetPassword(token: string, newPasswordHash: string): Promise<void> {
+  return db
+    .query(
+      "SELECT user_id FROM password_resets WHERE token = ? AND expires_at > ? AND used = 0",
+      [token, new Date().toISOString()],
+    )
+    .then((rows) => {
+      if (rows.length === 0) throw new Error("invalid or expired reset token");
+      const userId = (rows[0] as { user_id: string }).user_id;
+      return db
+        .query("UPDATE users SET password_hash = ?, updated_at = ? WHERE id = ?", [
+          newPasswordHash,
+          new Date().toISOString(),
+          userId,
+        ])
+        .then(() => db.query("UPDATE password_resets SET used = 1 WHERE token = ?", [token]))
+        .then(() => { logger.info(`password reset completed for user ${userId}`); });
+    })
+    .catch((err) => {
+      logger.error(`resetPassword failed`, err);
+      throw err;
+    });
+}
--- a/src/__tests__/evals/helpers/eval_recorder.ts
+++ b/src/__tests__/evals/helpers/eval_recorder.ts
+import { mkdir, writeFile } from "node:fs/promises";
+import { resolve } from "node:path";
+import type { LanguageModelUsage } from "ai";
+
+// Project-root `eval-results/` (never deleted, not tracked by git — see
+// .gitignore). Layout:
+//
+//   eval-results/
+//     <suite>/
+//       <run-start-ts>__<model-label>/        (run folder)
+//         <case-name>/                        (record folder)
+//           record.json                       (full structured record)
+//           record.txt                        (readable plaintext, every
+//                                              tool call inline)
+//           tool_calls/
+//             01.txt                          (one file per tool call,
+//             02.txt                           real newlines — not \n)
+//             ...
+//
+// `<run-start-ts>` is captured once at module load so every case run in
+// the same vitest process for the same model lands in one folder. The
+// ISO-timestamp prefix makes `ls` return folders in chronological order.
+const RESULTS_ROOT = resolve(__dirname, "../../../../eval-results");
+
+// Captured once per module load. Shared by every `recordEvalRun` call
+// from the same process so all cases from a single run cluster into
+// one folder per model.
+const RUN_START_TIMESTAMP = new Date().toISOString();
+
+export interface NormalizedUsage {
+  inputTokens: number;
+  outputTokens: number;
+  totalTokens: number;
+}
+
+export interface LLMRequestRecord {
+  stepIndex: number;
+  timestamp: string;
+  durationMs: number;
+  usage: NormalizedUsage;
+  finishReason: string | null;
+}
+
+export interface ToolCallRecord {
+  timestamp: string;
+  index: number;
+  toolName: string;
+  filePath: string;
+  // Raw tool input arguments, keyed by the tool's parameter names
+  // (e.g. `old_string`/`new_string` for search_replace, `content` for
+  // write_file, `content`/`instructions` for edit_file).
+  args: Record<string, unknown>;
+  fileBefore: string;
+  fileAfter: string;
+  // Unified diff from fileBefore → fileAfter for this single call.
+  // Empty string when the call did not change the file.
+  diff: string;
+  // Whether the tool call completed successfully. Failed calls still get
+  // recorded so the tool-call log reflects what the model actually tried,
+  // not just what succeeded.
+  succeeded: boolean;
+  // Error message when succeeded=false; null otherwise.
+  error: string | null;
+}
+
+export interface JudgeRecord {
+  label: string;
+  provider: string;
+  modelName: string;
+  durationMs: number;
+  usage: NormalizedUsage;
+  pass: boolean;
+  explanation: string;
+}
+
+export interface EvalRunRecord {
+  timestamp: string;
+  suite: string;
+  caseName: string;
+  model: {
+    label: string;
+    provider: string;
+    modelName: string;
+    responseModelId: string | null;
+  };
+  prompt: {
+    system: string;
+    // Plain edit instructions for the case, without the file content
+    // spliced in. Handy for skimming what the model was asked to do.
+    instructions: string;
+    // Full user-message content actually sent to the model (typically
+    // the file contents followed by the instructions).
+    user: string;
+  };
+  file: {
+    name: string;
+    before: string;
+    after: string;
+  };
+  llm: {
+    totalDurationMs: number;
+    totalUsage: NormalizedUsage;
+    requestCount: number;
+    requests: LLMRequestRecord[];
+  };
+  toolCalls: ToolCallRecord[];
+  // Unified diff between the original file (pre-first-tool-call) and
+  // the final file (post-last-tool-call). Empty string when no change.
+  diff: string;
+  judge: JudgeRecord | null;
+  passed: boolean;
+  errorMessage: string | null;
+}
+
+function sanitize(s: string): string {
+  return s.replace(/[^a-zA-Z0-9_-]+/g, "-").replace(/^-+|-+$/g, "");
+}
+
+function fsTimestamp(iso: string): string {
+  // Colons/periods are legal on Linux but ugly and fragile across
+  // filesystems. Replace so `2026-04-10T14:23:01.123Z` becomes
+  // `2026-04-10T14-23-01-123Z`.
+  return iso.replace(/[:.]/g, "-");
+}
+
+export function normalizeUsage(
+  u: LanguageModelUsage | undefined,
+): NormalizedUsage {
+  const input = u?.inputTokens ?? 0;
+  const output = u?.outputTokens ?? 0;
+  const total = u?.totalTokens ?? input + output;
+  return { inputTokens: input, outputTokens: output, totalTokens: total };
+}
+
+function formatUsage(u: NormalizedUsage): string {
+  return `input=${u.inputTokens} output=${u.outputTokens} total=${u.totalTokens}`;
+}
+
+function hr(char = "=", n = 72): string {
+  return char.repeat(n);
+}
+
+function stringifyArg(value: unknown): { text: string; length: number } {
+  if (typeof value === "string") {
+    return { text: value, length: value.length };
+  }
+  const text = JSON.stringify(value, null, 2) ?? String(value);
+  return { text, length: text.length };
+}
+
+function formatToolCall(tc: ToolCallRecord): string {
+  const parts: string[] = [];
+  parts.push(hr("-"));
+  const status = tc.succeeded ? "" : " [FAILED]";
+  parts.push(`Tool call #${tc.index + 1} (${tc.toolName})${status}`);
+  parts.push(`Timestamp: ${tc.timestamp}`);
+  parts.push(`File:      ${tc.filePath}`);
+  if (!tc.succeeded && tc.error) {
+    parts.push(`Error:     ${tc.error}`);
+  }
+  parts.push("");
+  for (const [key, value] of Object.entries(tc.args)) {
+    const { text, length } = stringifyArg(value);
+    parts.push(`----- ${key.toUpperCase()} (${length} chars) -----`);
+    parts.push(text);
+  }
+  parts.push(`----- FILE BEFORE (${tc.fileBefore.length} chars) -----`);
+  parts.push(tc.fileBefore);
+  parts.push(`----- FILE AFTER (${tc.fileAfter.length} chars) -----`);
+  parts.push(tc.fileAfter);
+  parts.push(`----- DIFF (before → after) -----`);
+  parts.push(tc.diff || "(no change)");
+  return parts.join("\n") + "\n";
+}
+
+export function renderToolCallAsText(
+  tc: ToolCallRecord,
+  context: { suite: string; caseName: string; modelLabel: string },
+): string {
+  return (
+    `${hr("=")}\n` +
+    `Suite:     ${context.suite}\n` +
+    `Case:      ${context.caseName}\n` +
+    `Model:     ${context.modelLabel}\n` +
+    `${hr("=")}\n` +
+    `\n` +
+    formatToolCall(tc)
+  );
+}
+
+export function renderEvalRunAsText(record: EvalRunRecord): string {
+  const lines: string[] = [];
+  lines.push(hr("="));
+  lines.push(`Suite:     ${record.suite}`);
+  lines.push(`Case:      ${record.caseName}`);
+  lines.push(
+    `Model:     ${record.model.label} ` +
+      `[${record.model.provider}/${record.model.modelName}]` +
+      (record.model.responseModelId
+        ? ` → ${record.model.responseModelId}`
+        : ""),
+  );
+  lines.push(`Timestamp: ${record.timestamp}`);
+  lines.push(`Passed:    ${record.passed}`);
+  if (record.errorMessage) {
+    lines.push(`Error:     ${record.errorMessage}`);
+  }
+  lines.push(hr("="));
+  lines.push("");
+
+  lines.push("System prompt");
+  lines.push(hr("-"));
+  lines.push(record.prompt.system);
+  lines.push("");
+  lines.push("Instructions");
+  lines.push(hr("-"));
+  lines.push(record.prompt.instructions);
+  lines.push("");
+  lines.push("User prompt (full)");
+  lines.push(hr("-"));
+  lines.push(record.prompt.user);
+  lines.push("");
+
+  lines.push("LLM");
+  lines.push(`  Total duration: ${record.llm.totalDurationMs}ms`);
+  lines.push(`  Requests:       ${record.llm.requestCount}`);
+  lines.push(`  Total tokens:   ${formatUsage(record.llm.totalUsage)}`);
+  for (const req of record.llm.requests) {
+    lines.push(
+      `    step ${req.stepIndex}: ${req.durationMs}ms, ` +
+        `${formatUsage(req.usage)}, finish=${req.finishReason ?? "?"}`,
+    );
+  }
+  lines.push("");
+
+  lines.push(`Tool calls (${record.toolCalls.length})`);
+  lines.push("");
+  for (const tc of record.toolCalls) {
+    lines.push(formatToolCall(tc));
+  }
+
+  lines.push(hr("="));
+  lines.push("Diff (original → final)");
+  lines.push(hr("="));
+  if (record.diff) {
+    lines.push(record.diff);
+  } else {
+    lines.push("(no change)");
+    lines.push("");
+  }
+
+  if (record.judge) {
+    lines.push(hr("="));
+    lines.push("Judge");
+    lines.push(`  Identity: ${record.judge.label} [${record.judge.modelName}]`);
+    lines.push(`  Duration: ${record.judge.durationMs}ms`);
+    lines.push(`  Tokens:   ${formatUsage(record.judge.usage)}`);
+    lines.push(`  Verdict:  ${record.judge.pass ? "PASS" : "FAIL"}`);
+    lines.push(`  Explanation:`);
+    for (const line of record.judge.explanation.split("\n")) {
+      lines.push(`    ${line}`);
+    }
+    lines.push("");
+  }
+
+  return lines.join("\n");
+}
+
+export function recordDirFor(
+  suite: string,
+  caseName: string,
+  modelLabel: string,
+): string {
+  const runDirName =
+    `${fsTimestamp(RUN_START_TIMESTAMP)}__${sanitize(modelLabel)}`;
+  return resolve(
+    RESULTS_ROOT,
+    sanitize(suite),
+    runDirName,
+    sanitize(caseName),
+  );
+}
+
+export async function recordEvalRun(record: EvalRunRecord): Promise<void> {
+  const recordDir = recordDirFor(
+    record.suite,
+    record.caseName,
+    record.model.label,
+  );
+  await mkdir(recordDir, { recursive: true });
+
+  const writes: Promise<void>[] = [
+    writeFile(
+      resolve(recordDir, "record.json"),
+      JSON.stringify(record, null, 2) + "\n",
+    ),
+    writeFile(resolve(recordDir, "record.txt"), renderEvalRunAsText(record)),
+    writeDetailsFolder(recordDir, record),
+  ];
+
+  if (record.toolCalls.length > 0) {
+    writes.push(writeToolCallsFolder(recordDir, record));
+  }
+
+  await Promise.all(writes);
+}
+
+async function writeToolCallsFolder(
+  recordDir: string,
+  record: EvalRunRecord,
+): Promise<void> {
+  const toolCallsDir = resolve(recordDir, "tool_calls");
+  await mkdir(toolCallsDir, { recursive: true });
+  const padWidth = Math.max(2, String(record.toolCalls.length).length);
+
+  await Promise.all(
+    record.toolCalls.map(async (tc) => {
+      const base = String(tc.index + 1).padStart(padWidth, "0");
+
+      // Combined summary (easy to scan in one file).
+      const summaryWrite = writeFile(
+        resolve(toolCallsDir, `${base}.txt`),
+        renderToolCallAsText(tc, {
+          suite: record.suite,
+          caseName: record.caseName,
+          modelLabel: record.model.label,
+        }),
+      );
+
+      // Split views for easy per-piece inspection. Each file contains
+      // the raw content — no headers — so it can be opened in an editor
+      // with syntax highlighting matching the source file's extension.
+      const splitDir = resolve(toolCallsDir, base);
+      await mkdir(splitDir, { recursive: true });
+      const ext = extensionFor(tc.filePath);
+
+      const argLengths: string[] = [];
+      const argWrites: Promise<void>[] = [];
+      // One file per argument. Strings use the target file's extension so
+      // they open with matching syntax highlighting; non-strings become
+      // JSON blobs.
+      for (const [key, value] of Object.entries(tc.args)) {
+        const { text, length } = stringifyArg(value);
+        const argExt = typeof value === "string" ? ext : ".json";
+        argWrites.push(writeFile(resolve(splitDir, `${key}${argExt}`), text));
+        argLengths.push(`${key}: ${length} chars`);
+      }
+
+      await Promise.all([
+        summaryWrite,
+        writeFile(resolve(splitDir, `file_before${ext}`), tc.fileBefore),
+        writeFile(resolve(splitDir, `file_after${ext}`), tc.fileAfter),
+        writeFile(resolve(splitDir, "diff.patch"), tc.diff || ""),
+        ...argWrites,
+        writeFile(
+          resolve(splitDir, "meta.txt"),
+          `index:     ${tc.index + 1}\n` +
+            `tool:      ${tc.toolName}\n` +
+            `timestamp: ${tc.timestamp}\n` +
+            `file_path: ${tc.filePath}\n` +
+            `succeeded: ${tc.succeeded}\n` +
+            (tc.succeeded ? "" : `error:     ${tc.error ?? ""}\n`) +
+            argLengths.map((l) => `${l}\n`).join("") +
+            `file_before: ${tc.fileBefore.length} chars\n` +
+            `file_after: ${tc.fileAfter.length} chars\n`,
+        ),
+      ]);
+    }),
+  );
+}
+
+async function writeDetailsFolder(
+  recordDir: string,
+  record: EvalRunRecord,
+): Promise<void> {
+  const detailsDir = resolve(recordDir, "details");
+  await mkdir(detailsDir, { recursive: true });
+  const ext = extensionFor(record.file.name);
+
+  // Metadata mirrors the main record but drops the large content blobs
+  // that already have their own files (file_before, file_after, overall
+  // diff) and the per-tool-call details (tool_calls/ folder has them).
+  const metadata = {
+    timestamp: record.timestamp,
+    suite: record.suite,
+    caseName: record.caseName,
+    model: record.model,
+    prompt: record.prompt,
+    file: { name: record.file.name },
+    llm: record.llm,
+    toolCallCount: record.toolCalls.length,
+    judge: record.judge,
+    passed: record.passed,
+    errorMessage: record.errorMessage,
+  };
+
+  await Promise.all([
+    writeFile(resolve(detailsDir, `file_before${ext}`), record.file.before),
+    writeFile(resolve(detailsDir, `file_after${ext}`), record.file.after),
+    writeFile(resolve(detailsDir, "diff.patch"), record.diff || ""),
+    writeFile(resolve(detailsDir, "system_prompt.txt"), record.prompt.system),
+    writeFile(
+      resolve(detailsDir, "instructions.txt"),
+      record.prompt.instructions,
+    ),
+    writeFile(resolve(detailsDir, "user_prompt.txt"), record.prompt.user),
+    writeFile(
+      resolve(detailsDir, "metadata.json"),
+      JSON.stringify(metadata, null, 2) + "\n",
+    ),
+    writeFile(
+      resolve(detailsDir, "metadata.txt"),
+      renderMetadataAsText(metadata),
+    ),
+  ]);
+}
+
+function renderMetadataAsText(m: {
+  timestamp: string;
+  suite: string;
+  caseName: string;
+  model: EvalRunRecord["model"];
+  prompt: EvalRunRecord["prompt"];
+  file: { name: string };
+  llm: EvalRunRecord["llm"];
+  toolCallCount: number;
+  judge: JudgeRecord | null;
+  passed: boolean;
+  errorMessage: string | null;
+}): string {
+  const lines: string[] = [];
+  lines.push(hr("="));
+  lines.push(`Suite:     ${m.suite}`);
+  lines.push(`Case:      ${m.caseName}`);
+  lines.push(`File:      ${m.file.name}`);
+  lines.push(
+    `Model:     ${m.model.label} ` +
+      `[${m.model.provider}/${m.model.modelName}]` +
+      (m.model.responseModelId ? ` → ${m.model.responseModelId}` : ""),
+  );
+  lines.push(`Timestamp: ${m.timestamp}`);
+  lines.push(`Passed:    ${m.passed}`);
+  if (m.errorMessage) lines.push(`Error:     ${m.errorMessage}`);
+  lines.push(hr("="));
+  lines.push("");
+
+  lines.push("LLM");
+  lines.push(`  Total duration: ${m.llm.totalDurationMs}ms`);
+  lines.push(`  Requests:       ${m.llm.requestCount}`);
+  lines.push(`  Total tokens:   ${formatUsage(m.llm.totalUsage)}`);
+  for (const req of m.llm.requests) {
+    lines.push(
+      `    step ${req.stepIndex}: ${req.durationMs}ms, ` +
+        `${formatUsage(req.usage)}, finish=${req.finishReason ?? "?"}`,
+    );
+  }
+  lines.push("");
+  lines.push(`Tool call count: ${m.toolCallCount}`);
+  lines.push("");
+
+  lines.push("System prompt");
+  lines.push(hr("-"));
+  lines.push(m.prompt.system);
+  lines.push("");
+  lines.push("Instructions");
+  lines.push(hr("-"));
+  lines.push(m.prompt.instructions);
+  lines.push("");
+
+  if (m.judge) {
+    lines.push(hr("="));
+    lines.push("Judge");
+    lines.push(`  Identity: ${m.judge.label} [${m.judge.modelName}]`);
+    lines.push(`  Provider: ${m.judge.provider}`);
+    lines.push(`  Duration: ${m.judge.durationMs}ms`);
+    lines.push(`  Tokens:   ${formatUsage(m.judge.usage)}`);
+    lines.push(`  Verdict:  ${m.judge.pass ? "PASS" : "FAIL"}`);
+    lines.push(`  Explanation:`);
+    for (const line of m.judge.explanation.split("\n")) {
+      lines.push(`    ${line}`);
+    }
+    lines.push("");
+  }
+
+  return lines.join("\n");
+}
+
+function extensionFor(filePath: string): string {
+  const match = /\.[A-Za-z0-9]+$/.exec(filePath);
+  return match ? match[0] : ".txt";
+}
--- a/src/__tests__/evals/helpers/get_eval_model.ts
+++ b/src/__tests__/evals/helpers/get_eval_model.ts
+import {
+  createDyadEngine,
+  type DyadEngineProvider,
+} from "@/ipc/utils/llm_engine_provider";
+import type { LanguageModel } from "ai";
+import type { UserSettings } from "@/lib/schemas";
+
+export type EvalProvider = "anthropic" | "openai" | "google";
+
+// Eval-only model identifier. Lives here (not in production constants)
+// because Dyad's production picker does not currently surface GPT 5.4 —
+// it had refusal/routing issues — but the eval harness still uses it as
+// the judge model.
+export const GPT_5_4 = "gpt-5.4";
+
+// Single source of truth for the Dyad Engine URL across the eval helpers
+// and any out-of-band fetches the harness makes (e.g. turbo-file-edit).
+export const DYAD_ENGINE_URL =
+  process.env.DYAD_ENGINE_URL ?? "https://engine.dyad.sh/v1";
+
+// Gateway prefixes must match CLOUD_PROVIDERS in language_model_constants.ts.
+const GATEWAY_PREFIXES: Record<EvalProvider, string> = {
+  openai: "",
+  anthropic: "anthropic/",
+  google: "gemini/",
+};
+
+export function hasDyadProKey(): boolean {
+  return !!process.env.DYAD_PRO_API_KEY;
+}
+
+let _provider: DyadEngineProvider | null = null;
+
+/**
+ * Reassemble an SSE stream of OpenAI chat-completion chunks into a single
+ * non-streaming JSON response that the AI SDK's `doGenerate` path can parse.
+ *
+ * The Dyad Engine only supports `stream: true`, but the AI SDK sends
+ * non-streaming requests for `generateText`. This adapter bridges the gap.
+ */
+async function sseToNonStreamingResponse(
+  response: Response,
+): Promise<Response> {
+  const text = await response.text();
+  const lines = text.split("\n");
+
+  let id = "";
+  let model = "";
+  // Track usage across the stream. When `stream_options.include_usage` is
+  // set on the request, OpenAI-compatible providers emit a final chunk with
+  // a populated `usage` object. We overwrite on every chunk that carries
+  // one so the last value wins.
+  let usage: {
+    prompt_tokens: number;
+    completion_tokens: number;
+    total_tokens: number;
+  } = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
+  const choices: Map<
+    number,
+    {
+      role: string;
+      content: string;
+      tool_calls: Map<
+        number,
+        {
+          id: string;
+          type: string;
+          function: { name: string; arguments: string };
+        }
+      >;
+      finish_reason: string | null;
+    }
+  > = new Map();
+
+  for (const line of lines) {
+    if (!line.startsWith("data: ") || line === "data: [DONE]") continue;
+    let chunk: any;
+    try {
+      chunk = JSON.parse(line.slice(6));
+    } catch {
+      continue;
+    }
+
+    // If the engine returned an error inside the SSE stream, surface it as
+    // a JSON error response so the SDK's retry logic can handle it.
+    if (chunk.error) {
+      return new Response(JSON.stringify(chunk), {
+        status: 400,
+        headers: { "content-type": "application/json" },
+      });
+    }
+
+    if (chunk.id) id = chunk.id;
+    if (chunk.model) model = chunk.model;
+
+    if (chunk.usage) {
+      usage = {
+        prompt_tokens: chunk.usage.prompt_tokens ?? 0,
+        completion_tokens: chunk.usage.completion_tokens ?? 0,
+        total_tokens:
+          chunk.usage.total_tokens ??
+          (chunk.usage.prompt_tokens ?? 0) +
+            (chunk.usage.completion_tokens ?? 0),
+      };
+    }
+
+    for (const c of chunk.choices ?? []) {
+      const idx = c.index ?? 0;
+      if (!choices.has(idx)) {
+        choices.set(idx, {
+          role: "assistant",
+          content: "",
+          tool_calls: new Map(),
+          finish_reason: null,
+        });
+      }
+      const choice = choices.get(idx)!;
+      const delta = c.delta ?? {};
+
+      if (delta.role) choice.role = delta.role;
+      if (delta.content) choice.content += delta.content;
+      if (c.finish_reason) choice.finish_reason = c.finish_reason;
+
+      for (const tc of delta.tool_calls ?? []) {
+        const tcIdx = tc.index ?? 0;
+        if (!choice.tool_calls.has(tcIdx)) {
+          choice.tool_calls.set(tcIdx, {
+            id: tc.id ?? "",
+            type: tc.type ?? "function",
+            function: { name: "", arguments: "" },
+          });
+        }
+        const existing = choice.tool_calls.get(tcIdx)!;
+        if (tc.id) existing.id = tc.id;
+        if (tc.type) existing.type = tc.type;
+        if (tc.function?.name) existing.function.name += tc.function.name;
+        if (tc.function?.arguments)
+          existing.function.arguments += tc.function.arguments;
+      }
+    }
+  }
+
+  const assembled = {
+    id,
+    object: "chat.completion",
+    model,
+    choices: Array.from(choices.entries())
+      .sort(([a], [b]) => a - b)
+      .map(([idx, c]) => ({
+        index: idx,
+        message: {
+          role: c.role,
+          content: c.content || null,
+          ...(c.tool_calls.size > 0
+            ? {
+                tool_calls: Array.from(c.tool_calls.entries())
+                  .sort(([a], [b]) => a - b)
+                  .map(([, tc]) => tc),
+              }
+            : {}),
+        },
+        finish_reason: c.finish_reason ?? "stop",
+      })),
+    usage,
+  };
+
+  return new Response(JSON.stringify(assembled), {
+    status: 200,
+    headers: { "content-type": "application/json" },
+  });
+}
+
+/**
+ * Fetch wrapper that adapts requests for the Dyad Engine, which only supports
+ * streaming (`stream: true`). For non-streaming SDK calls (e.g. `generateText`),
+ * this forces `stream: true` in the request and then reassembles the SSE
+ * response into a single JSON object the SDK expects.
+ */
+const evalFetch: typeof fetch = async (input, init) => {
+  if (!init?.body || typeof init.body !== "string") {
+    return fetch(input, init);
+  }
+
+  // Only the JSON parse is allowed to fail silently — if the body isn't a
+  // JSON request we don't know how to adapt, so fall through to a plain
+  // fetch with the original init. Network and SSE-adaptation errors must
+  // propagate so the SDK can surface them (and so we don't double-spend
+  // tokens by transparently retrying a request the gateway already saw).
+  let parsed: any;
+  let wasNonStreaming: boolean;
+  try {
+    parsed = JSON.parse(init.body);
+    wasNonStreaming = !parsed.stream;
+  } catch {
+    return fetch(input, init);
+  }
+
+  // Force streaming — the Dyad Engine returns 500 for non-streaming requests
+  parsed.stream = true;
+  // Ask OpenAI-compatible providers to include a final usage chunk so
+  // we can surface token counts in the reassembled non-streaming
+  // response instead of hard-coding zeros.
+  parsed.stream_options = {
+    ...(parsed.stream_options ?? {}),
+    include_usage: true,
+  };
+  const modifiedInit = { ...init, body: JSON.stringify(parsed) };
+
+  const response = await fetch(input, modifiedInit);
+
+  // Convert the SSE stream back to a single JSON response for the SDK.
+  // Only reassemble when the upstream response is actually an SSE stream —
+  // otherwise (non-OK status, or a non-SSE body like a JSON error payload)
+  // pass the response through unchanged so the SDK's error/retry path
+  // sees the real failure instead of a synthetic empty 200.
+  if (wasNonStreaming) {
+    const contentType = response.headers.get("content-type") ?? "";
+    if (!response.ok || !contentType.includes("text/event-stream")) {
+      return response;
+    }
+    return sseToNonStreamingResponse(response);
+  }
+  return response;
+};
+
+function getProvider(): DyadEngineProvider {
+  if (!_provider) {
+    _provider = createDyadEngine({
+      apiKey: process.env.DYAD_PRO_API_KEY,
+      baseURL: DYAD_ENGINE_URL,
+      dyadOptions: {
+        enableLazyEdits: false,
+        enableSmartFilesContext: false,
+        enableWebSearch: false,
+      },
+      settings: {} as UserSettings,
+      fetch: evalFetch,
+    });
+  }
+  return _provider;
+}
+
+export function getEvalModel(
+  provider: EvalProvider,
+  modelName: string,
+): LanguageModel {
+  const dyadProvider = getProvider();
+  const modelId = `${GATEWAY_PREFIXES[provider]}${modelName}`;
+
+  // Always use the chat completions model (not .responses()) because:
+  // 1. The Dyad Engine only supports streaming for chat completions, and the
+  //    SSE-to-JSON adapter handles that format. The Responses API uses a
+  //    different SSE event format that would need its own adapter.
+  // 2. The eval tests model quality (correct tool calls), not transport layer.
+  return dyadProvider(modelId, { providerId: provider });
+}
--- a/src/__tests__/evals/helpers/prompts.ts
+++ b/src/__tests__/evals/helpers/prompts.ts
+// System prompts used by the eval suites.
+//
+// The "simple" prompts below are minimal baselines for comparing raw tool
+// selection behavior. The "experimental" pro agent prompt is a standalone
+// copy of the production Pro agent prompt (see src/prompts/local_agent_prompt.ts)
+// so that prompt variations can be iterated here and recorded by evals
+// without modifying the prompt that ships in production.
+//
+// When the production prompt meaningfully changes, re-sync this file by
+// hand — intentional drift is the point.
+
+// ── Simple baselines ─────────────────────────────────────────────
+
+export const SIMPLE_SEARCH_REPLACE_SYSTEM_PROMPT =
+  "You are a precise code editor. When asked to change a file, " +
+  "use the search_replace tool. You may call it multiple times " +
+  "to make sequential edits. Do not explain.";
+
+export const SIMPLE_EDIT_FILE_SYSTEM_PROMPT =
+  "You are a precise code editor. When asked to change a file, " +
+  "use the edit_file tool. You may call it multiple times " +
+  "to make sequential edits. Do not explain.";
+
+export const SEARCH_REPLACE_FEW_SYSTEM_PROMPT =
+  SIMPLE_SEARCH_REPLACE_SYSTEM_PROMPT +
+  " Aim to use as few tool calls as possible — ideally a single call " +
+  "that bundles all the required changes. Only make additional calls " +
+  "if the edit genuinely cannot be expressed in one call.";
+
+// ── Experimental Pro agent prompt ────────────────────────────────
+//
+// Standalone copy of the Pro agent system prompt. Edit freely to test
+// prompt variations without touching the production prompt module.
+
+const ROLE_BLOCK = `<role>
+You are Dyad, an AI assistant that creates and modifies web applications. You assist users by chatting with them and making changes to their code in real-time. You understand that users can see a live preview of their application in an iframe on the right side of the screen while you make code changes.
+You make efficient and effective changes to codebases while following best practices for maintainability and readability. You take pride in keeping things simple and elegant. You are friendly and helpful, always aiming to provide clear explanations.
+</role>`;
+
+const APP_COMMANDS_BLOCK = `<app_commands>
+Do *not* tell the user to run shell commands. Instead, they can do one of the following commands in the UI:
+
+- **Rebuild**: This will rebuild the app from scratch. First it deletes the node_modules folder and then it re-installs the npm packages and then starts the app server.
+- **Restart**: This will restart the app server.
+- **Refresh**: This will refresh the app preview page.
+
+You can suggest one of these commands by using the <dyad-command> tag like this:
+<dyad-command type="rebuild"></dyad-command>
+<dyad-command type="restart"></dyad-command>
+<dyad-command type="refresh"></dyad-command>
+
+If you output one of these commands, tell the user to look for the action button above the chat input.
+</app_commands>`;
+
+const COMMON_GUIDELINES = `- All text you output outside of tool use is displayed to the user. Output text to communicate with the user. You can use Github-flavored markdown for formatting.
+- Always reply to the user in the same language they are using.
+- Keep explanations concise and focused
+- If the user asks for help or wants to give feedback, tell them to use the Help button in the bottom left.`;
+
+const GENERAL_GUIDELINES_BLOCK = `<general_guidelines>
+${COMMON_GUIDELINES}
+- Be careful not to introduce security vulnerabilities such as command injection, XSS, SQL injection, and other OWASP top 10 vulnerabilities. If you notice that you wrote insecure code, immediately fix it. Prioritize writing safe, secure, and correct code.
+- Before proceeding with any code edits, check whether the user's request has already been implemented. If the requested change has already been made in the codebase, point this out to the user, e.g., "This feature is already implemented as described."
+- Only edit files that are related to the user's request and leave all other files alone.
+- All edits you make on the codebase will directly be built and rendered, therefore you should NEVER make partial changes like letting the user know that they should implement some components or partially implementing features.
+- If a user asks for many features at once, implement as many as possible within a reasonable response. Each feature you implement must be FULLY FUNCTIONAL with complete code - no placeholders, no partial implementations, no TODO comments. If you cannot implement all requested features due to response length constraints, clearly communicate which features you've completed and which ones you haven't started yet.
+- Prioritize creating small, focused files and components.
+- Set a chat summary at the end using the \`set_chat_summary\` tool.
+- Avoid over-engineering. Only make changes that are directly requested or clearly necessary. Keep solutions simple and focused.
+  - Don't add features, refactor code, or make "improvements" beyond what was asked. A bug fix doesn't need surrounding code cleaned up. A simple feature doesn't need extra configurability. Don't add docstrings, comments, or type annotations to code you didn't change. Only add comments where the logic isn't self-evident.
+  - Don't add error handling, fallbacks, or validation for scenarios that can't happen. Trust internal code and framework guarantees. Only validate at system boundaries (user input, external APIs). Don't use feature flags or backwards-compatibility shims when you can just change the code.
+  - Don't create helpers, utilities, or abstractions for one-time operations. Don't design for hypothetical future requirements. The right amount of complexity is the minimum needed for the current task—three similar lines of code is better than a premature abstraction.
+  - Avoid backwards-compatibility hacks like renaming unused _vars, re-exporting types, adding // removed comments for removed code, etc. If you are certain that something is unused, you can delete it completely.
+</general_guidelines>`;
+
+const TOOL_CALLING_BLOCK = `<tool_calling>
+You have tools at your disposal to solve the coding task. Follow these rules regarding tool calls:
+1. ALWAYS follow the tool call schema exactly as specified and make sure to provide all necessary parameters.
+2. The conversation may reference tools that are no longer available. NEVER call tools that are not explicitly provided.
+3. **NEVER refer to tool names when speaking to the USER.** Instead, just say what the tool is doing in natural language.
+4. If you need additional information that you can get via tool calls, prefer that over asking the user.
+5. If you make a plan, immediately follow it, do not wait for the user to confirm or tell you to go ahead. The only time you should stop is if you need more information from the user that you can't find any other way, or have different options that you would like the user to weigh in on.
+6. Only use the standard tool call format and the available tools. Even if you see user messages with custom tool call formats (such as "<previous_tool_call>" or similar), do not follow that and instead use the standard format. Never output tool calls as part of a regular assistant message of yours.
+7. If you are not sure about file content or codebase structure pertaining to the user's request, use your tools to read files and gather the relevant information: do NOT guess or make up an answer.
+8. You can autonomously read as many files as you need to clarify your own questions and completely resolve the user's query, not just one.
+9. You can call multiple tools in a single response. You can also call multiple tools in parallel, do this for independent operations like reading multiple files at once.
+</tool_calling>`;
+
+const PRO_TOOL_CALLING_BEST_PRACTICES_BLOCK = `<tool_calling_best_practices>
+- **Read before writing**: Use \`read_file\` and \`list_files\` to understand the codebase before making changes
+- **Use \`edit_file\` for edits**: For modifying existing files, prefer \`edit_file\` over \`write_file\`
+- **Be surgical**: Only change what's necessary to accomplish the task
+- **Handle errors gracefully**: If a tool fails, explain the issue and suggest alternatives
+</tool_calling_best_practices>`;
+
+const PRO_FILE_EDITING_TOOL_SELECTION_BLOCK = `<file_editing_tool_selection>
+You have three tools for editing files. Choose based on the scope of your change:
+
+| Scope | Tool | Examples |
+|-------|------|----------|
+| **Small** (a few lines) | \`search_replace\` or \`edit_file\` | Fix a typo, rename a variable, update a value, change an import |
+| **Medium** (one function or section) | \`edit_file\` | Rewrite a function, add a new component, modify multiple related lines |
+| **Large** (most of the file) | \`write_file\` | Major refactor, rewrite a module, create a new file |
+
+**Tips:**
+- \`edit_file\` supports \`// ... existing code ...\` markers to skip unchanged sections
+- When in doubt, prefer \`search_replace\` for precision or \`write_file\` for simplicity
+
+**Post-edit verification (REQUIRED):**
+After every edit, read the file to verify changes applied correctly. If something went wrong, try a different tool and verify again.
+</file_editing_tool_selection>`;
+
+const PRO_DEVELOPMENT_WORKFLOW_BLOCK = `<development_workflow>
+1. **Understand:** Think about the user's request and the relevant codebase context. Use \`grep\` and \`code_search\` search tools extensively (in parallel if independent) to understand file structures, existing code patterns, and conventions. Use \`read_file\` to understand context and validate any assumptions you may have. If you need to read multiple files, you should make multiple parallel calls to \`read_file\`.
+2. **Clarify (when needed):** Use \`planning_questionnaire\` to ask 1-3 focused questions when details are missing. Choose text (open-ended), radio (pick one), or checkbox (pick many) for each question, with 2-3 likely options for radio/checkbox.
+   **Use when:** creating a new app/project, the request is vague (e.g. "Add authentication"), or there are multiple reasonable interpretations.
+   **Skip when:** the request is specific and concrete (e.g. "Fix the login button", "Change color from blue to green").
+   The tool accepts ONLY a \`questions\` array (no empty objects). It returns the user's answers as the tool result.
+3. **Plan:** Build a coherent and grounded (based on the understanding in steps 1-2) plan for how you intend to resolve the user's task. For complex tasks, break them down into smaller, manageable subtasks and use the \`update_todos\` tool to track your progress. Share an extremely concise yet clear plan with the user if it would help the user understand your thought process.
+4. **Implement:** Use the available tools (e.g., \`edit_file\`, \`write_file\`, ...) to act on the plan, strictly adhering to the project's established conventions. When debugging, add targeted console.log statements to trace data flow and identify root causes. **Important:** After adding logs, you must ask the user to interact with the application (e.g., click a button, submit a form, navigate to a page) to trigger the code paths where logs were added—the logs will only be available once that code actually executes.
+5. **Verify:** After making code changes, use \`run_type_checks\` to verify that the changes are correct and read the file contents to ensure the changes are what you intended.
+6. **Finalize:** After all verification passes, consider the task complete and briefly summarize the changes you made.
+</development_workflow>`;
+
+const IMAGE_GENERATION_BLOCK = `<image_generation_guidelines>
+When a user explicitly requests custom images, illustrations, or visual media for their app:
+- Use the \`generate_image\` tool instead of using placeholder images or broken external URLs
+- Do NOT generate images when an existing asset, SVG, or icon library (e.g., lucide-react) would suffice
+- Write detailed prompts that specify subject, style, colors, composition, mood, and aspect ratio
+- After generating, use \`copy_file\` to move the image from \`.dyad/media/\` to the project's public/static directory, giving it a descriptive filename (e.g., \`public/assets/hero-banner.png\`)
+- Reference the copied path in code (e.g., \`<img src="/assets/hero-banner.png" />\`)
+</image_generation_guidelines>`;
+
+const DEFAULT_AI_RULES = `# Tech Stack
+- You are building a React application.
+- Use TypeScript.
+- Use React Router. KEEP the routes in src/App.tsx
+- Always put source code in the src folder.
+- Put pages into src/pages/
+- Put components into src/components/
+- The main page (default page) is src/pages/Index.tsx
+- UPDATE the main page to include the new components. OTHERWISE, the user can NOT see any components!
+- ALWAYS try to use the shadcn/ui library.
+- Tailwind CSS: always use Tailwind CSS for styling components. Utilize Tailwind classes extensively for layout, spacing, colors, and other design aspects.
+
+Available packages and libraries:
+- The lucide-react package is installed for icons.
+- You ALREADY have ALL the shadcn/ui components and their dependencies installed. So you don't need to install them again.
+- You have ALL the necessary Radix UI components installed.
+- Use prebuilt components from the shadcn/ui library after importing them. Note that these files shouldn't be edited, so make new components if you need to change them.
+`;
+
+export const PRO_AGENT_EXPERIMENTAL_SYSTEM_PROMPT = `
+${ROLE_BLOCK}
+
+${APP_COMMANDS_BLOCK}
+
+${GENERAL_GUIDELINES_BLOCK}
+
+${TOOL_CALLING_BLOCK}
+
+${PRO_TOOL_CALLING_BEST_PRACTICES_BLOCK}
+
+${PRO_FILE_EDITING_TOOL_SELECTION_BLOCK}
+
+${PRO_DEVELOPMENT_WORKFLOW_BLOCK}
+
+${IMAGE_GENERATION_BLOCK}
+
+${DEFAULT_AI_RULES}
+`;
--- a/src/__tests__/evals/helpers/unified_diff.ts
+++ b/src/__tests__/evals/helpers/unified_diff.ts
+// Minimal unified-diff generator — no third-party deps.
+//
+// Uses an LCS dynamic-programming table to align two files line by line,
+// backtracks it into a sequence of keep/add/remove ops, and groups those
+// into hunks with a fixed number of context lines. Output matches the
+// unified diff format consumed by `patch -p0`.
+//
+// Performance: O(m*n) time and space, where m and n are line counts.
+// Fine for eval fixtures (hundreds of lines). Callers that need to diff
+// multi-thousand-line files should reach for a real Myers implementation.
+
+const DEFAULT_CONTEXT = 3;
+
+type OpType = "keep" | "add" | "remove";
+
+interface DiffOp {
+  type: OpType;
+  line: string;
+}
+
+interface PositionedOp {
+  op: DiffOp;
+  oldPos: number; // 1-indexed line number in the old file
+  newPos: number; // 1-indexed line number in the new file
+}
+
+interface Hunk {
+  oldStart: number;
+  oldLen: number;
+  newStart: number;
+  newLen: number;
+  lines: string[];
+}
+
+function computeLcsTable(
+  oldLines: readonly string[],
+  newLines: readonly string[],
+): number[][] {
+  const m = oldLines.length;
+  const n = newLines.length;
+  const dp: number[][] = Array.from({ length: m + 1 }, () =>
+    new Array<number>(n + 1).fill(0),
+  );
+  for (let i = 1; i <= m; i++) {
+    for (let j = 1; j <= n; j++) {
+      if (oldLines[i - 1] === newLines[j - 1]) {
+        dp[i][j] = dp[i - 1][j - 1] + 1;
+      } else {
+        dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
+      }
+    }
+  }
+  return dp;
+}
+
+function backtrackOps(
+  dp: number[][],
+  oldLines: readonly string[],
+  newLines: readonly string[],
+): DiffOp[] {
+  const ops: DiffOp[] = [];
+  let i = oldLines.length;
+  let j = newLines.length;
+  while (i > 0 || j > 0) {
+    if (i > 0 && j > 0 && oldLines[i - 1] === newLines[j - 1]) {
+      ops.push({ type: "keep", line: oldLines[i - 1] });
+      i--;
+      j--;
+    } else if (j > 0 && (i === 0 || dp[i][j - 1] >= dp[i - 1][j])) {
+      ops.push({ type: "add", line: newLines[j - 1] });
+      j--;
+    } else {
+      ops.push({ type: "remove", line: oldLines[i - 1] });
+      i--;
+    }
+  }
+  return ops.reverse();
+}
+
+function assignPositions(ops: readonly DiffOp[]): PositionedOp[] {
+  const out: PositionedOp[] = [];
+  let oldCursor = 1;
+  let newCursor = 1;
+  for (const op of ops) {
+    out.push({ op, oldPos: oldCursor, newPos: newCursor });
+    if (op.type === "keep") {
+      oldCursor++;
+      newCursor++;
+    } else if (op.type === "remove") {
+      oldCursor++;
+    } else {
+      newCursor++;
+    }
+  }
+  return out;
+}
+
+function buildHunks(
+  positioned: readonly PositionedOp[],
+  context: number,
+): Hunk[] {
+  const include = new Array<boolean>(positioned.length).fill(false);
+  for (let i = 0; i < positioned.length; i++) {
+    if (positioned[i].op.type !== "keep") {
+      const lo = Math.max(0, i - context);
+      const hi = Math.min(positioned.length - 1, i + context);
+      for (let k = lo; k <= hi; k++) include[k] = true;
+    }
+  }
+
+  const hunks: Hunk[] = [];
+  let i = 0;
+  while (i < positioned.length) {
+    if (!include[i]) {
+      i++;
+      continue;
+    }
+    let j = i;
+    while (j < positioned.length && include[j]) j++;
+
+    const group = positioned.slice(i, j);
+    const first = group[0];
+    let oldStart = first.oldPos;
+    let newStart = first.newPos;
+    let oldLen = 0;
+    let newLen = 0;
+    const lines: string[] = [];
+
+    for (const p of group) {
+      if (p.op.type === "keep") {
+        oldLen++;
+        newLen++;
+        lines.push(` ${p.op.line}`);
+      } else if (p.op.type === "remove") {
+        oldLen++;
+        lines.push(`-${p.op.line}`);
+      } else {
+        newLen++;
+        lines.push(`+${p.op.line}`);
+      }
+    }
+
+    // Unified diff convention: zero-length sides use the line number of
+    // the preceding line (or 0 if there is none).
+    if (oldLen === 0) oldStart = Math.max(0, oldStart - 1);
+    if (newLen === 0) newStart = Math.max(0, newStart - 1);
+
+    hunks.push({ oldStart, oldLen, newStart, newLen, lines });
+    i = j;
+  }
+
+  return hunks;
+}
+
+/**
+ * Produce a unified diff between two strings. Returns an empty string
+ * when the inputs are identical. Trailing newlines are preserved by
+ * round-tripping through `split("\n")`, which yields an empty final
+ * element that the diff walker treats like any other line.
+ */
+export function createUnifiedDiff(
+  oldContent: string,
+  newContent: string,
+  options: {
+    oldLabel?: string;
+    newLabel?: string;
+    context?: number;
+  } = {},
+): string {
+  if (oldContent === newContent) return "";
+
+  const oldLabel = options.oldLabel ?? "original";
+  const newLabel = options.newLabel ?? "modified";
+  const context = options.context ?? DEFAULT_CONTEXT;
+
+  const oldLines = oldContent.split("\n");
+  const newLines = newContent.split("\n");
+  const dp = computeLcsTable(oldLines, newLines);
+  const ops = backtrackOps(dp, oldLines, newLines);
+  const positioned = assignPositions(ops);
+  const hunks = buildHunks(positioned, context);
+
+  if (hunks.length === 0) return "";
+
+  const out: string[] = [];
+  out.push(`--- ${oldLabel}`);
+  out.push(`+++ ${newLabel}`);
+  for (const h of hunks) {
+    out.push(`@@ -${h.oldStart},${h.oldLen} +${h.newStart},${h.newLen} @@`);
+    for (const line of h.lines) out.push(line);
+  }
+  return out.join("\n") + "\n";
+}
--- a/src/__tests__/evals/tool_use.eval.ts
+++ b/src/__tests__/evals/tool_use.eval.ts
+import { describe, it } from "vitest";
+import { generateText, stepCountIs, type Tool } from "ai";
+import { readFileSync } from "node:fs";
+import { basename, resolve } from "node:path";
+import { randomUUID } from "node:crypto";
+import { searchReplaceTool } from "@/pro/main/ipc/handlers/local_agent/tools/search_replace";
+import { writeFileTool } from "@/pro/main/ipc/handlers/local_agent/tools/write_file";
+import { editFileTool } from "@/pro/main/ipc/handlers/local_agent/tools/edit_file";
+import { applySearchReplace } from "@/pro/main/ipc/processors/search_replace_processor";
+import { escapeSearchReplaceMarkers } from "@/pro/shared/search_replace_markers";
+import { constructLocalAgentPrompt } from "@/prompts/local_agent_prompt";
+import {
+  SONNET_4_6,
+  GEMINI_3_FLASH,
+} from "@/ipc/shared/language_model_constants";
+import {
+  DYAD_ENGINE_URL,
+  GPT_5_4,
+  getEvalModel,
+  hasDyadProKey,
+  type EvalProvider,
+} from "./helpers/get_eval_model";
+import {
+  normalizeUsage,
+  recordDirFor,
+  recordEvalRun,
+  type LLMRequestRecord,
+  type ToolCallRecord,
+  type JudgeRecord,
+} from "./helpers/eval_recorder";
+import { createUnifiedDiff } from "./helpers/unified_diff";
+import {
+  SIMPLE_SEARCH_REPLACE_SYSTEM_PROMPT,
+  SIMPLE_EDIT_FILE_SYSTEM_PROMPT,
+  SEARCH_REPLACE_FEW_SYSTEM_PROMPT,
+  PRO_AGENT_EXPERIMENTAL_SYSTEM_PROMPT,
+} from "./helpers/prompts";
+
+// ── Fixture loader ─────────────────────────────────────────────
+
+const FIXTURES_DIR = resolve(__dirname, "fixtures");
+
+function loadFixture(filename: string): string {
+  return readFileSync(resolve(FIXTURES_DIR, filename), "utf-8");
+}
+
+// Models sometimes emit paths like `./foo.ts`, `src/foo.ts`, or even Windows-
+// style `.\foo.ts` / `src\foo.ts` instead of the bare fixture filename. Since
+// each case targets a single known file, a basename match is sufficient and
+// avoids penalizing harmless path formatting differences across models. We
+// normalize backslashes to forward slashes first because node's posix
+// `basename` treats `\` as a regular filename character.
+function pathMatchesCase(got: string | undefined, expected: string): boolean {
+  if (!got) return false;
+  const normalize = (p: string) => basename(p.replace(/\\/g, "/"));
+  return normalize(got) === normalize(expected);
+}
+
+// ── Case type ──────────────────────────────────────────────────
+
+interface EvalCase {
+  name: string;
+  fileName: string;
+  fileContent: string;
+  prompt: string;
+  // Optional cheap post-edit sanity checks. The authoritative verdict
+  // comes from the LLM judge; these guard against the model hallucinating
+  // a passing diff that obviously doesn't contain the expected symbols.
+  structuralChecks?: string[];
+}
+
+// ── Cases ──────────────────────────────────────────────────────
+
+const CASES: EvalCase[] = [
+  {
+    name: "Extract a helper function",
+    fileName: "order_processor.ts",
+    fileContent: loadFixture("order_processor.ts"),
+    prompt:
+      "Extract the validation logic in `processOrder` (the block that checks inventory, " +
+      "validates payment, and verifies shipping) into a separate `validateOrder` function. " +
+      "The new function should accept the same `order` parameter and return the same " +
+      "`ProcessResult` type on validation failure, or `null` if validation passes. " +
+      "`processOrder` should call `validateOrder` and return early if it returns a non-null result.",
+    structuralChecks: ["function validateOrder", "validateOrder("],
+  },
+  {
+    name: "Add error handling to multiple call sites",
+    fileName: "fetch_client.ts",
+    fileContent: loadFixture("fetch_client.ts"),
+    prompt:
+      "Wrap each call to `serviceRequest` in the convenience functions (`getResource`, " +
+      "`postResource`, `putResource`, `patchResource`, `deleteResource`) with a try/catch " +
+      "that logs `logger.error(`${method} ${path} failed`, err)` (where method and path " +
+      "come from the function context) and re-throws the error. Do not modify `serviceRequest` itself.",
+    structuralChecks: ["try {", "catch"],
+  },
+  {
+    name: "Convert class component to function component",
+    fileName: "UserProfile.tsx",
+    fileContent: loadFixture("UserProfile.tsx"),
+    prompt:
+      "Convert `UserProfile` from a class component to a function component using React hooks. " +
+      "Replace `this.state` with `useState` hooks, `componentDidMount`/`componentDidUpdate` " +
+      "with `useEffect`, and class methods with regular functions or `useCallback`. " +
+      "Keep the same external behavior and JSX structure.",
+    structuralChecks: ["useState", "useEffect"],
+  },
+  {
+    name: "Refactor giant component into 3 smaller ones",
+    fileName: "UserProfileFull.tsx",
+    fileContent: loadFixture("UserProfileFull.tsx"),
+    prompt:
+      "Extract `AvatarSection` (the avatar/upload logic and its JSX around the avatar-section), " +
+      "`StatsPanel` (the stats grid, header, and summary around the stats-panel section), " +
+      "and `ActivityFeed` (the activity list, grouping, and load-more around the activity-feed section) " +
+      "into their own function components in the same file. Pass the necessary props to each. " +
+      "Then use `<AvatarSection>`, `<StatsPanel>`, and `<ActivityFeed>` in the main `UserProfile` component.",
+    structuralChecks: [
+      "function AvatarSection",
+      "function StatsPanel",
+      "function ActivityFeed",
+      "<AvatarSection",
+      "<StatsPanel",
+      "<ActivityFeed",
+    ],
+  },
+  {
+    name: "Reorganize switch into strategy map",
+    fileName: "event_handler.ts",
+    fileContent: loadFixture("event_handler.ts"),
+    prompt:
+      "Refactor the `handleEvent` function's switch statement into a " +
+      "`Record<EventType, (payload: Record<string, unknown>) => Promise<void>>` handler map " +
+      "and a dispatch function. The `handleEvent` function should look up the handler in the map " +
+      "and call it, falling back to a warning log for unknown types. Remove the switch statement entirely.",
+    structuralChecks: ["Record<", "handleEvent"],
+  },
+  {
+    name: "Convert Promise chains to async/await",
+    fileName: "user_service.ts",
+    fileContent: loadFixture("user_service.ts"),
+    prompt:
+      "Rewrite every exported function in this file to use `async`/`await` with a " +
+      "`try`/`catch` block instead of `.then()`/`.catch()` chains. Preserve the existing " +
+      "error-logging behavior (each catch block should still log and re-throw). Do not " +
+      "change any function signatures or return types. Do not add or remove log calls.",
+    structuralChecks: ["async function", "await", "try {", "catch"],
+  },
+  {
+    name: "Replace console.* calls with logger.*",
+    fileName: "analytics.ts",
+    fileContent: loadFixture("analytics.ts"),
+    prompt:
+      "Replace every real call to `console.log`, `console.warn`, and `console.error` " +
+      "with `logger.info`, `logger.warn`, and `logger.error` respectively. Add a new " +
+      'import at the top of the file: `import { logger } from "./logger";`. Do NOT ' +
+      "modify the word `console` when it appears inside comments or inside string " +
+      "literals (for example the help text shown to the user).",
+    structuralChecks: [
+      "logger.info",
+      "logger.warn",
+      "logger.error",
+      "./logger",
+    ],
+  },
+  {
+    name: "Add optional chaining and defaults for nested config access",
+    fileName: "config_reader.ts",
+    fileContent: loadFixture("config_reader.ts"),
+    prompt:
+      "Make every nested property access on the `cfg` argument safe against missing " +
+      "intermediate objects by using optional chaining (`?.`). For accesses that " +
+      "produce the function's return value, use the `??` nullish-coalescing operator to " +
+      "supply sensible defaults: empty string for string results, 0 for number results, " +
+      "and `false` for boolean results. Do not change any function signatures or the " +
+      "`AppConfig` interface.",
+    structuralChecks: ["?.", "??"],
+  },
+  {
+    name: "Extract magic numbers into named constants",
+    fileName: "cache_manager.ts",
+    fileContent: loadFixture("cache_manager.ts"),
+    prompt:
+      "Extract the duration and size magic numbers in this file into named `const` " +
+      "declarations at the top of the module (below any imports and interfaces). " +
+      "Use descriptive SCREAMING_SNAKE_CASE names that convey units (e.g. " +
+      "`MAX_ENTRY_BYTES`, `MAX_TOTAL_BYTES`, `DEFAULT_TTL_MS`, `CLEANUP_INTERVAL_MS`). " +
+      "Replace each occurrence with the new constant. Do not extract ordinary " +
+      "integers that are not magic (for example loop counters or `0` initializers).",
+    structuralChecks: ["const ", "= "],
+  },
+  {
+    name: "Add zod validation to API handler",
+    fileName: "user_handler.ts",
+    fileContent: loadFixture("user_handler.ts"),
+    prompt:
+      'Add an `import { z } from "zod";` statement to this file and define a ' +
+      "`createUserBodySchema` that validates the shape of `req.body`: `email` is a " +
+      "string email, `name` is a non-empty string, `age` is a non-negative integer, " +
+      'and `role` is one of `"admin"`, `"member"`, `"guest"`. At the top of ' +
+      "`createUserHandler`, parse `req.body` with the schema inside a try/catch. On a " +
+      '`ZodError`, respond with status 400 and a JSON body of `{ error: "invalid ' +
+      'body", details: err.issues }`. Read the validated fields from the parsed ' +
+      "object instead of from `req.body` directly. Do not change the rest of the " +
+      "handler's logic.",
+    structuralChecks: [
+      'from "zod"',
+      "createUserBodySchema",
+      ".parse(",
+      "ZodError",
+    ],
+  },
+  {
+    name: "Dedupe redundant guard/logging block across handlers",
+    fileName: "route_handlers.ts",
+    fileContent: loadFixture("route_handlers.ts"),
+    prompt:
+      "All the handlers in this file repeat the same `userId` + `id` validation " +
+      "block and the same `logger.info` timing log. Extract the validation into a " +
+      "helper `requireAuthedIdParam(req, res)` that returns the validated `id` string " +
+      "on success or `null` after writing the 401/400 response. Extract the timing " +
+      "log into a helper `logHandlerTiming(name, id, startMs)`. Replace the redundant " +
+      "logic in all handlers with these two helpers. Do not change the handlers' " +
+      "exported signatures or their response bodies for the success path.",
+    structuralChecks: [
+      "function requireAuthedIdParam",
+      "function logHandlerTiming",
+      "requireAuthedIdParam(",
+      "logHandlerTiming(",
+    ],
+  },
+  {
+    name: "Extract multiple shared helpers from duplicated reporting logic",
+    fileName: "report_builders.ts",
+    fileContent: loadFixture("report_builders.ts"),
+    prompt:
+      "The exported report functions in this file repeat several patterns. Extract these " +
+      "into named helper functions at the top of the module (below the interfaces and " +
+      "MONTH_NAMES) and reuse them throughout:\n\n" +
+      "1. A helper `filterByDateField<T>(items: T[], range: ReportRange, getDate: (item: T) => string): T[]` " +
+      "that filters items whose ISO date (extracted via `getDate`) falls in `[range.from, range.to)`. " +
+      "Every `Date.parse` range-filter block should call this helper.\n" +
+      "2. A helper `formatUsd(amount: number): string` that returns the USD-formatted string " +
+      "currently produced by the repeated `new Intl.NumberFormat(...).format(amount)` calls. " +
+      "Every such call should go through this helper.\n" +
+      "3. A helper `formatRangeLabel(range: ReportRange): string` that returns the " +
+      '`"Jan 1, 2025 – Feb 1, 2025"`-style label built from MONTH_NAMES. Every occurrence ' +
+      "of that block should go through this helper.\n" +
+      "4. A helper `sumBy<T>(items: T[], get: (item: T) => number): number` that returns the " +
+      "sum, and a helper `groupSumBy<T>(items: T[], getKey: (item: T) => string, getValue: (item: T) => number): Map<string, number>` " +
+      "that builds a keyed-sum Map. Use them wherever a manual `for`-loop *revenue/amount " +
+      "sum* or *keyed-sum* Map accumulation appears (for example summing `unitPrice * quantity`, " +
+      "`amount`, or `mrr`). Do NOT force count-accumulation or conditional-tally loops " +
+      "through these helpers — leave counts (e.g. `countByReason`, `countByPlan`) and " +
+      "conditional counters (e.g. `churnRate`'s `activeAtStart` / `canceledInRange`) as " +
+      "manual loops.\n\n" +
+      "Preserve every exported function's signature and return type exactly. Do not change " +
+      "sort order, rounding, or numeric results. Do not remove any exported function.",
+    structuralChecks: [
+      "function filterByDateField",
+      "function formatUsd",
+      "function formatRangeLabel",
+      "function sumBy",
+      "function groupSumBy",
+      "filterByDateField(",
+      "formatUsd(",
+      "formatRangeLabel(",
+    ],
+  },
+  {
+    name: "Migrate Contact schema: split name into firstName and lastName",
+    fileName: "contact_book.ts",
+    fileContent: loadFixture("contact_book.ts"),
+    prompt:
+      "Replace the `name: string` field on the `Contact` interface with two separate " +
+      "fields: `firstName: string` and `lastName: string`. Update every function in the " +
+      "file to use the new fields. Specifically:\n\n" +
+      "- `createContact` must accept `firstName` and `lastName` in its input (instead of " +
+      "`name`), each trimmed.\n" +
+      "- `fromCsv` must read two columns `firstName,lastName` from the header instead of " +
+      "a single `name` column. `toCsv` must emit the same two columns. The CSV header must " +
+      "start with `firstName,lastName,email,phone,tags,starred`.\n" +
+      '- `displayName` must return `"${firstName} ${lastName}"` (single space, no trim ' +
+      "beyond what is already stored).\n" +
+      '- `lastFirstDisplay` must return `"${lastName}, ${firstName}"` directly — no more ' +
+      "string splitting.\n" +
+      "- `initials` must return `firstName.charAt(0).toUpperCase() + lastName.charAt(0).toUpperCase()` " +
+      "— no splitting, no length guards.\n" +
+      '- `greetingFor` must greet using `firstName` directly (fall back to `"there"` if ' +
+      "`firstName` is empty). \n" +
+      "- `searchByName` must match the query (case-insensitive) against either `firstName` " +
+      "OR `lastName` (not the concatenation).\n" +
+      "- `sortByName` must sort by `lastName` then `firstName` (both case-insensitive).\n" +
+      "- `sortByLastName` must sort by `lastName` (case-insensitive), no more splitting.\n" +
+      "- `dedupeByName` must treat two contacts as duplicates when both their `firstName` " +
+      "and `lastName` match case-insensitively.\n" +
+      '- `validateContact` must report `"firstName is required"` if `firstName.trim()` is ' +
+      'empty, and `"lastName is required"` if `lastName.trim()` is empty (keep the email ' +
+      "check unchanged).\n" +
+      '- `formatLine` must render `"${firstName} ${lastName} <${email}>"` (with the ' +
+      "existing star prefix).\n\n" +
+      "Do not leave any reference to a `.name` property on a Contact anywhere in the file. " +
+      "Do not change any other public API (function names, return types, other fields).",
+    structuralChecks: [
+      "firstName: string",
+      "lastName: string",
+      "firstName,lastName,email,phone,tags,starred",
+      "firstName is required",
+      "lastName is required",
+    ],
+  },
+  {
+    name: "Replace Math.pow with exponentiation operator",
+    fileName: "stat_utils.ts",
+    fileContent: loadFixture("stat_utils.ts"),
+    prompt:
+      "Replace every `Math.pow(base, exponent)` call in this file with the " +
+      "JavaScript exponentiation operator `**`. When `base` is a compound " +
+      "expression (i.e. anything other than a bare identifier or numeric literal), " +
+      "wrap it in parentheses so operator precedence is preserved. Single " +
+      "identifiers and numeric literals do not need extra parentheses. " +
+      "The `correlation` function is currently duplicated. Delete all but one declaration. " +
+      "Additionally, fix the currently incorrect `median` function." +
+      "Do not change any other code.",
+    structuralChecks: ["** 2", "** 3", "** 4"],
+  },
+  {
+    name: "Rename exported function but preserve references in string literals",
+    fileName: "order_math.ts",
+    fileContent: loadFixture("order_math.ts"),
+    prompt:
+      "Rename the exported function `calculateTotal` to `computeOrderTotal`. Update " +
+      "every call site inside this file to use the new name. Do NOT modify any " +
+      "occurrences of the old name `calculateTotal` that appear inside string " +
+      "literals (for example inside `throw new Error(...)` messages) — those " +
+      "diagnostic strings must keep referring to the historical name.",
+    structuralChecks: [
+      "function computeOrderTotal",
+      "computeOrderTotal(",
+      "calculateTotal failed",
+    ],
+  },
+  {
+    name: "Restrict moderator from managing users",
+    fileName: "permissions.ts",
+    fileContent: loadFixture("permissions.ts"),
+    prompt:
+      "In `ModeratorPolicy`, change `canManageUsers` to return `false` instead of `true`. " +
+      "Do not modify any other methods or classes.",
+    structuralChecks: [],
+  },
+];
+
+// ── Judge helper ───────────────────────────────────────────────
+
+const JUDGE_LABEL = "GPT 5.4";
+const JUDGE_PROVIDER: EvalProvider = "openai";
+const JUDGE_MODEL = GPT_5_4;
+
+async function judgeResult(
+  originalFile: string,
+  prompt: string,
+  resultFile: string,
+  abortSignal?: AbortSignal,
+): Promise<JudgeRecord> {
+  const startMs = Date.now();
+  const result = await generateText({
+    model: getEvalModel(JUDGE_PROVIDER, JUDGE_MODEL),
+    temperature: 1,
+    abortSignal,
+    system:
+      "You are a code-review judge. You will be given an original file, " +
+      "an edit instruction, and the resulting file after the edit was applied. " +
+      "Evaluate whether the result correctly implements the requested change " +
+      "without introducing bugs, removing unrelated code, or breaking the " +
+      "file's existing behavior.\n\n" +
+      "Format your response as follows (do NOT keep reasoning private — write " +
+      "it in your visible output):\n\n" +
+      "1. Write a concise written explanation of what you observed and why you " +
+      "are passing or failing the edit. This explanation MUST appear in your " +
+      "visible output, not in any hidden reasoning channel.\n" +
+      "2. On the VERY LAST line, write exactly `PASS` or `FAIL` and nothing else.",
+    messages: [
+      {
+        role: "user",
+        content:
+          `## Edit instruction\n${prompt}\n\n` +
+          `## Original file\n\`\`\`\n${originalFile}\n\`\`\`\n\n` +
+          `## Result file\n\`\`\`\n${resultFile}\n\`\`\``,
+      },
+    ],
+  });
+  const durationMs = Date.now() - startMs;
+
+  const text = result.text.trim();
+  const lines = text.split("\n");
+  const lastLine = lines.at(-1)?.trim() ?? "";
+  const pass = lastLine === "PASS";
+  // Strip the trailing verdict line so the explanation field holds only
+  // the reasoning. If the model emitted only a verdict (no explanation),
+  // record a clear marker instead of an empty string so reviewers can
+  // tell "no explanation given" apart from "explanation missing due to
+  // a bug in the recorder".
+  const explanationBody = lines.slice(0, -1).join("\n").trim();
+  const explanation =
+    explanationBody.length > 0
+      ? explanationBody
+      : `(no explanation emitted — raw model output was: ${JSON.stringify(text)})`;
+
+  return {
+    label: JUDGE_LABEL,
+    provider: JUDGE_PROVIDER,
+    modelName: JUDGE_MODEL,
+    durationMs,
+    usage: normalizeUsage(result.totalUsage),
+    pass,
+    explanation,
+  };
+}
+
+// ── Tool apply helpers ─────────────────────────────────────────
+
+function applySearchReplaceEdit(
+  fileContent: string,
+  args: { old_string: string; new_string: string },
+): string {
+  const escapedOld = escapeSearchReplaceMarkers(args.old_string);
+  const escapedNew = escapeSearchReplaceMarkers(args.new_string);
+  const ops = `<<<<<<< SEARCH\n${escapedOld}\n=======\n${escapedNew}\n>>>>>>> REPLACE`;
+  const applied = applySearchReplace(fileContent, ops);
+  if (!applied.success) {
+    throw new Error(`applySearchReplace failed: ${applied.error}`);
+  }
+  return applied.content!;
+}
+
+// Stand-in for the production `edit_file` tool's engine call. Mirrors
+// `callTurboFileEdit` in src/pro/main/ipc/handlers/local_agent/tools/edit_file.ts
+// but reaches the engine directly (no AgentContext required). The base URL is
+// imported from `helpers/get_eval_model` so this and the SDK provider can't
+// drift apart.
+
+async function turboFileEdit(params: {
+  path: string;
+  content: string;
+  originalContent: string;
+  instructions?: string;
+  signal?: AbortSignal;
+}): Promise<string> {
+  const apiKey = process.env.DYAD_PRO_API_KEY;
+  if (!apiKey) {
+    throw new Error(
+      "DYAD_PRO_API_KEY is required to run eval suites that use edit_file",
+    );
+  }
+  const response = await fetch(`${DYAD_ENGINE_URL}/tools/turbo-file-edit`, {
+    method: "POST",
+    headers: {
+      "Content-Type": "application/json",
+      Authorization: `Bearer ${apiKey}`,
+      "X-Dyad-Request-Id": randomUUID(),
+    },
+    body: JSON.stringify({
+      path: params.path,
+      content: params.content,
+      originalContent: params.originalContent,
+      instructions: params.instructions ?? "",
+    }),
+    signal: params.signal,
+  });
+  if (!response.ok) {
+    const errorText = await response.text();
+    throw new Error(
+      `turbo-file-edit failed: ${response.status} ${response.statusText} - ${errorText}`,
+    );
+  }
+  const data = (await response.json()) as { result?: unknown };
+  if (typeof data.result !== "string") {
+    throw new Error("turbo-file-edit returned unexpected payload (no result)");
+  }
+  return data.result;
+}
+
+// ── Tool factories ─────────────────────────────────────────────
+//
+// Each factory returns an AI-SDK tool whose `execute` mutates the
+// shared `state.content` box and appends a `ToolCallRecord`. The
+// factories take closures over `state` and the case so the tool
+// stays bound to this single run.
+
+interface ToolRunState {
+  content: string;
+  toolCalls: ToolCallRecord[];
+  abortSignal?: AbortSignal;
+}
+
+function makeRecord(
+  toolName: string,
+  filePath: string,
+  args: Record<string, unknown>,
+  fileBefore: string,
+  fileAfter: string,
+  index: number,
+  opts: { succeeded?: boolean; error?: string | null } = {},
+): ToolCallRecord {
+  const succeeded = opts.succeeded ?? true;
+  return {
+    timestamp: new Date().toISOString(),
+    index,
+    toolName,
+    filePath,
+    args,
+    fileBefore,
+    fileAfter,
+    diff: createUnifiedDiff(fileBefore, fileAfter, {
+      oldLabel: `${filePath} (before call ${index + 1})`,
+      newLabel: `${filePath} (after call ${index + 1})`,
+    }),
+    succeeded,
+    error: succeeded ? null : (opts.error ?? null),
+  };
+}
+
+function searchReplaceHarnessTool(
+  state: ToolRunState,
+  c: EvalCase,
+  label: string,
+): Tool {
+  return {
+    description: searchReplaceTool.description,
+    inputSchema: searchReplaceTool.inputSchema,
+    execute: async (args) => {
+      const fileBefore = state.content;
+      const recordArgs = {
+        file_path: args.file_path,
+        old_string: args.old_string,
+        new_string: args.new_string,
+      };
+      try {
+        if (!pathMatchesCase(args.file_path, c.fileName)) {
+          throw new Error(
+            `${label} / ${c.name} search_replace targeted wrong file: ` +
+              `got "${args.file_path}", expected "${c.fileName}"`,
+          );
+        }
+        state.content = applySearchReplaceEdit(state.content, args);
+        state.toolCalls.push(
+          makeRecord(
+            "search_replace",
+            args.file_path,
+            recordArgs,
+            fileBefore,
+            state.content,
+            state.toolCalls.length,
+          ),
+        );
+        return `Successfully applied edits to ${args.file_path}`;
+      } catch (err) {
+        const message = err instanceof Error ? err.message : String(err);
+        state.toolCalls.push(
+          makeRecord(
+            "search_replace",
+            args.file_path ?? c.fileName,
+            recordArgs,
+            fileBefore,
+            fileBefore,
+            state.toolCalls.length,
+            { succeeded: false, error: message },
+          ),
+        );
+        throw err;
+      }
+    },
+  };
+}
+
+function writeFileHarnessTool(
+  state: ToolRunState,
+  c: EvalCase,
+  label: string,
+): Tool {
+  return {
+    description: writeFileTool.description,
+    inputSchema: writeFileTool.inputSchema,
+    execute: async (args) => {
+      const fileBefore = state.content;
+      const recordArgs = {
+        path: args.path,
+        content: args.content,
+        description: args.description ?? "",
+      };
+      try {
+        if (!pathMatchesCase(args.path, c.fileName)) {
+          throw new Error(
+            `${label} / ${c.name} write_file targeted wrong file: ` +
+              `got "${args.path}", expected "${c.fileName}"`,
+          );
+        }
+        state.content = args.content;
+        state.toolCalls.push(
+          makeRecord(
+            "write_file",
+            args.path,
+            recordArgs,
+            fileBefore,
+            state.content,
+            state.toolCalls.length,
+          ),
+        );
+        return `Successfully wrote ${args.path}`;
+      } catch (err) {
+        const message = err instanceof Error ? err.message : String(err);
+        state.toolCalls.push(
+          makeRecord(
+            "write_file",
+            args.path ?? c.fileName,
+            recordArgs,
+            fileBefore,
+            fileBefore,
+            state.toolCalls.length,
+            { succeeded: false, error: message },
+          ),
+        );
+        throw err;
+      }
+    },
+  };
+}
+
+function editFileHarnessTool(
+  state: ToolRunState,
+  c: EvalCase,
+  label: string,
+): Tool {
+  return {
+    description: editFileTool.description,
+    inputSchema: editFileTool.inputSchema,
+    execute: async (args) => {
+      const fileBefore = state.content;
+      const recordArgs = {
+        path: args.path,
+        content: args.content,
+        instructions: args.instructions ?? "",
+      };
+      try {
+        if (!pathMatchesCase(args.path, c.fileName)) {
+          throw new Error(
+            `${label} / ${c.name} edit_file targeted wrong file: ` +
+              `got "${args.path}", expected "${c.fileName}"`,
+          );
+        }
+        const newContent = await turboFileEdit({
+          path: args.path,
+          content: args.content,
+          originalContent: state.content,
+          instructions: args.instructions,
+          signal: state.abortSignal,
+        });
+        state.content = newContent;
+        state.toolCalls.push(
+          makeRecord(
+            "edit_file",
+            args.path,
+            recordArgs,
+            fileBefore,
+            state.content,
+            state.toolCalls.length,
+          ),
+        );
+        return `Successfully edited ${args.path}`;
+      } catch (err) {
+        const message = err instanceof Error ? err.message : String(err);
+        state.toolCalls.push(
+          makeRecord(
+            "edit_file",
+            args.path ?? c.fileName,
+            recordArgs,
+            fileBefore,
+            fileBefore,
+            state.toolCalls.length,
+            { succeeded: false, error: message },
+          ),
+        );
+        throw err;
+      }
+    },
+  };
+}
+
+// ── Suite configs ──────────────────────────────────────────────
+
+interface SuiteConfig {
+  name: string;
+  displayName: string;
+  systemPrompt: string;
+  buildTools: (
+    state: ToolRunState,
+    c: EvalCase,
+    label: string,
+  ) => Record<string, Tool>;
+}
+
+const SUITES: SuiteConfig[] = [
+  {
+    name: "search_replace",
+    displayName: "search_replace",
+    systemPrompt: SIMPLE_SEARCH_REPLACE_SYSTEM_PROMPT,
+    buildTools: (state, c, label) => ({
+      search_replace: searchReplaceHarnessTool(state, c, label),
+    }),
+  },
+  {
+    name: "search_replace_few",
+    displayName: "search_replace_few (minimize call count)",
+    systemPrompt: SEARCH_REPLACE_FEW_SYSTEM_PROMPT,
+    buildTools: (state, c, label) => ({
+      search_replace: searchReplaceHarnessTool(state, c, label),
+    }),
+  },
+  {
+    name: "edit_file",
+    displayName: "edit_file",
+    systemPrompt: SIMPLE_EDIT_FILE_SYSTEM_PROMPT,
+    buildTools: (state, c, label) => ({
+      edit_file: editFileHarnessTool(state, c, label),
+    }),
+  },
+  {
+    name: "basic_agent",
+    displayName: "basic_agent (search_replace + write_file)",
+    systemPrompt: constructLocalAgentPrompt(undefined, undefined, {
+      basicAgentMode: true,
+    }),
+    buildTools: (state, c, label) => ({
+      search_replace: searchReplaceHarnessTool(state, c, label),
+      write_file: writeFileHarnessTool(state, c, label),
+    }),
+  },
+  {
+    name: "pro_agent",
+    displayName: "pro_agent (search_replace + edit_file + write_file)",
+    systemPrompt: constructLocalAgentPrompt(undefined),
+    buildTools: (state, c, label) => ({
+      search_replace: searchReplaceHarnessTool(state, c, label),
+      edit_file: editFileHarnessTool(state, c, label),
+      write_file: writeFileHarnessTool(state, c, label),
+    }),
+  },
+  {
+    // Mirrors pro_agent but uses a standalone copy of the prompt
+    // (see helpers/prompts.ts) so prompt variations can be recorded
+    // without modifying the production prompt.
+    name: "pro_agent_experimental",
+    displayName:
+      "pro_agent_experimental (pro_agent with editable prompt copy)",
+    systemPrompt: PRO_AGENT_EXPERIMENTAL_SYSTEM_PROMPT,
+    buildTools: (state, c, label) => ({
+      search_replace: searchReplaceHarnessTool(state, c, label),
+      edit_file: editFileHarnessTool(state, c, label),
+      write_file: writeFileHarnessTool(state, c, label),
+    }),
+  },
+];
+
+// ── Model matrix ───────────────────────────────────────────────
+
+const ALL_MODELS: Array<{
+  provider: EvalProvider;
+  modelName: string;
+  label: string;
+  temperature: number;
+}> = [
+  {
+    provider: "anthropic",
+    modelName: SONNET_4_6,
+    label: "Claude Sonnet 4.6",
+    temperature: 0,
+  },
+  {
+    provider: "openai",
+    modelName: GPT_5_4,
+    label: "GPT 5.4",
+    temperature: 1,
+  },
+  {
+    provider: "google",
+    modelName: GEMINI_3_FLASH,
+    label: "Gemini 3 Flash",
+    temperature: 1,
+  },
+];
+
+// ── Case runner ────────────────────────────────────────────────
+
+async function runCase(
+  suite: SuiteConfig,
+  c: EvalCase,
+  provider: EvalProvider,
+  modelName: string,
+  label: string,
+  temperature: number,
+): Promise<void> {
+  const runTimestamp = new Date().toISOString();
+  const llmStartMs = Date.now();
+  let lastStepEndMs = llmStartMs;
+  const requests: LLMRequestRecord[] = [];
+  let totalUsage = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
+  let totalDurationMs = 0;
+  let responseModelId: string | null = null;
+  let judgeRecord: JudgeRecord | null = null;
+  let passed = false;
+  let errorMessage: string | null = null;
+
+  const systemPrompt = suite.systemPrompt;
+  const userPrompt = `File: ${c.fileName}\n\`\`\`\n${c.fileContent}\n\`\`\`\n\n${c.prompt}`;
+
+  // Internal timeout fires slightly before vitest's testTimeout so the
+  // finally block still runs and we capture a partial record (tool calls,
+  // LLM requests so far, current file state) instead of losing everything
+  // to a hard vitest timeout. Keep this strictly less than testTimeout in
+  // vitest.eval.config.ts.
+  const INTERNAL_TIMEOUT_MS = 330_000;
+  const abortController = new AbortController();
+
+  const state: ToolRunState = {
+    content: c.fileContent,
+    toolCalls: [],
+    abortSignal: abortController.signal,
+  };
+  const timeoutHandle = setTimeout(() => {
+    abortController.abort(
+      new Error(
+        `runCase internal timeout: exceeded ${INTERNAL_TIMEOUT_MS}ms budget`,
+      ),
+    );
+  }, INTERNAL_TIMEOUT_MS);
+
+  try {
+    const result = await generateText({
+      model: getEvalModel(provider, modelName),
+      temperature,
+      stopWhen: stepCountIs(100),
+      abortSignal: abortController.signal,
+      system: systemPrompt,
+      messages: [
+        {
+          role: "user",
+          content: userPrompt,
+        },
+      ],
+      tools: suite.buildTools(state, c, label),
+      onStepFinish: (step) => {
+        const now = Date.now();
+        requests.push({
+          stepIndex: requests.length,
+          timestamp: step.response.timestamp.toISOString(),
+          durationMs: now - lastStepEndMs,
+          usage: normalizeUsage(step.usage),
+          finishReason: step.finishReason ?? null,
+        });
+        lastStepEndMs = now;
+      },
+    });
+
+    totalDurationMs = Date.now() - llmStartMs;
+    totalUsage = normalizeUsage(result.totalUsage);
+    responseModelId = result.response.modelId ?? null;
+
+    const totalCalls = result.steps.reduce((n, s) => n + s.toolCalls.length, 0);
+    console.log(
+      `\n[${suite.name} / ${label}] ${c.name} — ${totalCalls} tool call(s):`,
+    );
+    for (const [i, tc] of state.toolCalls.entries()) {
+      const argSummary = Object.entries(tc.args)
+        .map(([k, v]) =>
+          typeof v === "string" ? `${k} (${v.length} chars)` : `${k}=${v}`,
+        )
+        .join(", ");
+      console.log(
+        `  Call ${i + 1}: ${tc.toolName} file=${tc.filePath}, ${argSummary}`,
+      );
+    }
+
+    const successfulCalls = state.toolCalls.filter((tc) => tc.succeeded).length;
+    if (successfulCalls === 0) {
+      throw new Error(
+        `${label} made no successful tool calls (attempted ${totalCalls})`,
+      );
+    }
+
+    for (const check of c.structuralChecks ?? []) {
+      const ok = state.content.includes(check);
+      console.log(`  Structural check "${check}": ${ok ? "PASS" : "FAIL"}`);
+      if (!ok) {
+        throw new Error(
+          `Structural check failed: expected output to contain "${check}"`,
+        );
+      }
+    }
+
+    const recordDir = recordDirFor(suite.name, c.name, label);
+    console.log(
+      `\n[${suite.name} / ${label}] ${c.name} — final content (${state.content.length} chars, first 500):\n${state.content.slice(0, 500)}...\n` +
+        `  Full record will be written to: ${recordDir}`,
+    );
+
+    console.log(`\n[${suite.name} / ${label}] ${c.name} — calling judge...`);
+    judgeRecord = await judgeResult(
+      c.fileContent,
+      c.prompt,
+      state.content,
+      abortController.signal,
+    );
+    console.log(
+      `\n[${suite.name} / ${label}] ${c.name} — judge verdict: ${judgeRecord.pass ? "PASS" : "FAIL"}\n${judgeRecord.explanation}`,
+    );
+
+    if (!judgeRecord.pass) {
+      throw new Error(
+        `Judge (${JUDGE_LABEL}) said FAIL for ${label}:\n${judgeRecord.explanation}`,
+      );
+    }
+    passed = true;
+  } catch (err) {
+    errorMessage = err instanceof Error ? err.message : String(err);
+    if (totalDurationMs === 0) totalDurationMs = Date.now() - llmStartMs;
+    // generateText throws before we can read result.totalUsage, but any
+    // already-completed steps were captured in `requests` via onStepFinish.
+    // Sum those so failed runs still report real token consumption instead
+    // of zeros — otherwise cost and per-model comparisons get skewed for
+    // exactly the failure cases we most care about analyzing.
+    if (totalUsage.totalTokens === 0 && requests.length > 0) {
+      totalUsage = requests.reduce(
+        (acc, r) => ({
+          inputTokens: acc.inputTokens + r.usage.inputTokens,
+          outputTokens: acc.outputTokens + r.usage.outputTokens,
+          totalTokens: acc.totalTokens + r.usage.totalTokens,
+        }),
+        { inputTokens: 0, outputTokens: 0, totalTokens: 0 },
+      );
+    }
+    throw err;
+  } finally {
+    clearTimeout(timeoutHandle);
+    await recordEvalRun({
+      timestamp: runTimestamp,
+      suite: suite.name,
+      caseName: c.name,
+      model: { label, provider, modelName, responseModelId },
+      prompt: {
+        system: systemPrompt,
+        instructions: c.prompt,
+        user: userPrompt,
+      },
+      file: {
+        name: c.fileName,
+        before: c.fileContent,
+        after: state.content,
+      },
+      llm: {
+        totalDurationMs,
+        totalUsage,
+        requestCount: requests.length,
+        requests,
+      },
+      toolCalls: state.toolCalls,
+      diff: createUnifiedDiff(c.fileContent, state.content, {
+        oldLabel: `${c.fileName} (original)`,
+        newLabel: `${c.fileName} (modified)`,
+      }),
+      judge: judgeRecord,
+      passed,
+      errorMessage,
+    });
+  }
+}
+
+// ── Filters + test runner ──────────────────────────────────────
+//
+// `EVAL_SUITE` and `EVAL_MODEL` are both required — running every suite
+// against every model by accident is expensive, so the caller must opt
+// in explicitly. Use `all` to mean "run everything". `EVAL_SUITE` matches
+// suite names exactly (comma-separated for multiple, e.g.
+// `EVAL_SUITE=search_replace,edit_file`) so that `search_replace` does
+// not also pick up `search_replace_few`. `EVAL_MODEL` is a
+// case-insensitive substring match against model label or id.
+
+const SUITE_FILTER_RAW = process.env.EVAL_SUITE?.trim();
+const MODEL_FILTER_RAW = process.env.EVAL_MODEL?.trim();
+
+if (!SUITE_FILTER_RAW || !MODEL_FILTER_RAW) {
+  const missingEnv: string[] = [];
+  if (!SUITE_FILTER_RAW) missingEnv.push("EVAL_SUITE");
+  if (!MODEL_FILTER_RAW) missingEnv.push("EVAL_MODEL");
+  const suiteOptions = SUITES.map((s) => s.name).join(", ");
+  const modelOptions = ALL_MODELS.map((m) => m.label).join(", ");
+  console.warn(
+    `\n⚠️  Eval suite not running: ${missingEnv.join(" and ")} not set.\n` +
+      `  Set EVAL_SUITE to "all" or an exact name (comma-separated for multiple) from: ${suiteOptions}\n` +
+      `  Set EVAL_MODEL to "all" or a substring of a label: ${modelOptions}\n` +
+      `  Example:\n` +
+      `    EVAL_SUITE=all EVAL_MODEL=all DYAD_PRO_API_KEY="..." npm run eval\n`,
+  );
+  // Register a single skipped describe so vitest still reports something
+  // coherent (rather than "no tests found").
+  describe.skip("eval suite — configuration required", () => {
+    it("set EVAL_SUITE and EVAL_MODEL (use 'all' to run every suite/model)", () => {});
+  });
+} else {
+  const suiteFilter = SUITE_FILTER_RAW.toLowerCase();
+  const requestedSuiteNames =
+    suiteFilter === "all"
+      ? null
+      : new Set(
+          suiteFilter
+            .split(",")
+            .map((s) => s.trim())
+            .filter((s) => s !== ""),
+        );
+  const ACTIVE_SUITES =
+    requestedSuiteNames === null
+      ? SUITES
+      : SUITES.filter((s) => requestedSuiteNames.has(s.name.toLowerCase()));
+
+  // Surface filter misconfiguration as a clean failing test rather than
+  // crashing module load with an opaque stack trace. The describe block
+  // gives vitest a place to attach the (carefully written) error message.
+  const configErrors: string[] = [];
+  if (ACTIVE_SUITES.length === 0) {
+    configErrors.push(
+      `EVAL_SUITE="${SUITE_FILTER_RAW}" matched no suites. ` +
+        `Available: ${SUITES.map((s) => s.name).join(", ")} (or "all"). ` +
+        `Use exact names, comma-separated for multiple.`,
+    );
+  } else if (requestedSuiteNames !== null) {
+    const matched = new Set(ACTIVE_SUITES.map((s) => s.name.toLowerCase()));
+    const unknown = [...requestedSuiteNames].filter((n) => !matched.has(n));
+    if (unknown.length > 0) {
+      configErrors.push(
+        `EVAL_SUITE contains unknown suite name(s): ${unknown.join(", ")}. ` +
+          `Available: ${SUITES.map((s) => s.name).join(", ")} (or "all").`,
+      );
+    }
+  }
+
+  const modelFilter = MODEL_FILTER_RAW.toLowerCase();
+  const MODELS =
+    modelFilter === "all"
+      ? ALL_MODELS
+      : ALL_MODELS.filter(
+          (m) =>
+            m.label.toLowerCase().includes(modelFilter) ||
+            m.modelName.toLowerCase().includes(modelFilter),
+        );
+
+  if (MODELS.length === 0) {
+    configErrors.push(
+      `EVAL_MODEL="${MODEL_FILTER_RAW}" matched no models. ` +
+        `Available labels: ${ALL_MODELS.map((m) => m.label).join(", ")} (or "all")`,
+    );
+  }
+
+  if (configErrors.length > 0) {
+    describe("eval suite — configuration error", () => {
+      for (const msg of configErrors) {
+        it(msg.split(".")[0], () => {
+          throw new Error(msg);
+        });
+      }
+    });
+  } else {
+    for (const suite of ACTIVE_SUITES) {
+      for (const { provider, modelName, label, temperature } of MODELS) {
+        describe.skipIf(!hasDyadProKey())(
+          `${suite.displayName} — ${label}`,
+          () => {
+            for (const c of CASES) {
+              it.concurrent(c.name, async () => {
+                try {
+                  await runCase(
+                    suite,
+                    c,
+                    provider,
+                    modelName,
+                    label,
+                    temperature,
+                  );
+                } catch (err) {
+                  console.error(
+                    `\n[${suite.name} / ${label}] ${c.name} — ERROR: ${err instanceof Error ? err.message : String(err)}`,
+                  );
+                  throw err;
+                }
+              });
+            }
+          },
+        );
+      }
+    }
+  }
+}
--- a/tsconfig.app.json
+++ b/tsconfig.app.json
@@ -28,5 +28,9 @@
    }
  },
  "include": ["src", "e2e-tests", "shared"],
-  "exclude": ["e2e-tests/fixtures"]
+  "exclude": [
+    "e2e-tests/fixtures",
+    "src/__tests__/evals/fixtures",
+    "eval-results"
+  ]
 }
--- a/vitest.eval.config.ts
+++ b/vitest.eval.config.ts
+import { defineConfig } from "vitest/config";
+import { resolve } from "path";
+
+export default defineConfig({
+  test: {
+    environment: "node",
+    include: ["src/__tests__/evals/**/*.eval.ts"],
+    globals: true,
+    testTimeout: 360_000,
+    maxConcurrency: 5,
+  },
+  resolve: {
+    alias: { "@": resolve(__dirname, "./src") },
+  },
+});