Unverified 提交 dd1e4881 authored 作者: Mohamed Aziz Mejri's avatar Mohamed Aziz Mejri 提交者: GitHub

Web fetching tool (#2920)

closes #2809 <!-- devin-review-badge-begin --> --- <a href="https://app.devin.ai/review/dyad-sh/dyad/pull/2920" target="_blank"> <picture> <source media="(prefers-color-scheme: dark)" srcset="https://static.devin.ai/assets/gh-open-in-devin-review-dark.svg?v=1"> <img src="https://static.devin.ai/assets/gh-open-in-devin-review-light.svg?v=1" alt="Open with Devin"> </picture> </a> <!-- devin-review-badge-end --> --------- Co-authored-by: 's avatarClaude Opus 4.6 <noreply@anthropic.com>
上级 7db53e5a
import type { LocalAgentFixture } from "../../../../testing/fake-llm-server/localAgentTypes";
export const fixture: LocalAgentFixture = {
description: "Fetch and read web page content using web_fetch tool",
turns: [
{
text: "I'll fetch the content of that page for you.",
toolCalls: [
{
name: "web_fetch",
args: {
url: "https://example.com/docs/getting-started",
},
},
],
},
{
text: "Here's a summary of the page content. The getting started guide covers three main items. Let me know if you need more details!",
},
],
};
import { testSkipIfWindows } from "./helpers/test_helper";
/**
* E2E test for web_fetch tool in local-agent mode
* Tests fetching and reading web page content as markdown
* Note: web_fetch has defaultConsent: "always", so no consent flow is tested
*/
testSkipIfWindows("local-agent - web fetch", async ({ po }) => {
await po.setUpDyadPro({ localAgent: true });
await po.importApp("minimal");
await po.chatActions.selectLocalAgentMode();
await po.sendPrompt("tc=local-agent/web-fetch");
await po.snapshotMessages();
});
...@@ -276,6 +276,27 @@ ...@@ -276,6 +276,27 @@
} }
} }
}, },
{
"type": "function",
"function": {
"name": "web_fetch",
"description": "Fetch and read the content of a web page as markdown given its URL.\n\n### When to Use This Tool\nUse this tool when the user's message contains a URL (or domain name) and they want to:\n- **Read** the page's content (e.g. documentation, blog post, article)\n- **Reference** information from the page (e.g. API docs, tutorials, guides)\n- **Extract** data or context from a live web page to inform their code\n- **Follow a link** someone shared to understand its contents\n\nExamples:\n- \"Use the docs at docs.example.com/api to set up the client\"\n- \"What does this page say? https://example.com/blog/post\"\n- \"Follow the guide at example.com/tutorial\"\n\n### When NOT to Use This Tool\n- The user wants to **visually clone or replicate** a website → use `web_crawl` instead\n- The user needs to **search the web** for information without a specific URL → use `web_search` instead\n",
"parameters": {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "URL to fetch content from"
}
},
"required": [
"url"
],
"additionalProperties": false
}
}
},
{ {
"type": "function", "type": "function",
"function": { "function": {
......
...@@ -451,6 +451,25 @@ ...@@ -451,6 +451,25 @@
"additionalProperties": false "additionalProperties": false
} }
}, },
{
"type": "function",
"name": "web_fetch",
"description": "Fetch and read the content of a web page as markdown given its URL.\n\n### When to Use This Tool\nUse this tool when the user's message contains a URL (or domain name) and they want to:\n- **Read** the page's content (e.g. documentation, blog post, article)\n- **Reference** information from the page (e.g. API docs, tutorials, guides)\n- **Extract** data or context from a live web page to inform their code\n- **Follow a link** someone shared to understand its contents\n\nExamples:\n- \"Use the docs at docs.example.com/api to set up the client\"\n- \"What does this page say? https://example.com/blog/post\"\n- \"Follow the guide at example.com/tutorial\"\n\n### When NOT to Use This Tool\n- The user wants to **visually clone or replicate** a website → use `web_crawl` instead\n- The user needs to **search the web** for information without a specific URL → use `web_search` instead\n",
"parameters": {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "URL to fetch content from"
}
},
"required": [
"url"
],
"additionalProperties": false
}
},
{ {
"type": "function", "type": "function",
"name": "generate_image", "name": "generate_image",
......
...@@ -465,6 +465,27 @@ ...@@ -465,6 +465,27 @@
} }
} }
}, },
{
"type": "function",
"function": {
"name": "web_fetch",
"description": "Fetch and read the content of a web page as markdown given its URL.\n\n### When to Use This Tool\nUse this tool when the user's message contains a URL (or domain name) and they want to:\n- **Read** the page's content (e.g. documentation, blog post, article)\n- **Reference** information from the page (e.g. API docs, tutorials, guides)\n- **Extract** data or context from a live web page to inform their code\n- **Follow a link** someone shared to understand its contents\n\nExamples:\n- \"Use the docs at docs.example.com/api to set up the client\"\n- \"What does this page say? https://example.com/blog/post\"\n- \"Follow the guide at example.com/tutorial\"\n\n### When NOT to Use This Tool\n- The user wants to **visually clone or replicate** a website → use `web_crawl` instead\n- The user needs to **search the web** for information without a specific URL → use `web_search` instead\n",
"parameters": {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "URL to fetch content from"
}
},
"required": [
"url"
],
"additionalProperties": false
}
}
},
{ {
"type": "function", "type": "function",
"function": { "function": {
......
- paragraph: /Generate an AI_RULES\.md file for this app\. Describe the tech stack in 5-\d+ bullet points and describe clear rules about what libraries to use for what\./
- button "file1.txt file1.txt Edit":
- img
- text: ""
- button "Edit":
- img
- text: ""
- img
- paragraph: More EOM
- button "Copy":
- img
- img
- text: Approved
- img
- text: claude-opus-4-5
- img
- text: less than a minute ago
- img
- text: (1 files changed)
- button "Copy Request ID":
- img
- text: ""
- paragraph: tc=local-agent/web-fetch
- paragraph: I'll fetch the content of that page for you.
- img
- text: Web Fetch
- img
- text: Done https://example.com/docs/getting-started
- paragraph: Here's a summary of the page content. The getting started guide covers three main items. Let me know if you need more details!
- button "Copy":
- img
- img
- text: claude-opus-4-5
- img
- text: less than a minute ago
- button "Copy Request ID":
- img
- text: ""
- button "Undo":
- img
- text: ""
- button "Retry":
- img
- text: ""
\ No newline at end of file
...@@ -27,6 +27,7 @@ import { DyadMcpToolResult } from "./DyadMcpToolResult"; ...@@ -27,6 +27,7 @@ import { DyadMcpToolResult } from "./DyadMcpToolResult";
import { DyadWebSearchResult } from "./DyadWebSearchResult"; import { DyadWebSearchResult } from "./DyadWebSearchResult";
import { DyadWebSearch } from "./DyadWebSearch"; import { DyadWebSearch } from "./DyadWebSearch";
import { DyadWebCrawl } from "./DyadWebCrawl"; import { DyadWebCrawl } from "./DyadWebCrawl";
import { DyadWebFetch } from "./DyadWebFetch";
import { DyadImageGeneration } from "./DyadImageGeneration"; import { DyadImageGeneration } from "./DyadImageGeneration";
import { DyadCodeSearchResult } from "./DyadCodeSearchResult"; import { DyadCodeSearchResult } from "./DyadCodeSearchResult";
import { DyadCodeSearch } from "./DyadCodeSearch"; import { DyadCodeSearch } from "./DyadCodeSearch";
...@@ -64,6 +65,7 @@ const DYAD_CUSTOM_TAGS = [ ...@@ -64,6 +65,7 @@ const DYAD_CUSTOM_TAGS = [
"dyad-web-search-result", "dyad-web-search-result",
"dyad-web-search", "dyad-web-search",
"dyad-web-crawl", "dyad-web-crawl",
"dyad-web-fetch",
"dyad-code-search-result", "dyad-code-search-result",
"dyad-code-search", "dyad-code-search",
"dyad-read", "dyad-read",
...@@ -392,6 +394,18 @@ function renderCustomTag( ...@@ -392,6 +394,18 @@ function renderCustomTag(
{content} {content}
</DyadWebCrawl> </DyadWebCrawl>
); );
case "dyad-web-fetch":
return (
<DyadWebFetch
node={{
properties: {
state: getState({ isStreaming, inProgress }),
},
}}
>
{content}
</DyadWebFetch>
);
case "dyad-code-search": case "dyad-code-search":
return ( return (
<DyadCodeSearch <DyadCodeSearch
......
import type { FC, ReactNode } from "react";
import { Globe } from "lucide-react";
import {
DyadCard,
DyadCardHeader,
DyadBadge,
DyadStateIndicator,
} from "./DyadCardPrimitives";
import { CustomTagState } from "./stateTypes";
interface DyadWebFetchProps {
children?: ReactNode;
node?: {
properties: {
state?: CustomTagState;
};
};
}
export const DyadWebFetch: FC<DyadWebFetchProps> = ({ children, node }) => {
const state = node?.properties?.state as CustomTagState;
return (
<DyadCard state={state} accentColor="blue">
<DyadCardHeader icon={<Globe size={15} />} accentColor="blue">
<DyadBadge color="blue">Web Fetch</DyadBadge>
{state && (
<DyadStateIndicator
state={state}
pendingLabel="Fetching..."
finishedLabel="Done"
abortedLabel="Aborted"
/>
)}
</DyadCardHeader>
{children && (
<div className="px-3 pb-2 text-sm italic text-muted-foreground">
{children}
</div>
)}
</DyadCard>
);
};
...@@ -24,6 +24,7 @@ import { editFileTool } from "./tools/edit_file"; ...@@ -24,6 +24,7 @@ import { editFileTool } from "./tools/edit_file";
import { searchReplaceTool } from "./tools/search_replace"; import { searchReplaceTool } from "./tools/search_replace";
import { webSearchTool } from "./tools/web_search"; import { webSearchTool } from "./tools/web_search";
import { webCrawlTool } from "./tools/web_crawl"; import { webCrawlTool } from "./tools/web_crawl";
import { webFetchTool } from "./tools/web_fetch";
import { generateImageTool } from "./tools/generate_image"; import { generateImageTool } from "./tools/generate_image";
import { updateTodosTool } from "./tools/update_todos"; import { updateTodosTool } from "./tools/update_todos";
import { runTypeChecksTool } from "./tools/run_type_checks"; import { runTypeChecksTool } from "./tools/run_type_checks";
...@@ -65,6 +66,7 @@ export const TOOL_DEFINITIONS: readonly ToolDefinition[] = [ ...@@ -65,6 +66,7 @@ export const TOOL_DEFINITIONS: readonly ToolDefinition[] = [
readLogsTool, readLogsTool,
webSearchTool, webSearchTool,
webCrawlTool, webCrawlTool,
webFetchTool,
generateImageTool, generateImageTool,
updateTodosTool, updateTodosTool,
runTypeChecksTool, runTypeChecksTool,
......
import { z } from "zod";
import log from "electron-log";
import { ToolDefinition, escapeXmlContent, AgentContext } from "./types";
import { engineFetch } from "./engine_fetch";
const logger = log.scope("web_fetch");
function validateHttpUrl(url: string): void {
let parsed: URL;
try {
parsed = new URL(url);
} catch {
throw new Error(`Invalid URL: ${url}`);
}
if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
throw new Error(
`Unsupported URL scheme "${parsed.protocol}" — only http and https are allowed`,
);
}
}
const MAX_CONTENT_LENGTH = 80_000;
function truncateContent(value: string): string {
if (value.length <= MAX_CONTENT_LENGTH) return value;
return `${value.slice(0, MAX_CONTENT_LENGTH)}\n\n<!-- truncated -->`;
}
const webFetchSchema = z.object({
url: z.string().describe("URL to fetch content from"),
});
const webFetchResponseSchema = z.object({
rootUrl: z.string(),
markdown: z.string().optional(),
pages: z.array(
z.object({
url: z.string(),
markdown: z.string(),
}),
),
});
const DESCRIPTION = `Fetch and read the content of a web page as markdown given its URL.
### When to Use This Tool
Use this tool when the user's message contains a URL (or domain name) and they want to:
- **Read** the page's content (e.g. documentation, blog post, article)
- **Reference** information from the page (e.g. API docs, tutorials, guides)
- **Extract** data or context from a live web page to inform their code
- **Follow a link** someone shared to understand its contents
Examples:
- "Use the docs at docs.example.com/api to set up the client"
- "What does this page say? https://example.com/blog/post"
- "Follow the guide at example.com/tutorial"
### When NOT to Use This Tool
- The user wants to **visually clone or replicate** a website → use \`web_crawl\` instead
- The user needs to **search the web** for information without a specific URL → use \`web_search\` instead
`;
async function callWebFetch(
url: string,
ctx: Pick<AgentContext, "dyadRequestId">,
): Promise<z.infer<typeof webFetchResponseSchema>> {
const response = await engineFetch(ctx, "/tools/web-crawl", {
method: "POST",
body: JSON.stringify({ url, markdownOnly: true }),
});
if (!response.ok) {
const errorText = await response.text();
throw new Error(
`Web fetch failed: ${response.status} ${response.statusText} - ${errorText}`,
);
}
const data = webFetchResponseSchema.parse(await response.json());
return data;
}
export const webFetchTool: ToolDefinition<z.infer<typeof webFetchSchema>> = {
name: "web_fetch",
description: DESCRIPTION,
inputSchema: webFetchSchema,
defaultConsent: "always",
// Requires Dyad Pro engine API
isEnabled: (ctx) => ctx.isDyadPro,
getConsentPreview: (args) => `Fetch URL: "${args.url}"`,
buildXml: (args, isComplete) => {
if (!args.url) return undefined;
// When complete, return undefined so execute's onXmlComplete provides the final XML
if (isComplete) return undefined;
return `<dyad-web-fetch>${escapeXmlContent(args.url)}`;
},
execute: async (args, ctx) => {
logger.log(`Executing web fetch: ${args.url}`);
validateHttpUrl(args.url);
ctx.onXmlStream(`<dyad-web-fetch>${escapeXmlContent(args.url)}`);
try {
const result = await callWebFetch(args.url, ctx);
if (!result) {
throw new Error("Web fetch returned no results");
}
// Combine markdown from all pages
const allContent = result.pages
.map((page) => `## ${page.url}\n\n${page.markdown}`)
.join("\n\n---\n\n");
if (!allContent) {
throw new Error("No content available from web fetch");
}
logger.log(
`Web fetch completed for URL: ${args.url} (${result.pages.length} pages)`,
);
ctx.onXmlComplete(
`<dyad-web-fetch>${escapeXmlContent(args.url)}</dyad-web-fetch>`,
);
return truncateContent(allContent);
} catch (error) {
ctx.onXmlComplete(
`<dyad-web-fetch>${escapeXmlContent(args.url)}</dyad-web-fetch>`,
);
throw error;
}
},
};
...@@ -274,6 +274,28 @@ app.post("/engine/v1/images/generations", (req, res) => { ...@@ -274,6 +274,28 @@ app.post("/engine/v1/images/generations", (req, res) => {
} }
}); });
// Dyad Engine web-crawl endpoint for web_fetch tool
app.post("/engine/v1/tools/web-crawl", (req, res) => {
const { url, markdownOnly } = req.body;
console.log(`* web-crawl: url="${url}", markdownOnly=${markdownOnly}`);
try {
res.json({
rootUrl: url,
markdown: `# Page content from ${url}`,
pages: [
{
url,
markdown: `# Page content from ${url}\n\nThis is the fetched content of the web page.\n\n- Item 1\n- Item 2\n- Item 3`,
},
],
});
} catch (error) {
console.error(`* web-crawl error:`, error);
res.status(400).json({ error: String(error) });
}
});
// Start the server // Start the server
const server = createServer(app); const server = createServer(app);
server.listen(PORT, () => { server.listen(PORT, () => {
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论