Skip to main content

codetether_agent/tool/
webfetch.rs

1//! Web Fetch Tool - Fetches content from URLs and converts HTML to markdown/text/html.
2
3use anyhow::{Context, Result};
4use async_trait::async_trait;
5use serde::Deserialize;
6use serde_json::{json, Value};
7use std::time::Duration;
8use super::{Tool, ToolResult};
9
10#[allow(dead_code)]
11const MAX_CONTENT_LENGTH: usize = 10 * 1024 * 1024;
12const REQUEST_TIMEOUT: Duration = Duration::from_secs(30);
13
14pub struct WebFetchTool {
15    client: reqwest::Client,
16}
17
18impl Default for WebFetchTool {
19    fn default() -> Self { Self::new() }
20}
21
22impl WebFetchTool {
23    pub fn new() -> Self {
24        let client = reqwest::Client::builder()
25            .timeout(REQUEST_TIMEOUT)
26            .user_agent("CodeTether-Agent/1.0")
27            .redirect(reqwest::redirect::Policy::limited(5))
28            .build()
29            .expect("Failed to build HTTP client");
30        Self { client }
31    }
32
33    fn html_to_markdown(&self, html: &str) -> String {
34        let mut result = html.to_string();
35        let patterns = [
36            (r"<h1[^>]*>(.*?)</h1>", "# $1\n"),
37            (r"<h2[^>]*>(.*?)</h2>", "## $1\n"),
38            (r"<h3[^>]*>(.*?)</h3>", "### $1\n"),
39            (r"<p[^>]*>(.*?)</p>", "$1\n\n"),
40            (r"<(strong|b)[^>]*>(.*?)</\1>", "**$2**"),
41            (r"<(em|i)[^>]*>(.*?)</\1>", "*$2*"),
42            (r"<code[^>]*>(.*?)</code>", "`$1`"),
43            (r"<li[^>]*>(.*?)</li>", "- $1\n"),
44        ];
45        for (pat, rep) in patterns {
46            if let Ok(re) = regex::Regex::new(pat) {
47                result = re.replace_all(&result, rep).to_string();
48            }
49        }
50        result = regex::Regex::new(r"<[^>]+>").unwrap().replace_all(&result, "").to_string();
51        result.replace("&nbsp;", " ").replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
52    }
53
54    fn html_to_text(&self, html: &str) -> String {
55        let md = self.html_to_markdown(html);
56        md.replace("**", "").replace("*", "").replace("`", "").replace("# ", "")
57    }
58
59    fn is_html(&self, ct: &str, body: &str) -> bool {
60        ct.contains("text/html") || body.trim().starts_with('<')
61    }
62}
63
64#[derive(Deserialize)]
65struct Params { url: String, #[serde(default = "default_fmt")] format: String }
66fn default_fmt() -> String { "markdown".into() }
67
68#[async_trait]
69impl Tool for WebFetchTool {
70    fn id(&self) -> &str { "webfetch" }
71    fn name(&self) -> &str { "Web Fetch" }
72    fn description(&self) -> &str { "Fetch content from URL, convert HTML to markdown/text/html." }
73    fn parameters(&self) -> Value {
74        json!({"type":"object","properties":{"url":{"type":"string"},"format":{"type":"string","enum":["markdown","text","html"],"default":"markdown"}},"required":["url"]})
75    }
76
77    async fn execute(&self, params: Value) -> Result<ToolResult> {
78        let p: Params = serde_json::from_value(params).context("Invalid params")?;
79        let url = p.url.parse::<reqwest::Url>().context("Invalid URL")?;
80        if url.scheme() != "http" && url.scheme() != "https" {
81            return Ok(ToolResult::error("Only HTTP/HTTPS supported"));
82        }
83        let resp = self.client.get(url).send().await.map_err(|e| anyhow::anyhow!("{}", e))?;
84        if !resp.status().is_success() {
85            return Ok(ToolResult::error(format!("HTTP {}", resp.status())));
86        }
87        let ct = resp.headers().get("content-type").and_then(|v| v.to_str().ok()).unwrap_or("").to_lowercase();
88        let body = resp.text().await.context("Failed to read body")?;
89        let content = match p.format.as_str() {
90            "html" => body,
91            "text" => if self.is_html(&ct, &body) { self.html_to_text(&body) } else { body },
92            _ => if self.is_html(&ct, &body) { self.html_to_markdown(&body) } else { body },
93        };
94        Ok(ToolResult::success(content).with_metadata("url", json!(p.url)))
95    }
96}