Skip to main content

codetether_agent/tool/
webfetch.rs

1//! Web Fetch Tool - Fetches content from URLs and converts HTML to markdown/text/html.
2
3use super::{Tool, ToolResult};
4use anyhow::{Context, Result};
5use async_trait::async_trait;
6use once_cell::sync::Lazy;
7use regex::Regex;
8use serde::Deserialize;
9use serde_json::{Value, json};
10use std::time::Duration;
11
12#[allow(dead_code)]
13const MAX_CONTENT_LENGTH: usize = 10 * 1024 * 1024;
14const REQUEST_TIMEOUT: Duration = Duration::from_secs(30);
15
16const DEFAULT_MAX_CHARS: usize = 200_000;
17
18#[allow(dead_code)]
19static RE_STRIP_SCRIPT_STYLE: Lazy<Regex> = Lazy::new(|| {
20    // DEPRECATED: kept for backwards compatibility with old builds.
21    // NOTE: The Rust `regex` crate does NOT support backreferences, so we do
22    // not rely on this regex for correctness. See `preprocess_html()`.
23    Regex::new(r"(?is)<script[^>]*>.*?</script>").expect("invalid regex")
24});
25
26static RE_STRIP_HTML_COMMENTS: Lazy<Regex> =
27    Lazy::new(|| Regex::new(r"(?is)<!--.*?-->").expect("invalid regex"));
28
29static RE_STRIP_SCRIPT: Lazy<Regex> =
30    Lazy::new(|| Regex::new(r"(?is)<script[^>]*>.*?</script>").expect("invalid regex"));
31static RE_STRIP_STYLE: Lazy<Regex> =
32    Lazy::new(|| Regex::new(r"(?is)<style[^>]*>.*?</style>").expect("invalid regex"));
33static RE_STRIP_NOSCRIPT: Lazy<Regex> =
34    Lazy::new(|| Regex::new(r"(?is)<noscript[^>]*>.*?</noscript>").expect("invalid regex"));
35static RE_STRIP_SVG: Lazy<Regex> =
36    Lazy::new(|| Regex::new(r"(?is)<svg[^>]*>.*?</svg>").expect("invalid regex"));
37static RE_STRIP_CANVAS: Lazy<Regex> =
38    Lazy::new(|| Regex::new(r"(?is)<canvas[^>]*>.*?</canvas>").expect("invalid regex"));
39static RE_STRIP_IFRAME: Lazy<Regex> =
40    Lazy::new(|| Regex::new(r"(?is)<iframe[^>]*>.*?</iframe>").expect("invalid regex"));
41
42pub struct WebFetchTool {
43    client: reqwest::Client,
44}
45
46impl Default for WebFetchTool {
47    fn default() -> Self {
48        Self::new()
49    }
50}
51
52impl WebFetchTool {
53    pub fn new() -> Self {
54        let client = reqwest::Client::builder()
55            .timeout(REQUEST_TIMEOUT)
56            .user_agent("CodeTether-Agent/1.0")
57            .redirect(reqwest::redirect::Policy::limited(5))
58            .build()
59            .expect("Failed to build HTTP client");
60        Self { client }
61    }
62
63    fn preprocess_html(&self, html: &str) -> String {
64        // Remove the highest-noise/highest-token sections first.
65        // Many docs sites embed large JS bundles and assistant widgets inside
66        // <script> tags; we do not want that in agent context.
67        let mut s = html.to_string();
68
69        // Strip common "noise" blocks that otherwise dominate the output.
70        // (Rust regex has no backreferences, so we do them explicitly.)
71        for re in [
72            &*RE_STRIP_SCRIPT,
73            &*RE_STRIP_STYLE,
74            &*RE_STRIP_NOSCRIPT,
75            &*RE_STRIP_SVG,
76            &*RE_STRIP_CANVAS,
77            &*RE_STRIP_IFRAME,
78        ] {
79            s = re.replace_all(&s, "").to_string();
80        }
81
82        s = RE_STRIP_HTML_COMMENTS.replace_all(&s, "").to_string();
83        s
84    }
85
86    fn html_to_markdown(&self, html: &str) -> String {
87        let html = self.preprocess_html(html);
88        let mut result = html;
89        let patterns = [
90            (r"<h1[^>]*>(.*?)</h1>", "# $1\n"),
91            (r"<h2[^>]*>(.*?)</h2>", "## $1\n"),
92            (r"<h3[^>]*>(.*?)</h3>", "### $1\n"),
93            (r"<p[^>]*>(.*?)</p>", "$1\n\n"),
94            (r"<(strong|b)[^>]*>(.*?)</\1>", "**$2**"),
95            (r"<(em|i)[^>]*>(.*?)</\1>", "*$2*"),
96            (r"<code[^>]*>(.*?)</code>", "`$1`"),
97            (r"<li[^>]*>(.*?)</li>", "- $1\n"),
98        ];
99        for (pat, rep) in patterns {
100            if let Ok(re) = regex::Regex::new(pat) {
101                result = re.replace_all(&result, rep).to_string();
102            }
103        }
104        result = regex::Regex::new(r"<[^>]+>")
105            .unwrap()
106            .replace_all(&result, "")
107            .to_string();
108        result
109            .replace("&nbsp;", " ")
110            .replace("&amp;", "&")
111            .replace("&lt;", "<")
112            .replace("&gt;", ">")
113    }
114
115    fn html_to_text(&self, html: &str) -> String {
116        let md = self.html_to_markdown(html);
117        md.replace("**", "")
118            .replace("*", "")
119            .replace("`", "")
120            .replace("# ", "")
121    }
122
123    fn is_html(&self, ct: &str, body: &str) -> bool {
124        ct.contains("text/html") || body.trim().starts_with('<')
125    }
126}
127
128#[derive(Deserialize)]
129struct Params {
130    url: String,
131    #[serde(default = "default_fmt")]
132    format: String,
133    #[serde(default = "default_max_chars")]
134    max_chars: usize,
135}
136fn default_fmt() -> String {
137    "markdown".into()
138}
139
140fn default_max_chars() -> usize {
141    DEFAULT_MAX_CHARS
142}
143
144#[async_trait]
145impl Tool for WebFetchTool {
146    fn id(&self) -> &str {
147        "webfetch"
148    }
149    fn name(&self) -> &str {
150        "Web Fetch"
151    }
152    fn description(&self) -> &str {
153        "Fetch content from URL, convert HTML to markdown/text/html."
154    }
155    fn parameters(&self) -> Value {
156        json!({
157            "type": "object",
158            "properties": {
159                "url": {"type": "string"},
160                "format": {"type": "string", "enum": ["markdown", "text", "html"], "default": "markdown"},
161                "max_chars": {
162                    "type": "integer",
163                    "minimum": 1000,
164                    "default": DEFAULT_MAX_CHARS,
165                    "description": "Maximum number of characters to return (safety limit to avoid overflowing the model context window)."
166                }
167            },
168            "required": ["url"]
169        })
170    }
171
172    async fn execute(&self, params: Value) -> Result<ToolResult> {
173        let p: Params = serde_json::from_value(params).context("Invalid params")?;
174        let url = p.url.parse::<reqwest::Url>().context("Invalid URL")?;
175        if url.scheme() != "http" && url.scheme() != "https" {
176            return Ok(ToolResult::error("Only HTTP/HTTPS supported"));
177        }
178
179        crate::tls::ensure_rustls_crypto_provider();
180
181        let resp = self
182            .client
183            .get(url)
184            .send()
185            .await
186            .map_err(|e| anyhow::anyhow!("{}", e))?;
187        if !resp.status().is_success() {
188            return Ok(ToolResult::error(format!("HTTP {}", resp.status())));
189        }
190        let ct = resp
191            .headers()
192            .get("content-type")
193            .and_then(|v| v.to_str().ok())
194            .unwrap_or("")
195            .to_lowercase();
196        let bytes = resp.bytes().await.context("Failed to read body")?;
197        if bytes.len() > MAX_CONTENT_LENGTH {
198            return Ok(ToolResult::error(format!(
199                "Content too large ({} bytes > {} max)",
200                bytes.len(),
201                MAX_CONTENT_LENGTH
202            )));
203        }
204        let body = String::from_utf8_lossy(&bytes).to_string();
205        let content = match p.format.as_str() {
206            "html" => body,
207            "text" => {
208                if self.is_html(&ct, &body) {
209                    self.html_to_text(&body)
210                } else {
211                    body
212                }
213            }
214            _ => {
215                if self.is_html(&ct, &body) {
216                    self.html_to_markdown(&body)
217                } else {
218                    body
219                }
220            }
221        };
222
223        let mut out = content;
224        let truncated = if out.chars().count() > p.max_chars {
225            // Keep head + tail so navigation/footer and disclaimers can still be spotted.
226            let head_chars = (p.max_chars as f64 * 0.70) as usize;
227            let tail_chars = (p.max_chars as f64 * 0.20) as usize;
228
229            let head: String = out.chars().take(head_chars).collect();
230            let tail: String = out
231                .chars()
232                .rev()
233                .take(tail_chars)
234                .collect::<String>()
235                .chars()
236                .rev()
237                .collect();
238
239            let total_chars = out.chars().count();
240            out = format!(
241                "{}\n\n[... truncated {} chars (max_chars={}) ...]\n\n{}",
242                head,
243                total_chars.saturating_sub(head_chars + tail_chars),
244                p.max_chars,
245                tail
246            );
247            true
248        } else {
249            false
250        };
251
252        Ok(ToolResult::success(out)
253            .with_metadata("url", json!(p.url))
254            .with_metadata("format", json!(p.format))
255            .with_metadata("truncated", json!(truncated))
256            .with_metadata("max_chars", json!(p.max_chars)))
257    }
258}
259
260#[cfg(test)]
261mod tests {
262    use super::*;
263
264    #[test]
265    fn html_to_markdown_strips_script_content() {
266        let tool = WebFetchTool::new();
267        let html = r#"<html><head><title>x</title></head><body>
268<h1>Hello</h1>
269<script>window.__assistant_state = { open: true }; function big(){ return 1; }</script>
270<p>World</p>
271</body></html>"#;
272
273        let md = tool.html_to_markdown(html);
274        assert!(md.contains("Hello"));
275        assert!(md.contains("World"));
276        assert!(!md.contains("__assistant_state"));
277        assert!(!md.contains("function big"));
278    }
279}