codetether_agent/tool/
webfetch.rs1use super::{Tool, ToolResult};
4use anyhow::{Context, Result};
5use async_trait::async_trait;
6use once_cell::sync::Lazy;
7use regex::Regex;
8use serde::Deserialize;
9use serde_json::{Value, json};
10use std::time::Duration;
11
12#[allow(dead_code)]
13const MAX_CONTENT_LENGTH: usize = 10 * 1024 * 1024;
14const REQUEST_TIMEOUT: Duration = Duration::from_secs(30);
15
16const DEFAULT_MAX_CHARS: usize = 200_000;
17
18#[allow(dead_code)]
19static RE_STRIP_SCRIPT_STYLE: Lazy<Regex> = Lazy::new(|| {
20 Regex::new(r"(?is)<script[^>]*>.*?</script>").expect("invalid regex")
24});
25
26static RE_STRIP_HTML_COMMENTS: Lazy<Regex> =
27 Lazy::new(|| Regex::new(r"(?is)<!--.*?-->").expect("invalid regex"));
28
29static RE_STRIP_SCRIPT: Lazy<Regex> =
30 Lazy::new(|| Regex::new(r"(?is)<script[^>]*>.*?</script>").expect("invalid regex"));
31static RE_STRIP_STYLE: Lazy<Regex> =
32 Lazy::new(|| Regex::new(r"(?is)<style[^>]*>.*?</style>").expect("invalid regex"));
33static RE_STRIP_NOSCRIPT: Lazy<Regex> =
34 Lazy::new(|| Regex::new(r"(?is)<noscript[^>]*>.*?</noscript>").expect("invalid regex"));
35static RE_STRIP_SVG: Lazy<Regex> =
36 Lazy::new(|| Regex::new(r"(?is)<svg[^>]*>.*?</svg>").expect("invalid regex"));
37static RE_STRIP_CANVAS: Lazy<Regex> =
38 Lazy::new(|| Regex::new(r"(?is)<canvas[^>]*>.*?</canvas>").expect("invalid regex"));
39static RE_STRIP_IFRAME: Lazy<Regex> =
40 Lazy::new(|| Regex::new(r"(?is)<iframe[^>]*>.*?</iframe>").expect("invalid regex"));
41
42pub struct WebFetchTool {
43 client: reqwest::Client,
44}
45
46impl Default for WebFetchTool {
47 fn default() -> Self {
48 Self::new()
49 }
50}
51
52impl WebFetchTool {
53 pub fn new() -> Self {
54 let client = reqwest::Client::builder()
55 .timeout(REQUEST_TIMEOUT)
56 .user_agent("CodeTether-Agent/1.0")
57 .redirect(reqwest::redirect::Policy::limited(5))
58 .build()
59 .expect("Failed to build HTTP client");
60 Self { client }
61 }
62
63 fn preprocess_html(&self, html: &str) -> String {
64 let mut s = html.to_string();
68
69 for re in [
72 &*RE_STRIP_SCRIPT,
73 &*RE_STRIP_STYLE,
74 &*RE_STRIP_NOSCRIPT,
75 &*RE_STRIP_SVG,
76 &*RE_STRIP_CANVAS,
77 &*RE_STRIP_IFRAME,
78 ] {
79 s = re.replace_all(&s, "").to_string();
80 }
81
82 s = RE_STRIP_HTML_COMMENTS.replace_all(&s, "").to_string();
83 s
84 }
85
86 fn html_to_markdown(&self, html: &str) -> String {
87 let html = self.preprocess_html(html);
88 let mut result = html;
89 let patterns = [
90 (r"<h1[^>]*>(.*?)</h1>", "# $1\n"),
91 (r"<h2[^>]*>(.*?)</h2>", "## $1\n"),
92 (r"<h3[^>]*>(.*?)</h3>", "### $1\n"),
93 (r"<p[^>]*>(.*?)</p>", "$1\n\n"),
94 (r"<(strong|b)[^>]*>(.*?)</\1>", "**$2**"),
95 (r"<(em|i)[^>]*>(.*?)</\1>", "*$2*"),
96 (r"<code[^>]*>(.*?)</code>", "`$1`"),
97 (r"<li[^>]*>(.*?)</li>", "- $1\n"),
98 ];
99 for (pat, rep) in patterns {
100 if let Ok(re) = regex::Regex::new(pat) {
101 result = re.replace_all(&result, rep).to_string();
102 }
103 }
104 result = regex::Regex::new(r"<[^>]+>")
105 .unwrap()
106 .replace_all(&result, "")
107 .to_string();
108 result
109 .replace(" ", " ")
110 .replace("&", "&")
111 .replace("<", "<")
112 .replace(">", ">")
113 }
114
115 fn html_to_text(&self, html: &str) -> String {
116 let md = self.html_to_markdown(html);
117 md.replace("**", "")
118 .replace("*", "")
119 .replace("`", "")
120 .replace("# ", "")
121 }
122
123 fn is_html(&self, ct: &str, body: &str) -> bool {
124 ct.contains("text/html") || body.trim().starts_with('<')
125 }
126}
127
128#[derive(Deserialize)]
129struct Params {
130 url: String,
131 #[serde(default = "default_fmt")]
132 format: String,
133 #[serde(default = "default_max_chars")]
134 max_chars: usize,
135}
136fn default_fmt() -> String {
137 "markdown".into()
138}
139
140fn default_max_chars() -> usize {
141 DEFAULT_MAX_CHARS
142}
143
144#[async_trait]
145impl Tool for WebFetchTool {
146 fn id(&self) -> &str {
147 "webfetch"
148 }
149 fn name(&self) -> &str {
150 "Web Fetch"
151 }
152 fn description(&self) -> &str {
153 "Fetch content from URL, convert HTML to markdown/text/html."
154 }
155 fn parameters(&self) -> Value {
156 json!({
157 "type": "object",
158 "properties": {
159 "url": {"type": "string"},
160 "format": {"type": "string", "enum": ["markdown", "text", "html"], "default": "markdown"},
161 "max_chars": {
162 "type": "integer",
163 "minimum": 1000,
164 "default": DEFAULT_MAX_CHARS,
165 "description": "Maximum number of characters to return (safety limit to avoid overflowing the model context window)."
166 }
167 },
168 "required": ["url"]
169 })
170 }
171
172 async fn execute(&self, params: Value) -> Result<ToolResult> {
173 let p: Params = serde_json::from_value(params).context("Invalid params")?;
174 let url = p.url.parse::<reqwest::Url>().context("Invalid URL")?;
175 if url.scheme() != "http" && url.scheme() != "https" {
176 return Ok(ToolResult::error("Only HTTP/HTTPS supported"));
177 }
178
179 crate::tls::ensure_rustls_crypto_provider();
180
181 let resp = self
182 .client
183 .get(url)
184 .send()
185 .await
186 .map_err(|e| anyhow::anyhow!("{}", e))?;
187 if !resp.status().is_success() {
188 return Ok(ToolResult::error(format!("HTTP {}", resp.status())));
189 }
190 let ct = resp
191 .headers()
192 .get("content-type")
193 .and_then(|v| v.to_str().ok())
194 .unwrap_or("")
195 .to_lowercase();
196 let bytes = resp.bytes().await.context("Failed to read body")?;
197 if bytes.len() > MAX_CONTENT_LENGTH {
198 return Ok(ToolResult::error(format!(
199 "Content too large ({} bytes > {} max)",
200 bytes.len(),
201 MAX_CONTENT_LENGTH
202 )));
203 }
204 let body = String::from_utf8_lossy(&bytes).to_string();
205 let content = match p.format.as_str() {
206 "html" => body,
207 "text" => {
208 if self.is_html(&ct, &body) {
209 self.html_to_text(&body)
210 } else {
211 body
212 }
213 }
214 _ => {
215 if self.is_html(&ct, &body) {
216 self.html_to_markdown(&body)
217 } else {
218 body
219 }
220 }
221 };
222
223 let mut out = content;
224 let truncated = if out.chars().count() > p.max_chars {
225 let head_chars = (p.max_chars as f64 * 0.70) as usize;
227 let tail_chars = (p.max_chars as f64 * 0.20) as usize;
228
229 let head: String = out.chars().take(head_chars).collect();
230 let tail: String = out
231 .chars()
232 .rev()
233 .take(tail_chars)
234 .collect::<String>()
235 .chars()
236 .rev()
237 .collect();
238
239 let total_chars = out.chars().count();
240 out = format!(
241 "{}\n\n[... truncated {} chars (max_chars={}) ...]\n\n{}",
242 head,
243 total_chars.saturating_sub(head_chars + tail_chars),
244 p.max_chars,
245 tail
246 );
247 true
248 } else {
249 false
250 };
251
252 Ok(ToolResult::success(out)
253 .with_metadata("url", json!(p.url))
254 .with_metadata("format", json!(p.format))
255 .with_metadata("truncated", json!(truncated))
256 .with_metadata("max_chars", json!(p.max_chars)))
257 }
258}
259
260#[cfg(test)]
261mod tests {
262 use super::*;
263
264 #[test]
265 fn html_to_markdown_strips_script_content() {
266 let tool = WebFetchTool::new();
267 let html = r#"<html><head><title>x</title></head><body>
268<h1>Hello</h1>
269<script>window.__assistant_state = { open: true }; function big(){ return 1; }</script>
270<p>World</p>
271</body></html>"#;
272
273 let md = tool.html_to_markdown(html);
274 assert!(md.contains("Hello"));
275 assert!(md.contains("World"));
276 assert!(!md.contains("__assistant_state"));
277 assert!(!md.contains("function big"));
278 }
279}