1use async_trait::async_trait;
2use limit_agent::error::AgentError;
3use limit_agent::Tool;
4use regex::Regex;
5use reqwest::Client;
6use serde_json::Value;
7use std::time::Duration;
8
9pub struct WebFetchTool {
11 client: Client,
12}
13
14impl WebFetchTool {
15 pub fn new() -> Self {
16 Self {
17 client: Client::builder()
18 .timeout(Duration::from_secs(30))
19 .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
20 .build()
21 .unwrap_or_else(|_| Client::new()),
22 }
23 }
24
25 const MAX_SIZE: usize = 5 * 1024 * 1024; }
27
28impl Default for WebFetchTool {
29 fn default() -> Self {
30 Self::new()
31 }
32}
33
34#[async_trait]
35impl Tool for WebFetchTool {
36 fn name(&self) -> &str {
37 "web_fetch"
38 }
39
40 async fn execute(&self, args: Value) -> Result<Value, AgentError> {
41 let url = args
42 .get("url")
43 .and_then(|v| v.as_str())
44 .ok_or_else(|| AgentError::ToolError("Missing 'url' argument".to_string()))?;
45
46 let format = args
47 .get("format")
48 .and_then(|v| v.as_str())
49 .unwrap_or("markdown");
50
51 if !url.starts_with("http://") && !url.starts_with("https://") {
53 return Err(AgentError::ToolError(
54 "URL must start with http:// or https://".to_string(),
55 ));
56 }
57
58 let response = self
60 .client
61 .get(url)
62 .header(
63 "Accept",
64 "text/html,application/xhtml+xml,text/markdown,text/plain,*/*;q=0.8",
65 )
66 .send()
67 .await
68 .map_err(|e| AgentError::ToolError(format!("Request failed: {}", e)))?;
69
70 if !response.status().is_success() {
72 return Err(AgentError::ToolError(format!(
73 "HTTP error: {}",
74 response.status()
75 )));
76 }
77
78 if let Some(content_length) = response.headers().get("content-length") {
80 if let Ok(length_str) = content_length.to_str() {
81 if let Ok(length) = length_str.parse::<usize>() {
82 if length > Self::MAX_SIZE {
83 return Err(AgentError::ToolError(format!(
84 "Response too large: {} bytes (max: {})",
85 length,
86 Self::MAX_SIZE
87 )));
88 }
89 }
90 }
91 }
92
93 let content_type = response
95 .headers()
96 .get("content-type")
97 .and_then(|v| v.to_str().ok())
98 .unwrap_or("text/plain")
99 .to_string();
100
101 let body = response
103 .text()
104 .await
105 .map_err(|e| AgentError::ToolError(format!("Failed to read response: {}", e)))?;
106
107 if body.len() > Self::MAX_SIZE {
109 return Err(AgentError::ToolError(format!(
110 "Response too large: {} bytes (max: {})",
111 body.len(),
112 Self::MAX_SIZE
113 )));
114 }
115
116 let output = if content_type.contains("text/html") {
118 match format {
119 "markdown" => html_to_markdown(&body),
120 "text" => html_to_text(&body),
121 "html" => body,
122 _ => html_to_markdown(&body),
123 }
124 } else {
125 body
126 };
127
128 Ok(serde_json::json!({
129 "url": url,
130 "content_type": content_type,
131 "format": format,
132 "content": output
133 }))
134 }
135}
136
137fn html_to_markdown(html: &str) -> String {
139 let mut text = html.to_string();
140
141 let remove_patterns = [
143 r"<script[^>]*>.*?</script>",
144 r"<style[^>]*>.*?</style>",
145 r"<nav[^>]*>.*?</nav>",
146 r"<footer[^>]*>.*?</footer>",
147 r"<header[^>]*>.*?</header>",
148 r"<!--.*?-->",
149 ];
150
151 for pattern in &remove_patterns {
152 if let Ok(re) = Regex::new(pattern) {
153 text = re.replace_all(&text, "").to_string();
154 }
155 }
156
157 for i in 1..=6 {
159 if let Ok(re) = Regex::new(&format!(r"<h{0}[^>]*>(.*?)</h{0}>", i)) {
160 text = re
161 .replace_all(&text, |caps: ®ex::Captures| {
162 format!("{} {}\n\n", "#".repeat(i), &caps[1])
163 })
164 .to_string();
165 }
166 }
167
168 if let Ok(re) = Regex::new(r#"<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>"#) {
170 text = re
171 .replace_all(&text, |caps: ®ex::Captures| {
172 format!("[{}]({})", &caps[2], &caps[1])
173 })
174 .to_string();
175 }
176
177 if let Ok(re) = Regex::new(r"<p[^>]*>(.*?)</p>") {
179 text = re
180 .replace_all(&text, |caps: ®ex::Captures| format!("{}\n\n", &caps[1]))
181 .to_string();
182 }
183
184 if let Ok(re) = Regex::new(r"<br\s*/?>") {
186 text = re.replace_all(&text, "\n").to_string();
187 }
188
189 if let Ok(re) = Regex::new(r"<pre[^>]*><code[^>]*>(.*?)</code></pre>") {
191 text = re
192 .replace_all(&text, |caps: ®ex::Captures| {
193 format!("```\n{}\n```\n\n", &caps[1])
194 })
195 .to_string();
196 }
197
198 if let Ok(re) = Regex::new(r"<code[^>]*>(.*?)</code>") {
200 text = re
201 .replace_all(&text, |caps: ®ex::Captures| format!("`{}`", &caps[1]))
202 .to_string();
203 }
204
205 if let Ok(re) = Regex::new(r"<strong[^>]*>(.*?)</strong>") {
207 text = re
208 .replace_all(&text, |caps: ®ex::Captures| format!("**{}**", &caps[1]))
209 .to_string();
210 }
211 if let Ok(re) = Regex::new(r"<b[^>]*>(.*?)</b>") {
212 text = re
213 .replace_all(&text, |caps: ®ex::Captures| format!("**{}**", &caps[1]))
214 .to_string();
215 }
216
217 if let Ok(re) = Regex::new(r"<em[^>]*>(.*?)</em>") {
219 text = re
220 .replace_all(&text, |caps: ®ex::Captures| format!("*{}*", &caps[1]))
221 .to_string();
222 }
223 if let Ok(re) = Regex::new(r"<i[^>]*>(.*?)</i>") {
224 text = re
225 .replace_all(&text, |caps: ®ex::Captures| format!("*{}*", &caps[1]))
226 .to_string();
227 }
228
229 if let Ok(re) = Regex::new(r"<li[^>]*>(.*?)</li>") {
231 text = re
232 .replace_all(&text, |caps: ®ex::Captures| format!("- {}\n", &caps[1]))
233 .to_string();
234 }
235
236 if let Ok(re) = Regex::new(r"<[^>]+>") {
238 text = re.replace_all(&text, "").to_string();
239 }
240
241 text = text
243 .replace(" ", " ")
244 .replace("&", "&")
245 .replace("<", "<")
246 .replace(">", ">")
247 .replace(""", "\"")
248 .replace("'", "'");
249
250 clean_whitespace(&text)
252}
253
254fn html_to_text(html: &str) -> String {
256 let mut text = html.to_string();
257
258 let remove_patterns = [
260 r"<script[^>]*>.*?</script>",
261 r"<style[^>]*>.*?</style>",
262 r"<nav[^>]*>.*?</nav>",
263 r"<footer[^>]*>.*?</footer>",
264 r"<header[^>]*>.*?</header>",
265 r"<!--.*?-->",
266 ];
267
268 for pattern in &remove_patterns {
269 if let Ok(re) = Regex::new(pattern) {
270 text = re.replace_all(&text, "").to_string();
271 }
272 }
273
274 let block_patterns = [r"</p>", r"</div>", r"</h[1-6]>", r"</li>", r"<br\s*/?>"];
276 for pattern in &block_patterns {
277 if let Ok(re) = Regex::new(pattern) {
278 text = re.replace_all(&text, "\n").to_string();
279 }
280 }
281
282 if let Ok(re) = Regex::new(r"<[^>]+>") {
284 text = re.replace_all(&text, "").to_string();
285 }
286
287 text = text
289 .replace(" ", " ")
290 .replace("&", "&")
291 .replace("<", "<")
292 .replace(">", ">")
293 .replace(""", "\"")
294 .replace("'", "'");
295
296 clean_whitespace(&text)
297}
298
299fn clean_whitespace(text: &str) -> String {
301 let re = Regex::new(r" {2,}").unwrap();
303 let mut text = re.replace_all(text, " ").to_string();
304
305 let re = Regex::new(r"\n{3,}").unwrap();
307 text = re.replace_all(&text, "\n\n").to_string();
308
309 text.trim().to_string()
310}
311
312#[cfg(test)]
313mod tests {
314 use super::*;
315
316 #[test]
317 fn test_web_fetch_tool_name() {
318 let tool = WebFetchTool::new();
319 assert_eq!(tool.name(), "web_fetch");
320 }
321
322 #[test]
323 fn test_web_fetch_tool_default() {
324 let tool = WebFetchTool::new();
325 assert_eq!(tool.name(), "web_fetch");
326 }
327
328 #[tokio::test]
329 async fn test_web_fetch_missing_url() {
330 let tool = WebFetchTool::new();
331 let args = serde_json::json!({});
332
333 let result = tool.execute(args).await;
334 assert!(result.is_err());
335 assert!(result.unwrap_err().to_string().contains("Missing 'url'"));
336 }
337
338 #[tokio::test]
339 async fn test_web_fetch_invalid_url() {
340 let tool = WebFetchTool::new();
341 let args = serde_json::json!({
342 "url": "ftp://example.com"
343 });
344
345 let result = tool.execute(args).await;
346 assert!(result.is_err());
347 assert!(result
348 .unwrap_err()
349 .to_string()
350 .contains("http:// or https://"));
351 }
352
353 #[test]
354 fn test_html_to_markdown() {
355 let html = r#"<h1>Title</h1><p>This is <strong>bold</strong> text.</p>"#;
356 let markdown = html_to_markdown(html);
357 assert!(markdown.contains("# Title"));
358 assert!(markdown.contains("**bold**"));
359 }
360
361 #[test]
362 fn test_html_to_text() {
363 let html = r#"<p>Hello</p><p>World</p>"#;
364 let text = html_to_text(html);
365 assert!(text.contains("Hello"));
366 assert!(text.contains("World"));
367 }
368
369 #[test]
370 fn test_clean_whitespace() {
371 let text = "Hello World\n\n\n\nTest";
372 let cleaned = clean_whitespace(text);
373 assert!(!cleaned.contains(" "));
374 assert!(!cleaned.contains("\n\n\n"));
375 }
376}