Skip to main content

limit_cli/tools/
web_fetch.rs

1use async_trait::async_trait;
2use limit_agent::error::AgentError;
3use limit_agent::Tool;
4use regex::Regex;
5use reqwest::Client;
6use serde_json::Value;
7use std::time::Duration;
8
9/// Web fetch tool for retrieving and parsing web content
10pub struct WebFetchTool {
11    client: Client,
12}
13
14impl WebFetchTool {
15    pub fn new() -> Self {
16        Self {
17            client: Client::builder()
18                .timeout(Duration::from_secs(30))
19                .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
20                .build()
21                .unwrap_or_else(|_| Client::new()),
22        }
23    }
24
25    const MAX_SIZE: usize = 5 * 1024 * 1024; // 5MB
26}
27
28impl Default for WebFetchTool {
29    fn default() -> Self {
30        Self::new()
31    }
32}
33
34#[async_trait]
35impl Tool for WebFetchTool {
36    fn name(&self) -> &str {
37        "web_fetch"
38    }
39
40    async fn execute(&self, args: Value) -> Result<Value, AgentError> {
41        let url = args
42            .get("url")
43            .and_then(|v| v.as_str())
44            .ok_or_else(|| AgentError::ToolError("Missing 'url' argument".to_string()))?;
45
46        let format = args
47            .get("format")
48            .and_then(|v| v.as_str())
49            .unwrap_or("markdown");
50
51        // Validate URL
52        if !url.starts_with("http://") && !url.starts_with("https://") {
53            return Err(AgentError::ToolError(
54                "URL must start with http:// or https://".to_string(),
55            ));
56        }
57
58        // Fetch the URL
59        let response = self
60            .client
61            .get(url)
62            .header(
63                "Accept",
64                "text/html,application/xhtml+xml,text/markdown,text/plain,*/*;q=0.8",
65            )
66            .send()
67            .await
68            .map_err(|e| AgentError::ToolError(format!("Request failed: {}", e)))?;
69
70        // Check status
71        if !response.status().is_success() {
72            return Err(AgentError::ToolError(format!(
73                "HTTP error: {}",
74                response.status()
75            )));
76        }
77
78        // Check content length
79        if let Some(content_length) = response.headers().get("content-length") {
80            if let Ok(length_str) = content_length.to_str() {
81                if let Ok(length) = length_str.parse::<usize>() {
82                    if length > Self::MAX_SIZE {
83                        return Err(AgentError::ToolError(format!(
84                            "Response too large: {} bytes (max: {})",
85                            length,
86                            Self::MAX_SIZE
87                        )));
88                    }
89                }
90            }
91        }
92
93        // Get content type
94        let content_type = response
95            .headers()
96            .get("content-type")
97            .and_then(|v| v.to_str().ok())
98            .unwrap_or("text/plain")
99            .to_string();
100
101        // Get body
102        let body = response
103            .text()
104            .await
105            .map_err(|e| AgentError::ToolError(format!("Failed to read response: {}", e)))?;
106
107        // Check actual size
108        if body.len() > Self::MAX_SIZE {
109            return Err(AgentError::ToolError(format!(
110                "Response too large: {} bytes (max: {})",
111                body.len(),
112                Self::MAX_SIZE
113            )));
114        }
115
116        // Process based on format and content type
117        let output = if content_type.contains("text/html") {
118            match format {
119                "markdown" => html_to_markdown(&body),
120                "text" => html_to_text(&body),
121                "html" => body,
122                _ => html_to_markdown(&body),
123            }
124        } else {
125            body
126        };
127
128        Ok(serde_json::json!({
129            "url": url,
130            "content_type": content_type,
131            "format": format,
132            "content": output
133        }))
134    }
135}
136
137/// Convert HTML to Markdown (simple implementation)
138fn html_to_markdown(html: &str) -> String {
139    let mut text = html.to_string();
140
141    // Remove script, style, nav, footer, header tags
142    let remove_patterns = [
143        r"<script[^>]*>.*?</script>",
144        r"<style[^>]*>.*?</style>",
145        r"<nav[^>]*>.*?</nav>",
146        r"<footer[^>]*>.*?</footer>",
147        r"<header[^>]*>.*?</header>",
148        r"<!--.*?-->",
149    ];
150
151    for pattern in &remove_patterns {
152        if let Ok(re) = Regex::new(pattern) {
153            text = re.replace_all(&text, "").to_string();
154        }
155    }
156
157    // Convert headings
158    for i in 1..=6 {
159        if let Ok(re) = Regex::new(&format!(r"<h{0}[^>]*>(.*?)</h{0}>", i)) {
160            text = re
161                .replace_all(&text, |caps: &regex::Captures| {
162                    format!("{} {}\n\n", "#".repeat(i), &caps[1])
163                })
164                .to_string();
165        }
166    }
167
168    // Convert links
169    if let Ok(re) = Regex::new(r#"<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>"#) {
170        text = re
171            .replace_all(&text, |caps: &regex::Captures| {
172                format!("[{}]({})", &caps[2], &caps[1])
173            })
174            .to_string();
175    }
176
177    // Convert paragraphs
178    if let Ok(re) = Regex::new(r"<p[^>]*>(.*?)</p>") {
179        text = re
180            .replace_all(&text, |caps: &regex::Captures| format!("{}\n\n", &caps[1]))
181            .to_string();
182    }
183
184    // Convert line breaks
185    if let Ok(re) = Regex::new(r"<br\s*/?>") {
186        text = re.replace_all(&text, "\n").to_string();
187    }
188
189    // Convert code blocks
190    if let Ok(re) = Regex::new(r"<pre[^>]*><code[^>]*>(.*?)</code></pre>") {
191        text = re
192            .replace_all(&text, |caps: &regex::Captures| {
193                format!("```\n{}\n```\n\n", &caps[1])
194            })
195            .to_string();
196    }
197
198    // Convert inline code
199    if let Ok(re) = Regex::new(r"<code[^>]*>(.*?)</code>") {
200        text = re
201            .replace_all(&text, |caps: &regex::Captures| format!("`{}`", &caps[1]))
202            .to_string();
203    }
204
205    // Convert strong/bold
206    if let Ok(re) = Regex::new(r"<strong[^>]*>(.*?)</strong>") {
207        text = re
208            .replace_all(&text, |caps: &regex::Captures| format!("**{}**", &caps[1]))
209            .to_string();
210    }
211    if let Ok(re) = Regex::new(r"<b[^>]*>(.*?)</b>") {
212        text = re
213            .replace_all(&text, |caps: &regex::Captures| format!("**{}**", &caps[1]))
214            .to_string();
215    }
216
217    // Convert em/italic
218    if let Ok(re) = Regex::new(r"<em[^>]*>(.*?)</em>") {
219        text = re
220            .replace_all(&text, |caps: &regex::Captures| format!("*{}*", &caps[1]))
221            .to_string();
222    }
223    if let Ok(re) = Regex::new(r"<i[^>]*>(.*?)</i>") {
224        text = re
225            .replace_all(&text, |caps: &regex::Captures| format!("*{}*", &caps[1]))
226            .to_string();
227    }
228
229    // Convert lists
230    if let Ok(re) = Regex::new(r"<li[^>]*>(.*?)</li>") {
231        text = re
232            .replace_all(&text, |caps: &regex::Captures| format!("- {}\n", &caps[1]))
233            .to_string();
234    }
235
236    // Remove remaining HTML tags
237    if let Ok(re) = Regex::new(r"<[^>]+>") {
238        text = re.replace_all(&text, "").to_string();
239    }
240
241    // Decode HTML entities
242    text = text
243        .replace("&nbsp;", " ")
244        .replace("&amp;", "&")
245        .replace("&lt;", "<")
246        .replace("&gt;", ">")
247        .replace("&quot;", "\"")
248        .replace("&#39;", "'");
249
250    // Clean up whitespace
251    clean_whitespace(&text)
252}
253
254/// Convert HTML to plain text
255fn html_to_text(html: &str) -> String {
256    let mut text = html.to_string();
257
258    // Remove script, style, nav, footer, header tags
259    let remove_patterns = [
260        r"<script[^>]*>.*?</script>",
261        r"<style[^>]*>.*?</style>",
262        r"<nav[^>]*>.*?</nav>",
263        r"<footer[^>]*>.*?</footer>",
264        r"<header[^>]*>.*?</header>",
265        r"<!--.*?-->",
266    ];
267
268    for pattern in &remove_patterns {
269        if let Ok(re) = Regex::new(pattern) {
270            text = re.replace_all(&text, "").to_string();
271        }
272    }
273
274    // Convert block elements to newlines
275    let block_patterns = [r"</p>", r"</div>", r"</h[1-6]>", r"</li>", r"<br\s*/?>"];
276    for pattern in &block_patterns {
277        if let Ok(re) = Regex::new(pattern) {
278            text = re.replace_all(&text, "\n").to_string();
279        }
280    }
281
282    // Remove remaining HTML tags
283    if let Ok(re) = Regex::new(r"<[^>]+>") {
284        text = re.replace_all(&text, "").to_string();
285    }
286
287    // Decode HTML entities
288    text = text
289        .replace("&nbsp;", " ")
290        .replace("&amp;", "&")
291        .replace("&lt;", "<")
292        .replace("&gt;", ">")
293        .replace("&quot;", "\"")
294        .replace("&#39;", "'");
295
296    clean_whitespace(&text)
297}
298
299/// Clean up whitespace in text
300fn clean_whitespace(text: &str) -> String {
301    // Replace multiple spaces with single space
302    let re = Regex::new(r" {2,}").unwrap();
303    let mut text = re.replace_all(text, " ").to_string();
304
305    // Replace more than 2 newlines with 2 newlines
306    let re = Regex::new(r"\n{3,}").unwrap();
307    text = re.replace_all(&text, "\n\n").to_string();
308
309    text.trim().to_string()
310}
311
312#[cfg(test)]
313mod tests {
314    use super::*;
315
316    #[test]
317    fn test_web_fetch_tool_name() {
318        let tool = WebFetchTool::new();
319        assert_eq!(tool.name(), "web_fetch");
320    }
321
322    #[test]
323    fn test_web_fetch_tool_default() {
324        let tool = WebFetchTool::new();
325        assert_eq!(tool.name(), "web_fetch");
326    }
327
328    #[tokio::test]
329    async fn test_web_fetch_missing_url() {
330        let tool = WebFetchTool::new();
331        let args = serde_json::json!({});
332
333        let result = tool.execute(args).await;
334        assert!(result.is_err());
335        assert!(result.unwrap_err().to_string().contains("Missing 'url'"));
336    }
337
338    #[tokio::test]
339    async fn test_web_fetch_invalid_url() {
340        let tool = WebFetchTool::new();
341        let args = serde_json::json!({
342            "url": "ftp://example.com"
343        });
344
345        let result = tool.execute(args).await;
346        assert!(result.is_err());
347        assert!(result
348            .unwrap_err()
349            .to_string()
350            .contains("http:// or https://"));
351    }
352
353    #[test]
354    fn test_html_to_markdown() {
355        let html = r#"<h1>Title</h1><p>This is <strong>bold</strong> text.</p>"#;
356        let markdown = html_to_markdown(html);
357        assert!(markdown.contains("# Title"));
358        assert!(markdown.contains("**bold**"));
359    }
360
361    #[test]
362    fn test_html_to_text() {
363        let html = r#"<p>Hello</p><p>World</p>"#;
364        let text = html_to_text(html);
365        assert!(text.contains("Hello"));
366        assert!(text.contains("World"));
367    }
368
369    #[test]
370    fn test_clean_whitespace() {
371        let text = "Hello   World\n\n\n\nTest";
372        let cleaned = clean_whitespace(text);
373        assert!(!cleaned.contains("   "));
374        assert!(!cleaned.contains("\n\n\n"));
375    }
376}