syncable_cli/agent/tools/
fetch.rs

1//! Web fetch tool for retrieving online content
2//!
3//! Provides the agent with the ability to fetch content from URLs and convert
4//! HTML to readable markdown. Inspired by Forge's NetFetch tool.
5//!
6//! Features:
7//! - Fetches HTTP/HTTPS URLs
8//! - Converts HTML to markdown for readability
9//! - Respects robots.txt (basic check)
10//! - Truncates large responses to prevent context overflow
11//! - Returns raw content when requested
12
13use reqwest::{Client, Url};
14use rig::completion::ToolDefinition;
15use rig::tool::Tool;
16use serde::{Deserialize, Serialize};
17use serde_json::json;
18
19/// Maximum content length to return (characters)
20const MAX_CONTENT_LENGTH: usize = 40_000;
21
22// ============================================================================
23// Web Fetch Tool
24// ============================================================================
25
26#[derive(Debug, Deserialize)]
27pub struct WebFetchArgs {
28    /// URL to fetch
29    pub url: String,
30    /// If true, return raw content without markdown conversion (default: false)
31    pub raw: Option<bool>,
32}
33
34#[derive(Debug, thiserror::Error)]
35#[error("Web fetch error: {0}")]
36pub struct WebFetchError(String);
37
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct WebFetchTool {
40    #[serde(skip)]
41    client: Option<Client>,
42}
43
44impl Default for WebFetchTool {
45    fn default() -> Self {
46        Self::new()
47    }
48}
49
50impl WebFetchTool {
51    pub fn new() -> Self {
52        Self {
53            client: Some(
54                Client::builder()
55                    .user_agent("Mozilla/5.0 (compatible; SyncableCLI/0.1; +https://syncable.dev)")
56                    .timeout(std::time::Duration::from_secs(30))
57                    .build()
58                    .unwrap_or_default(),
59            ),
60        }
61    }
62
63    fn client(&self) -> Client {
64        self.client.clone().unwrap_or_default()
65    }
66
67    /// Check robots.txt for disallowed paths (basic check)
68    async fn check_robots_txt(&self, url: &Url) -> Result<(), WebFetchError> {
69        let robots_url = format!("{}://{}/robots.txt", url.scheme(), url.authority());
70
71        // Try to fetch robots.txt (ignore errors - many sites don't have one)
72        if let Ok(response) = self.client().get(&robots_url).send().await {
73            if response.status().is_success() {
74                if let Ok(robots_content) = response.text().await {
75                    let path = url.path();
76                    for line in robots_content.lines() {
77                        if let Some(disallowed) = line.strip_prefix("Disallow: ") {
78                            let disallowed = disallowed.trim();
79                            if !disallowed.is_empty() {
80                                let disallowed = if !disallowed.starts_with('/') {
81                                    format!("/{}", disallowed)
82                                } else {
83                                    disallowed.to_string()
84                                };
85                                let check_path = if !path.starts_with('/') {
86                                    format!("/{}", path)
87                                } else {
88                                    path.to_string()
89                                };
90                                if check_path.starts_with(&disallowed) {
91                                    return Err(WebFetchError(format!(
92                                        "URL {} cannot be fetched due to robots.txt restrictions",
93                                        url
94                                    )));
95                                }
96                            }
97                        }
98                    }
99                }
100            }
101        }
102        Ok(())
103    }
104
105    /// Fetch URL content and optionally convert HTML to markdown
106    async fn fetch_url(&self, url: &Url, force_raw: bool) -> Result<FetchResult, WebFetchError> {
107        // Check robots.txt first
108        self.check_robots_txt(url).await?;
109
110        let response = self
111            .client()
112            .get(url.as_str())
113            .send()
114            .await
115            .map_err(|e| WebFetchError(format!("Failed to fetch URL {}: {}", url, e)))?;
116
117        let status = response.status();
118        if !status.is_success() {
119            return Err(WebFetchError(format!(
120                "Failed to fetch {} - status code {}",
121                url, status
122            )));
123        }
124
125        let content_type = response
126            .headers()
127            .get("content-type")
128            .and_then(|v| v.to_str().ok())
129            .unwrap_or("")
130            .to_string();
131
132        let raw_content = response
133            .text()
134            .await
135            .map_err(|e| WebFetchError(format!("Failed to read response from {}: {}", url, e)))?;
136
137        // Determine if content is HTML
138        let is_html = raw_content[..100.min(raw_content.len())].contains("<html")
139            || raw_content[..100.min(raw_content.len())].contains("<!DOCTYPE")
140            || raw_content[..100.min(raw_content.len())].contains("<!doctype")
141            || content_type.contains("text/html")
142            || (content_type.is_empty() && raw_content.contains("<body"));
143
144        // Convert HTML to markdown unless raw is requested
145        let content = if is_html && !force_raw {
146            html_to_markdown(&raw_content)
147        } else {
148            raw_content
149        };
150
151        // Truncate if too long
152        let (content, was_truncated) = if content.len() > MAX_CONTENT_LENGTH {
153            (
154                content[..MAX_CONTENT_LENGTH].to_string() + "\n\n[Content truncated...]",
155                true,
156            )
157        } else {
158            (content, false)
159        };
160
161        Ok(FetchResult {
162            content,
163            content_type,
164            status_code: status.as_u16(),
165            was_truncated,
166            was_html: is_html && !force_raw,
167        })
168    }
169}
170
171#[derive(Debug)]
172struct FetchResult {
173    content: String,
174    content_type: String,
175    status_code: u16,
176    was_truncated: bool,
177    was_html: bool,
178}
179
180impl Tool for WebFetchTool {
181    const NAME: &'static str = "web_fetch";
182
183    type Error = WebFetchError;
184    type Args = WebFetchArgs;
185    type Output = String;
186
187    async fn definition(&self, _prompt: String) -> ToolDefinition {
188        ToolDefinition {
189            name: Self::NAME.to_string(),
190            description: r#"Fetch content from a URL and return it as text or markdown.
191
192Use this tool to:
193- Look up documentation for libraries, frameworks, or APIs
194- Check official guides and tutorials
195- Verify information from authoritative sources
196- Research best practices and patterns
197- Access API reference documentation
198- Get current information beyond training data
199
200The tool automatically converts HTML pages to readable markdown format.
201For API endpoints returning JSON/XML, use raw=true to get the unprocessed response.
202
203Limitations:
204- Cannot access pages requiring authentication
205- Respects robots.txt restrictions
206- Large pages are truncated to ~40,000 characters
207- Some sites may block automated requests"#
208                .to_string(),
209            parameters: json!({
210                "type": "object",
211                "properties": {
212                    "url": {
213                        "type": "string",
214                        "description": "The URL to fetch (must be http:// or https://)"
215                    },
216                    "raw": {
217                        "type": "boolean",
218                        "description": "If true, return raw content without HTML-to-markdown conversion. Default: false"
219                    }
220                },
221                "required": ["url"]
222            }),
223        }
224    }
225
226    async fn call(&self, args: Self::Args) -> Result<Self::Output, Self::Error> {
227        // Parse and validate URL
228        let url = Url::parse(&args.url)
229            .map_err(|e| WebFetchError(format!("Invalid URL '{}': {}", args.url, e)))?;
230
231        // Only allow http/https
232        if url.scheme() != "http" && url.scheme() != "https" {
233            return Err(WebFetchError(format!(
234                "Unsupported URL scheme '{}'. Only http and https are supported.",
235                url.scheme()
236            )));
237        }
238
239        let force_raw = args.raw.unwrap_or(false);
240        let result = self.fetch_url(&url, force_raw).await?;
241
242        let output = json!({
243            "url": args.url,
244            "status_code": result.status_code,
245            "content_type": result.content_type,
246            "converted_to_markdown": result.was_html,
247            "truncated": result.was_truncated,
248            "content": result.content
249        });
250
251        serde_json::to_string_pretty(&output)
252            .map_err(|e| WebFetchError(format!("Failed to serialize response: {}", e)))
253    }
254}
255
256/// Convert HTML content to Markdown
257///
258/// Uses a simple regex-based approach for common HTML elements.
259/// For more complex HTML, consider using a proper HTML parser.
260fn html_to_markdown(html: &str) -> String {
261    use regex::Regex;
262
263    let mut content = html.to_string();
264
265    // Remove script and style tags entirely
266    let script_re = Regex::new(r"(?is)<script[^>]*>.*?</script>").unwrap();
267    content = script_re.replace_all(&content, "").to_string();
268
269    let style_re = Regex::new(r"(?is)<style[^>]*>.*?</style>").unwrap();
270    content = style_re.replace_all(&content, "").to_string();
271
272    // Remove comments
273    let comment_re = Regex::new(r"(?is)<!--.*?-->").unwrap();
274    content = comment_re.replace_all(&content, "").to_string();
275
276    // Convert headers
277    let h1_re = Regex::new(r"(?is)<h1[^>]*>(.*?)</h1>").unwrap();
278    content = h1_re.replace_all(&content, "\n# $1\n").to_string();
279
280    let h2_re = Regex::new(r"(?is)<h2[^>]*>(.*?)</h2>").unwrap();
281    content = h2_re.replace_all(&content, "\n## $1\n").to_string();
282
283    let h3_re = Regex::new(r"(?is)<h3[^>]*>(.*?)</h3>").unwrap();
284    content = h3_re.replace_all(&content, "\n### $1\n").to_string();
285
286    let h4_re = Regex::new(r"(?is)<h4[^>]*>(.*?)</h4>").unwrap();
287    content = h4_re.replace_all(&content, "\n#### $1\n").to_string();
288
289    let h5_re = Regex::new(r"(?is)<h5[^>]*>(.*?)</h5>").unwrap();
290    content = h5_re.replace_all(&content, "\n##### $1\n").to_string();
291
292    let h6_re = Regex::new(r"(?is)<h6[^>]*>(.*?)</h6>").unwrap();
293    content = h6_re.replace_all(&content, "\n###### $1\n").to_string();
294
295    // Convert paragraphs
296    let p_re = Regex::new(r"(?is)<p[^>]*>(.*?)</p>").unwrap();
297    content = p_re.replace_all(&content, "\n$1\n").to_string();
298
299    // Convert links
300    let a_re = Regex::new(r#"(?is)<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>"#).unwrap();
301    content = a_re.replace_all(&content, "[$2]($1)").to_string();
302
303    // Convert bold/strong
304    let strong_re = Regex::new(r"(?is)<(?:strong|b)[^>]*>(.*?)</(?:strong|b)>").unwrap();
305    content = strong_re.replace_all(&content, "**$1**").to_string();
306
307    // Convert italic/em
308    let em_re = Regex::new(r"(?is)<(?:em|i)[^>]*>(.*?)</(?:em|i)>").unwrap();
309    content = em_re.replace_all(&content, "*$1*").to_string();
310
311    // Convert code blocks
312    let pre_re = Regex::new(r"(?is)<pre[^>]*><code[^>]*>(.*?)</code></pre>").unwrap();
313    content = pre_re.replace_all(&content, "\n```\n$1\n```\n").to_string();
314
315    let pre_only_re = Regex::new(r"(?is)<pre[^>]*>(.*?)</pre>").unwrap();
316    content = pre_only_re
317        .replace_all(&content, "\n```\n$1\n```\n")
318        .to_string();
319
320    // Convert inline code
321    let code_re = Regex::new(r"(?is)<code[^>]*>(.*?)</code>").unwrap();
322    content = code_re.replace_all(&content, "`$1`").to_string();
323
324    // Convert lists
325    let ul_re = Regex::new(r"(?is)<ul[^>]*>(.*?)</ul>").unwrap();
326    content = ul_re.replace_all(&content, "\n$1\n").to_string();
327
328    let ol_re = Regex::new(r"(?is)<ol[^>]*>(.*?)</ol>").unwrap();
329    content = ol_re.replace_all(&content, "\n$1\n").to_string();
330
331    let li_re = Regex::new(r"(?is)<li[^>]*>(.*?)</li>").unwrap();
332    content = li_re.replace_all(&content, "- $1\n").to_string();
333
334    // Convert blockquotes
335    let bq_re = Regex::new(r"(?is)<blockquote[^>]*>(.*?)</blockquote>").unwrap();
336    content = bq_re.replace_all(&content, "\n> $1\n").to_string();
337
338    // Convert line breaks
339    let br_re = Regex::new(r"(?i)<br\s*/?>").unwrap();
340    content = br_re.replace_all(&content, "\n").to_string();
341
342    // Convert horizontal rules
343    let hr_re = Regex::new(r"(?i)<hr\s*/?>").unwrap();
344    content = hr_re.replace_all(&content, "\n---\n").to_string();
345
346    // Remove remaining HTML tags
347    let tag_re = Regex::new(r"<[^>]+>").unwrap();
348    content = tag_re.replace_all(&content, "").to_string();
349
350    // Decode common HTML entities
351    content = content
352        .replace("&nbsp;", " ")
353        .replace("&lt;", "<")
354        .replace("&gt;", ">")
355        .replace("&amp;", "&")
356        .replace("&quot;", "\"")
357        .replace("&#39;", "'")
358        .replace("&apos;", "'")
359        .replace("&copy;", "©")
360        .replace("&reg;", "®")
361        .replace("&trade;", "™")
362        .replace("&mdash;", "—")
363        .replace("&ndash;", "–")
364        .replace("&hellip;", "…");
365
366    // Clean up excessive whitespace
367    let multiline_re = Regex::new(r"\n{3,}").unwrap();
368    content = multiline_re.replace_all(&content, "\n\n").to_string();
369
370    let space_re = Regex::new(r" {2,}").unwrap();
371    content = space_re.replace_all(&content, " ").to_string();
372
373    content.trim().to_string()
374}
375
376#[cfg(test)]
377mod tests {
378    use super::*;
379
380    #[test]
381    fn test_html_to_markdown_headers() {
382        let html = "<h1>Title</h1><h2>Subtitle</h2><h3>Section</h3>";
383        let md = html_to_markdown(html);
384        assert!(md.contains("# Title"));
385        assert!(md.contains("## Subtitle"));
386        assert!(md.contains("### Section"));
387    }
388
389    #[test]
390    fn test_html_to_markdown_links() {
391        let html = r#"<a href="https://example.com">Example</a>"#;
392        let md = html_to_markdown(html);
393        assert!(md.contains("[Example](https://example.com)"));
394    }
395
396    #[test]
397    fn test_html_to_markdown_formatting() {
398        let html = "<strong>bold</strong> and <em>italic</em>";
399        let md = html_to_markdown(html);
400        assert!(md.contains("**bold**"));
401        assert!(md.contains("*italic*"));
402    }
403
404    #[test]
405    fn test_html_to_markdown_code() {
406        let html = "<code>inline</code> and <pre><code>block</code></pre>";
407        let md = html_to_markdown(html);
408        assert!(md.contains("`inline`"));
409        assert!(md.contains("```"));
410    }
411
412    #[test]
413    fn test_html_to_markdown_lists() {
414        let html = "<ul><li>Item 1</li><li>Item 2</li></ul>";
415        let md = html_to_markdown(html);
416        assert!(md.contains("- Item 1"));
417        assert!(md.contains("- Item 2"));
418    }
419
420    #[test]
421    fn test_html_to_markdown_removes_scripts() {
422        let html = "<p>Content</p><script>alert('xss')</script><p>More</p>";
423        let md = html_to_markdown(html);
424        assert!(!md.contains("script"));
425        assert!(!md.contains("alert"));
426        assert!(md.contains("Content"));
427        assert!(md.contains("More"));
428    }
429}