syncable_cli/agent/tools/
fetch.rs1use reqwest::{Client, Url};
14use rig::completion::ToolDefinition;
15use rig::tool::Tool;
16use serde::{Deserialize, Serialize};
17use serde_json::json;
18
19const MAX_CONTENT_LENGTH: usize = 40_000;
21
22#[derive(Debug, Deserialize)]
27pub struct WebFetchArgs {
28 pub url: String,
30 pub raw: Option<bool>,
32}
33
34#[derive(Debug, thiserror::Error)]
35#[error("Web fetch error: {0}")]
36pub struct WebFetchError(String);
37
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct WebFetchTool {
40 #[serde(skip)]
41 client: Option<Client>,
42}
43
44impl Default for WebFetchTool {
45 fn default() -> Self {
46 Self::new()
47 }
48}
49
50impl WebFetchTool {
51 pub fn new() -> Self {
52 Self {
53 client: Some(
54 Client::builder()
55 .user_agent("Mozilla/5.0 (compatible; SyncableCLI/0.1; +https://syncable.dev)")
56 .timeout(std::time::Duration::from_secs(30))
57 .build()
58 .unwrap_or_default(),
59 ),
60 }
61 }
62
63 fn client(&self) -> Client {
64 self.client.clone().unwrap_or_default()
65 }
66
67 async fn check_robots_txt(&self, url: &Url) -> Result<(), WebFetchError> {
69 let robots_url = format!("{}://{}/robots.txt", url.scheme(), url.authority());
70
71 if let Ok(response) = self.client().get(&robots_url).send().await {
73 if response.status().is_success() {
74 if let Ok(robots_content) = response.text().await {
75 let path = url.path();
76 for line in robots_content.lines() {
77 if let Some(disallowed) = line.strip_prefix("Disallow: ") {
78 let disallowed = disallowed.trim();
79 if !disallowed.is_empty() {
80 let disallowed = if !disallowed.starts_with('/') {
81 format!("/{}", disallowed)
82 } else {
83 disallowed.to_string()
84 };
85 let check_path = if !path.starts_with('/') {
86 format!("/{}", path)
87 } else {
88 path.to_string()
89 };
90 if check_path.starts_with(&disallowed) {
91 return Err(WebFetchError(format!(
92 "URL {} cannot be fetched due to robots.txt restrictions",
93 url
94 )));
95 }
96 }
97 }
98 }
99 }
100 }
101 }
102 Ok(())
103 }
104
105 async fn fetch_url(&self, url: &Url, force_raw: bool) -> Result<FetchResult, WebFetchError> {
107 self.check_robots_txt(url).await?;
109
110 let response = self
111 .client()
112 .get(url.as_str())
113 .send()
114 .await
115 .map_err(|e| WebFetchError(format!("Failed to fetch URL {}: {}", url, e)))?;
116
117 let status = response.status();
118 if !status.is_success() {
119 return Err(WebFetchError(format!(
120 "Failed to fetch {} - status code {}",
121 url, status
122 )));
123 }
124
125 let content_type = response
126 .headers()
127 .get("content-type")
128 .and_then(|v| v.to_str().ok())
129 .unwrap_or("")
130 .to_string();
131
132 let raw_content = response
133 .text()
134 .await
135 .map_err(|e| WebFetchError(format!("Failed to read response from {}: {}", url, e)))?;
136
137 let is_html = raw_content[..100.min(raw_content.len())].contains("<html")
139 || raw_content[..100.min(raw_content.len())].contains("<!DOCTYPE")
140 || raw_content[..100.min(raw_content.len())].contains("<!doctype")
141 || content_type.contains("text/html")
142 || (content_type.is_empty() && raw_content.contains("<body"));
143
144 let content = if is_html && !force_raw {
146 html_to_markdown(&raw_content)
147 } else {
148 raw_content
149 };
150
151 let (content, was_truncated) = if content.len() > MAX_CONTENT_LENGTH {
153 (
154 content[..MAX_CONTENT_LENGTH].to_string() + "\n\n[Content truncated...]",
155 true,
156 )
157 } else {
158 (content, false)
159 };
160
161 Ok(FetchResult {
162 content,
163 content_type,
164 status_code: status.as_u16(),
165 was_truncated,
166 was_html: is_html && !force_raw,
167 })
168 }
169}
170
171#[derive(Debug)]
172struct FetchResult {
173 content: String,
174 content_type: String,
175 status_code: u16,
176 was_truncated: bool,
177 was_html: bool,
178}
179
180impl Tool for WebFetchTool {
181 const NAME: &'static str = "web_fetch";
182
183 type Error = WebFetchError;
184 type Args = WebFetchArgs;
185 type Output = String;
186
187 async fn definition(&self, _prompt: String) -> ToolDefinition {
188 ToolDefinition {
189 name: Self::NAME.to_string(),
190 description: r#"Fetch content from a URL and return it as text or markdown.
191
192Use this tool to:
193- Look up documentation for libraries, frameworks, or APIs
194- Check official guides and tutorials
195- Verify information from authoritative sources
196- Research best practices and patterns
197- Access API reference documentation
198- Get current information beyond training data
199
200The tool automatically converts HTML pages to readable markdown format.
201For API endpoints returning JSON/XML, use raw=true to get the unprocessed response.
202
203Limitations:
204- Cannot access pages requiring authentication
205- Respects robots.txt restrictions
206- Large pages are truncated to ~40,000 characters
207- Some sites may block automated requests"#
208 .to_string(),
209 parameters: json!({
210 "type": "object",
211 "properties": {
212 "url": {
213 "type": "string",
214 "description": "The URL to fetch (must be http:// or https://)"
215 },
216 "raw": {
217 "type": "boolean",
218 "description": "If true, return raw content without HTML-to-markdown conversion. Default: false"
219 }
220 },
221 "required": ["url"]
222 }),
223 }
224 }
225
226 async fn call(&self, args: Self::Args) -> Result<Self::Output, Self::Error> {
227 let url = Url::parse(&args.url)
229 .map_err(|e| WebFetchError(format!("Invalid URL '{}': {}", args.url, e)))?;
230
231 if url.scheme() != "http" && url.scheme() != "https" {
233 return Err(WebFetchError(format!(
234 "Unsupported URL scheme '{}'. Only http and https are supported.",
235 url.scheme()
236 )));
237 }
238
239 let force_raw = args.raw.unwrap_or(false);
240 let result = self.fetch_url(&url, force_raw).await?;
241
242 let output = json!({
243 "url": args.url,
244 "status_code": result.status_code,
245 "content_type": result.content_type,
246 "converted_to_markdown": result.was_html,
247 "truncated": result.was_truncated,
248 "content": result.content
249 });
250
251 serde_json::to_string_pretty(&output)
252 .map_err(|e| WebFetchError(format!("Failed to serialize response: {}", e)))
253 }
254}
255
256fn html_to_markdown(html: &str) -> String {
261 use regex::Regex;
262
263 let mut content = html.to_string();
264
265 let script_re = Regex::new(r"(?is)<script[^>]*>.*?</script>").unwrap();
267 content = script_re.replace_all(&content, "").to_string();
268
269 let style_re = Regex::new(r"(?is)<style[^>]*>.*?</style>").unwrap();
270 content = style_re.replace_all(&content, "").to_string();
271
272 let comment_re = Regex::new(r"(?is)<!--.*?-->").unwrap();
274 content = comment_re.replace_all(&content, "").to_string();
275
276 let h1_re = Regex::new(r"(?is)<h1[^>]*>(.*?)</h1>").unwrap();
278 content = h1_re.replace_all(&content, "\n# $1\n").to_string();
279
280 let h2_re = Regex::new(r"(?is)<h2[^>]*>(.*?)</h2>").unwrap();
281 content = h2_re.replace_all(&content, "\n## $1\n").to_string();
282
283 let h3_re = Regex::new(r"(?is)<h3[^>]*>(.*?)</h3>").unwrap();
284 content = h3_re.replace_all(&content, "\n### $1\n").to_string();
285
286 let h4_re = Regex::new(r"(?is)<h4[^>]*>(.*?)</h4>").unwrap();
287 content = h4_re.replace_all(&content, "\n#### $1\n").to_string();
288
289 let h5_re = Regex::new(r"(?is)<h5[^>]*>(.*?)</h5>").unwrap();
290 content = h5_re.replace_all(&content, "\n##### $1\n").to_string();
291
292 let h6_re = Regex::new(r"(?is)<h6[^>]*>(.*?)</h6>").unwrap();
293 content = h6_re.replace_all(&content, "\n###### $1\n").to_string();
294
295 let p_re = Regex::new(r"(?is)<p[^>]*>(.*?)</p>").unwrap();
297 content = p_re.replace_all(&content, "\n$1\n").to_string();
298
299 let a_re = Regex::new(r#"(?is)<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>"#).unwrap();
301 content = a_re.replace_all(&content, "[$2]($1)").to_string();
302
303 let strong_re = Regex::new(r"(?is)<(?:strong|b)[^>]*>(.*?)</(?:strong|b)>").unwrap();
305 content = strong_re.replace_all(&content, "**$1**").to_string();
306
307 let em_re = Regex::new(r"(?is)<(?:em|i)[^>]*>(.*?)</(?:em|i)>").unwrap();
309 content = em_re.replace_all(&content, "*$1*").to_string();
310
311 let pre_re = Regex::new(r"(?is)<pre[^>]*><code[^>]*>(.*?)</code></pre>").unwrap();
313 content = pre_re.replace_all(&content, "\n```\n$1\n```\n").to_string();
314
315 let pre_only_re = Regex::new(r"(?is)<pre[^>]*>(.*?)</pre>").unwrap();
316 content = pre_only_re
317 .replace_all(&content, "\n```\n$1\n```\n")
318 .to_string();
319
320 let code_re = Regex::new(r"(?is)<code[^>]*>(.*?)</code>").unwrap();
322 content = code_re.replace_all(&content, "`$1`").to_string();
323
324 let ul_re = Regex::new(r"(?is)<ul[^>]*>(.*?)</ul>").unwrap();
326 content = ul_re.replace_all(&content, "\n$1\n").to_string();
327
328 let ol_re = Regex::new(r"(?is)<ol[^>]*>(.*?)</ol>").unwrap();
329 content = ol_re.replace_all(&content, "\n$1\n").to_string();
330
331 let li_re = Regex::new(r"(?is)<li[^>]*>(.*?)</li>").unwrap();
332 content = li_re.replace_all(&content, "- $1\n").to_string();
333
334 let bq_re = Regex::new(r"(?is)<blockquote[^>]*>(.*?)</blockquote>").unwrap();
336 content = bq_re.replace_all(&content, "\n> $1\n").to_string();
337
338 let br_re = Regex::new(r"(?i)<br\s*/?>").unwrap();
340 content = br_re.replace_all(&content, "\n").to_string();
341
342 let hr_re = Regex::new(r"(?i)<hr\s*/?>").unwrap();
344 content = hr_re.replace_all(&content, "\n---\n").to_string();
345
346 let tag_re = Regex::new(r"<[^>]+>").unwrap();
348 content = tag_re.replace_all(&content, "").to_string();
349
350 content = content
352 .replace(" ", " ")
353 .replace("<", "<")
354 .replace(">", ">")
355 .replace("&", "&")
356 .replace(""", "\"")
357 .replace("'", "'")
358 .replace("'", "'")
359 .replace("©", "©")
360 .replace("®", "®")
361 .replace("™", "™")
362 .replace("—", "—")
363 .replace("–", "–")
364 .replace("…", "…");
365
366 let multiline_re = Regex::new(r"\n{3,}").unwrap();
368 content = multiline_re.replace_all(&content, "\n\n").to_string();
369
370 let space_re = Regex::new(r" {2,}").unwrap();
371 content = space_re.replace_all(&content, " ").to_string();
372
373 content.trim().to_string()
374}
375
376#[cfg(test)]
377mod tests {
378 use super::*;
379
380 #[test]
381 fn test_html_to_markdown_headers() {
382 let html = "<h1>Title</h1><h2>Subtitle</h2><h3>Section</h3>";
383 let md = html_to_markdown(html);
384 assert!(md.contains("# Title"));
385 assert!(md.contains("## Subtitle"));
386 assert!(md.contains("### Section"));
387 }
388
389 #[test]
390 fn test_html_to_markdown_links() {
391 let html = r#"<a href="https://example.com">Example</a>"#;
392 let md = html_to_markdown(html);
393 assert!(md.contains("[Example](https://example.com)"));
394 }
395
396 #[test]
397 fn test_html_to_markdown_formatting() {
398 let html = "<strong>bold</strong> and <em>italic</em>";
399 let md = html_to_markdown(html);
400 assert!(md.contains("**bold**"));
401 assert!(md.contains("*italic*"));
402 }
403
404 #[test]
405 fn test_html_to_markdown_code() {
406 let html = "<code>inline</code> and <pre><code>block</code></pre>";
407 let md = html_to_markdown(html);
408 assert!(md.contains("`inline`"));
409 assert!(md.contains("```"));
410 }
411
412 #[test]
413 fn test_html_to_markdown_lists() {
414 let html = "<ul><li>Item 1</li><li>Item 2</li></ul>";
415 let md = html_to_markdown(html);
416 assert!(md.contains("- Item 1"));
417 assert!(md.contains("- Item 2"));
418 }
419
420 #[test]
421 fn test_html_to_markdown_removes_scripts() {
422 let html = "<p>Content</p><script>alert('xss')</script><p>More</p>";
423 let md = html_to_markdown(html);
424 assert!(!md.contains("script"));
425 assert!(!md.contains("alert"));
426 assert!(md.contains("Content"));
427 assert!(md.contains("More"));
428 }
429}