syncable-cli 0.37.1

A Rust-based CLI that analyzes code repositories and generates Infrastructure as Code configurations
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
//! Web fetch tool for retrieving online content
//!
//! Provides the agent with the ability to fetch content from URLs and convert
//! HTML to readable markdown. Inspired by Forge's NetFetch tool.
//!
//! Features:
//! - Fetches HTTP/HTTPS URLs
//! - Converts HTML to markdown for readability
//! - Respects robots.txt (basic check)
//! - Truncates large responses to prevent context overflow
//! - Returns raw content when requested

use reqwest::{Client, Url};
use rig::completion::ToolDefinition;
use rig::tool::Tool;
use serde::{Deserialize, Serialize};
use serde_json::json;

/// Maximum content length to return (characters)
const MAX_CONTENT_LENGTH: usize = 40_000;

// ============================================================================
// Web Fetch Tool
// ============================================================================

#[derive(Debug, Deserialize)]
pub struct WebFetchArgs {
    /// URL to fetch
    pub url: String,
    /// If true, return raw content without markdown conversion (default: false)
    pub raw: Option<bool>,
}

#[derive(Debug, thiserror::Error)]
#[error("Web fetch error: {0}")]
pub struct WebFetchError(String);

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WebFetchTool {
    #[serde(skip)]
    client: Option<Client>,
}

impl Default for WebFetchTool {
    fn default() -> Self {
        Self::new()
    }
}

impl WebFetchTool {
    pub fn new() -> Self {
        Self {
            client: Some(
                Client::builder()
                    .user_agent("Mozilla/5.0 (compatible; SyncableCLI/0.1; +https://syncable.dev)")
                    .timeout(std::time::Duration::from_secs(30))
                    .build()
                    .unwrap_or_default(),
            ),
        }
    }

    fn client(&self) -> Client {
        self.client.clone().unwrap_or_default()
    }

    /// Check robots.txt for disallowed paths (basic check)
    async fn check_robots_txt(&self, url: &Url) -> Result<(), WebFetchError> {
        let robots_url = format!("{}://{}/robots.txt", url.scheme(), url.authority());

        // Try to fetch robots.txt (ignore errors - many sites don't have one)
        if let Ok(response) = self.client().get(&robots_url).send().await
            && response.status().is_success()
            && let Ok(robots_content) = response.text().await
        {
            let path = url.path();
            for line in robots_content.lines() {
                if let Some(disallowed) = line.strip_prefix("Disallow: ") {
                    let disallowed = disallowed.trim();
                    if !disallowed.is_empty() {
                        let disallowed = if !disallowed.starts_with('/') {
                            format!("/{}", disallowed)
                        } else {
                            disallowed.to_string()
                        };
                        let check_path = if !path.starts_with('/') {
                            format!("/{}", path)
                        } else {
                            path.to_string()
                        };
                        if check_path.starts_with(&disallowed) {
                            return Err(WebFetchError(format!(
                                "URL {} cannot be fetched due to robots.txt restrictions",
                                url
                            )));
                        }
                    }
                }
            }
        }
        Ok(())
    }

    /// Fetch URL content and optionally convert HTML to markdown
    async fn fetch_url(&self, url: &Url, force_raw: bool) -> Result<FetchResult, WebFetchError> {
        // Check robots.txt first
        self.check_robots_txt(url).await?;

        let response = self
            .client()
            .get(url.as_str())
            .send()
            .await
            .map_err(|e| WebFetchError(format!("Failed to fetch URL {}: {}", url, e)))?;

        let status = response.status();
        if !status.is_success() {
            return Err(WebFetchError(format!(
                "Failed to fetch {} - status code {}",
                url, status
            )));
        }

        let content_type = response
            .headers()
            .get("content-type")
            .and_then(|v| v.to_str().ok())
            .unwrap_or("")
            .to_string();

        let raw_content = response
            .text()
            .await
            .map_err(|e| WebFetchError(format!("Failed to read response from {}: {}", url, e)))?;

        // Determine if content is HTML
        let is_html = raw_content[..100.min(raw_content.len())].contains("<html")
            || raw_content[..100.min(raw_content.len())].contains("<!DOCTYPE")
            || raw_content[..100.min(raw_content.len())].contains("<!doctype")
            || content_type.contains("text/html")
            || (content_type.is_empty() && raw_content.contains("<body"));

        // Convert HTML to markdown unless raw is requested
        let content = if is_html && !force_raw {
            html_to_markdown(&raw_content)
        } else {
            raw_content
        };

        // Truncate if too long
        let (content, was_truncated) = if content.len() > MAX_CONTENT_LENGTH {
            (
                content[..MAX_CONTENT_LENGTH].to_string() + "\n\n[Content truncated...]",
                true,
            )
        } else {
            (content, false)
        };

        Ok(FetchResult {
            content,
            content_type,
            status_code: status.as_u16(),
            was_truncated,
            was_html: is_html && !force_raw,
        })
    }
}

#[derive(Debug)]
struct FetchResult {
    content: String,
    content_type: String,
    status_code: u16,
    was_truncated: bool,
    was_html: bool,
}

impl Tool for WebFetchTool {
    const NAME: &'static str = "web_fetch";

    type Error = WebFetchError;
    type Args = WebFetchArgs;
    type Output = String;

    async fn definition(&self, _prompt: String) -> ToolDefinition {
        ToolDefinition {
            name: Self::NAME.to_string(),
            description: r#"Fetch content from a URL and return it as text or markdown.

Use this tool to:
- Look up documentation for libraries, frameworks, or APIs
- Check official guides and tutorials
- Verify information from authoritative sources
- Research best practices and patterns
- Access API reference documentation
- Get current information beyond training data

The tool automatically converts HTML pages to readable markdown format.
For API endpoints returning JSON/XML, use raw=true to get the unprocessed response.

Limitations:
- Cannot access pages requiring authentication
- Respects robots.txt restrictions
- Large pages are truncated to ~40,000 characters
- Some sites may block automated requests"#
                .to_string(),
            parameters: json!({
                "type": "object",
                "properties": {
                    "url": {
                        "type": "string",
                        "description": "The URL to fetch (must be http:// or https://)"
                    },
                    "raw": {
                        "type": "boolean",
                        "description": "If true, return raw content without HTML-to-markdown conversion. Default: false"
                    }
                },
                "required": ["url"]
            }),
        }
    }

    async fn call(&self, args: Self::Args) -> Result<Self::Output, Self::Error> {
        // Parse and validate URL
        let url = Url::parse(&args.url)
            .map_err(|e| WebFetchError(format!("Invalid URL '{}': {}", args.url, e)))?;

        // Only allow http/https
        if url.scheme() != "http" && url.scheme() != "https" {
            return Err(WebFetchError(format!(
                "Unsupported URL scheme '{}'. Only http and https are supported.",
                url.scheme()
            )));
        }

        let force_raw = args.raw.unwrap_or(false);
        let result = self.fetch_url(&url, force_raw).await?;

        let output = json!({
            "url": args.url,
            "status_code": result.status_code,
            "content_type": result.content_type,
            "converted_to_markdown": result.was_html,
            "truncated": result.was_truncated,
            "content": result.content
        });

        serde_json::to_string_pretty(&output)
            .map_err(|e| WebFetchError(format!("Failed to serialize response: {}", e)))
    }
}

/// Convert HTML content to Markdown
///
/// Uses a simple regex-based approach for common HTML elements.
/// For more complex HTML, consider using a proper HTML parser.
fn html_to_markdown(html: &str) -> String {
    use regex::Regex;

    let mut content = html.to_string();

    // Remove script and style tags entirely
    let script_re = Regex::new(r"(?is)<script[^>]*>.*?</script>").unwrap();
    content = script_re.replace_all(&content, "").to_string();

    let style_re = Regex::new(r"(?is)<style[^>]*>.*?</style>").unwrap();
    content = style_re.replace_all(&content, "").to_string();

    // Remove comments
    let comment_re = Regex::new(r"(?is)<!--.*?-->").unwrap();
    content = comment_re.replace_all(&content, "").to_string();

    // Convert headers
    let h1_re = Regex::new(r"(?is)<h1[^>]*>(.*?)</h1>").unwrap();
    content = h1_re.replace_all(&content, "\n# $1\n").to_string();

    let h2_re = Regex::new(r"(?is)<h2[^>]*>(.*?)</h2>").unwrap();
    content = h2_re.replace_all(&content, "\n## $1\n").to_string();

    let h3_re = Regex::new(r"(?is)<h3[^>]*>(.*?)</h3>").unwrap();
    content = h3_re.replace_all(&content, "\n### $1\n").to_string();

    let h4_re = Regex::new(r"(?is)<h4[^>]*>(.*?)</h4>").unwrap();
    content = h4_re.replace_all(&content, "\n#### $1\n").to_string();

    let h5_re = Regex::new(r"(?is)<h5[^>]*>(.*?)</h5>").unwrap();
    content = h5_re.replace_all(&content, "\n##### $1\n").to_string();

    let h6_re = Regex::new(r"(?is)<h6[^>]*>(.*?)</h6>").unwrap();
    content = h6_re.replace_all(&content, "\n###### $1\n").to_string();

    // Convert paragraphs
    let p_re = Regex::new(r"(?is)<p[^>]*>(.*?)</p>").unwrap();
    content = p_re.replace_all(&content, "\n$1\n").to_string();

    // Convert links
    let a_re = Regex::new(r#"(?is)<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>"#).unwrap();
    content = a_re.replace_all(&content, "[$2]($1)").to_string();

    // Convert bold/strong
    let strong_re = Regex::new(r"(?is)<(?:strong|b)[^>]*>(.*?)</(?:strong|b)>").unwrap();
    content = strong_re.replace_all(&content, "**$1**").to_string();

    // Convert italic/em
    let em_re = Regex::new(r"(?is)<(?:em|i)[^>]*>(.*?)</(?:em|i)>").unwrap();
    content = em_re.replace_all(&content, "*$1*").to_string();

    // Convert code blocks
    let pre_re = Regex::new(r"(?is)<pre[^>]*><code[^>]*>(.*?)</code></pre>").unwrap();
    content = pre_re.replace_all(&content, "\n```\n$1\n```\n").to_string();

    let pre_only_re = Regex::new(r"(?is)<pre[^>]*>(.*?)</pre>").unwrap();
    content = pre_only_re
        .replace_all(&content, "\n```\n$1\n```\n")
        .to_string();

    // Convert inline code
    let code_re = Regex::new(r"(?is)<code[^>]*>(.*?)</code>").unwrap();
    content = code_re.replace_all(&content, "`$1`").to_string();

    // Convert lists
    let ul_re = Regex::new(r"(?is)<ul[^>]*>(.*?)</ul>").unwrap();
    content = ul_re.replace_all(&content, "\n$1\n").to_string();

    let ol_re = Regex::new(r"(?is)<ol[^>]*>(.*?)</ol>").unwrap();
    content = ol_re.replace_all(&content, "\n$1\n").to_string();

    let li_re = Regex::new(r"(?is)<li[^>]*>(.*?)</li>").unwrap();
    content = li_re.replace_all(&content, "- $1\n").to_string();

    // Convert blockquotes
    let bq_re = Regex::new(r"(?is)<blockquote[^>]*>(.*?)</blockquote>").unwrap();
    content = bq_re.replace_all(&content, "\n> $1\n").to_string();

    // Convert line breaks
    let br_re = Regex::new(r"(?i)<br\s*/?>").unwrap();
    content = br_re.replace_all(&content, "\n").to_string();

    // Convert horizontal rules
    let hr_re = Regex::new(r"(?i)<hr\s*/?>").unwrap();
    content = hr_re.replace_all(&content, "\n---\n").to_string();

    // Remove remaining HTML tags
    let tag_re = Regex::new(r"<[^>]+>").unwrap();
    content = tag_re.replace_all(&content, "").to_string();

    // Decode common HTML entities
    content = content
        .replace("&nbsp;", " ")
        .replace("&lt;", "<")
        .replace("&gt;", ">")
        .replace("&amp;", "&")
        .replace("&quot;", "\"")
        .replace("&#39;", "'")
        .replace("&apos;", "'")
        .replace("&copy;", "©")
        .replace("&reg;", "®")
        .replace("&trade;", "")
        .replace("&mdash;", "")
        .replace("&ndash;", "")
        .replace("&hellip;", "");

    // Clean up excessive whitespace
    let multiline_re = Regex::new(r"\n{3,}").unwrap();
    content = multiline_re.replace_all(&content, "\n\n").to_string();

    let space_re = Regex::new(r" {2,}").unwrap();
    content = space_re.replace_all(&content, " ").to_string();

    content.trim().to_string()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_html_to_markdown_headers() {
        let html = "<h1>Title</h1><h2>Subtitle</h2><h3>Section</h3>";
        let md = html_to_markdown(html);
        assert!(md.contains("# Title"));
        assert!(md.contains("## Subtitle"));
        assert!(md.contains("### Section"));
    }

    #[test]
    fn test_html_to_markdown_links() {
        let html = r#"<a href="https://example.com">Example</a>"#;
        let md = html_to_markdown(html);
        assert!(md.contains("[Example](https://example.com)"));
    }

    #[test]
    fn test_html_to_markdown_formatting() {
        let html = "<strong>bold</strong> and <em>italic</em>";
        let md = html_to_markdown(html);
        assert!(md.contains("**bold**"));
        assert!(md.contains("*italic*"));
    }

    #[test]
    fn test_html_to_markdown_code() {
        let html = "<code>inline</code> and <pre><code>block</code></pre>";
        let md = html_to_markdown(html);
        assert!(md.contains("`inline`"));
        assert!(md.contains("```"));
    }

    #[test]
    fn test_html_to_markdown_lists() {
        let html = "<ul><li>Item 1</li><li>Item 2</li></ul>";
        let md = html_to_markdown(html);
        assert!(md.contains("- Item 1"));
        assert!(md.contains("- Item 2"));
    }

    #[test]
    fn test_html_to_markdown_removes_scripts() {
        let html = "<p>Content</p><script>alert('xss')</script><p>More</p>";
        let md = html_to_markdown(html);
        assert!(!md.contains("script"));
        assert!(!md.contains("alert"));
        assert!(md.contains("Content"));
        assert!(md.contains("More"));
    }
}