llm_coding_tools_core/operations/webfetch/
mod.rs

1//! Web content fetching operation.
2
3use crate::error::{ToolError, ToolResult};
4use html_to_markdown_rs::{convert, ConversionOptions, PreprocessingOptions, PreprocessingPreset};
5
6/// Maximum response size to accept (5MB).
7pub(crate) const MAX_RESPONSE_SIZE: usize = 5 * 1_024 * 1_024;
8
9/// Result from URL fetch operation.
10#[derive(Debug, Clone)]
11pub struct WebFetchOutput {
12    /// The processed content (HTML converted to markdown, JSON prettified).
13    pub content: String,
14    /// The Content-Type header value.
15    pub content_type: String,
16    /// Original byte length before processing.
17    pub byte_length: usize,
18}
19
20/// Processes raw response content based on content type.
21pub(crate) fn process_content(raw_content: &str, content_type: &str) -> String {
22    if content_type.contains("text/html") {
23        html_to_markdown(raw_content)
24    } else if content_type.contains("application/json") {
25        format_json(raw_content)
26    } else {
27        raw_content.to_owned()
28    }
29}
30
31/// Categorizes reqwest errors into appropriate [`ToolError`] variants.
32pub(crate) fn categorize_reqwest_error(e: reqwest::Error, url: &str) -> ToolError {
33    if e.is_timeout() {
34        ToolError::Timeout(format!("Request timed out for {}", url))
35    } else if e.is_connect() {
36        ToolError::Http(format!("Connection failed for {}: {}", url, e))
37    } else if e.is_redirect() {
38        ToolError::Http(format!("Too many redirects for {}", url))
39    } else {
40        ToolError::Http(e.to_string())
41    }
42}
43
44/// Returns an error if the response size exceeds the maximum.
45#[inline]
46pub(crate) fn check_size(len: usize, url: &str) -> ToolResult<()> {
47    if len > MAX_RESPONSE_SIZE {
48        return Err(ToolError::Http(format!(
49            "Response too large: {} bytes (max {}) for {}",
50            len, MAX_RESPONSE_SIZE, url
51        )));
52    }
53    Ok(())
54}
55
56/// Converts HTML to markdown for LLM-friendly output.
57pub fn html_to_markdown(html: &str) -> String {
58    let options = ConversionOptions {
59        preprocessing: PreprocessingOptions {
60            enabled: true,
61            preset: PreprocessingPreset::Aggressive,
62            remove_navigation: true,
63            remove_forms: true,
64        },
65        strip_tags: vec![
66            "img".into(),
67            "svg".into(),
68            "script".into(),
69            "style".into(),
70            "noscript".into(),
71        ],
72        ..Default::default()
73    };
74
75    convert(html, Some(options)).unwrap_or_else(|_| html.to_string())
76}
77
78/// Formats JSON content for readability.
79pub fn format_json(json_str: &str) -> String {
80    match serde_json::from_str::<serde_json::Value>(json_str) {
81        Ok(value) => serde_json::to_string_pretty(&value).unwrap_or_else(|_| json_str.to_string()),
82        Err(_) => json_str.to_string(),
83    }
84}
85
86#[cfg(not(feature = "blocking"))]
87mod async_impl;
88#[cfg(not(feature = "blocking"))]
89pub use async_impl::fetch_url;
90
91#[cfg(feature = "blocking")]
92mod blocking_impl;
93#[cfg(feature = "blocking")]
94pub use blocking_impl::fetch_url;
95
96#[cfg(test)]
97mod tests {
98    use super::*;
99
100    #[test]
101    fn html_to_markdown_strips_scripts() {
102        let html = "<p>Before</p><script>alert('xss')</script><p>After</p>";
103        let result = html_to_markdown(html);
104        assert!(!result.contains("alert"));
105    }
106
107    #[test]
108    fn format_json_prettifies() {
109        let json = r#"{"a":1}"#;
110        let result = format_json(json);
111        assert!(result.contains("\"a\": 1"));
112    }
113
114    #[test]
115    fn format_json_returns_original_on_invalid() {
116        let invalid = "not json";
117        assert_eq!(format_json(invalid), "not json");
118    }
119
120    #[test]
121    fn check_size_ok_for_small_content() {
122        assert!(check_size(1000, "http://example.com").is_ok());
123    }
124
125    #[test]
126    fn check_size_fails_for_large_content() {
127        assert!(check_size(MAX_RESPONSE_SIZE + 1, "http://example.com").is_err());
128    }
129}