llm_coding_tools_core/operations/webfetch/
mod.rs1use crate::error::{ToolError, ToolResult};
4use html_to_markdown_rs::{convert, ConversionOptions, PreprocessingOptions, PreprocessingPreset};
5
6pub(crate) const MAX_RESPONSE_SIZE: usize = 5 * 1_024 * 1_024;
8
9#[derive(Debug, Clone)]
11pub struct WebFetchOutput {
12 pub content: String,
14 pub content_type: String,
16 pub byte_length: usize,
18}
19
20pub(crate) fn process_content(raw_content: &str, content_type: &str) -> String {
22 if content_type.contains("text/html") {
23 html_to_markdown(raw_content)
24 } else if content_type.contains("application/json") {
25 format_json(raw_content)
26 } else {
27 raw_content.to_owned()
28 }
29}
30
31pub(crate) fn categorize_reqwest_error(e: reqwest::Error, url: &str) -> ToolError {
33 if e.is_timeout() {
34 ToolError::Timeout(format!("Request timed out for {}", url))
35 } else if e.is_connect() {
36 ToolError::Http(format!("Connection failed for {}: {}", url, e))
37 } else if e.is_redirect() {
38 ToolError::Http(format!("Too many redirects for {}", url))
39 } else {
40 ToolError::Http(e.to_string())
41 }
42}
43
44#[inline]
46pub(crate) fn check_size(len: usize, url: &str) -> ToolResult<()> {
47 if len > MAX_RESPONSE_SIZE {
48 return Err(ToolError::Http(format!(
49 "Response too large: {} bytes (max {}) for {}",
50 len, MAX_RESPONSE_SIZE, url
51 )));
52 }
53 Ok(())
54}
55
56pub fn html_to_markdown(html: &str) -> String {
58 let options = ConversionOptions {
59 preprocessing: PreprocessingOptions {
60 enabled: true,
61 preset: PreprocessingPreset::Aggressive,
62 remove_navigation: true,
63 remove_forms: true,
64 },
65 strip_tags: vec![
66 "img".into(),
67 "svg".into(),
68 "script".into(),
69 "style".into(),
70 "noscript".into(),
71 ],
72 ..Default::default()
73 };
74
75 convert(html, Some(options)).unwrap_or_else(|_| html.to_string())
76}
77
78pub fn format_json(json_str: &str) -> String {
80 match serde_json::from_str::<serde_json::Value>(json_str) {
81 Ok(value) => serde_json::to_string_pretty(&value).unwrap_or_else(|_| json_str.to_string()),
82 Err(_) => json_str.to_string(),
83 }
84}
85
86#[cfg(not(feature = "blocking"))]
87mod async_impl;
88#[cfg(not(feature = "blocking"))]
89pub use async_impl::fetch_url;
90
91#[cfg(feature = "blocking")]
92mod blocking_impl;
93#[cfg(feature = "blocking")]
94pub use blocking_impl::fetch_url;
95
96#[cfg(test)]
97mod tests {
98 use super::*;
99
100 #[test]
101 fn html_to_markdown_strips_scripts() {
102 let html = "<p>Before</p><script>alert('xss')</script><p>After</p>";
103 let result = html_to_markdown(html);
104 assert!(!result.contains("alert"));
105 }
106
107 #[test]
108 fn format_json_prettifies() {
109 let json = r#"{"a":1}"#;
110 let result = format_json(json);
111 assert!(result.contains("\"a\": 1"));
112 }
113
114 #[test]
115 fn format_json_returns_original_on_invalid() {
116 let invalid = "not json";
117 assert_eq!(format_json(invalid), "not json");
118 }
119
120 #[test]
121 fn check_size_ok_for_small_content() {
122 assert!(check_size(1000, "http://example.com").is_ok());
123 }
124
125 #[test]
126 fn check_size_fails_for_large_content() {
127 assert!(check_size(MAX_RESPONSE_SIZE + 1, "http://example.com").is_err());
128 }
129}