Skip to main content

web_retrieval/
fetch.rs

1// TODO(1): Add optional JS-rendering sidecar for dynamic pages (Playwright/etc.)
2
3use agentic_tools_core::error::ToolError;
4use chrono::Utc;
5
6use crate::WebTools;
7use crate::types::WebFetchInput;
8use crate::types::WebFetchOutput;
9
10/// Hard maximum allowed `max_bytes`: 20 MB
11pub const HARD_MAX_BYTES: usize = 20 * 1024 * 1024;
12
13/// Execute a web fetch: download URL, convert content, optionally summarize.
14///
15/// # Errors
16/// Returns `ToolError` if the HTTP request fails, the content type is unsupported, or summarization fails.
17pub async fn web_fetch(
18    tools: &WebTools,
19    input: WebFetchInput,
20) -> Result<WebFetchOutput, ToolError> {
21    #[expect(clippy::cast_possible_truncation)]
22    let default_max_bytes = tools.cfg.default_max_bytes as usize;
23    let max_bytes = input.max_bytes.unwrap_or(default_max_bytes);
24
25    if max_bytes > HARD_MAX_BYTES {
26        return Err(ToolError::invalid_input(format!(
27            "max_bytes must be <= {HARD_MAX_BYTES} (20MB)"
28        )));
29    }
30
31    // Send GET request
32    let mut response = tools
33        .http
34        .get(&input.url)
35        .send()
36        .await
37        .map_err(|e| ToolError::external(format!("HTTP request failed: {e}")))?;
38
39    let status = response.status();
40    if !status.is_success() {
41        return Err(ToolError::external(format!(
42            "HTTP request failed with status {status} for {}",
43            response.url()
44        )));
45    }
46
47    let final_url = response.url().to_string();
48    let content_type = response
49        .headers()
50        .get(reqwest::header::CONTENT_TYPE)
51        .and_then(|v| v.to_str().ok())
52        .unwrap_or("")
53        .to_string();
54
55    // Download body with size cap (streaming)
56    #[expect(clippy::cast_possible_truncation)]
57    // max_bytes is already bounded by HARD_MAX_BYTES (20MB)
58    let initial_capacity = response
59        .content_length()
60        .map_or(8 * 1024, |len| len.min(max_bytes as u64) as usize)
61        .min(max_bytes);
62
63    let mut bytes: Vec<u8> = Vec::with_capacity(initial_capacity);
64    let mut truncated = false;
65
66    loop {
67        // Conservative: once we reach the cap, stop without attempting to read more
68        if bytes.len() >= max_bytes {
69            truncated = true;
70            break;
71        }
72
73        let Some(chunk) = response
74            .chunk()
75            .await
76            .map_err(|e| ToolError::external(format!("Failed to read response body: {e}")))?
77        else {
78            break;
79        };
80
81        let remaining = max_bytes - bytes.len();
82        if chunk.len() > remaining {
83            bytes.extend_from_slice(&chunk[..remaining]);
84            truncated = true;
85            break;
86        }
87
88        bytes.extend_from_slice(&chunk);
89    }
90
91    // Convert based on content-type
92    let (title, content) = decode_and_convert(&bytes, &content_type)?;
93
94    let word_count = content.split_whitespace().count();
95
96    // Optional summarization
97    let summary = if input.summarize {
98        Some(
99            crate::haiku::summarize_markdown(tools, &content)
100                .await
101                .map_err(|e| ToolError::external(format!("Summarization failed: {e}")))?,
102        )
103    } else {
104        None
105    };
106
107    Ok(WebFetchOutput {
108        final_url,
109        title,
110        content_type,
111        word_count,
112        truncated,
113        retrieved_at: Utc::now(),
114        content,
115        summary,
116    })
117}
118
119/// Decode bytes and convert to a useful text format based on content-type.
120///
121/// # Errors
122/// Returns `ToolError` if the content type is unsupported or HTML conversion fails.
123pub fn decode_and_convert(
124    bytes: &[u8],
125    content_type: &str,
126) -> Result<(Option<String>, String), ToolError> {
127    let ct_lower = content_type.to_lowercase();
128
129    // Try to decode as UTF-8
130    let text = String::from_utf8_lossy(bytes);
131
132    if ct_lower.contains("text/html") || (ct_lower.is_empty() && looks_like_html(&text)) {
133        let title = extract_title(&text);
134        let md = htmd::convert(&text)
135            .map_err(|e| ToolError::internal(format!("HTML conversion failed: {e}")))?;
136        Ok((title, md))
137    } else if ct_lower.contains("application/json") || ct_lower.contains("+json") {
138        // Pretty-print JSON
139        match serde_json::from_str::<serde_json::Value>(&text) {
140            Ok(val) => {
141                let pretty =
142                    serde_json::to_string_pretty(&val).unwrap_or_else(|_| text.into_owned());
143                Ok((None, pretty))
144            }
145            Err(_) => Ok((None, text.into_owned())),
146        }
147    } else if ct_lower.starts_with("text/") || ct_lower.is_empty() {
148        Ok((None, text.into_owned()))
149    } else {
150        // Binary or unsupported content type
151        Err(ToolError::invalid_input(format!(
152            "Unsupported content type: {content_type}. Only HTML, text, and JSON are supported."
153        )))
154    }
155}
156
157/// Best-effort `<title>` extraction from HTML.
158#[must_use]
159pub fn extract_title(html: &str) -> Option<String> {
160    let lower = html.to_ascii_lowercase();
161    let start = lower.find("<title")?;
162    let after_tag = lower[start..].find('>')?;
163    let title_start = start + after_tag + 1;
164    let title_end = lower[title_start..].find("</title>")?;
165    let title = html[title_start..title_start + title_end].trim();
166    if title.is_empty() {
167        None
168    } else {
169        Some(title.to_string())
170    }
171}
172
173/// Simple heuristic to detect HTML content.
174fn looks_like_html(text: &str) -> bool {
175    let trimmed = text.trim_start();
176    trimmed.starts_with("<!DOCTYPE")
177        || trimmed.starts_with("<!doctype")
178        || trimmed.starts_with("<html")
179}
180
181#[cfg(test)]
182mod tests {
183    use super::*;
184
185    #[test]
186    fn test_decode_html() {
187        let html = b"<html><head><title>Test Page</title></head><body><h1>Hello</h1><p>World</p></body></html>";
188        let (title, content) = decode_and_convert(html, "text/html").unwrap();
189        assert_eq!(title.as_deref(), Some("Test Page"));
190        assert!(content.contains("Hello"));
191        assert!(content.contains("World"));
192    }
193
194    #[test]
195    fn test_decode_json() {
196        let json = br#"{"key":"value","num":42}"#;
197        let (title, content) = decode_and_convert(json, "application/json").unwrap();
198        assert!(title.is_none());
199        assert!(content.contains("\"key\": \"value\""));
200    }
201
202    #[test]
203    fn test_decode_plain_text() {
204        let text = b"Hello, world!";
205        let (title, content) = decode_and_convert(text, "text/plain").unwrap();
206        assert!(title.is_none());
207        assert_eq!(content, "Hello, world!");
208    }
209
210    #[test]
211    fn test_decode_binary_errors() {
212        let bytes = b"\x00\x01\x02";
213        let result = decode_and_convert(bytes, "application/octet-stream");
214        assert!(result.is_err());
215    }
216
217    #[test]
218    fn test_extract_title() {
219        assert_eq!(
220            extract_title("<html><head><title>My Page</title></head></html>"),
221            Some("My Page".into())
222        );
223        assert_eq!(extract_title("<html><head></head></html>"), None);
224        assert_eq!(extract_title("<title></title>"), None);
225    }
226
227    #[test]
228    fn test_looks_like_html() {
229        assert!(looks_like_html("<!DOCTYPE html><html>"));
230        assert!(looks_like_html("  <html>"));
231        assert!(!looks_like_html("Hello, world!"));
232    }
233
234    #[test]
235    fn test_extract_title_unicode_before_tag() {
236        // Turkish İ (2→3 bytes under to_lowercase) would panic or corrupt with old code
237        assert_eq!(
238            extract_title("İ<title>Test Page</title>"),
239            Some("Test Page".to_string())
240        );
241    }
242
243    #[test]
244    fn test_extract_title_mixed_case_tags() {
245        // Verify ASCII case-insensitivity still works
246        assert_eq!(
247            extract_title("<TITLE>Upper</TITLE>"),
248            Some("Upper".to_string())
249        );
250        assert_eq!(
251            extract_title("<TiTlE>Mixed</TiTlE>"),
252            Some("Mixed".to_string())
253        );
254    }
255
256    mod integration {
257        use super::*;
258        use crate::WebTools;
259        use crate::types::WebFetchInput;
260        use wiremock::Mock;
261        use wiremock::MockServer;
262        use wiremock::ResponseTemplate;
263        use wiremock::matchers::method;
264
265        #[tokio::test]
266        async fn web_fetch_returns_error_on_404() {
267            let mock_server = MockServer::start().await;
268
269            Mock::given(method("GET"))
270                .respond_with(ResponseTemplate::new(404).set_body_string("Not Found"))
271                .mount(&mock_server)
272                .await;
273
274            let http = reqwest::Client::new();
275            let tools = WebTools::with_http_client(http);
276
277            let input = WebFetchInput {
278                url: mock_server.uri(),
279                summarize: false,
280                max_bytes: None,
281            };
282
283            let result = web_fetch(&tools, input).await;
284            assert!(result.is_err(), "Expected error for 404 response");
285            let err = result.unwrap_err();
286            assert!(
287                err.to_string().contains("404"),
288                "Error message should mention 404 status"
289            );
290        }
291
292        #[tokio::test]
293        async fn web_fetch_returns_error_on_500() {
294            let mock_server = MockServer::start().await;
295
296            Mock::given(method("GET"))
297                .respond_with(ResponseTemplate::new(500).set_body_string("Internal Server Error"))
298                .mount(&mock_server)
299                .await;
300
301            let http = reqwest::Client::new();
302            let tools = WebTools::with_http_client(http);
303
304            let input = WebFetchInput {
305                url: mock_server.uri(),
306                summarize: false,
307                max_bytes: None,
308            };
309
310            let result = web_fetch(&tools, input).await;
311            assert!(result.is_err(), "Expected error for 500 response");
312            let err = result.unwrap_err();
313            assert!(
314                err.to_string().contains("500"),
315                "Error message should mention 500 status"
316            );
317        }
318
319        #[tokio::test]
320        async fn web_fetch_succeeds_on_200() {
321            let mock_server = MockServer::start().await;
322
323            Mock::given(method("GET"))
324                .respond_with(
325                    ResponseTemplate::new(200)
326                        .set_body_string("Hello, world!")
327                        .insert_header("Content-Type", "text/plain"),
328                )
329                .mount(&mock_server)
330                .await;
331
332            let http = reqwest::Client::new();
333            let tools = WebTools::with_http_client(http);
334
335            let input = WebFetchInput {
336                url: mock_server.uri(),
337                summarize: false,
338                max_bytes: None,
339            };
340
341            let result = web_fetch(&tools, input).await;
342            assert!(result.is_ok(), "Expected success for 200 response");
343            let output = result.unwrap();
344            assert_eq!(output.content, "Hello, world!");
345        }
346
347        #[tokio::test]
348        async fn web_fetch_detects_html_without_content_type() {
349            let mock_server = MockServer::start().await;
350
351            let html = b"<!DOCTYPE html><html><head><title>Test Page</title></head><body><p>Hello</p></body></html>";
352
353            // HTML response with NO Content-Type header (misconfigured server)
354            // Use set_body_bytes to avoid wiremock's automatic text/plain Content-Type
355            Mock::given(method("GET"))
356                .respond_with(ResponseTemplate::new(200).set_body_bytes(html.as_slice()))
357                .mount(&mock_server)
358                .await;
359
360            let http = reqwest::Client::new();
361            let tools = WebTools::with_http_client(http);
362
363            let input = WebFetchInput {
364                url: mock_server.uri(),
365                summarize: false,
366                max_bytes: None,
367            };
368
369            let result = web_fetch(&tools, input).await;
370            assert!(
371                result.is_ok(),
372                "Expected success for HTML without Content-Type"
373            );
374            let output = result.unwrap();
375
376            // Verify content_type is empty (no header)
377            assert!(
378                output.content_type.is_empty(),
379                "Content-Type should be empty, got: {}",
380                output.content_type
381            );
382
383            // Verify HTML heuristic detected the content and converted to markdown
384            assert_eq!(
385                output.title.as_deref(),
386                Some("Test Page"),
387                "Should extract title via looks_like_html heuristic"
388            );
389            assert!(
390                output.content.contains("Hello"),
391                "Content should be converted"
392            );
393            assert!(
394                !output.content.contains("<p>"),
395                "HTML tags should be removed by markdown conversion"
396            );
397        }
398    }
399}