Skip to main content

web_retrieval/
fetch.rs

1// TODO(1): Add optional JS-rendering sidecar for dynamic pages (Playwright/etc.)
2
3use agentic_tools_core::error::ToolError;
4use chrono::Utc;
5
6use crate::WebTools;
7use crate::types::{WebFetchInput, WebFetchOutput};
8
9/// Default maximum download size: 5 MB
10const DEFAULT_MAX_BYTES: usize = 5 * 1024 * 1024;
11
12/// Hard maximum allowed `max_bytes`: 20 MB
13pub const HARD_MAX_BYTES: usize = 20 * 1024 * 1024;
14
15/// Execute a web fetch: download URL, convert content, optionally summarize.
16///
17/// # Errors
18/// Returns `ToolError` if the HTTP request fails, the content type is unsupported, or summarization fails.
19pub async fn web_fetch(
20    tools: &WebTools,
21    input: WebFetchInput,
22) -> Result<WebFetchOutput, ToolError> {
23    let max_bytes = input.max_bytes.unwrap_or(DEFAULT_MAX_BYTES);
24
25    if max_bytes > HARD_MAX_BYTES {
26        return Err(ToolError::invalid_input(format!(
27            "max_bytes must be <= {HARD_MAX_BYTES} (20MB)"
28        )));
29    }
30
31    // Send GET request
32    let mut response = tools
33        .http
34        .get(&input.url)
35        .send()
36        .await
37        .map_err(|e| ToolError::external(format!("HTTP request failed: {e}")))?;
38
39    let status = response.status();
40    if !status.is_success() {
41        return Err(ToolError::external(format!(
42            "HTTP request failed with status {status} for {}",
43            response.url()
44        )));
45    }
46
47    let final_url = response.url().to_string();
48    let content_type = response
49        .headers()
50        .get(reqwest::header::CONTENT_TYPE)
51        .and_then(|v| v.to_str().ok())
52        .unwrap_or("")
53        .to_string();
54
55    // Download body with size cap (streaming)
56    #[allow(clippy::cast_possible_truncation)]
57    // max_bytes is already bounded by HARD_MAX_BYTES (20MB)
58    let initial_capacity = response
59        .content_length()
60        .map_or(8 * 1024, |len| len.min(max_bytes as u64) as usize)
61        .min(max_bytes);
62
63    let mut bytes: Vec<u8> = Vec::with_capacity(initial_capacity);
64    let mut truncated = false;
65
66    loop {
67        // Conservative: once we reach the cap, stop without attempting to read more
68        if bytes.len() >= max_bytes {
69            truncated = true;
70            break;
71        }
72
73        let Some(chunk) = response
74            .chunk()
75            .await
76            .map_err(|e| ToolError::external(format!("Failed to read response body: {e}")))?
77        else {
78            break;
79        };
80
81        let remaining = max_bytes - bytes.len();
82        if chunk.len() > remaining {
83            bytes.extend_from_slice(&chunk[..remaining]);
84            truncated = true;
85            break;
86        }
87
88        bytes.extend_from_slice(&chunk);
89    }
90
91    // Convert based on content-type
92    let (title, content) = decode_and_convert(&bytes, &content_type)?;
93
94    let word_count = content.split_whitespace().count();
95
96    // Optional summarization
97    let summary = if input.summarize {
98        Some(
99            crate::haiku::summarize_markdown(tools, &content)
100                .await
101                .map_err(|e| ToolError::external(format!("Summarization failed: {e}")))?,
102        )
103    } else {
104        None
105    };
106
107    Ok(WebFetchOutput {
108        final_url,
109        title,
110        content_type,
111        word_count,
112        truncated,
113        retrieved_at: Utc::now(),
114        content,
115        summary,
116    })
117}
118
119/// Decode bytes and convert to a useful text format based on content-type.
120///
121/// # Errors
122/// Returns `ToolError` if the content type is unsupported or HTML conversion fails.
123pub fn decode_and_convert(
124    bytes: &[u8],
125    content_type: &str,
126) -> Result<(Option<String>, String), ToolError> {
127    let ct_lower = content_type.to_lowercase();
128
129    // Try to decode as UTF-8
130    let text = String::from_utf8_lossy(bytes);
131
132    if ct_lower.contains("text/html") || (ct_lower.is_empty() && looks_like_html(&text)) {
133        let title = extract_title(&text);
134        let md = htmd::convert(&text)
135            .map_err(|e| ToolError::internal(format!("HTML conversion failed: {e}")))?;
136        Ok((title, md))
137    } else if ct_lower.contains("application/json") || ct_lower.contains("+json") {
138        // Pretty-print JSON
139        match serde_json::from_str::<serde_json::Value>(&text) {
140            Ok(val) => {
141                let pretty =
142                    serde_json::to_string_pretty(&val).unwrap_or_else(|_| text.into_owned());
143                Ok((None, pretty))
144            }
145            Err(_) => Ok((None, text.into_owned())),
146        }
147    } else if ct_lower.starts_with("text/") || ct_lower.is_empty() {
148        Ok((None, text.into_owned()))
149    } else {
150        // Binary or unsupported content type
151        Err(ToolError::invalid_input(format!(
152            "Unsupported content type: {content_type}. Only HTML, text, and JSON are supported."
153        )))
154    }
155}
156
157/// Best-effort `<title>` extraction from HTML.
158#[must_use]
159pub fn extract_title(html: &str) -> Option<String> {
160    let lower = html.to_ascii_lowercase();
161    let start = lower.find("<title")?;
162    let after_tag = lower[start..].find('>')?;
163    let title_start = start + after_tag + 1;
164    let title_end = lower[title_start..].find("</title>")?;
165    let title = html[title_start..title_start + title_end].trim();
166    if title.is_empty() {
167        None
168    } else {
169        Some(title.to_string())
170    }
171}
172
173/// Simple heuristic to detect HTML content.
174fn looks_like_html(text: &str) -> bool {
175    let trimmed = text.trim_start();
176    trimmed.starts_with("<!DOCTYPE")
177        || trimmed.starts_with("<!doctype")
178        || trimmed.starts_with("<html")
179}
180
181#[cfg(test)]
182mod tests {
183    use super::*;
184
185    #[test]
186    fn test_decode_html() {
187        let html = b"<html><head><title>Test Page</title></head><body><h1>Hello</h1><p>World</p></body></html>";
188        let (title, content) = decode_and_convert(html, "text/html").unwrap();
189        assert_eq!(title.as_deref(), Some("Test Page"));
190        assert!(content.contains("Hello"));
191        assert!(content.contains("World"));
192    }
193
194    #[test]
195    fn test_decode_json() {
196        let json = br#"{"key":"value","num":42}"#;
197        let (title, content) = decode_and_convert(json, "application/json").unwrap();
198        assert!(title.is_none());
199        assert!(content.contains("\"key\": \"value\""));
200    }
201
202    #[test]
203    fn test_decode_plain_text() {
204        let text = b"Hello, world!";
205        let (title, content) = decode_and_convert(text, "text/plain").unwrap();
206        assert!(title.is_none());
207        assert_eq!(content, "Hello, world!");
208    }
209
210    #[test]
211    fn test_decode_binary_errors() {
212        let bytes = b"\x00\x01\x02";
213        let result = decode_and_convert(bytes, "application/octet-stream");
214        assert!(result.is_err());
215    }
216
217    #[test]
218    fn test_extract_title() {
219        assert_eq!(
220            extract_title("<html><head><title>My Page</title></head></html>"),
221            Some("My Page".into())
222        );
223        assert_eq!(extract_title("<html><head></head></html>"), None);
224        assert_eq!(extract_title("<title></title>"), None);
225    }
226
227    #[test]
228    fn test_looks_like_html() {
229        assert!(looks_like_html("<!DOCTYPE html><html>"));
230        assert!(looks_like_html("  <html>"));
231        assert!(!looks_like_html("Hello, world!"));
232    }
233
234    #[test]
235    fn test_extract_title_unicode_before_tag() {
236        // Turkish İ (2→3 bytes under to_lowercase) would panic or corrupt with old code
237        assert_eq!(
238            extract_title("İ<title>Test Page</title>"),
239            Some("Test Page".to_string())
240        );
241    }
242
243    #[test]
244    fn test_extract_title_mixed_case_tags() {
245        // Verify ASCII case-insensitivity still works
246        assert_eq!(
247            extract_title("<TITLE>Upper</TITLE>"),
248            Some("Upper".to_string())
249        );
250        assert_eq!(
251            extract_title("<TiTlE>Mixed</TiTlE>"),
252            Some("Mixed".to_string())
253        );
254    }
255
256    mod integration {
257        use super::*;
258        use crate::WebTools;
259        use crate::types::WebFetchInput;
260        use wiremock::matchers::method;
261        use wiremock::{Mock, MockServer, ResponseTemplate};
262
263        #[tokio::test]
264        async fn web_fetch_returns_error_on_404() {
265            let mock_server = MockServer::start().await;
266
267            Mock::given(method("GET"))
268                .respond_with(ResponseTemplate::new(404).set_body_string("Not Found"))
269                .mount(&mock_server)
270                .await;
271
272            let http = reqwest::Client::new();
273            let tools = WebTools::with_http_client(http);
274
275            let input = WebFetchInput {
276                url: mock_server.uri(),
277                summarize: false,
278                max_bytes: None,
279            };
280
281            let result = web_fetch(&tools, input).await;
282            assert!(result.is_err(), "Expected error for 404 response");
283            let err = result.unwrap_err();
284            assert!(
285                err.to_string().contains("404"),
286                "Error message should mention 404 status"
287            );
288        }
289
290        #[tokio::test]
291        async fn web_fetch_returns_error_on_500() {
292            let mock_server = MockServer::start().await;
293
294            Mock::given(method("GET"))
295                .respond_with(ResponseTemplate::new(500).set_body_string("Internal Server Error"))
296                .mount(&mock_server)
297                .await;
298
299            let http = reqwest::Client::new();
300            let tools = WebTools::with_http_client(http);
301
302            let input = WebFetchInput {
303                url: mock_server.uri(),
304                summarize: false,
305                max_bytes: None,
306            };
307
308            let result = web_fetch(&tools, input).await;
309            assert!(result.is_err(), "Expected error for 500 response");
310            let err = result.unwrap_err();
311            assert!(
312                err.to_string().contains("500"),
313                "Error message should mention 500 status"
314            );
315        }
316
317        #[tokio::test]
318        async fn web_fetch_succeeds_on_200() {
319            let mock_server = MockServer::start().await;
320
321            Mock::given(method("GET"))
322                .respond_with(
323                    ResponseTemplate::new(200)
324                        .set_body_string("Hello, world!")
325                        .insert_header("Content-Type", "text/plain"),
326                )
327                .mount(&mock_server)
328                .await;
329
330            let http = reqwest::Client::new();
331            let tools = WebTools::with_http_client(http);
332
333            let input = WebFetchInput {
334                url: mock_server.uri(),
335                summarize: false,
336                max_bytes: None,
337            };
338
339            let result = web_fetch(&tools, input).await;
340            assert!(result.is_ok(), "Expected success for 200 response");
341            let output = result.unwrap();
342            assert_eq!(output.content, "Hello, world!");
343        }
344
345        #[tokio::test]
346        async fn web_fetch_detects_html_without_content_type() {
347            let mock_server = MockServer::start().await;
348
349            let html = b"<!DOCTYPE html><html><head><title>Test Page</title></head><body><p>Hello</p></body></html>";
350
351            // HTML response with NO Content-Type header (misconfigured server)
352            // Use set_body_bytes to avoid wiremock's automatic text/plain Content-Type
353            Mock::given(method("GET"))
354                .respond_with(ResponseTemplate::new(200).set_body_bytes(html.as_slice()))
355                .mount(&mock_server)
356                .await;
357
358            let http = reqwest::Client::new();
359            let tools = WebTools::with_http_client(http);
360
361            let input = WebFetchInput {
362                url: mock_server.uri(),
363                summarize: false,
364                max_bytes: None,
365            };
366
367            let result = web_fetch(&tools, input).await;
368            assert!(
369                result.is_ok(),
370                "Expected success for HTML without Content-Type"
371            );
372            let output = result.unwrap();
373
374            // Verify content_type is empty (no header)
375            assert!(
376                output.content_type.is_empty(),
377                "Content-Type should be empty, got: {}",
378                output.content_type
379            );
380
381            // Verify HTML heuristic detected the content and converted to markdown
382            assert_eq!(
383                output.title.as_deref(),
384                Some("Test Page"),
385                "Should extract title via looks_like_html heuristic"
386            );
387            assert!(
388                output.content.contains("Hello"),
389                "Content should be converted"
390            );
391            assert!(
392                !output.content.contains("<p>"),
393                "HTML tags should be removed by markdown conversion"
394            );
395        }
396    }
397}