Skip to main content

web_retrieval/
fetch.rs

1// TODO(1): Add optional JS-rendering sidecar for dynamic pages (Playwright/etc.)
2
3use agentic_tools_core::ToolContext;
4use agentic_tools_core::error::ToolError;
5use chrono::Utc;
6
7use crate::WebTools;
8use crate::types::WebFetchInput;
9use crate::types::WebFetchOutput;
10
11/// Hard maximum allowed `max_bytes`: 20 MB
12pub const HARD_MAX_BYTES: usize = 20 * 1024 * 1024;
13
14/// Execute a web fetch: download URL, convert content, optionally summarize.
15///
16/// # Errors
17/// Returns `ToolError` if the HTTP request fails, the content type is unsupported, or summarization fails.
18pub async fn web_fetch(
19    tools: &WebTools,
20    input: WebFetchInput,
21    ctx: &ToolContext,
22) -> Result<WebFetchOutput, ToolError> {
23    #[expect(clippy::cast_possible_truncation)]
24    let default_max_bytes = tools.cfg.default_max_bytes as usize;
25    let max_bytes = input.max_bytes.unwrap_or(default_max_bytes);
26
27    if max_bytes > HARD_MAX_BYTES {
28        return Err(ToolError::invalid_input(format!(
29            "max_bytes must be <= {HARD_MAX_BYTES} (20MB)"
30        )));
31    }
32
33    if ctx.is_cancelled() {
34        return Err(ToolError::cancelled(None));
35    }
36
37    // Send GET request
38    let mut response = ctx
39        .run_cancellable(async {
40            tools
41                .http
42                .get(&input.url)
43                .send()
44                .await
45                .map_err(|e| ToolError::external(format!("HTTP request failed: {e}")))
46        })
47        .await?;
48
49    let status = response.status();
50    if !status.is_success() {
51        return Err(ToolError::external(format!(
52            "HTTP request failed with status {status} for {}",
53            response.url()
54        )));
55    }
56
57    let final_url = response.url().to_string();
58    let content_type = response
59        .headers()
60        .get(reqwest::header::CONTENT_TYPE)
61        .and_then(|v| v.to_str().ok())
62        .unwrap_or("")
63        .to_string();
64
65    // Download body with size cap (streaming)
66    #[expect(clippy::cast_possible_truncation)]
67    // max_bytes is already bounded by HARD_MAX_BYTES (20MB)
68    let initial_capacity = response
69        .content_length()
70        .map_or(8 * 1024, |len| len.min(max_bytes as u64) as usize)
71        .min(max_bytes);
72
73    let mut bytes: Vec<u8> = Vec::with_capacity(initial_capacity);
74    let mut truncated = false;
75
76    loop {
77        // Conservative: once we reach the cap, stop without attempting to read more
78        if bytes.len() >= max_bytes {
79            truncated = true;
80            break;
81        }
82
83        if ctx.is_cancelled() {
84            return Err(ToolError::cancelled(None));
85        }
86
87        let Some(chunk) = ctx
88            .run_cancellable(async {
89                response
90                    .chunk()
91                    .await
92                    .map_err(|e| ToolError::external(format!("Failed to read response body: {e}")))
93            })
94            .await?
95        else {
96            break;
97        };
98
99        let remaining = max_bytes - bytes.len();
100        if chunk.len() > remaining {
101            bytes.extend_from_slice(&chunk[..remaining]);
102            truncated = true;
103            break;
104        }
105
106        bytes.extend_from_slice(&chunk);
107    }
108
109    // Convert based on content-type
110    let (title, content) = decode_and_convert(&bytes, &content_type)?;
111
112    let word_count = content.split_whitespace().count();
113
114    // Optional summarization
115    let summary = summarize_content_if_requested(tools, &content, input.summarize, ctx).await?;
116
117    if ctx.is_cancelled() {
118        return Err(ToolError::cancelled(None));
119    }
120
121    Ok(WebFetchOutput {
122        final_url,
123        title,
124        content_type,
125        word_count,
126        truncated,
127        retrieved_at: Utc::now(),
128        content,
129        summary,
130    })
131}
132
133async fn summarize_content_if_requested(
134    tools: &WebTools,
135    content: &str,
136    summarize: bool,
137    ctx: &ToolContext,
138) -> Result<Option<String>, ToolError> {
139    if !summarize {
140        return Ok(None);
141    }
142
143    match crate::haiku::summarize_markdown(tools, content, ctx).await {
144        Ok(summary) => Ok(Some(summary)),
145        Err(ToolError::Cancelled { reason }) => Err(ToolError::Cancelled { reason }),
146        Err(e) => Err(ToolError::external(format!("Summarization failed: {e}"))),
147    }
148}
149
150/// Decode bytes and convert to a useful text format based on content-type.
151///
152/// # Errors
153/// Returns `ToolError` if the content type is unsupported or HTML conversion fails.
154pub fn decode_and_convert(
155    bytes: &[u8],
156    content_type: &str,
157) -> Result<(Option<String>, String), ToolError> {
158    let ct_lower = content_type.to_lowercase();
159
160    // Try to decode as UTF-8
161    let text = String::from_utf8_lossy(bytes);
162
163    if ct_lower.contains("text/html") || (ct_lower.is_empty() && looks_like_html(&text)) {
164        let title = extract_title(&text);
165        let md = htmd::convert(&text)
166            .map_err(|e| ToolError::internal(format!("HTML conversion failed: {e}")))?;
167        Ok((title, md))
168    } else if ct_lower.contains("application/json") || ct_lower.contains("+json") {
169        // Pretty-print JSON
170        match serde_json::from_str::<serde_json::Value>(&text) {
171            Ok(val) => {
172                let pretty =
173                    serde_json::to_string_pretty(&val).unwrap_or_else(|_| text.into_owned());
174                Ok((None, pretty))
175            }
176            Err(_) => Ok((None, text.into_owned())),
177        }
178    } else if ct_lower.starts_with("text/") || ct_lower.is_empty() {
179        Ok((None, text.into_owned()))
180    } else {
181        // Binary or unsupported content type
182        Err(ToolError::invalid_input(format!(
183            "Unsupported content type: {content_type}. Only HTML, text, and JSON are supported."
184        )))
185    }
186}
187
188/// Best-effort `<title>` extraction from HTML.
189#[must_use]
190pub fn extract_title(html: &str) -> Option<String> {
191    let lower = html.to_ascii_lowercase();
192    let start = lower.find("<title")?;
193    let after_tag = lower[start..].find('>')?;
194    let title_start = start + after_tag + 1;
195    let title_end = lower[title_start..].find("</title>")?;
196    let title = html[title_start..title_start + title_end].trim();
197    if title.is_empty() {
198        None
199    } else {
200        Some(title.to_string())
201    }
202}
203
204/// Simple heuristic to detect HTML content.
205fn looks_like_html(text: &str) -> bool {
206    let trimmed = text.trim_start();
207    trimmed.starts_with("<!DOCTYPE")
208        || trimmed.starts_with("<!doctype")
209        || trimmed.starts_with("<html")
210}
211
212#[cfg(test)]
213mod tests {
214    use super::*;
215
216    #[test]
217    fn test_decode_html() {
218        let html = b"<html><head><title>Test Page</title></head><body><h1>Hello</h1><p>World</p></body></html>";
219        let (title, content) = decode_and_convert(html, "text/html").unwrap();
220        assert_eq!(title.as_deref(), Some("Test Page"));
221        assert!(content.contains("Hello"));
222        assert!(content.contains("World"));
223    }
224
225    #[test]
226    fn test_decode_json() {
227        let json = br#"{"key":"value","num":42}"#;
228        let (title, content) = decode_and_convert(json, "application/json").unwrap();
229        assert!(title.is_none());
230        assert!(content.contains("\"key\": \"value\""));
231    }
232
233    #[test]
234    fn test_decode_plain_text() {
235        let text = b"Hello, world!";
236        let (title, content) = decode_and_convert(text, "text/plain").unwrap();
237        assert!(title.is_none());
238        assert_eq!(content, "Hello, world!");
239    }
240
241    #[test]
242    fn test_decode_binary_errors() {
243        let bytes = b"\x00\x01\x02";
244        let result = decode_and_convert(bytes, "application/octet-stream");
245        assert!(result.is_err());
246    }
247
248    #[test]
249    fn test_extract_title() {
250        assert_eq!(
251            extract_title("<html><head><title>My Page</title></head></html>"),
252            Some("My Page".into())
253        );
254        assert_eq!(extract_title("<html><head></head></html>"), None);
255        assert_eq!(extract_title("<title></title>"), None);
256    }
257
258    #[test]
259    fn test_looks_like_html() {
260        assert!(looks_like_html("<!DOCTYPE html><html>"));
261        assert!(looks_like_html("  <html>"));
262        assert!(!looks_like_html("Hello, world!"));
263    }
264
265    #[test]
266    fn test_extract_title_unicode_before_tag() {
267        // Turkish İ (2→3 bytes under to_lowercase) would panic or corrupt with old code
268        assert_eq!(
269            extract_title("İ<title>Test Page</title>"),
270            Some("Test Page".to_string())
271        );
272    }
273
274    #[test]
275    fn test_extract_title_mixed_case_tags() {
276        // Verify ASCII case-insensitivity still works
277        assert_eq!(
278            extract_title("<TITLE>Upper</TITLE>"),
279            Some("Upper".to_string())
280        );
281        assert_eq!(
282            extract_title("<TiTlE>Mixed</TiTlE>"),
283            Some("Mixed".to_string())
284        );
285    }
286
287    mod integration {
288        use super::*;
289        use crate::WebTools;
290        use crate::types::WebFetchInput;
291        use agentic_tools_core::ToolContext;
292        use wiremock::Mock;
293        use wiremock::MockServer;
294        use wiremock::ResponseTemplate;
295        use wiremock::matchers::method;
296
297        #[tokio::test]
298        async fn web_fetch_returns_error_on_404() {
299            let mock_server = MockServer::start().await;
300
301            Mock::given(method("GET"))
302                .respond_with(ResponseTemplate::new(404).set_body_string("Not Found"))
303                .mount(&mock_server)
304                .await;
305
306            let http = reqwest::Client::new();
307            let tools = WebTools::with_http_client(http);
308
309            let input = WebFetchInput {
310                url: mock_server.uri(),
311                summarize: false,
312                max_bytes: None,
313            };
314
315            let result = web_fetch(&tools, input, &ToolContext::default()).await;
316            assert!(result.is_err(), "Expected error for 404 response");
317            let err = result.unwrap_err();
318            assert!(
319                err.to_string().contains("404"),
320                "Error message should mention 404 status"
321            );
322        }
323
324        #[tokio::test]
325        async fn web_fetch_returns_error_on_500() {
326            let mock_server = MockServer::start().await;
327
328            Mock::given(method("GET"))
329                .respond_with(ResponseTemplate::new(500).set_body_string("Internal Server Error"))
330                .mount(&mock_server)
331                .await;
332
333            let http = reqwest::Client::new();
334            let tools = WebTools::with_http_client(http);
335
336            let input = WebFetchInput {
337                url: mock_server.uri(),
338                summarize: false,
339                max_bytes: None,
340            };
341
342            let result = web_fetch(&tools, input, &ToolContext::default()).await;
343            assert!(result.is_err(), "Expected error for 500 response");
344            let err = result.unwrap_err();
345            assert!(
346                err.to_string().contains("500"),
347                "Error message should mention 500 status"
348            );
349        }
350
351        #[tokio::test]
352        async fn web_fetch_succeeds_on_200() {
353            let mock_server = MockServer::start().await;
354
355            Mock::given(method("GET"))
356                .respond_with(
357                    ResponseTemplate::new(200)
358                        .set_body_string("Hello, world!")
359                        .insert_header("Content-Type", "text/plain"),
360                )
361                .mount(&mock_server)
362                .await;
363
364            let http = reqwest::Client::new();
365            let tools = WebTools::with_http_client(http);
366
367            let input = WebFetchInput {
368                url: mock_server.uri(),
369                summarize: false,
370                max_bytes: None,
371            };
372
373            let result = web_fetch(&tools, input, &ToolContext::default()).await;
374            assert!(result.is_ok(), "Expected success for 200 response");
375            let output = result.unwrap();
376            assert_eq!(output.content, "Hello, world!");
377        }
378
379        #[tokio::test]
380        async fn web_fetch_detects_html_without_content_type() {
381            let mock_server = MockServer::start().await;
382
383            let html = b"<!DOCTYPE html><html><head><title>Test Page</title></head><body><p>Hello</p></body></html>";
384
385            // HTML response with NO Content-Type header (misconfigured server)
386            // Use set_body_bytes to avoid wiremock's automatic text/plain Content-Type
387            Mock::given(method("GET"))
388                .respond_with(ResponseTemplate::new(200).set_body_bytes(html.as_slice()))
389                .mount(&mock_server)
390                .await;
391
392            let http = reqwest::Client::new();
393            let tools = WebTools::with_http_client(http);
394
395            let input = WebFetchInput {
396                url: mock_server.uri(),
397                summarize: false,
398                max_bytes: None,
399            };
400
401            let result = web_fetch(&tools, input, &ToolContext::default()).await;
402            assert!(
403                result.is_ok(),
404                "Expected success for HTML without Content-Type"
405            );
406            let output = result.unwrap();
407
408            // Verify content_type is empty (no header)
409            assert!(
410                output.content_type.is_empty(),
411                "Content-Type should be empty, got: {}",
412                output.content_type
413            );
414
415            // Verify HTML heuristic detected the content and converted to markdown
416            assert_eq!(
417                output.title.as_deref(),
418                Some("Test Page"),
419                "Should extract title via looks_like_html heuristic"
420            );
421            assert!(
422                output.content.contains("Hello"),
423                "Content should be converted"
424            );
425            assert!(
426                !output.content.contains("<p>"),
427                "HTML tags should be removed by markdown conversion"
428            );
429        }
430
431        #[tokio::test]
432        async fn web_fetch_returns_cancelled_before_sending_request() {
433            let mock_server = MockServer::start().await;
434            let http = reqwest::Client::new();
435            let tools = WebTools::with_http_client(http);
436            let ctx = ToolContext::default();
437            ctx.cancellation_token().cancel();
438
439            let input = WebFetchInput {
440                url: mock_server.uri(),
441                summarize: false,
442                max_bytes: None,
443            };
444
445            let result = web_fetch(&tools, input, &ctx).await;
446            assert!(matches!(result, Err(ToolError::Cancelled { .. })));
447            assert!(mock_server.received_requests().await.unwrap().is_empty());
448        }
449
450        #[tokio::test]
451        async fn summarization_preserves_cancelled_error() {
452            let tools = WebTools::with_http_client(reqwest::Client::new());
453            let ctx = ToolContext::default();
454            ctx.cancellation_token().cancel();
455
456            let result = summarize_content_if_requested(&tools, "content", true, &ctx).await;
457
458            assert!(matches!(result, Err(ToolError::Cancelled { .. })));
459        }
460    }
461}