1use agentic_tools_core::ToolContext;
4use agentic_tools_core::error::ToolError;
5use chrono::Utc;
6
7use crate::WebTools;
8use crate::types::WebFetchInput;
9use crate::types::WebFetchOutput;
10
11pub const HARD_MAX_BYTES: usize = 20 * 1024 * 1024;
13
14pub async fn web_fetch(
19 tools: &WebTools,
20 input: WebFetchInput,
21 ctx: &ToolContext,
22) -> Result<WebFetchOutput, ToolError> {
23 #[expect(clippy::cast_possible_truncation)]
24 let default_max_bytes = tools.cfg.default_max_bytes as usize;
25 let max_bytes = input.max_bytes.unwrap_or(default_max_bytes);
26
27 if max_bytes > HARD_MAX_BYTES {
28 return Err(ToolError::invalid_input(format!(
29 "max_bytes must be <= {HARD_MAX_BYTES} (20MB)"
30 )));
31 }
32
33 if ctx.is_cancelled() {
34 return Err(ToolError::cancelled(None));
35 }
36
37 let mut response = ctx
39 .run_cancellable(async {
40 tools
41 .http
42 .get(&input.url)
43 .send()
44 .await
45 .map_err(|e| ToolError::external(format!("HTTP request failed: {e}")))
46 })
47 .await?;
48
49 let status = response.status();
50 if !status.is_success() {
51 return Err(ToolError::external(format!(
52 "HTTP request failed with status {status} for {}",
53 response.url()
54 )));
55 }
56
57 let final_url = response.url().to_string();
58 let content_type = response
59 .headers()
60 .get(reqwest::header::CONTENT_TYPE)
61 .and_then(|v| v.to_str().ok())
62 .unwrap_or("")
63 .to_string();
64
65 #[expect(clippy::cast_possible_truncation)]
67 let initial_capacity = response
69 .content_length()
70 .map_or(8 * 1024, |len| len.min(max_bytes as u64) as usize)
71 .min(max_bytes);
72
73 let mut bytes: Vec<u8> = Vec::with_capacity(initial_capacity);
74 let mut truncated = false;
75
76 loop {
77 if bytes.len() >= max_bytes {
79 truncated = true;
80 break;
81 }
82
83 if ctx.is_cancelled() {
84 return Err(ToolError::cancelled(None));
85 }
86
87 let Some(chunk) = ctx
88 .run_cancellable(async {
89 response
90 .chunk()
91 .await
92 .map_err(|e| ToolError::external(format!("Failed to read response body: {e}")))
93 })
94 .await?
95 else {
96 break;
97 };
98
99 let remaining = max_bytes - bytes.len();
100 if chunk.len() > remaining {
101 bytes.extend_from_slice(&chunk[..remaining]);
102 truncated = true;
103 break;
104 }
105
106 bytes.extend_from_slice(&chunk);
107 }
108
109 let (title, content) = decode_and_convert(&bytes, &content_type)?;
111
112 let word_count = content.split_whitespace().count();
113
114 let summary = summarize_content_if_requested(tools, &content, input.summarize, ctx).await?;
116
117 if ctx.is_cancelled() {
118 return Err(ToolError::cancelled(None));
119 }
120
121 Ok(WebFetchOutput {
122 final_url,
123 title,
124 content_type,
125 word_count,
126 truncated,
127 retrieved_at: Utc::now(),
128 content,
129 summary,
130 })
131}
132
133async fn summarize_content_if_requested(
134 tools: &WebTools,
135 content: &str,
136 summarize: bool,
137 ctx: &ToolContext,
138) -> Result<Option<String>, ToolError> {
139 if !summarize {
140 return Ok(None);
141 }
142
143 match crate::haiku::summarize_markdown(tools, content, ctx).await {
144 Ok(summary) => Ok(Some(summary)),
145 Err(ToolError::Cancelled { reason }) => Err(ToolError::Cancelled { reason }),
146 Err(e) => Err(ToolError::external(format!("Summarization failed: {e}"))),
147 }
148}
149
150pub fn decode_and_convert(
155 bytes: &[u8],
156 content_type: &str,
157) -> Result<(Option<String>, String), ToolError> {
158 let ct_lower = content_type.to_lowercase();
159
160 let text = String::from_utf8_lossy(bytes);
162
163 if ct_lower.contains("text/html") || (ct_lower.is_empty() && looks_like_html(&text)) {
164 let title = extract_title(&text);
165 let md = htmd::convert(&text)
166 .map_err(|e| ToolError::internal(format!("HTML conversion failed: {e}")))?;
167 Ok((title, md))
168 } else if ct_lower.contains("application/json") || ct_lower.contains("+json") {
169 match serde_json::from_str::<serde_json::Value>(&text) {
171 Ok(val) => {
172 let pretty =
173 serde_json::to_string_pretty(&val).unwrap_or_else(|_| text.into_owned());
174 Ok((None, pretty))
175 }
176 Err(_) => Ok((None, text.into_owned())),
177 }
178 } else if ct_lower.starts_with("text/") || ct_lower.is_empty() {
179 Ok((None, text.into_owned()))
180 } else {
181 Err(ToolError::invalid_input(format!(
183 "Unsupported content type: {content_type}. Only HTML, text, and JSON are supported."
184 )))
185 }
186}
187
188#[must_use]
190pub fn extract_title(html: &str) -> Option<String> {
191 let lower = html.to_ascii_lowercase();
192 let start = lower.find("<title")?;
193 let after_tag = lower[start..].find('>')?;
194 let title_start = start + after_tag + 1;
195 let title_end = lower[title_start..].find("</title>")?;
196 let title = html[title_start..title_start + title_end].trim();
197 if title.is_empty() {
198 None
199 } else {
200 Some(title.to_string())
201 }
202}
203
204fn looks_like_html(text: &str) -> bool {
206 let trimmed = text.trim_start();
207 trimmed.starts_with("<!DOCTYPE")
208 || trimmed.starts_with("<!doctype")
209 || trimmed.starts_with("<html")
210}
211
212#[cfg(test)]
213mod tests {
214 use super::*;
215
216 #[test]
217 fn test_decode_html() {
218 let html = b"<html><head><title>Test Page</title></head><body><h1>Hello</h1><p>World</p></body></html>";
219 let (title, content) = decode_and_convert(html, "text/html").unwrap();
220 assert_eq!(title.as_deref(), Some("Test Page"));
221 assert!(content.contains("Hello"));
222 assert!(content.contains("World"));
223 }
224
225 #[test]
226 fn test_decode_json() {
227 let json = br#"{"key":"value","num":42}"#;
228 let (title, content) = decode_and_convert(json, "application/json").unwrap();
229 assert!(title.is_none());
230 assert!(content.contains("\"key\": \"value\""));
231 }
232
233 #[test]
234 fn test_decode_plain_text() {
235 let text = b"Hello, world!";
236 let (title, content) = decode_and_convert(text, "text/plain").unwrap();
237 assert!(title.is_none());
238 assert_eq!(content, "Hello, world!");
239 }
240
241 #[test]
242 fn test_decode_binary_errors() {
243 let bytes = b"\x00\x01\x02";
244 let result = decode_and_convert(bytes, "application/octet-stream");
245 assert!(result.is_err());
246 }
247
248 #[test]
249 fn test_extract_title() {
250 assert_eq!(
251 extract_title("<html><head><title>My Page</title></head></html>"),
252 Some("My Page".into())
253 );
254 assert_eq!(extract_title("<html><head></head></html>"), None);
255 assert_eq!(extract_title("<title></title>"), None);
256 }
257
258 #[test]
259 fn test_looks_like_html() {
260 assert!(looks_like_html("<!DOCTYPE html><html>"));
261 assert!(looks_like_html(" <html>"));
262 assert!(!looks_like_html("Hello, world!"));
263 }
264
265 #[test]
266 fn test_extract_title_unicode_before_tag() {
267 assert_eq!(
269 extract_title("İ<title>Test Page</title>"),
270 Some("Test Page".to_string())
271 );
272 }
273
274 #[test]
275 fn test_extract_title_mixed_case_tags() {
276 assert_eq!(
278 extract_title("<TITLE>Upper</TITLE>"),
279 Some("Upper".to_string())
280 );
281 assert_eq!(
282 extract_title("<TiTlE>Mixed</TiTlE>"),
283 Some("Mixed".to_string())
284 );
285 }
286
287 mod integration {
288 use super::*;
289 use crate::WebTools;
290 use crate::types::WebFetchInput;
291 use agentic_tools_core::ToolContext;
292 use wiremock::Mock;
293 use wiremock::MockServer;
294 use wiremock::ResponseTemplate;
295 use wiremock::matchers::method;
296
297 #[tokio::test]
298 async fn web_fetch_returns_error_on_404() {
299 let mock_server = MockServer::start().await;
300
301 Mock::given(method("GET"))
302 .respond_with(ResponseTemplate::new(404).set_body_string("Not Found"))
303 .mount(&mock_server)
304 .await;
305
306 let http = reqwest::Client::new();
307 let tools = WebTools::with_http_client(http);
308
309 let input = WebFetchInput {
310 url: mock_server.uri(),
311 summarize: false,
312 max_bytes: None,
313 };
314
315 let result = web_fetch(&tools, input, &ToolContext::default()).await;
316 assert!(result.is_err(), "Expected error for 404 response");
317 let err = result.unwrap_err();
318 assert!(
319 err.to_string().contains("404"),
320 "Error message should mention 404 status"
321 );
322 }
323
324 #[tokio::test]
325 async fn web_fetch_returns_error_on_500() {
326 let mock_server = MockServer::start().await;
327
328 Mock::given(method("GET"))
329 .respond_with(ResponseTemplate::new(500).set_body_string("Internal Server Error"))
330 .mount(&mock_server)
331 .await;
332
333 let http = reqwest::Client::new();
334 let tools = WebTools::with_http_client(http);
335
336 let input = WebFetchInput {
337 url: mock_server.uri(),
338 summarize: false,
339 max_bytes: None,
340 };
341
342 let result = web_fetch(&tools, input, &ToolContext::default()).await;
343 assert!(result.is_err(), "Expected error for 500 response");
344 let err = result.unwrap_err();
345 assert!(
346 err.to_string().contains("500"),
347 "Error message should mention 500 status"
348 );
349 }
350
351 #[tokio::test]
352 async fn web_fetch_succeeds_on_200() {
353 let mock_server = MockServer::start().await;
354
355 Mock::given(method("GET"))
356 .respond_with(
357 ResponseTemplate::new(200)
358 .set_body_string("Hello, world!")
359 .insert_header("Content-Type", "text/plain"),
360 )
361 .mount(&mock_server)
362 .await;
363
364 let http = reqwest::Client::new();
365 let tools = WebTools::with_http_client(http);
366
367 let input = WebFetchInput {
368 url: mock_server.uri(),
369 summarize: false,
370 max_bytes: None,
371 };
372
373 let result = web_fetch(&tools, input, &ToolContext::default()).await;
374 assert!(result.is_ok(), "Expected success for 200 response");
375 let output = result.unwrap();
376 assert_eq!(output.content, "Hello, world!");
377 }
378
379 #[tokio::test]
380 async fn web_fetch_detects_html_without_content_type() {
381 let mock_server = MockServer::start().await;
382
383 let html = b"<!DOCTYPE html><html><head><title>Test Page</title></head><body><p>Hello</p></body></html>";
384
385 Mock::given(method("GET"))
388 .respond_with(ResponseTemplate::new(200).set_body_bytes(html.as_slice()))
389 .mount(&mock_server)
390 .await;
391
392 let http = reqwest::Client::new();
393 let tools = WebTools::with_http_client(http);
394
395 let input = WebFetchInput {
396 url: mock_server.uri(),
397 summarize: false,
398 max_bytes: None,
399 };
400
401 let result = web_fetch(&tools, input, &ToolContext::default()).await;
402 assert!(
403 result.is_ok(),
404 "Expected success for HTML without Content-Type"
405 );
406 let output = result.unwrap();
407
408 assert!(
410 output.content_type.is_empty(),
411 "Content-Type should be empty, got: {}",
412 output.content_type
413 );
414
415 assert_eq!(
417 output.title.as_deref(),
418 Some("Test Page"),
419 "Should extract title via looks_like_html heuristic"
420 );
421 assert!(
422 output.content.contains("Hello"),
423 "Content should be converted"
424 );
425 assert!(
426 !output.content.contains("<p>"),
427 "HTML tags should be removed by markdown conversion"
428 );
429 }
430
431 #[tokio::test]
432 async fn web_fetch_returns_cancelled_before_sending_request() {
433 let mock_server = MockServer::start().await;
434 let http = reqwest::Client::new();
435 let tools = WebTools::with_http_client(http);
436 let ctx = ToolContext::default();
437 ctx.cancellation_token().cancel();
438
439 let input = WebFetchInput {
440 url: mock_server.uri(),
441 summarize: false,
442 max_bytes: None,
443 };
444
445 let result = web_fetch(&tools, input, &ctx).await;
446 assert!(matches!(result, Err(ToolError::Cancelled { .. })));
447 assert!(mock_server.received_requests().await.unwrap().is_empty());
448 }
449
450 #[tokio::test]
451 async fn summarization_preserves_cancelled_error() {
452 let tools = WebTools::with_http_client(reqwest::Client::new());
453 let ctx = ToolContext::default();
454 ctx.cancellation_token().cancel();
455
456 let result = summarize_content_if_requested(&tools, "content", true, &ctx).await;
457
458 assert!(matches!(result, Err(ToolError::Cancelled { .. })));
459 }
460 }
461}