1use agentic_tools_core::error::ToolError;
4use chrono::Utc;
5
6use crate::WebTools;
7use crate::types::WebFetchInput;
8use crate::types::WebFetchOutput;
9
10pub const HARD_MAX_BYTES: usize = 20 * 1024 * 1024;
12
13pub async fn web_fetch(
18 tools: &WebTools,
19 input: WebFetchInput,
20) -> Result<WebFetchOutput, ToolError> {
21 #[expect(clippy::cast_possible_truncation)]
22 let default_max_bytes = tools.cfg.default_max_bytes as usize;
23 let max_bytes = input.max_bytes.unwrap_or(default_max_bytes);
24
25 if max_bytes > HARD_MAX_BYTES {
26 return Err(ToolError::invalid_input(format!(
27 "max_bytes must be <= {HARD_MAX_BYTES} (20MB)"
28 )));
29 }
30
31 let mut response = tools
33 .http
34 .get(&input.url)
35 .send()
36 .await
37 .map_err(|e| ToolError::external(format!("HTTP request failed: {e}")))?;
38
39 let status = response.status();
40 if !status.is_success() {
41 return Err(ToolError::external(format!(
42 "HTTP request failed with status {status} for {}",
43 response.url()
44 )));
45 }
46
47 let final_url = response.url().to_string();
48 let content_type = response
49 .headers()
50 .get(reqwest::header::CONTENT_TYPE)
51 .and_then(|v| v.to_str().ok())
52 .unwrap_or("")
53 .to_string();
54
55 #[expect(clippy::cast_possible_truncation)]
57 let initial_capacity = response
59 .content_length()
60 .map_or(8 * 1024, |len| len.min(max_bytes as u64) as usize)
61 .min(max_bytes);
62
63 let mut bytes: Vec<u8> = Vec::with_capacity(initial_capacity);
64 let mut truncated = false;
65
66 loop {
67 if bytes.len() >= max_bytes {
69 truncated = true;
70 break;
71 }
72
73 let Some(chunk) = response
74 .chunk()
75 .await
76 .map_err(|e| ToolError::external(format!("Failed to read response body: {e}")))?
77 else {
78 break;
79 };
80
81 let remaining = max_bytes - bytes.len();
82 if chunk.len() > remaining {
83 bytes.extend_from_slice(&chunk[..remaining]);
84 truncated = true;
85 break;
86 }
87
88 bytes.extend_from_slice(&chunk);
89 }
90
91 let (title, content) = decode_and_convert(&bytes, &content_type)?;
93
94 let word_count = content.split_whitespace().count();
95
96 let summary = if input.summarize {
98 Some(
99 crate::haiku::summarize_markdown(tools, &content)
100 .await
101 .map_err(|e| ToolError::external(format!("Summarization failed: {e}")))?,
102 )
103 } else {
104 None
105 };
106
107 Ok(WebFetchOutput {
108 final_url,
109 title,
110 content_type,
111 word_count,
112 truncated,
113 retrieved_at: Utc::now(),
114 content,
115 summary,
116 })
117}
118
119pub fn decode_and_convert(
124 bytes: &[u8],
125 content_type: &str,
126) -> Result<(Option<String>, String), ToolError> {
127 let ct_lower = content_type.to_lowercase();
128
129 let text = String::from_utf8_lossy(bytes);
131
132 if ct_lower.contains("text/html") || (ct_lower.is_empty() && looks_like_html(&text)) {
133 let title = extract_title(&text);
134 let md = htmd::convert(&text)
135 .map_err(|e| ToolError::internal(format!("HTML conversion failed: {e}")))?;
136 Ok((title, md))
137 } else if ct_lower.contains("application/json") || ct_lower.contains("+json") {
138 match serde_json::from_str::<serde_json::Value>(&text) {
140 Ok(val) => {
141 let pretty =
142 serde_json::to_string_pretty(&val).unwrap_or_else(|_| text.into_owned());
143 Ok((None, pretty))
144 }
145 Err(_) => Ok((None, text.into_owned())),
146 }
147 } else if ct_lower.starts_with("text/") || ct_lower.is_empty() {
148 Ok((None, text.into_owned()))
149 } else {
150 Err(ToolError::invalid_input(format!(
152 "Unsupported content type: {content_type}. Only HTML, text, and JSON are supported."
153 )))
154 }
155}
156
157#[must_use]
159pub fn extract_title(html: &str) -> Option<String> {
160 let lower = html.to_ascii_lowercase();
161 let start = lower.find("<title")?;
162 let after_tag = lower[start..].find('>')?;
163 let title_start = start + after_tag + 1;
164 let title_end = lower[title_start..].find("</title>")?;
165 let title = html[title_start..title_start + title_end].trim();
166 if title.is_empty() {
167 None
168 } else {
169 Some(title.to_string())
170 }
171}
172
173fn looks_like_html(text: &str) -> bool {
175 let trimmed = text.trim_start();
176 trimmed.starts_with("<!DOCTYPE")
177 || trimmed.starts_with("<!doctype")
178 || trimmed.starts_with("<html")
179}
180
181#[cfg(test)]
182mod tests {
183 use super::*;
184
185 #[test]
186 fn test_decode_html() {
187 let html = b"<html><head><title>Test Page</title></head><body><h1>Hello</h1><p>World</p></body></html>";
188 let (title, content) = decode_and_convert(html, "text/html").unwrap();
189 assert_eq!(title.as_deref(), Some("Test Page"));
190 assert!(content.contains("Hello"));
191 assert!(content.contains("World"));
192 }
193
194 #[test]
195 fn test_decode_json() {
196 let json = br#"{"key":"value","num":42}"#;
197 let (title, content) = decode_and_convert(json, "application/json").unwrap();
198 assert!(title.is_none());
199 assert!(content.contains("\"key\": \"value\""));
200 }
201
202 #[test]
203 fn test_decode_plain_text() {
204 let text = b"Hello, world!";
205 let (title, content) = decode_and_convert(text, "text/plain").unwrap();
206 assert!(title.is_none());
207 assert_eq!(content, "Hello, world!");
208 }
209
210 #[test]
211 fn test_decode_binary_errors() {
212 let bytes = b"\x00\x01\x02";
213 let result = decode_and_convert(bytes, "application/octet-stream");
214 assert!(result.is_err());
215 }
216
217 #[test]
218 fn test_extract_title() {
219 assert_eq!(
220 extract_title("<html><head><title>My Page</title></head></html>"),
221 Some("My Page".into())
222 );
223 assert_eq!(extract_title("<html><head></head></html>"), None);
224 assert_eq!(extract_title("<title></title>"), None);
225 }
226
227 #[test]
228 fn test_looks_like_html() {
229 assert!(looks_like_html("<!DOCTYPE html><html>"));
230 assert!(looks_like_html(" <html>"));
231 assert!(!looks_like_html("Hello, world!"));
232 }
233
234 #[test]
235 fn test_extract_title_unicode_before_tag() {
236 assert_eq!(
238 extract_title("İ<title>Test Page</title>"),
239 Some("Test Page".to_string())
240 );
241 }
242
243 #[test]
244 fn test_extract_title_mixed_case_tags() {
245 assert_eq!(
247 extract_title("<TITLE>Upper</TITLE>"),
248 Some("Upper".to_string())
249 );
250 assert_eq!(
251 extract_title("<TiTlE>Mixed</TiTlE>"),
252 Some("Mixed".to_string())
253 );
254 }
255
256 mod integration {
257 use super::*;
258 use crate::WebTools;
259 use crate::types::WebFetchInput;
260 use wiremock::Mock;
261 use wiremock::MockServer;
262 use wiremock::ResponseTemplate;
263 use wiremock::matchers::method;
264
265 #[tokio::test]
266 async fn web_fetch_returns_error_on_404() {
267 let mock_server = MockServer::start().await;
268
269 Mock::given(method("GET"))
270 .respond_with(ResponseTemplate::new(404).set_body_string("Not Found"))
271 .mount(&mock_server)
272 .await;
273
274 let http = reqwest::Client::new();
275 let tools = WebTools::with_http_client(http);
276
277 let input = WebFetchInput {
278 url: mock_server.uri(),
279 summarize: false,
280 max_bytes: None,
281 };
282
283 let result = web_fetch(&tools, input).await;
284 assert!(result.is_err(), "Expected error for 404 response");
285 let err = result.unwrap_err();
286 assert!(
287 err.to_string().contains("404"),
288 "Error message should mention 404 status"
289 );
290 }
291
292 #[tokio::test]
293 async fn web_fetch_returns_error_on_500() {
294 let mock_server = MockServer::start().await;
295
296 Mock::given(method("GET"))
297 .respond_with(ResponseTemplate::new(500).set_body_string("Internal Server Error"))
298 .mount(&mock_server)
299 .await;
300
301 let http = reqwest::Client::new();
302 let tools = WebTools::with_http_client(http);
303
304 let input = WebFetchInput {
305 url: mock_server.uri(),
306 summarize: false,
307 max_bytes: None,
308 };
309
310 let result = web_fetch(&tools, input).await;
311 assert!(result.is_err(), "Expected error for 500 response");
312 let err = result.unwrap_err();
313 assert!(
314 err.to_string().contains("500"),
315 "Error message should mention 500 status"
316 );
317 }
318
319 #[tokio::test]
320 async fn web_fetch_succeeds_on_200() {
321 let mock_server = MockServer::start().await;
322
323 Mock::given(method("GET"))
324 .respond_with(
325 ResponseTemplate::new(200)
326 .set_body_string("Hello, world!")
327 .insert_header("Content-Type", "text/plain"),
328 )
329 .mount(&mock_server)
330 .await;
331
332 let http = reqwest::Client::new();
333 let tools = WebTools::with_http_client(http);
334
335 let input = WebFetchInput {
336 url: mock_server.uri(),
337 summarize: false,
338 max_bytes: None,
339 };
340
341 let result = web_fetch(&tools, input).await;
342 assert!(result.is_ok(), "Expected success for 200 response");
343 let output = result.unwrap();
344 assert_eq!(output.content, "Hello, world!");
345 }
346
347 #[tokio::test]
348 async fn web_fetch_detects_html_without_content_type() {
349 let mock_server = MockServer::start().await;
350
351 let html = b"<!DOCTYPE html><html><head><title>Test Page</title></head><body><p>Hello</p></body></html>";
352
353 Mock::given(method("GET"))
356 .respond_with(ResponseTemplate::new(200).set_body_bytes(html.as_slice()))
357 .mount(&mock_server)
358 .await;
359
360 let http = reqwest::Client::new();
361 let tools = WebTools::with_http_client(http);
362
363 let input = WebFetchInput {
364 url: mock_server.uri(),
365 summarize: false,
366 max_bytes: None,
367 };
368
369 let result = web_fetch(&tools, input).await;
370 assert!(
371 result.is_ok(),
372 "Expected success for HTML without Content-Type"
373 );
374 let output = result.unwrap();
375
376 assert!(
378 output.content_type.is_empty(),
379 "Content-Type should be empty, got: {}",
380 output.content_type
381 );
382
383 assert_eq!(
385 output.title.as_deref(),
386 Some("Test Page"),
387 "Should extract title via looks_like_html heuristic"
388 );
389 assert!(
390 output.content.contains("Hello"),
391 "Content should be converted"
392 );
393 assert!(
394 !output.content.contains("<p>"),
395 "HTML tags should be removed by markdown conversion"
396 );
397 }
398 }
399}