1use std::net::{IpAddr, SocketAddr};
5use std::time::Duration;
6
7use schemars::JsonSchema;
8use serde::Deserialize;
9use url::Url;
10
11use crate::config::ScrapeConfig;
12use crate::executor::{ToolCall, ToolError, ToolExecutor, ToolOutput, deserialize_params};
13
14#[derive(Debug, Deserialize, JsonSchema)]
15struct ScrapeInstruction {
16 url: String,
18 select: String,
20 #[serde(default = "default_extract")]
22 extract: String,
23 limit: Option<usize>,
25}
26
27fn default_extract() -> String {
28 "text".into()
29}
30
31#[derive(Debug)]
32enum ExtractMode {
33 Text,
34 Html,
35 Attr(String),
36}
37
38impl ExtractMode {
39 fn parse(s: &str) -> Self {
40 match s {
41 "text" => Self::Text,
42 "html" => Self::Html,
43 attr if attr.starts_with("attr:") => {
44 Self::Attr(attr.strip_prefix("attr:").unwrap_or(attr).to_owned())
45 }
46 _ => Self::Text,
47 }
48 }
49}
50
51#[derive(Debug)]
56pub struct WebScrapeExecutor {
57 timeout: Duration,
58 max_body_bytes: usize,
59}
60
61impl WebScrapeExecutor {
62 #[must_use]
63 pub fn new(config: &ScrapeConfig) -> Self {
64 Self {
65 timeout: Duration::from_secs(config.timeout),
66 max_body_bytes: config.max_body_bytes,
67 }
68 }
69
70 fn build_client(&self, host: &str, addrs: &[SocketAddr]) -> reqwest::Client {
71 let mut builder = reqwest::Client::builder()
72 .timeout(self.timeout)
73 .redirect(reqwest::redirect::Policy::none());
74 builder = builder.resolve_to_addrs(host, addrs);
75 builder.build().unwrap_or_default()
76 }
77}
78
79impl ToolExecutor for WebScrapeExecutor {
80 fn tool_definitions(&self) -> Vec<crate::registry::ToolDef> {
81 use crate::registry::{InvocationHint, ToolDef};
82 vec![ToolDef {
83 id: "web_scrape".into(),
84 description: "Scrape data from a web page via CSS selectors".into(),
85 schema: schemars::schema_for!(ScrapeInstruction),
86 invocation: InvocationHint::FencedBlock("scrape"),
87 }]
88 }
89
90 async fn execute(&self, response: &str) -> Result<Option<ToolOutput>, ToolError> {
91 let blocks = extract_scrape_blocks(response);
92 if blocks.is_empty() {
93 return Ok(None);
94 }
95
96 let mut outputs = Vec::with_capacity(blocks.len());
97 #[allow(clippy::cast_possible_truncation)]
98 let blocks_executed = blocks.len() as u32;
99
100 for block in &blocks {
101 let instruction: ScrapeInstruction = serde_json::from_str(block).map_err(|e| {
102 ToolError::Execution(std::io::Error::new(
103 std::io::ErrorKind::InvalidData,
104 e.to_string(),
105 ))
106 })?;
107 outputs.push(self.scrape_instruction(&instruction).await?);
108 }
109
110 Ok(Some(ToolOutput {
111 tool_name: "web-scrape".to_owned(),
112 summary: outputs.join("\n\n"),
113 blocks_executed,
114 filter_stats: None,
115 diff: None,
116 streamed: false,
117 terminal_id: None,
118 }))
119 }
120
121 async fn execute_tool_call(&self, call: &ToolCall) -> Result<Option<ToolOutput>, ToolError> {
122 if call.tool_id != "web_scrape" {
123 return Ok(None);
124 }
125
126 let instruction: ScrapeInstruction = deserialize_params(&call.params)?;
127
128 let result = self.scrape_instruction(&instruction).await?;
129
130 Ok(Some(ToolOutput {
131 tool_name: "web-scrape".to_owned(),
132 summary: result,
133 blocks_executed: 1,
134 filter_stats: None,
135 diff: None,
136 streamed: false,
137 terminal_id: None,
138 }))
139 }
140}
141
142impl WebScrapeExecutor {
143 async fn scrape_instruction(
144 &self,
145 instruction: &ScrapeInstruction,
146 ) -> Result<String, ToolError> {
147 let parsed = validate_url(&instruction.url)?;
148 let (host, addrs) = resolve_and_validate(&parsed).await?;
149 let html = self.fetch_html(&instruction.url, &host, &addrs).await?;
150 let selector = instruction.select.clone();
151 let extract = ExtractMode::parse(&instruction.extract);
152 let limit = instruction.limit.unwrap_or(10);
153 tokio::task::spawn_blocking(move || parse_and_extract(&html, &selector, &extract, limit))
154 .await
155 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?
156 }
157
158 async fn fetch_html(
168 &self,
169 url: &str,
170 host: &str,
171 addrs: &[SocketAddr],
172 ) -> Result<String, ToolError> {
173 const MAX_REDIRECTS: usize = 3;
174
175 let mut current_url = url.to_owned();
176 let mut current_host = host.to_owned();
177 let mut current_addrs = addrs.to_vec();
178
179 for hop in 0..=MAX_REDIRECTS {
180 let client = self.build_client(¤t_host, ¤t_addrs);
182 let resp = client
183 .get(¤t_url)
184 .send()
185 .await
186 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
187
188 let status = resp.status();
189
190 if status.is_redirection() {
191 if hop == MAX_REDIRECTS {
192 return Err(ToolError::Execution(std::io::Error::other(
193 "too many redirects",
194 )));
195 }
196
197 let location = resp
198 .headers()
199 .get(reqwest::header::LOCATION)
200 .and_then(|v| v.to_str().ok())
201 .ok_or_else(|| {
202 ToolError::Execution(std::io::Error::other("redirect with no Location"))
203 })?;
204
205 let base = Url::parse(¤t_url)
207 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
208 let next_url = base
209 .join(location)
210 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
211
212 let validated = validate_url(next_url.as_str())?;
213 let (next_host, next_addrs) = resolve_and_validate(&validated).await?;
214
215 current_url = next_url.to_string();
216 current_host = next_host;
217 current_addrs = next_addrs;
218 continue;
219 }
220
221 if !status.is_success() {
222 return Err(ToolError::Execution(std::io::Error::other(format!(
223 "HTTP {status}",
224 ))));
225 }
226
227 let bytes = resp
228 .bytes()
229 .await
230 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
231
232 if bytes.len() > self.max_body_bytes {
233 return Err(ToolError::Execution(std::io::Error::other(format!(
234 "response too large: {} bytes (max: {})",
235 bytes.len(),
236 self.max_body_bytes,
237 ))));
238 }
239
240 return String::from_utf8(bytes.to_vec())
241 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())));
242 }
243
244 Err(ToolError::Execution(std::io::Error::other(
245 "too many redirects",
246 )))
247 }
248}
249
250fn extract_scrape_blocks(text: &str) -> Vec<&str> {
251 crate::executor::extract_fenced_blocks(text, "scrape")
252}
253
254fn validate_url(raw: &str) -> Result<Url, ToolError> {
255 let parsed = Url::parse(raw).map_err(|_| ToolError::Blocked {
256 command: format!("invalid URL: {raw}"),
257 })?;
258
259 if parsed.scheme() != "https" {
260 return Err(ToolError::Blocked {
261 command: format!("scheme not allowed: {}", parsed.scheme()),
262 });
263 }
264
265 if let Some(host) = parsed.host()
266 && is_private_host(&host)
267 {
268 return Err(ToolError::Blocked {
269 command: format!(
270 "private/local host blocked: {}",
271 parsed.host_str().unwrap_or("")
272 ),
273 });
274 }
275
276 Ok(parsed)
277}
278
279pub(crate) fn is_private_ip(ip: IpAddr) -> bool {
280 match ip {
281 IpAddr::V4(v4) => {
282 v4.is_loopback()
283 || v4.is_private()
284 || v4.is_link_local()
285 || v4.is_unspecified()
286 || v4.is_broadcast()
287 }
288 IpAddr::V6(v6) => {
289 if v6.is_loopback() || v6.is_unspecified() {
290 return true;
291 }
292 let seg = v6.segments();
293 if seg[0] & 0xffc0 == 0xfe80 {
295 return true;
296 }
297 if seg[0] & 0xfe00 == 0xfc00 {
299 return true;
300 }
301 if seg[0..6] == [0, 0, 0, 0, 0, 0xffff] {
303 let v4 = v6
304 .to_ipv4_mapped()
305 .unwrap_or(std::net::Ipv4Addr::UNSPECIFIED);
306 return v4.is_loopback()
307 || v4.is_private()
308 || v4.is_link_local()
309 || v4.is_unspecified()
310 || v4.is_broadcast();
311 }
312 false
313 }
314 }
315}
316
317fn is_private_host(host: &url::Host<&str>) -> bool {
318 match host {
319 url::Host::Domain(d) => {
320 #[allow(clippy::case_sensitive_file_extension_comparisons)]
323 {
324 *d == "localhost"
325 || d.ends_with(".localhost")
326 || d.ends_with(".internal")
327 || d.ends_with(".local")
328 }
329 }
330 url::Host::Ipv4(v4) => is_private_ip(IpAddr::V4(*v4)),
331 url::Host::Ipv6(v6) => is_private_ip(IpAddr::V6(*v6)),
332 }
333}
334
335async fn resolve_and_validate(url: &Url) -> Result<(String, Vec<SocketAddr>), ToolError> {
341 let Some(host) = url.host_str() else {
342 return Ok((String::new(), vec![]));
343 };
344 let port = url.port_or_known_default().unwrap_or(443);
345 let addrs: Vec<SocketAddr> = tokio::net::lookup_host(format!("{host}:{port}"))
346 .await
347 .map_err(|e| ToolError::Blocked {
348 command: format!("DNS resolution failed: {e}"),
349 })?
350 .collect();
351 for addr in &addrs {
352 if is_private_ip(addr.ip()) {
353 return Err(ToolError::Blocked {
354 command: format!("SSRF protection: private IP {} for host {host}", addr.ip()),
355 });
356 }
357 }
358 Ok((host.to_owned(), addrs))
359}
360
361fn parse_and_extract(
362 html: &str,
363 selector: &str,
364 extract: &ExtractMode,
365 limit: usize,
366) -> Result<String, ToolError> {
367 let soup = scrape_core::Soup::parse(html);
368
369 let tags = soup.find_all(selector).map_err(|e| {
370 ToolError::Execution(std::io::Error::new(
371 std::io::ErrorKind::InvalidData,
372 format!("invalid selector: {e}"),
373 ))
374 })?;
375
376 let mut results = Vec::new();
377
378 for tag in tags.into_iter().take(limit) {
379 let value = match extract {
380 ExtractMode::Text => tag.text(),
381 ExtractMode::Html => tag.inner_html(),
382 ExtractMode::Attr(name) => tag.get(name).unwrap_or_default().to_owned(),
383 };
384 if !value.trim().is_empty() {
385 results.push(value.trim().to_owned());
386 }
387 }
388
389 if results.is_empty() {
390 Ok(format!("No results for selector: {selector}"))
391 } else {
392 Ok(results.join("\n"))
393 }
394}
395
396#[cfg(test)]
397mod tests {
398 use super::*;
399
400 #[test]
403 fn extract_single_block() {
404 let text =
405 "Here:\n```scrape\n{\"url\":\"https://example.com\",\"select\":\"h1\"}\n```\nDone.";
406 let blocks = extract_scrape_blocks(text);
407 assert_eq!(blocks.len(), 1);
408 assert!(blocks[0].contains("example.com"));
409 }
410
411 #[test]
412 fn extract_multiple_blocks() {
413 let text = "```scrape\n{\"url\":\"https://a.com\",\"select\":\"h1\"}\n```\ntext\n```scrape\n{\"url\":\"https://b.com\",\"select\":\"p\"}\n```";
414 let blocks = extract_scrape_blocks(text);
415 assert_eq!(blocks.len(), 2);
416 }
417
418 #[test]
419 fn no_blocks_returns_empty() {
420 let blocks = extract_scrape_blocks("plain text, no code blocks");
421 assert!(blocks.is_empty());
422 }
423
424 #[test]
425 fn unclosed_block_ignored() {
426 let blocks = extract_scrape_blocks("```scrape\n{\"url\":\"https://x.com\"}");
427 assert!(blocks.is_empty());
428 }
429
430 #[test]
431 fn non_scrape_block_ignored() {
432 let text =
433 "```bash\necho hi\n```\n```scrape\n{\"url\":\"https://x.com\",\"select\":\"h1\"}\n```";
434 let blocks = extract_scrape_blocks(text);
435 assert_eq!(blocks.len(), 1);
436 assert!(blocks[0].contains("x.com"));
437 }
438
439 #[test]
440 fn multiline_json_block() {
441 let text =
442 "```scrape\n{\n \"url\": \"https://example.com\",\n \"select\": \"h1\"\n}\n```";
443 let blocks = extract_scrape_blocks(text);
444 assert_eq!(blocks.len(), 1);
445 let instr: ScrapeInstruction = serde_json::from_str(blocks[0]).unwrap();
446 assert_eq!(instr.url, "https://example.com");
447 }
448
449 #[test]
452 fn parse_valid_instruction() {
453 let json = r#"{"url":"https://example.com","select":"h1","extract":"text","limit":5}"#;
454 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
455 assert_eq!(instr.url, "https://example.com");
456 assert_eq!(instr.select, "h1");
457 assert_eq!(instr.extract, "text");
458 assert_eq!(instr.limit, Some(5));
459 }
460
461 #[test]
462 fn parse_minimal_instruction() {
463 let json = r#"{"url":"https://example.com","select":"p"}"#;
464 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
465 assert_eq!(instr.extract, "text");
466 assert!(instr.limit.is_none());
467 }
468
469 #[test]
470 fn parse_attr_extract() {
471 let json = r#"{"url":"https://example.com","select":"a","extract":"attr:href"}"#;
472 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
473 assert_eq!(instr.extract, "attr:href");
474 }
475
476 #[test]
477 fn parse_invalid_json_errors() {
478 let result = serde_json::from_str::<ScrapeInstruction>("not json");
479 assert!(result.is_err());
480 }
481
482 #[test]
485 fn extract_mode_text() {
486 assert!(matches!(ExtractMode::parse("text"), ExtractMode::Text));
487 }
488
489 #[test]
490 fn extract_mode_html() {
491 assert!(matches!(ExtractMode::parse("html"), ExtractMode::Html));
492 }
493
494 #[test]
495 fn extract_mode_attr() {
496 let mode = ExtractMode::parse("attr:href");
497 assert!(matches!(mode, ExtractMode::Attr(ref s) if s == "href"));
498 }
499
500 #[test]
501 fn extract_mode_unknown_defaults_to_text() {
502 assert!(matches!(ExtractMode::parse("unknown"), ExtractMode::Text));
503 }
504
505 #[test]
508 fn valid_https_url() {
509 assert!(validate_url("https://example.com").is_ok());
510 }
511
512 #[test]
513 fn http_rejected() {
514 let err = validate_url("http://example.com").unwrap_err();
515 assert!(matches!(err, ToolError::Blocked { .. }));
516 }
517
518 #[test]
519 fn ftp_rejected() {
520 let err = validate_url("ftp://files.example.com").unwrap_err();
521 assert!(matches!(err, ToolError::Blocked { .. }));
522 }
523
524 #[test]
525 fn file_rejected() {
526 let err = validate_url("file:///etc/passwd").unwrap_err();
527 assert!(matches!(err, ToolError::Blocked { .. }));
528 }
529
530 #[test]
531 fn invalid_url_rejected() {
532 let err = validate_url("not a url").unwrap_err();
533 assert!(matches!(err, ToolError::Blocked { .. }));
534 }
535
536 #[test]
537 fn localhost_blocked() {
538 let err = validate_url("https://localhost/path").unwrap_err();
539 assert!(matches!(err, ToolError::Blocked { .. }));
540 }
541
542 #[test]
543 fn loopback_ip_blocked() {
544 let err = validate_url("https://127.0.0.1/path").unwrap_err();
545 assert!(matches!(err, ToolError::Blocked { .. }));
546 }
547
548 #[test]
549 fn private_10_blocked() {
550 let err = validate_url("https://10.0.0.1/api").unwrap_err();
551 assert!(matches!(err, ToolError::Blocked { .. }));
552 }
553
554 #[test]
555 fn private_172_blocked() {
556 let err = validate_url("https://172.16.0.1/api").unwrap_err();
557 assert!(matches!(err, ToolError::Blocked { .. }));
558 }
559
560 #[test]
561 fn private_192_blocked() {
562 let err = validate_url("https://192.168.1.1/api").unwrap_err();
563 assert!(matches!(err, ToolError::Blocked { .. }));
564 }
565
566 #[test]
567 fn ipv6_loopback_blocked() {
568 let err = validate_url("https://[::1]/path").unwrap_err();
569 assert!(matches!(err, ToolError::Blocked { .. }));
570 }
571
572 #[test]
573 fn public_ip_allowed() {
574 assert!(validate_url("https://93.184.216.34/page").is_ok());
575 }
576
577 #[test]
580 fn extract_text_from_html() {
581 let html = "<html><body><h1>Hello World</h1><p>Content</p></body></html>";
582 let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
583 assert_eq!(result, "Hello World");
584 }
585
586 #[test]
587 fn extract_multiple_elements() {
588 let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
589 let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
590 assert_eq!(result, "A\nB\nC");
591 }
592
593 #[test]
594 fn extract_with_limit() {
595 let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
596 let result = parse_and_extract(html, "li", &ExtractMode::Text, 2).unwrap();
597 assert_eq!(result, "A\nB");
598 }
599
600 #[test]
601 fn extract_attr_href() {
602 let html = r#"<a href="https://example.com">Link</a>"#;
603 let result =
604 parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
605 assert_eq!(result, "https://example.com");
606 }
607
608 #[test]
609 fn extract_inner_html() {
610 let html = "<div><span>inner</span></div>";
611 let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
612 assert!(result.contains("<span>inner</span>"));
613 }
614
615 #[test]
616 fn no_matches_returns_message() {
617 let html = "<html><body><p>text</p></body></html>";
618 let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
619 assert!(result.starts_with("No results for selector:"));
620 }
621
622 #[test]
623 fn empty_text_skipped() {
624 let html = "<ul><li> </li><li>A</li></ul>";
625 let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
626 assert_eq!(result, "A");
627 }
628
629 #[test]
630 fn invalid_selector_errors() {
631 let html = "<html><body></body></html>";
632 let result = parse_and_extract(html, "[[[invalid", &ExtractMode::Text, 10);
633 assert!(result.is_err());
634 }
635
636 #[test]
637 fn empty_html_returns_no_results() {
638 let result = parse_and_extract("", "h1", &ExtractMode::Text, 10).unwrap();
639 assert!(result.starts_with("No results for selector:"));
640 }
641
642 #[test]
643 fn nested_selector() {
644 let html = "<div><span>inner</span></div><span>outer</span>";
645 let result = parse_and_extract(html, "div > span", &ExtractMode::Text, 10).unwrap();
646 assert_eq!(result, "inner");
647 }
648
649 #[test]
650 fn attr_missing_returns_empty() {
651 let html = r#"<a>No href</a>"#;
652 let result =
653 parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
654 assert!(result.starts_with("No results for selector:"));
655 }
656
657 #[test]
658 fn extract_html_mode() {
659 let html = "<div><b>bold</b> text</div>";
660 let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
661 assert!(result.contains("<b>bold</b>"));
662 }
663
664 #[test]
665 fn limit_zero_returns_no_results() {
666 let html = "<ul><li>A</li><li>B</li></ul>";
667 let result = parse_and_extract(html, "li", &ExtractMode::Text, 0).unwrap();
668 assert!(result.starts_with("No results for selector:"));
669 }
670
671 #[test]
674 fn url_with_port_allowed() {
675 assert!(validate_url("https://example.com:8443/path").is_ok());
676 }
677
678 #[test]
679 fn link_local_ip_blocked() {
680 let err = validate_url("https://169.254.1.1/path").unwrap_err();
681 assert!(matches!(err, ToolError::Blocked { .. }));
682 }
683
684 #[test]
685 fn url_no_scheme_rejected() {
686 let err = validate_url("example.com/path").unwrap_err();
687 assert!(matches!(err, ToolError::Blocked { .. }));
688 }
689
690 #[test]
691 fn unspecified_ipv4_blocked() {
692 let err = validate_url("https://0.0.0.0/path").unwrap_err();
693 assert!(matches!(err, ToolError::Blocked { .. }));
694 }
695
696 #[test]
697 fn broadcast_ipv4_blocked() {
698 let err = validate_url("https://255.255.255.255/path").unwrap_err();
699 assert!(matches!(err, ToolError::Blocked { .. }));
700 }
701
702 #[test]
703 fn ipv6_link_local_blocked() {
704 let err = validate_url("https://[fe80::1]/path").unwrap_err();
705 assert!(matches!(err, ToolError::Blocked { .. }));
706 }
707
708 #[test]
709 fn ipv6_unique_local_blocked() {
710 let err = validate_url("https://[fd12::1]/path").unwrap_err();
711 assert!(matches!(err, ToolError::Blocked { .. }));
712 }
713
714 #[test]
715 fn ipv4_mapped_ipv6_loopback_blocked() {
716 let err = validate_url("https://[::ffff:127.0.0.1]/path").unwrap_err();
717 assert!(matches!(err, ToolError::Blocked { .. }));
718 }
719
720 #[test]
721 fn ipv4_mapped_ipv6_private_blocked() {
722 let err = validate_url("https://[::ffff:10.0.0.1]/path").unwrap_err();
723 assert!(matches!(err, ToolError::Blocked { .. }));
724 }
725
726 #[tokio::test]
729 async fn executor_no_blocks_returns_none() {
730 let config = ScrapeConfig::default();
731 let executor = WebScrapeExecutor::new(&config);
732 let result = executor.execute("plain text").await;
733 assert!(result.unwrap().is_none());
734 }
735
736 #[tokio::test]
737 async fn executor_invalid_json_errors() {
738 let config = ScrapeConfig::default();
739 let executor = WebScrapeExecutor::new(&config);
740 let response = "```scrape\nnot json\n```";
741 let result = executor.execute(response).await;
742 assert!(matches!(result, Err(ToolError::Execution(_))));
743 }
744
745 #[tokio::test]
746 async fn executor_blocked_url_errors() {
747 let config = ScrapeConfig::default();
748 let executor = WebScrapeExecutor::new(&config);
749 let response = "```scrape\n{\"url\":\"http://example.com\",\"select\":\"h1\"}\n```";
750 let result = executor.execute(response).await;
751 assert!(matches!(result, Err(ToolError::Blocked { .. })));
752 }
753
754 #[tokio::test]
755 async fn executor_private_ip_blocked() {
756 let config = ScrapeConfig::default();
757 let executor = WebScrapeExecutor::new(&config);
758 let response = "```scrape\n{\"url\":\"https://192.168.1.1/api\",\"select\":\"h1\"}\n```";
759 let result = executor.execute(response).await;
760 assert!(matches!(result, Err(ToolError::Blocked { .. })));
761 }
762
763 #[tokio::test]
764 async fn executor_unreachable_host_returns_error() {
765 let config = ScrapeConfig {
766 timeout: 1,
767 max_body_bytes: 1_048_576,
768 };
769 let executor = WebScrapeExecutor::new(&config);
770 let response = "```scrape\n{\"url\":\"https://192.0.2.1:1/page\",\"select\":\"h1\"}\n```";
771 let result = executor.execute(response).await;
772 assert!(matches!(result, Err(ToolError::Execution(_))));
773 }
774
775 #[tokio::test]
776 async fn executor_localhost_url_blocked() {
777 let config = ScrapeConfig::default();
778 let executor = WebScrapeExecutor::new(&config);
779 let response = "```scrape\n{\"url\":\"https://localhost:9999/api\",\"select\":\"h1\"}\n```";
780 let result = executor.execute(response).await;
781 assert!(matches!(result, Err(ToolError::Blocked { .. })));
782 }
783
784 #[tokio::test]
785 async fn executor_empty_text_returns_none() {
786 let config = ScrapeConfig::default();
787 let executor = WebScrapeExecutor::new(&config);
788 let result = executor.execute("").await;
789 assert!(result.unwrap().is_none());
790 }
791
792 #[tokio::test]
793 async fn executor_multiple_blocks_first_blocked() {
794 let config = ScrapeConfig::default();
795 let executor = WebScrapeExecutor::new(&config);
796 let response = "```scrape\n{\"url\":\"http://evil.com\",\"select\":\"h1\"}\n```\n\
797 ```scrape\n{\"url\":\"https://ok.com\",\"select\":\"h1\"}\n```";
798 let result = executor.execute(response).await;
799 assert!(result.is_err());
800 }
801
802 #[test]
803 fn validate_url_empty_string() {
804 let err = validate_url("").unwrap_err();
805 assert!(matches!(err, ToolError::Blocked { .. }));
806 }
807
808 #[test]
809 fn validate_url_javascript_scheme_blocked() {
810 let err = validate_url("javascript:alert(1)").unwrap_err();
811 assert!(matches!(err, ToolError::Blocked { .. }));
812 }
813
814 #[test]
815 fn validate_url_data_scheme_blocked() {
816 let err = validate_url("data:text/html,<h1>hi</h1>").unwrap_err();
817 assert!(matches!(err, ToolError::Blocked { .. }));
818 }
819
820 #[test]
821 fn is_private_host_public_domain_is_false() {
822 let host: url::Host<&str> = url::Host::Domain("example.com");
823 assert!(!is_private_host(&host));
824 }
825
826 #[test]
827 fn is_private_host_localhost_is_true() {
828 let host: url::Host<&str> = url::Host::Domain("localhost");
829 assert!(is_private_host(&host));
830 }
831
832 #[test]
833 fn is_private_host_ipv6_unspecified_is_true() {
834 let host = url::Host::Ipv6(std::net::Ipv6Addr::UNSPECIFIED);
835 assert!(is_private_host(&host));
836 }
837
838 #[test]
839 fn is_private_host_public_ipv6_is_false() {
840 let host = url::Host::Ipv6("2001:db8::1".parse().unwrap());
841 assert!(!is_private_host(&host));
842 }
843
844 async fn mock_server_executor() -> (WebScrapeExecutor, wiremock::MockServer) {
855 let server = wiremock::MockServer::start().await;
856 let executor = WebScrapeExecutor {
857 timeout: Duration::from_secs(5),
858 max_body_bytes: 1_048_576,
859 };
860 (executor, server)
861 }
862
863 fn server_host_and_addr(server: &wiremock::MockServer) -> (String, Vec<std::net::SocketAddr>) {
865 let uri = server.uri();
866 let url = Url::parse(&uri).unwrap();
867 let host = url.host_str().unwrap_or("127.0.0.1").to_owned();
868 let port = url.port().unwrap_or(80);
869 let addr: std::net::SocketAddr = format!("{host}:{port}").parse().unwrap();
870 (host, vec![addr])
871 }
872
873 async fn follow_redirects_raw(
877 executor: &WebScrapeExecutor,
878 start_url: &str,
879 host: &str,
880 addrs: &[std::net::SocketAddr],
881 ) -> Result<String, ToolError> {
882 const MAX_REDIRECTS: usize = 3;
883 let mut current_url = start_url.to_owned();
884 let mut current_host = host.to_owned();
885 let mut current_addrs = addrs.to_vec();
886
887 for hop in 0..=MAX_REDIRECTS {
888 let client = executor.build_client(¤t_host, ¤t_addrs);
889 let resp = client
890 .get(¤t_url)
891 .send()
892 .await
893 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
894
895 let status = resp.status();
896
897 if status.is_redirection() {
898 if hop == MAX_REDIRECTS {
899 return Err(ToolError::Execution(std::io::Error::other(
900 "too many redirects",
901 )));
902 }
903
904 let location = resp
905 .headers()
906 .get(reqwest::header::LOCATION)
907 .and_then(|v| v.to_str().ok())
908 .ok_or_else(|| {
909 ToolError::Execution(std::io::Error::other("redirect with no Location"))
910 })?;
911
912 let base = Url::parse(¤t_url)
913 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
914 let next_url = base
915 .join(location)
916 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
917
918 current_url = next_url.to_string();
920 let _ = &mut current_host;
922 let _ = &mut current_addrs;
923 continue;
924 }
925
926 if !status.is_success() {
927 return Err(ToolError::Execution(std::io::Error::other(format!(
928 "HTTP {status}",
929 ))));
930 }
931
932 let bytes = resp
933 .bytes()
934 .await
935 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
936
937 if bytes.len() > executor.max_body_bytes {
938 return Err(ToolError::Execution(std::io::Error::other(format!(
939 "response too large: {} bytes (max: {})",
940 bytes.len(),
941 executor.max_body_bytes,
942 ))));
943 }
944
945 return String::from_utf8(bytes.to_vec())
946 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())));
947 }
948
949 Err(ToolError::Execution(std::io::Error::other(
950 "too many redirects",
951 )))
952 }
953
954 #[tokio::test]
955 async fn fetch_html_success_returns_body() {
956 use wiremock::matchers::{method, path};
957 use wiremock::{Mock, ResponseTemplate};
958
959 let (executor, server) = mock_server_executor().await;
960 Mock::given(method("GET"))
961 .and(path("/page"))
962 .respond_with(ResponseTemplate::new(200).set_body_string("<h1>OK</h1>"))
963 .mount(&server)
964 .await;
965
966 let (host, addrs) = server_host_and_addr(&server);
967 let url = format!("{}/page", server.uri());
968 let result = executor.fetch_html(&url, &host, &addrs).await;
969 assert!(result.is_ok(), "expected Ok, got: {result:?}");
970 assert_eq!(result.unwrap(), "<h1>OK</h1>");
971 }
972
973 #[tokio::test]
974 async fn fetch_html_non_2xx_returns_error() {
975 use wiremock::matchers::{method, path};
976 use wiremock::{Mock, ResponseTemplate};
977
978 let (executor, server) = mock_server_executor().await;
979 Mock::given(method("GET"))
980 .and(path("/forbidden"))
981 .respond_with(ResponseTemplate::new(403))
982 .mount(&server)
983 .await;
984
985 let (host, addrs) = server_host_and_addr(&server);
986 let url = format!("{}/forbidden", server.uri());
987 let result = executor.fetch_html(&url, &host, &addrs).await;
988 assert!(result.is_err());
989 let msg = result.unwrap_err().to_string();
990 assert!(msg.contains("403"), "expected 403 in error: {msg}");
991 }
992
993 #[tokio::test]
994 async fn fetch_html_404_returns_error() {
995 use wiremock::matchers::{method, path};
996 use wiremock::{Mock, ResponseTemplate};
997
998 let (executor, server) = mock_server_executor().await;
999 Mock::given(method("GET"))
1000 .and(path("/missing"))
1001 .respond_with(ResponseTemplate::new(404))
1002 .mount(&server)
1003 .await;
1004
1005 let (host, addrs) = server_host_and_addr(&server);
1006 let url = format!("{}/missing", server.uri());
1007 let result = executor.fetch_html(&url, &host, &addrs).await;
1008 assert!(result.is_err());
1009 let msg = result.unwrap_err().to_string();
1010 assert!(msg.contains("404"), "expected 404 in error: {msg}");
1011 }
1012
1013 #[tokio::test]
1014 async fn fetch_html_redirect_no_location_returns_error() {
1015 use wiremock::matchers::{method, path};
1016 use wiremock::{Mock, ResponseTemplate};
1017
1018 let (executor, server) = mock_server_executor().await;
1019 Mock::given(method("GET"))
1021 .and(path("/redirect-no-loc"))
1022 .respond_with(ResponseTemplate::new(302))
1023 .mount(&server)
1024 .await;
1025
1026 let (host, addrs) = server_host_and_addr(&server);
1027 let url = format!("{}/redirect-no-loc", server.uri());
1028 let result = executor.fetch_html(&url, &host, &addrs).await;
1029 assert!(result.is_err());
1030 let msg = result.unwrap_err().to_string();
1031 assert!(
1032 msg.contains("Location") || msg.contains("location"),
1033 "expected Location-related error: {msg}"
1034 );
1035 }
1036
1037 #[tokio::test]
1038 async fn fetch_html_single_redirect_followed() {
1039 use wiremock::matchers::{method, path};
1040 use wiremock::{Mock, ResponseTemplate};
1041
1042 let (executor, server) = mock_server_executor().await;
1043 let final_url = format!("{}/final", server.uri());
1044
1045 Mock::given(method("GET"))
1046 .and(path("/start"))
1047 .respond_with(ResponseTemplate::new(302).insert_header("location", final_url.as_str()))
1048 .mount(&server)
1049 .await;
1050
1051 Mock::given(method("GET"))
1052 .and(path("/final"))
1053 .respond_with(ResponseTemplate::new(200).set_body_string("<p>final</p>"))
1054 .mount(&server)
1055 .await;
1056
1057 let (host, addrs) = server_host_and_addr(&server);
1058 let url = format!("{}/start", server.uri());
1059 let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1060 assert!(result.is_ok(), "single redirect should succeed: {result:?}");
1061 assert_eq!(result.unwrap(), "<p>final</p>");
1062 }
1063
1064 #[tokio::test]
1065 async fn fetch_html_three_redirects_allowed() {
1066 use wiremock::matchers::{method, path};
1067 use wiremock::{Mock, ResponseTemplate};
1068
1069 let (executor, server) = mock_server_executor().await;
1070 let hop2 = format!("{}/hop2", server.uri());
1071 let hop3 = format!("{}/hop3", server.uri());
1072 let final_dest = format!("{}/done", server.uri());
1073
1074 Mock::given(method("GET"))
1075 .and(path("/hop1"))
1076 .respond_with(ResponseTemplate::new(301).insert_header("location", hop2.as_str()))
1077 .mount(&server)
1078 .await;
1079 Mock::given(method("GET"))
1080 .and(path("/hop2"))
1081 .respond_with(ResponseTemplate::new(301).insert_header("location", hop3.as_str()))
1082 .mount(&server)
1083 .await;
1084 Mock::given(method("GET"))
1085 .and(path("/hop3"))
1086 .respond_with(ResponseTemplate::new(301).insert_header("location", final_dest.as_str()))
1087 .mount(&server)
1088 .await;
1089 Mock::given(method("GET"))
1090 .and(path("/done"))
1091 .respond_with(ResponseTemplate::new(200).set_body_string("<p>done</p>"))
1092 .mount(&server)
1093 .await;
1094
1095 let (host, addrs) = server_host_and_addr(&server);
1096 let url = format!("{}/hop1", server.uri());
1097 let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1098 assert!(result.is_ok(), "3 redirects should succeed: {result:?}");
1099 assert_eq!(result.unwrap(), "<p>done</p>");
1100 }
1101
1102 #[tokio::test]
1103 async fn fetch_html_four_redirects_rejected() {
1104 use wiremock::matchers::{method, path};
1105 use wiremock::{Mock, ResponseTemplate};
1106
1107 let (executor, server) = mock_server_executor().await;
1108 let hop2 = format!("{}/r2", server.uri());
1109 let hop3 = format!("{}/r3", server.uri());
1110 let hop4 = format!("{}/r4", server.uri());
1111 let hop5 = format!("{}/r5", server.uri());
1112
1113 for (from, to) in [
1114 ("/r1", &hop2),
1115 ("/r2", &hop3),
1116 ("/r3", &hop4),
1117 ("/r4", &hop5),
1118 ] {
1119 Mock::given(method("GET"))
1120 .and(path(from))
1121 .respond_with(ResponseTemplate::new(301).insert_header("location", to.as_str()))
1122 .mount(&server)
1123 .await;
1124 }
1125
1126 let (host, addrs) = server_host_and_addr(&server);
1127 let url = format!("{}/r1", server.uri());
1128 let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1129 assert!(result.is_err(), "4 redirects should be rejected");
1130 let msg = result.unwrap_err().to_string();
1131 assert!(
1132 msg.contains("redirect"),
1133 "expected redirect-related error: {msg}"
1134 );
1135 }
1136
1137 #[tokio::test]
1138 async fn fetch_html_body_too_large_returns_error() {
1139 use wiremock::matchers::{method, path};
1140 use wiremock::{Mock, ResponseTemplate};
1141
1142 let small_limit_executor = WebScrapeExecutor {
1143 timeout: Duration::from_secs(5),
1144 max_body_bytes: 10,
1145 };
1146 let server = wiremock::MockServer::start().await;
1147 Mock::given(method("GET"))
1148 .and(path("/big"))
1149 .respond_with(
1150 ResponseTemplate::new(200)
1151 .set_body_string("this body is definitely longer than ten bytes"),
1152 )
1153 .mount(&server)
1154 .await;
1155
1156 let (host, addrs) = server_host_and_addr(&server);
1157 let url = format!("{}/big", server.uri());
1158 let result = small_limit_executor.fetch_html(&url, &host, &addrs).await;
1159 assert!(result.is_err());
1160 let msg = result.unwrap_err().to_string();
1161 assert!(msg.contains("too large"), "expected too-large error: {msg}");
1162 }
1163
1164 #[test]
1165 fn extract_scrape_blocks_empty_block_content() {
1166 let text = "```scrape\n\n```";
1167 let blocks = extract_scrape_blocks(text);
1168 assert_eq!(blocks.len(), 1);
1169 assert!(blocks[0].is_empty());
1170 }
1171
1172 #[test]
1173 fn extract_scrape_blocks_whitespace_only() {
1174 let text = "```scrape\n \n```";
1175 let blocks = extract_scrape_blocks(text);
1176 assert_eq!(blocks.len(), 1);
1177 }
1178
1179 #[test]
1180 fn parse_and_extract_multiple_selectors() {
1181 let html = "<div><h1>Title</h1><p>Para</p></div>";
1182 let result = parse_and_extract(html, "h1, p", &ExtractMode::Text, 10).unwrap();
1183 assert!(result.contains("Title"));
1184 assert!(result.contains("Para"));
1185 }
1186
1187 #[test]
1188 fn webscrape_executor_new_with_custom_config() {
1189 let config = ScrapeConfig {
1190 timeout: 60,
1191 max_body_bytes: 512,
1192 };
1193 let executor = WebScrapeExecutor::new(&config);
1194 assert_eq!(executor.max_body_bytes, 512);
1195 }
1196
1197 #[test]
1198 fn webscrape_executor_debug() {
1199 let config = ScrapeConfig::default();
1200 let executor = WebScrapeExecutor::new(&config);
1201 let dbg = format!("{executor:?}");
1202 assert!(dbg.contains("WebScrapeExecutor"));
1203 }
1204
1205 #[test]
1206 fn extract_mode_attr_empty_name() {
1207 let mode = ExtractMode::parse("attr:");
1208 assert!(matches!(mode, ExtractMode::Attr(ref s) if s.is_empty()));
1209 }
1210
1211 #[test]
1212 fn default_extract_returns_text() {
1213 assert_eq!(default_extract(), "text");
1214 }
1215
1216 #[test]
1217 fn scrape_instruction_debug() {
1218 let json = r#"{"url":"https://example.com","select":"h1"}"#;
1219 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
1220 let dbg = format!("{instr:?}");
1221 assert!(dbg.contains("ScrapeInstruction"));
1222 }
1223
1224 #[test]
1225 fn extract_mode_debug() {
1226 let mode = ExtractMode::Text;
1227 let dbg = format!("{mode:?}");
1228 assert!(dbg.contains("Text"));
1229 }
1230
1231 #[test]
1236 fn max_redirects_constant_is_three() {
1237 const MAX_REDIRECTS: usize = 3;
1241 assert_eq!(MAX_REDIRECTS, 3, "fetch_html allows exactly 3 redirects");
1242 }
1243
1244 #[test]
1247 fn redirect_no_location_error_message() {
1248 let err = std::io::Error::other("redirect with no Location");
1249 assert!(err.to_string().contains("redirect with no Location"));
1250 }
1251
1252 #[test]
1254 fn too_many_redirects_error_message() {
1255 let err = std::io::Error::other("too many redirects");
1256 assert!(err.to_string().contains("too many redirects"));
1257 }
1258
1259 #[test]
1261 fn non_2xx_status_error_format() {
1262 let status = reqwest::StatusCode::FORBIDDEN;
1263 let msg = format!("HTTP {status}");
1264 assert!(msg.contains("403"));
1265 }
1266
1267 #[test]
1269 fn not_found_status_error_format() {
1270 let status = reqwest::StatusCode::NOT_FOUND;
1271 let msg = format!("HTTP {status}");
1272 assert!(msg.contains("404"));
1273 }
1274
1275 #[test]
1277 fn relative_redirect_same_host_path() {
1278 let base = Url::parse("https://example.com/current").unwrap();
1279 let resolved = base.join("/other").unwrap();
1280 assert_eq!(resolved.as_str(), "https://example.com/other");
1281 }
1282
1283 #[test]
1285 fn relative_redirect_relative_path() {
1286 let base = Url::parse("https://example.com/a/b").unwrap();
1287 let resolved = base.join("c").unwrap();
1288 assert_eq!(resolved.as_str(), "https://example.com/a/c");
1289 }
1290
1291 #[test]
1293 fn absolute_redirect_overrides_base() {
1294 let base = Url::parse("https://example.com/page").unwrap();
1295 let resolved = base.join("https://other.com/target").unwrap();
1296 assert_eq!(resolved.as_str(), "https://other.com/target");
1297 }
1298
1299 #[test]
1301 fn redirect_http_downgrade_rejected() {
1302 let location = "http://example.com/page";
1303 let base = Url::parse("https://example.com/start").unwrap();
1304 let next = base.join(location).unwrap();
1305 let err = validate_url(next.as_str()).unwrap_err();
1306 assert!(matches!(err, ToolError::Blocked { .. }));
1307 }
1308
1309 #[test]
1311 fn redirect_location_private_ip_blocked() {
1312 let location = "https://192.168.100.1/admin";
1313 let base = Url::parse("https://example.com/start").unwrap();
1314 let next = base.join(location).unwrap();
1315 let err = validate_url(next.as_str()).unwrap_err();
1316 assert!(matches!(err, ToolError::Blocked { .. }));
1317 let cmd = match err {
1318 ToolError::Blocked { command } => command,
1319 _ => panic!("expected Blocked"),
1320 };
1321 assert!(
1322 cmd.contains("private") || cmd.contains("scheme"),
1323 "error message should describe the block reason: {cmd}"
1324 );
1325 }
1326
1327 #[test]
1329 fn redirect_location_internal_domain_blocked() {
1330 let location = "https://metadata.internal/latest/meta-data/";
1331 let base = Url::parse("https://example.com/start").unwrap();
1332 let next = base.join(location).unwrap();
1333 let err = validate_url(next.as_str()).unwrap_err();
1334 assert!(matches!(err, ToolError::Blocked { .. }));
1335 }
1336
1337 #[test]
1339 fn redirect_chain_three_hops_all_public() {
1340 let hops = [
1341 "https://redirect1.example.com/hop1",
1342 "https://redirect2.example.com/hop2",
1343 "https://destination.example.com/final",
1344 ];
1345 for hop in hops {
1346 assert!(validate_url(hop).is_ok(), "expected ok for {hop}");
1347 }
1348 }
1349
1350 #[test]
1355 fn redirect_to_private_ip_rejected_by_validate_url() {
1356 let private_targets = [
1358 "https://127.0.0.1/secret",
1359 "https://10.0.0.1/internal",
1360 "https://192.168.1.1/admin",
1361 "https://172.16.0.1/data",
1362 "https://[::1]/path",
1363 "https://[fe80::1]/path",
1364 "https://localhost/path",
1365 "https://service.internal/api",
1366 ];
1367 for target in private_targets {
1368 let result = validate_url(target);
1369 assert!(result.is_err(), "expected error for {target}");
1370 assert!(
1371 matches!(result.unwrap_err(), ToolError::Blocked { .. }),
1372 "expected Blocked for {target}"
1373 );
1374 }
1375 }
1376
1377 #[test]
1379 fn redirect_relative_url_resolves_correctly() {
1380 let base = Url::parse("https://example.com/page").unwrap();
1381 let relative = "/other";
1382 let resolved = base.join(relative).unwrap();
1383 assert_eq!(resolved.as_str(), "https://example.com/other");
1384 }
1385
1386 #[test]
1388 fn redirect_to_http_rejected() {
1389 let err = validate_url("http://example.com/page").unwrap_err();
1390 assert!(matches!(err, ToolError::Blocked { .. }));
1391 }
1392
1393 #[test]
1394 fn ipv4_mapped_ipv6_link_local_blocked() {
1395 let err = validate_url("https://[::ffff:169.254.0.1]/path").unwrap_err();
1396 assert!(matches!(err, ToolError::Blocked { .. }));
1397 }
1398
1399 #[test]
1400 fn ipv4_mapped_ipv6_public_allowed() {
1401 assert!(validate_url("https://[::ffff:93.184.216.34]/path").is_ok());
1402 }
1403
1404 #[test]
1405 fn tool_definitions_returns_web_scrape() {
1406 let config = ScrapeConfig::default();
1407 let executor = WebScrapeExecutor::new(&config);
1408 let defs = executor.tool_definitions();
1409 assert_eq!(defs.len(), 1);
1410 assert_eq!(defs[0].id, "web_scrape");
1411 assert_eq!(
1412 defs[0].invocation,
1413 crate::registry::InvocationHint::FencedBlock("scrape")
1414 );
1415 }
1416
1417 #[test]
1418 fn tool_definitions_schema_has_all_params() {
1419 let config = ScrapeConfig::default();
1420 let executor = WebScrapeExecutor::new(&config);
1421 let defs = executor.tool_definitions();
1422 let obj = defs[0].schema.as_object().unwrap();
1423 let props = obj["properties"].as_object().unwrap();
1424 assert!(props.contains_key("url"));
1425 assert!(props.contains_key("select"));
1426 assert!(props.contains_key("extract"));
1427 assert!(props.contains_key("limit"));
1428 let req = obj["required"].as_array().unwrap();
1429 assert!(req.iter().any(|v| v.as_str() == Some("url")));
1430 assert!(req.iter().any(|v| v.as_str() == Some("select")));
1431 assert!(!req.iter().any(|v| v.as_str() == Some("extract")));
1432 }
1433
1434 #[test]
1437 fn subdomain_localhost_blocked() {
1438 let host: url::Host<&str> = url::Host::Domain("foo.localhost");
1439 assert!(is_private_host(&host));
1440 }
1441
1442 #[test]
1443 fn internal_tld_blocked() {
1444 let host: url::Host<&str> = url::Host::Domain("service.internal");
1445 assert!(is_private_host(&host));
1446 }
1447
1448 #[test]
1449 fn local_tld_blocked() {
1450 let host: url::Host<&str> = url::Host::Domain("printer.local");
1451 assert!(is_private_host(&host));
1452 }
1453
1454 #[test]
1455 fn public_domain_not_blocked() {
1456 let host: url::Host<&str> = url::Host::Domain("example.com");
1457 assert!(!is_private_host(&host));
1458 }
1459
1460 #[tokio::test]
1463 async fn resolve_loopback_rejected() {
1464 let url = url::Url::parse("https://127.0.0.1/path").unwrap();
1466 let result = resolve_and_validate(&url).await;
1468 assert!(
1469 result.is_err(),
1470 "loopback IP must be rejected by resolve_and_validate"
1471 );
1472 let err = result.unwrap_err();
1473 assert!(matches!(err, crate::executor::ToolError::Blocked { .. }));
1474 }
1475
1476 #[tokio::test]
1477 async fn resolve_private_10_rejected() {
1478 let url = url::Url::parse("https://10.0.0.1/path").unwrap();
1479 let result = resolve_and_validate(&url).await;
1480 assert!(result.is_err());
1481 assert!(matches!(
1482 result.unwrap_err(),
1483 crate::executor::ToolError::Blocked { .. }
1484 ));
1485 }
1486
1487 #[tokio::test]
1488 async fn resolve_private_192_rejected() {
1489 let url = url::Url::parse("https://192.168.1.1/path").unwrap();
1490 let result = resolve_and_validate(&url).await;
1491 assert!(result.is_err());
1492 assert!(matches!(
1493 result.unwrap_err(),
1494 crate::executor::ToolError::Blocked { .. }
1495 ));
1496 }
1497
1498 #[tokio::test]
1499 async fn resolve_ipv6_loopback_rejected() {
1500 let url = url::Url::parse("https://[::1]/path").unwrap();
1501 let result = resolve_and_validate(&url).await;
1502 assert!(result.is_err());
1503 assert!(matches!(
1504 result.unwrap_err(),
1505 crate::executor::ToolError::Blocked { .. }
1506 ));
1507 }
1508
1509 #[tokio::test]
1510 async fn resolve_no_host_returns_ok() {
1511 let url = url::Url::parse("https://example.com/path").unwrap();
1513 let url_no_host = url::Url::parse("data:text/plain,hello").unwrap();
1515 let result = resolve_and_validate(&url_no_host).await;
1517 assert!(result.is_ok());
1518 let (host, addrs) = result.unwrap();
1519 assert!(host.is_empty());
1520 assert!(addrs.is_empty());
1521 drop(url);
1522 drop(url_no_host);
1523 }
1524}