1use std::net::{IpAddr, SocketAddr};
5use std::time::Duration;
6
7use schemars::JsonSchema;
8use serde::Deserialize;
9use url::Url;
10
11use crate::config::ScrapeConfig;
12use crate::executor::{ToolCall, ToolError, ToolExecutor, ToolOutput, deserialize_params};
13
14#[derive(Debug, Deserialize, JsonSchema)]
15struct ScrapeInstruction {
16 url: String,
18 select: String,
20 #[serde(default = "default_extract")]
22 extract: String,
23 limit: Option<usize>,
25}
26
27fn default_extract() -> String {
28 "text".into()
29}
30
31#[derive(Debug)]
32enum ExtractMode {
33 Text,
34 Html,
35 Attr(String),
36}
37
38impl ExtractMode {
39 fn parse(s: &str) -> Self {
40 match s {
41 "text" => Self::Text,
42 "html" => Self::Html,
43 attr if attr.starts_with("attr:") => {
44 Self::Attr(attr.strip_prefix("attr:").unwrap_or(attr).to_owned())
45 }
46 _ => Self::Text,
47 }
48 }
49}
50
51#[derive(Debug)]
56pub struct WebScrapeExecutor {
57 timeout: Duration,
58 max_body_bytes: usize,
59}
60
61impl WebScrapeExecutor {
62 #[must_use]
63 pub fn new(config: &ScrapeConfig) -> Self {
64 Self {
65 timeout: Duration::from_secs(config.timeout),
66 max_body_bytes: config.max_body_bytes,
67 }
68 }
69
70 fn build_client(&self, host: &str, addrs: &[SocketAddr]) -> reqwest::Client {
71 let mut builder = reqwest::Client::builder()
72 .timeout(self.timeout)
73 .redirect(reqwest::redirect::Policy::none());
74 builder = builder.resolve_to_addrs(host, addrs);
75 builder.build().unwrap_or_default()
76 }
77}
78
79impl ToolExecutor for WebScrapeExecutor {
80 fn tool_definitions(&self) -> Vec<crate::registry::ToolDef> {
81 use crate::registry::{InvocationHint, ToolDef};
82 vec![ToolDef {
83 id: "web_scrape".into(),
84 description: "Scrape data from a web page via CSS selectors".into(),
85 schema: schemars::schema_for!(ScrapeInstruction),
86 invocation: InvocationHint::FencedBlock("scrape"),
87 }]
88 }
89
90 async fn execute(&self, response: &str) -> Result<Option<ToolOutput>, ToolError> {
91 let blocks = extract_scrape_blocks(response);
92 if blocks.is_empty() {
93 return Ok(None);
94 }
95
96 let mut outputs = Vec::with_capacity(blocks.len());
97 #[allow(clippy::cast_possible_truncation)]
98 let blocks_executed = blocks.len() as u32;
99
100 for block in &blocks {
101 let instruction: ScrapeInstruction = serde_json::from_str(block).map_err(|e| {
102 ToolError::Execution(std::io::Error::new(
103 std::io::ErrorKind::InvalidData,
104 e.to_string(),
105 ))
106 })?;
107 outputs.push(self.scrape_instruction(&instruction).await?);
108 }
109
110 Ok(Some(ToolOutput {
111 tool_name: "web-scrape".to_owned(),
112 summary: outputs.join("\n\n"),
113 blocks_executed,
114 filter_stats: None,
115 diff: None,
116 streamed: false,
117 terminal_id: None,
118 locations: None,
119 }))
120 }
121
122 async fn execute_tool_call(&self, call: &ToolCall) -> Result<Option<ToolOutput>, ToolError> {
123 if call.tool_id != "web_scrape" {
124 return Ok(None);
125 }
126
127 let instruction: ScrapeInstruction = deserialize_params(&call.params)?;
128
129 let result = self.scrape_instruction(&instruction).await?;
130
131 Ok(Some(ToolOutput {
132 tool_name: "web-scrape".to_owned(),
133 summary: result,
134 blocks_executed: 1,
135 filter_stats: None,
136 diff: None,
137 streamed: false,
138 terminal_id: None,
139 locations: None,
140 }))
141 }
142}
143
144impl WebScrapeExecutor {
145 async fn scrape_instruction(
146 &self,
147 instruction: &ScrapeInstruction,
148 ) -> Result<String, ToolError> {
149 let parsed = validate_url(&instruction.url)?;
150 let (host, addrs) = resolve_and_validate(&parsed).await?;
151 let html = self.fetch_html(&instruction.url, &host, &addrs).await?;
152 let selector = instruction.select.clone();
153 let extract = ExtractMode::parse(&instruction.extract);
154 let limit = instruction.limit.unwrap_or(10);
155 tokio::task::spawn_blocking(move || parse_and_extract(&html, &selector, &extract, limit))
156 .await
157 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?
158 }
159
160 async fn fetch_html(
170 &self,
171 url: &str,
172 host: &str,
173 addrs: &[SocketAddr],
174 ) -> Result<String, ToolError> {
175 const MAX_REDIRECTS: usize = 3;
176
177 let mut current_url = url.to_owned();
178 let mut current_host = host.to_owned();
179 let mut current_addrs = addrs.to_vec();
180
181 for hop in 0..=MAX_REDIRECTS {
182 let client = self.build_client(¤t_host, ¤t_addrs);
184 let resp = client
185 .get(¤t_url)
186 .send()
187 .await
188 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
189
190 let status = resp.status();
191
192 if status.is_redirection() {
193 if hop == MAX_REDIRECTS {
194 return Err(ToolError::Execution(std::io::Error::other(
195 "too many redirects",
196 )));
197 }
198
199 let location = resp
200 .headers()
201 .get(reqwest::header::LOCATION)
202 .and_then(|v| v.to_str().ok())
203 .ok_or_else(|| {
204 ToolError::Execution(std::io::Error::other("redirect with no Location"))
205 })?;
206
207 let base = Url::parse(¤t_url)
209 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
210 let next_url = base
211 .join(location)
212 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
213
214 let validated = validate_url(next_url.as_str())?;
215 let (next_host, next_addrs) = resolve_and_validate(&validated).await?;
216
217 current_url = next_url.to_string();
218 current_host = next_host;
219 current_addrs = next_addrs;
220 continue;
221 }
222
223 if !status.is_success() {
224 return Err(ToolError::Execution(std::io::Error::other(format!(
225 "HTTP {status}",
226 ))));
227 }
228
229 let bytes = resp
230 .bytes()
231 .await
232 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
233
234 if bytes.len() > self.max_body_bytes {
235 return Err(ToolError::Execution(std::io::Error::other(format!(
236 "response too large: {} bytes (max: {})",
237 bytes.len(),
238 self.max_body_bytes,
239 ))));
240 }
241
242 return String::from_utf8(bytes.to_vec())
243 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())));
244 }
245
246 Err(ToolError::Execution(std::io::Error::other(
247 "too many redirects",
248 )))
249 }
250}
251
252fn extract_scrape_blocks(text: &str) -> Vec<&str> {
253 crate::executor::extract_fenced_blocks(text, "scrape")
254}
255
256fn validate_url(raw: &str) -> Result<Url, ToolError> {
257 let parsed = Url::parse(raw).map_err(|_| ToolError::Blocked {
258 command: format!("invalid URL: {raw}"),
259 })?;
260
261 if parsed.scheme() != "https" {
262 return Err(ToolError::Blocked {
263 command: format!("scheme not allowed: {}", parsed.scheme()),
264 });
265 }
266
267 if let Some(host) = parsed.host()
268 && is_private_host(&host)
269 {
270 return Err(ToolError::Blocked {
271 command: format!(
272 "private/local host blocked: {}",
273 parsed.host_str().unwrap_or("")
274 ),
275 });
276 }
277
278 Ok(parsed)
279}
280
281pub(crate) fn is_private_ip(ip: IpAddr) -> bool {
282 match ip {
283 IpAddr::V4(v4) => {
284 v4.is_loopback()
285 || v4.is_private()
286 || v4.is_link_local()
287 || v4.is_unspecified()
288 || v4.is_broadcast()
289 }
290 IpAddr::V6(v6) => {
291 if v6.is_loopback() || v6.is_unspecified() {
292 return true;
293 }
294 let seg = v6.segments();
295 if seg[0] & 0xffc0 == 0xfe80 {
297 return true;
298 }
299 if seg[0] & 0xfe00 == 0xfc00 {
301 return true;
302 }
303 if seg[0..6] == [0, 0, 0, 0, 0, 0xffff] {
305 let v4 = v6
306 .to_ipv4_mapped()
307 .unwrap_or(std::net::Ipv4Addr::UNSPECIFIED);
308 return v4.is_loopback()
309 || v4.is_private()
310 || v4.is_link_local()
311 || v4.is_unspecified()
312 || v4.is_broadcast();
313 }
314 false
315 }
316 }
317}
318
319fn is_private_host(host: &url::Host<&str>) -> bool {
320 match host {
321 url::Host::Domain(d) => {
322 #[allow(clippy::case_sensitive_file_extension_comparisons)]
325 {
326 *d == "localhost"
327 || d.ends_with(".localhost")
328 || d.ends_with(".internal")
329 || d.ends_with(".local")
330 }
331 }
332 url::Host::Ipv4(v4) => is_private_ip(IpAddr::V4(*v4)),
333 url::Host::Ipv6(v6) => is_private_ip(IpAddr::V6(*v6)),
334 }
335}
336
337async fn resolve_and_validate(url: &Url) -> Result<(String, Vec<SocketAddr>), ToolError> {
343 let Some(host) = url.host_str() else {
344 return Ok((String::new(), vec![]));
345 };
346 let port = url.port_or_known_default().unwrap_or(443);
347 let addrs: Vec<SocketAddr> = tokio::net::lookup_host(format!("{host}:{port}"))
348 .await
349 .map_err(|e| ToolError::Blocked {
350 command: format!("DNS resolution failed: {e}"),
351 })?
352 .collect();
353 for addr in &addrs {
354 if is_private_ip(addr.ip()) {
355 return Err(ToolError::Blocked {
356 command: format!("SSRF protection: private IP {} for host {host}", addr.ip()),
357 });
358 }
359 }
360 Ok((host.to_owned(), addrs))
361}
362
363fn parse_and_extract(
364 html: &str,
365 selector: &str,
366 extract: &ExtractMode,
367 limit: usize,
368) -> Result<String, ToolError> {
369 let soup = scrape_core::Soup::parse(html);
370
371 let tags = soup.find_all(selector).map_err(|e| {
372 ToolError::Execution(std::io::Error::new(
373 std::io::ErrorKind::InvalidData,
374 format!("invalid selector: {e}"),
375 ))
376 })?;
377
378 let mut results = Vec::new();
379
380 for tag in tags.into_iter().take(limit) {
381 let value = match extract {
382 ExtractMode::Text => tag.text(),
383 ExtractMode::Html => tag.inner_html(),
384 ExtractMode::Attr(name) => tag.get(name).unwrap_or_default().to_owned(),
385 };
386 if !value.trim().is_empty() {
387 results.push(value.trim().to_owned());
388 }
389 }
390
391 if results.is_empty() {
392 Ok(format!("No results for selector: {selector}"))
393 } else {
394 Ok(results.join("\n"))
395 }
396}
397
398#[cfg(test)]
399mod tests {
400 use super::*;
401
402 #[test]
405 fn extract_single_block() {
406 let text =
407 "Here:\n```scrape\n{\"url\":\"https://example.com\",\"select\":\"h1\"}\n```\nDone.";
408 let blocks = extract_scrape_blocks(text);
409 assert_eq!(blocks.len(), 1);
410 assert!(blocks[0].contains("example.com"));
411 }
412
413 #[test]
414 fn extract_multiple_blocks() {
415 let text = "```scrape\n{\"url\":\"https://a.com\",\"select\":\"h1\"}\n```\ntext\n```scrape\n{\"url\":\"https://b.com\",\"select\":\"p\"}\n```";
416 let blocks = extract_scrape_blocks(text);
417 assert_eq!(blocks.len(), 2);
418 }
419
420 #[test]
421 fn no_blocks_returns_empty() {
422 let blocks = extract_scrape_blocks("plain text, no code blocks");
423 assert!(blocks.is_empty());
424 }
425
426 #[test]
427 fn unclosed_block_ignored() {
428 let blocks = extract_scrape_blocks("```scrape\n{\"url\":\"https://x.com\"}");
429 assert!(blocks.is_empty());
430 }
431
432 #[test]
433 fn non_scrape_block_ignored() {
434 let text =
435 "```bash\necho hi\n```\n```scrape\n{\"url\":\"https://x.com\",\"select\":\"h1\"}\n```";
436 let blocks = extract_scrape_blocks(text);
437 assert_eq!(blocks.len(), 1);
438 assert!(blocks[0].contains("x.com"));
439 }
440
441 #[test]
442 fn multiline_json_block() {
443 let text =
444 "```scrape\n{\n \"url\": \"https://example.com\",\n \"select\": \"h1\"\n}\n```";
445 let blocks = extract_scrape_blocks(text);
446 assert_eq!(blocks.len(), 1);
447 let instr: ScrapeInstruction = serde_json::from_str(blocks[0]).unwrap();
448 assert_eq!(instr.url, "https://example.com");
449 }
450
451 #[test]
454 fn parse_valid_instruction() {
455 let json = r#"{"url":"https://example.com","select":"h1","extract":"text","limit":5}"#;
456 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
457 assert_eq!(instr.url, "https://example.com");
458 assert_eq!(instr.select, "h1");
459 assert_eq!(instr.extract, "text");
460 assert_eq!(instr.limit, Some(5));
461 }
462
463 #[test]
464 fn parse_minimal_instruction() {
465 let json = r#"{"url":"https://example.com","select":"p"}"#;
466 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
467 assert_eq!(instr.extract, "text");
468 assert!(instr.limit.is_none());
469 }
470
471 #[test]
472 fn parse_attr_extract() {
473 let json = r#"{"url":"https://example.com","select":"a","extract":"attr:href"}"#;
474 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
475 assert_eq!(instr.extract, "attr:href");
476 }
477
478 #[test]
479 fn parse_invalid_json_errors() {
480 let result = serde_json::from_str::<ScrapeInstruction>("not json");
481 assert!(result.is_err());
482 }
483
484 #[test]
487 fn extract_mode_text() {
488 assert!(matches!(ExtractMode::parse("text"), ExtractMode::Text));
489 }
490
491 #[test]
492 fn extract_mode_html() {
493 assert!(matches!(ExtractMode::parse("html"), ExtractMode::Html));
494 }
495
496 #[test]
497 fn extract_mode_attr() {
498 let mode = ExtractMode::parse("attr:href");
499 assert!(matches!(mode, ExtractMode::Attr(ref s) if s == "href"));
500 }
501
502 #[test]
503 fn extract_mode_unknown_defaults_to_text() {
504 assert!(matches!(ExtractMode::parse("unknown"), ExtractMode::Text));
505 }
506
507 #[test]
510 fn valid_https_url() {
511 assert!(validate_url("https://example.com").is_ok());
512 }
513
514 #[test]
515 fn http_rejected() {
516 let err = validate_url("http://example.com").unwrap_err();
517 assert!(matches!(err, ToolError::Blocked { .. }));
518 }
519
520 #[test]
521 fn ftp_rejected() {
522 let err = validate_url("ftp://files.example.com").unwrap_err();
523 assert!(matches!(err, ToolError::Blocked { .. }));
524 }
525
526 #[test]
527 fn file_rejected() {
528 let err = validate_url("file:///etc/passwd").unwrap_err();
529 assert!(matches!(err, ToolError::Blocked { .. }));
530 }
531
532 #[test]
533 fn invalid_url_rejected() {
534 let err = validate_url("not a url").unwrap_err();
535 assert!(matches!(err, ToolError::Blocked { .. }));
536 }
537
538 #[test]
539 fn localhost_blocked() {
540 let err = validate_url("https://localhost/path").unwrap_err();
541 assert!(matches!(err, ToolError::Blocked { .. }));
542 }
543
544 #[test]
545 fn loopback_ip_blocked() {
546 let err = validate_url("https://127.0.0.1/path").unwrap_err();
547 assert!(matches!(err, ToolError::Blocked { .. }));
548 }
549
550 #[test]
551 fn private_10_blocked() {
552 let err = validate_url("https://10.0.0.1/api").unwrap_err();
553 assert!(matches!(err, ToolError::Blocked { .. }));
554 }
555
556 #[test]
557 fn private_172_blocked() {
558 let err = validate_url("https://172.16.0.1/api").unwrap_err();
559 assert!(matches!(err, ToolError::Blocked { .. }));
560 }
561
562 #[test]
563 fn private_192_blocked() {
564 let err = validate_url("https://192.168.1.1/api").unwrap_err();
565 assert!(matches!(err, ToolError::Blocked { .. }));
566 }
567
568 #[test]
569 fn ipv6_loopback_blocked() {
570 let err = validate_url("https://[::1]/path").unwrap_err();
571 assert!(matches!(err, ToolError::Blocked { .. }));
572 }
573
574 #[test]
575 fn public_ip_allowed() {
576 assert!(validate_url("https://93.184.216.34/page").is_ok());
577 }
578
579 #[test]
582 fn extract_text_from_html() {
583 let html = "<html><body><h1>Hello World</h1><p>Content</p></body></html>";
584 let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
585 assert_eq!(result, "Hello World");
586 }
587
588 #[test]
589 fn extract_multiple_elements() {
590 let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
591 let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
592 assert_eq!(result, "A\nB\nC");
593 }
594
595 #[test]
596 fn extract_with_limit() {
597 let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
598 let result = parse_and_extract(html, "li", &ExtractMode::Text, 2).unwrap();
599 assert_eq!(result, "A\nB");
600 }
601
602 #[test]
603 fn extract_attr_href() {
604 let html = r#"<a href="https://example.com">Link</a>"#;
605 let result =
606 parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
607 assert_eq!(result, "https://example.com");
608 }
609
610 #[test]
611 fn extract_inner_html() {
612 let html = "<div><span>inner</span></div>";
613 let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
614 assert!(result.contains("<span>inner</span>"));
615 }
616
617 #[test]
618 fn no_matches_returns_message() {
619 let html = "<html><body><p>text</p></body></html>";
620 let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
621 assert!(result.starts_with("No results for selector:"));
622 }
623
624 #[test]
625 fn empty_text_skipped() {
626 let html = "<ul><li> </li><li>A</li></ul>";
627 let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
628 assert_eq!(result, "A");
629 }
630
631 #[test]
632 fn invalid_selector_errors() {
633 let html = "<html><body></body></html>";
634 let result = parse_and_extract(html, "[[[invalid", &ExtractMode::Text, 10);
635 assert!(result.is_err());
636 }
637
638 #[test]
639 fn empty_html_returns_no_results() {
640 let result = parse_and_extract("", "h1", &ExtractMode::Text, 10).unwrap();
641 assert!(result.starts_with("No results for selector:"));
642 }
643
644 #[test]
645 fn nested_selector() {
646 let html = "<div><span>inner</span></div><span>outer</span>";
647 let result = parse_and_extract(html, "div > span", &ExtractMode::Text, 10).unwrap();
648 assert_eq!(result, "inner");
649 }
650
651 #[test]
652 fn attr_missing_returns_empty() {
653 let html = r#"<a>No href</a>"#;
654 let result =
655 parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
656 assert!(result.starts_with("No results for selector:"));
657 }
658
659 #[test]
660 fn extract_html_mode() {
661 let html = "<div><b>bold</b> text</div>";
662 let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
663 assert!(result.contains("<b>bold</b>"));
664 }
665
666 #[test]
667 fn limit_zero_returns_no_results() {
668 let html = "<ul><li>A</li><li>B</li></ul>";
669 let result = parse_and_extract(html, "li", &ExtractMode::Text, 0).unwrap();
670 assert!(result.starts_with("No results for selector:"));
671 }
672
673 #[test]
676 fn url_with_port_allowed() {
677 assert!(validate_url("https://example.com:8443/path").is_ok());
678 }
679
680 #[test]
681 fn link_local_ip_blocked() {
682 let err = validate_url("https://169.254.1.1/path").unwrap_err();
683 assert!(matches!(err, ToolError::Blocked { .. }));
684 }
685
686 #[test]
687 fn url_no_scheme_rejected() {
688 let err = validate_url("example.com/path").unwrap_err();
689 assert!(matches!(err, ToolError::Blocked { .. }));
690 }
691
692 #[test]
693 fn unspecified_ipv4_blocked() {
694 let err = validate_url("https://0.0.0.0/path").unwrap_err();
695 assert!(matches!(err, ToolError::Blocked { .. }));
696 }
697
698 #[test]
699 fn broadcast_ipv4_blocked() {
700 let err = validate_url("https://255.255.255.255/path").unwrap_err();
701 assert!(matches!(err, ToolError::Blocked { .. }));
702 }
703
704 #[test]
705 fn ipv6_link_local_blocked() {
706 let err = validate_url("https://[fe80::1]/path").unwrap_err();
707 assert!(matches!(err, ToolError::Blocked { .. }));
708 }
709
710 #[test]
711 fn ipv6_unique_local_blocked() {
712 let err = validate_url("https://[fd12::1]/path").unwrap_err();
713 assert!(matches!(err, ToolError::Blocked { .. }));
714 }
715
716 #[test]
717 fn ipv4_mapped_ipv6_loopback_blocked() {
718 let err = validate_url("https://[::ffff:127.0.0.1]/path").unwrap_err();
719 assert!(matches!(err, ToolError::Blocked { .. }));
720 }
721
722 #[test]
723 fn ipv4_mapped_ipv6_private_blocked() {
724 let err = validate_url("https://[::ffff:10.0.0.1]/path").unwrap_err();
725 assert!(matches!(err, ToolError::Blocked { .. }));
726 }
727
728 #[tokio::test]
731 async fn executor_no_blocks_returns_none() {
732 let config = ScrapeConfig::default();
733 let executor = WebScrapeExecutor::new(&config);
734 let result = executor.execute("plain text").await;
735 assert!(result.unwrap().is_none());
736 }
737
738 #[tokio::test]
739 async fn executor_invalid_json_errors() {
740 let config = ScrapeConfig::default();
741 let executor = WebScrapeExecutor::new(&config);
742 let response = "```scrape\nnot json\n```";
743 let result = executor.execute(response).await;
744 assert!(matches!(result, Err(ToolError::Execution(_))));
745 }
746
747 #[tokio::test]
748 async fn executor_blocked_url_errors() {
749 let config = ScrapeConfig::default();
750 let executor = WebScrapeExecutor::new(&config);
751 let response = "```scrape\n{\"url\":\"http://example.com\",\"select\":\"h1\"}\n```";
752 let result = executor.execute(response).await;
753 assert!(matches!(result, Err(ToolError::Blocked { .. })));
754 }
755
756 #[tokio::test]
757 async fn executor_private_ip_blocked() {
758 let config = ScrapeConfig::default();
759 let executor = WebScrapeExecutor::new(&config);
760 let response = "```scrape\n{\"url\":\"https://192.168.1.1/api\",\"select\":\"h1\"}\n```";
761 let result = executor.execute(response).await;
762 assert!(matches!(result, Err(ToolError::Blocked { .. })));
763 }
764
765 #[tokio::test]
766 async fn executor_unreachable_host_returns_error() {
767 let config = ScrapeConfig {
768 timeout: 1,
769 max_body_bytes: 1_048_576,
770 };
771 let executor = WebScrapeExecutor::new(&config);
772 let response = "```scrape\n{\"url\":\"https://192.0.2.1:1/page\",\"select\":\"h1\"}\n```";
773 let result = executor.execute(response).await;
774 assert!(matches!(result, Err(ToolError::Execution(_))));
775 }
776
777 #[tokio::test]
778 async fn executor_localhost_url_blocked() {
779 let config = ScrapeConfig::default();
780 let executor = WebScrapeExecutor::new(&config);
781 let response = "```scrape\n{\"url\":\"https://localhost:9999/api\",\"select\":\"h1\"}\n```";
782 let result = executor.execute(response).await;
783 assert!(matches!(result, Err(ToolError::Blocked { .. })));
784 }
785
786 #[tokio::test]
787 async fn executor_empty_text_returns_none() {
788 let config = ScrapeConfig::default();
789 let executor = WebScrapeExecutor::new(&config);
790 let result = executor.execute("").await;
791 assert!(result.unwrap().is_none());
792 }
793
794 #[tokio::test]
795 async fn executor_multiple_blocks_first_blocked() {
796 let config = ScrapeConfig::default();
797 let executor = WebScrapeExecutor::new(&config);
798 let response = "```scrape\n{\"url\":\"http://evil.com\",\"select\":\"h1\"}\n```\n\
799 ```scrape\n{\"url\":\"https://ok.com\",\"select\":\"h1\"}\n```";
800 let result = executor.execute(response).await;
801 assert!(result.is_err());
802 }
803
804 #[test]
805 fn validate_url_empty_string() {
806 let err = validate_url("").unwrap_err();
807 assert!(matches!(err, ToolError::Blocked { .. }));
808 }
809
810 #[test]
811 fn validate_url_javascript_scheme_blocked() {
812 let err = validate_url("javascript:alert(1)").unwrap_err();
813 assert!(matches!(err, ToolError::Blocked { .. }));
814 }
815
816 #[test]
817 fn validate_url_data_scheme_blocked() {
818 let err = validate_url("data:text/html,<h1>hi</h1>").unwrap_err();
819 assert!(matches!(err, ToolError::Blocked { .. }));
820 }
821
822 #[test]
823 fn is_private_host_public_domain_is_false() {
824 let host: url::Host<&str> = url::Host::Domain("example.com");
825 assert!(!is_private_host(&host));
826 }
827
828 #[test]
829 fn is_private_host_localhost_is_true() {
830 let host: url::Host<&str> = url::Host::Domain("localhost");
831 assert!(is_private_host(&host));
832 }
833
834 #[test]
835 fn is_private_host_ipv6_unspecified_is_true() {
836 let host = url::Host::Ipv6(std::net::Ipv6Addr::UNSPECIFIED);
837 assert!(is_private_host(&host));
838 }
839
840 #[test]
841 fn is_private_host_public_ipv6_is_false() {
842 let host = url::Host::Ipv6("2001:db8::1".parse().unwrap());
843 assert!(!is_private_host(&host));
844 }
845
846 async fn mock_server_executor() -> (WebScrapeExecutor, wiremock::MockServer) {
857 let server = wiremock::MockServer::start().await;
858 let executor = WebScrapeExecutor {
859 timeout: Duration::from_secs(5),
860 max_body_bytes: 1_048_576,
861 };
862 (executor, server)
863 }
864
865 fn server_host_and_addr(server: &wiremock::MockServer) -> (String, Vec<std::net::SocketAddr>) {
867 let uri = server.uri();
868 let url = Url::parse(&uri).unwrap();
869 let host = url.host_str().unwrap_or("127.0.0.1").to_owned();
870 let port = url.port().unwrap_or(80);
871 let addr: std::net::SocketAddr = format!("{host}:{port}").parse().unwrap();
872 (host, vec![addr])
873 }
874
875 async fn follow_redirects_raw(
879 executor: &WebScrapeExecutor,
880 start_url: &str,
881 host: &str,
882 addrs: &[std::net::SocketAddr],
883 ) -> Result<String, ToolError> {
884 const MAX_REDIRECTS: usize = 3;
885 let mut current_url = start_url.to_owned();
886 let mut current_host = host.to_owned();
887 let mut current_addrs = addrs.to_vec();
888
889 for hop in 0..=MAX_REDIRECTS {
890 let client = executor.build_client(¤t_host, ¤t_addrs);
891 let resp = client
892 .get(¤t_url)
893 .send()
894 .await
895 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
896
897 let status = resp.status();
898
899 if status.is_redirection() {
900 if hop == MAX_REDIRECTS {
901 return Err(ToolError::Execution(std::io::Error::other(
902 "too many redirects",
903 )));
904 }
905
906 let location = resp
907 .headers()
908 .get(reqwest::header::LOCATION)
909 .and_then(|v| v.to_str().ok())
910 .ok_or_else(|| {
911 ToolError::Execution(std::io::Error::other("redirect with no Location"))
912 })?;
913
914 let base = Url::parse(¤t_url)
915 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
916 let next_url = base
917 .join(location)
918 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
919
920 current_url = next_url.to_string();
922 let _ = &mut current_host;
924 let _ = &mut current_addrs;
925 continue;
926 }
927
928 if !status.is_success() {
929 return Err(ToolError::Execution(std::io::Error::other(format!(
930 "HTTP {status}",
931 ))));
932 }
933
934 let bytes = resp
935 .bytes()
936 .await
937 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
938
939 if bytes.len() > executor.max_body_bytes {
940 return Err(ToolError::Execution(std::io::Error::other(format!(
941 "response too large: {} bytes (max: {})",
942 bytes.len(),
943 executor.max_body_bytes,
944 ))));
945 }
946
947 return String::from_utf8(bytes.to_vec())
948 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())));
949 }
950
951 Err(ToolError::Execution(std::io::Error::other(
952 "too many redirects",
953 )))
954 }
955
956 #[tokio::test]
957 async fn fetch_html_success_returns_body() {
958 use wiremock::matchers::{method, path};
959 use wiremock::{Mock, ResponseTemplate};
960
961 let (executor, server) = mock_server_executor().await;
962 Mock::given(method("GET"))
963 .and(path("/page"))
964 .respond_with(ResponseTemplate::new(200).set_body_string("<h1>OK</h1>"))
965 .mount(&server)
966 .await;
967
968 let (host, addrs) = server_host_and_addr(&server);
969 let url = format!("{}/page", server.uri());
970 let result = executor.fetch_html(&url, &host, &addrs).await;
971 assert!(result.is_ok(), "expected Ok, got: {result:?}");
972 assert_eq!(result.unwrap(), "<h1>OK</h1>");
973 }
974
975 #[tokio::test]
976 async fn fetch_html_non_2xx_returns_error() {
977 use wiremock::matchers::{method, path};
978 use wiremock::{Mock, ResponseTemplate};
979
980 let (executor, server) = mock_server_executor().await;
981 Mock::given(method("GET"))
982 .and(path("/forbidden"))
983 .respond_with(ResponseTemplate::new(403))
984 .mount(&server)
985 .await;
986
987 let (host, addrs) = server_host_and_addr(&server);
988 let url = format!("{}/forbidden", server.uri());
989 let result = executor.fetch_html(&url, &host, &addrs).await;
990 assert!(result.is_err());
991 let msg = result.unwrap_err().to_string();
992 assert!(msg.contains("403"), "expected 403 in error: {msg}");
993 }
994
995 #[tokio::test]
996 async fn fetch_html_404_returns_error() {
997 use wiremock::matchers::{method, path};
998 use wiremock::{Mock, ResponseTemplate};
999
1000 let (executor, server) = mock_server_executor().await;
1001 Mock::given(method("GET"))
1002 .and(path("/missing"))
1003 .respond_with(ResponseTemplate::new(404))
1004 .mount(&server)
1005 .await;
1006
1007 let (host, addrs) = server_host_and_addr(&server);
1008 let url = format!("{}/missing", server.uri());
1009 let result = executor.fetch_html(&url, &host, &addrs).await;
1010 assert!(result.is_err());
1011 let msg = result.unwrap_err().to_string();
1012 assert!(msg.contains("404"), "expected 404 in error: {msg}");
1013 }
1014
1015 #[tokio::test]
1016 async fn fetch_html_redirect_no_location_returns_error() {
1017 use wiremock::matchers::{method, path};
1018 use wiremock::{Mock, ResponseTemplate};
1019
1020 let (executor, server) = mock_server_executor().await;
1021 Mock::given(method("GET"))
1023 .and(path("/redirect-no-loc"))
1024 .respond_with(ResponseTemplate::new(302))
1025 .mount(&server)
1026 .await;
1027
1028 let (host, addrs) = server_host_and_addr(&server);
1029 let url = format!("{}/redirect-no-loc", server.uri());
1030 let result = executor.fetch_html(&url, &host, &addrs).await;
1031 assert!(result.is_err());
1032 let msg = result.unwrap_err().to_string();
1033 assert!(
1034 msg.contains("Location") || msg.contains("location"),
1035 "expected Location-related error: {msg}"
1036 );
1037 }
1038
1039 #[tokio::test]
1040 async fn fetch_html_single_redirect_followed() {
1041 use wiremock::matchers::{method, path};
1042 use wiremock::{Mock, ResponseTemplate};
1043
1044 let (executor, server) = mock_server_executor().await;
1045 let final_url = format!("{}/final", server.uri());
1046
1047 Mock::given(method("GET"))
1048 .and(path("/start"))
1049 .respond_with(ResponseTemplate::new(302).insert_header("location", final_url.as_str()))
1050 .mount(&server)
1051 .await;
1052
1053 Mock::given(method("GET"))
1054 .and(path("/final"))
1055 .respond_with(ResponseTemplate::new(200).set_body_string("<p>final</p>"))
1056 .mount(&server)
1057 .await;
1058
1059 let (host, addrs) = server_host_and_addr(&server);
1060 let url = format!("{}/start", server.uri());
1061 let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1062 assert!(result.is_ok(), "single redirect should succeed: {result:?}");
1063 assert_eq!(result.unwrap(), "<p>final</p>");
1064 }
1065
1066 #[tokio::test]
1067 async fn fetch_html_three_redirects_allowed() {
1068 use wiremock::matchers::{method, path};
1069 use wiremock::{Mock, ResponseTemplate};
1070
1071 let (executor, server) = mock_server_executor().await;
1072 let hop2 = format!("{}/hop2", server.uri());
1073 let hop3 = format!("{}/hop3", server.uri());
1074 let final_dest = format!("{}/done", server.uri());
1075
1076 Mock::given(method("GET"))
1077 .and(path("/hop1"))
1078 .respond_with(ResponseTemplate::new(301).insert_header("location", hop2.as_str()))
1079 .mount(&server)
1080 .await;
1081 Mock::given(method("GET"))
1082 .and(path("/hop2"))
1083 .respond_with(ResponseTemplate::new(301).insert_header("location", hop3.as_str()))
1084 .mount(&server)
1085 .await;
1086 Mock::given(method("GET"))
1087 .and(path("/hop3"))
1088 .respond_with(ResponseTemplate::new(301).insert_header("location", final_dest.as_str()))
1089 .mount(&server)
1090 .await;
1091 Mock::given(method("GET"))
1092 .and(path("/done"))
1093 .respond_with(ResponseTemplate::new(200).set_body_string("<p>done</p>"))
1094 .mount(&server)
1095 .await;
1096
1097 let (host, addrs) = server_host_and_addr(&server);
1098 let url = format!("{}/hop1", server.uri());
1099 let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1100 assert!(result.is_ok(), "3 redirects should succeed: {result:?}");
1101 assert_eq!(result.unwrap(), "<p>done</p>");
1102 }
1103
1104 #[tokio::test]
1105 async fn fetch_html_four_redirects_rejected() {
1106 use wiremock::matchers::{method, path};
1107 use wiremock::{Mock, ResponseTemplate};
1108
1109 let (executor, server) = mock_server_executor().await;
1110 let hop2 = format!("{}/r2", server.uri());
1111 let hop3 = format!("{}/r3", server.uri());
1112 let hop4 = format!("{}/r4", server.uri());
1113 let hop5 = format!("{}/r5", server.uri());
1114
1115 for (from, to) in [
1116 ("/r1", &hop2),
1117 ("/r2", &hop3),
1118 ("/r3", &hop4),
1119 ("/r4", &hop5),
1120 ] {
1121 Mock::given(method("GET"))
1122 .and(path(from))
1123 .respond_with(ResponseTemplate::new(301).insert_header("location", to.as_str()))
1124 .mount(&server)
1125 .await;
1126 }
1127
1128 let (host, addrs) = server_host_and_addr(&server);
1129 let url = format!("{}/r1", server.uri());
1130 let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1131 assert!(result.is_err(), "4 redirects should be rejected");
1132 let msg = result.unwrap_err().to_string();
1133 assert!(
1134 msg.contains("redirect"),
1135 "expected redirect-related error: {msg}"
1136 );
1137 }
1138
1139 #[tokio::test]
1140 async fn fetch_html_body_too_large_returns_error() {
1141 use wiremock::matchers::{method, path};
1142 use wiremock::{Mock, ResponseTemplate};
1143
1144 let small_limit_executor = WebScrapeExecutor {
1145 timeout: Duration::from_secs(5),
1146 max_body_bytes: 10,
1147 };
1148 let server = wiremock::MockServer::start().await;
1149 Mock::given(method("GET"))
1150 .and(path("/big"))
1151 .respond_with(
1152 ResponseTemplate::new(200)
1153 .set_body_string("this body is definitely longer than ten bytes"),
1154 )
1155 .mount(&server)
1156 .await;
1157
1158 let (host, addrs) = server_host_and_addr(&server);
1159 let url = format!("{}/big", server.uri());
1160 let result = small_limit_executor.fetch_html(&url, &host, &addrs).await;
1161 assert!(result.is_err());
1162 let msg = result.unwrap_err().to_string();
1163 assert!(msg.contains("too large"), "expected too-large error: {msg}");
1164 }
1165
1166 #[test]
1167 fn extract_scrape_blocks_empty_block_content() {
1168 let text = "```scrape\n\n```";
1169 let blocks = extract_scrape_blocks(text);
1170 assert_eq!(blocks.len(), 1);
1171 assert!(blocks[0].is_empty());
1172 }
1173
1174 #[test]
1175 fn extract_scrape_blocks_whitespace_only() {
1176 let text = "```scrape\n \n```";
1177 let blocks = extract_scrape_blocks(text);
1178 assert_eq!(blocks.len(), 1);
1179 }
1180
1181 #[test]
1182 fn parse_and_extract_multiple_selectors() {
1183 let html = "<div><h1>Title</h1><p>Para</p></div>";
1184 let result = parse_and_extract(html, "h1, p", &ExtractMode::Text, 10).unwrap();
1185 assert!(result.contains("Title"));
1186 assert!(result.contains("Para"));
1187 }
1188
1189 #[test]
1190 fn webscrape_executor_new_with_custom_config() {
1191 let config = ScrapeConfig {
1192 timeout: 60,
1193 max_body_bytes: 512,
1194 };
1195 let executor = WebScrapeExecutor::new(&config);
1196 assert_eq!(executor.max_body_bytes, 512);
1197 }
1198
1199 #[test]
1200 fn webscrape_executor_debug() {
1201 let config = ScrapeConfig::default();
1202 let executor = WebScrapeExecutor::new(&config);
1203 let dbg = format!("{executor:?}");
1204 assert!(dbg.contains("WebScrapeExecutor"));
1205 }
1206
1207 #[test]
1208 fn extract_mode_attr_empty_name() {
1209 let mode = ExtractMode::parse("attr:");
1210 assert!(matches!(mode, ExtractMode::Attr(ref s) if s.is_empty()));
1211 }
1212
1213 #[test]
1214 fn default_extract_returns_text() {
1215 assert_eq!(default_extract(), "text");
1216 }
1217
1218 #[test]
1219 fn scrape_instruction_debug() {
1220 let json = r#"{"url":"https://example.com","select":"h1"}"#;
1221 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
1222 let dbg = format!("{instr:?}");
1223 assert!(dbg.contains("ScrapeInstruction"));
1224 }
1225
1226 #[test]
1227 fn extract_mode_debug() {
1228 let mode = ExtractMode::Text;
1229 let dbg = format!("{mode:?}");
1230 assert!(dbg.contains("Text"));
1231 }
1232
1233 #[test]
1238 fn max_redirects_constant_is_three() {
1239 const MAX_REDIRECTS: usize = 3;
1243 assert_eq!(MAX_REDIRECTS, 3, "fetch_html allows exactly 3 redirects");
1244 }
1245
1246 #[test]
1249 fn redirect_no_location_error_message() {
1250 let err = std::io::Error::other("redirect with no Location");
1251 assert!(err.to_string().contains("redirect with no Location"));
1252 }
1253
1254 #[test]
1256 fn too_many_redirects_error_message() {
1257 let err = std::io::Error::other("too many redirects");
1258 assert!(err.to_string().contains("too many redirects"));
1259 }
1260
1261 #[test]
1263 fn non_2xx_status_error_format() {
1264 let status = reqwest::StatusCode::FORBIDDEN;
1265 let msg = format!("HTTP {status}");
1266 assert!(msg.contains("403"));
1267 }
1268
1269 #[test]
1271 fn not_found_status_error_format() {
1272 let status = reqwest::StatusCode::NOT_FOUND;
1273 let msg = format!("HTTP {status}");
1274 assert!(msg.contains("404"));
1275 }
1276
1277 #[test]
1279 fn relative_redirect_same_host_path() {
1280 let base = Url::parse("https://example.com/current").unwrap();
1281 let resolved = base.join("/other").unwrap();
1282 assert_eq!(resolved.as_str(), "https://example.com/other");
1283 }
1284
1285 #[test]
1287 fn relative_redirect_relative_path() {
1288 let base = Url::parse("https://example.com/a/b").unwrap();
1289 let resolved = base.join("c").unwrap();
1290 assert_eq!(resolved.as_str(), "https://example.com/a/c");
1291 }
1292
1293 #[test]
1295 fn absolute_redirect_overrides_base() {
1296 let base = Url::parse("https://example.com/page").unwrap();
1297 let resolved = base.join("https://other.com/target").unwrap();
1298 assert_eq!(resolved.as_str(), "https://other.com/target");
1299 }
1300
1301 #[test]
1303 fn redirect_http_downgrade_rejected() {
1304 let location = "http://example.com/page";
1305 let base = Url::parse("https://example.com/start").unwrap();
1306 let next = base.join(location).unwrap();
1307 let err = validate_url(next.as_str()).unwrap_err();
1308 assert!(matches!(err, ToolError::Blocked { .. }));
1309 }
1310
1311 #[test]
1313 fn redirect_location_private_ip_blocked() {
1314 let location = "https://192.168.100.1/admin";
1315 let base = Url::parse("https://example.com/start").unwrap();
1316 let next = base.join(location).unwrap();
1317 let err = validate_url(next.as_str()).unwrap_err();
1318 assert!(matches!(err, ToolError::Blocked { .. }));
1319 let cmd = match err {
1320 ToolError::Blocked { command } => command,
1321 _ => panic!("expected Blocked"),
1322 };
1323 assert!(
1324 cmd.contains("private") || cmd.contains("scheme"),
1325 "error message should describe the block reason: {cmd}"
1326 );
1327 }
1328
1329 #[test]
1331 fn redirect_location_internal_domain_blocked() {
1332 let location = "https://metadata.internal/latest/meta-data/";
1333 let base = Url::parse("https://example.com/start").unwrap();
1334 let next = base.join(location).unwrap();
1335 let err = validate_url(next.as_str()).unwrap_err();
1336 assert!(matches!(err, ToolError::Blocked { .. }));
1337 }
1338
1339 #[test]
1341 fn redirect_chain_three_hops_all_public() {
1342 let hops = [
1343 "https://redirect1.example.com/hop1",
1344 "https://redirect2.example.com/hop2",
1345 "https://destination.example.com/final",
1346 ];
1347 for hop in hops {
1348 assert!(validate_url(hop).is_ok(), "expected ok for {hop}");
1349 }
1350 }
1351
1352 #[test]
1357 fn redirect_to_private_ip_rejected_by_validate_url() {
1358 let private_targets = [
1360 "https://127.0.0.1/secret",
1361 "https://10.0.0.1/internal",
1362 "https://192.168.1.1/admin",
1363 "https://172.16.0.1/data",
1364 "https://[::1]/path",
1365 "https://[fe80::1]/path",
1366 "https://localhost/path",
1367 "https://service.internal/api",
1368 ];
1369 for target in private_targets {
1370 let result = validate_url(target);
1371 assert!(result.is_err(), "expected error for {target}");
1372 assert!(
1373 matches!(result.unwrap_err(), ToolError::Blocked { .. }),
1374 "expected Blocked for {target}"
1375 );
1376 }
1377 }
1378
1379 #[test]
1381 fn redirect_relative_url_resolves_correctly() {
1382 let base = Url::parse("https://example.com/page").unwrap();
1383 let relative = "/other";
1384 let resolved = base.join(relative).unwrap();
1385 assert_eq!(resolved.as_str(), "https://example.com/other");
1386 }
1387
1388 #[test]
1390 fn redirect_to_http_rejected() {
1391 let err = validate_url("http://example.com/page").unwrap_err();
1392 assert!(matches!(err, ToolError::Blocked { .. }));
1393 }
1394
1395 #[test]
1396 fn ipv4_mapped_ipv6_link_local_blocked() {
1397 let err = validate_url("https://[::ffff:169.254.0.1]/path").unwrap_err();
1398 assert!(matches!(err, ToolError::Blocked { .. }));
1399 }
1400
1401 #[test]
1402 fn ipv4_mapped_ipv6_public_allowed() {
1403 assert!(validate_url("https://[::ffff:93.184.216.34]/path").is_ok());
1404 }
1405
1406 #[test]
1407 fn tool_definitions_returns_web_scrape() {
1408 let config = ScrapeConfig::default();
1409 let executor = WebScrapeExecutor::new(&config);
1410 let defs = executor.tool_definitions();
1411 assert_eq!(defs.len(), 1);
1412 assert_eq!(defs[0].id, "web_scrape");
1413 assert_eq!(
1414 defs[0].invocation,
1415 crate::registry::InvocationHint::FencedBlock("scrape")
1416 );
1417 }
1418
1419 #[test]
1420 fn tool_definitions_schema_has_all_params() {
1421 let config = ScrapeConfig::default();
1422 let executor = WebScrapeExecutor::new(&config);
1423 let defs = executor.tool_definitions();
1424 let obj = defs[0].schema.as_object().unwrap();
1425 let props = obj["properties"].as_object().unwrap();
1426 assert!(props.contains_key("url"));
1427 assert!(props.contains_key("select"));
1428 assert!(props.contains_key("extract"));
1429 assert!(props.contains_key("limit"));
1430 let req = obj["required"].as_array().unwrap();
1431 assert!(req.iter().any(|v| v.as_str() == Some("url")));
1432 assert!(req.iter().any(|v| v.as_str() == Some("select")));
1433 assert!(!req.iter().any(|v| v.as_str() == Some("extract")));
1434 }
1435
1436 #[test]
1439 fn subdomain_localhost_blocked() {
1440 let host: url::Host<&str> = url::Host::Domain("foo.localhost");
1441 assert!(is_private_host(&host));
1442 }
1443
1444 #[test]
1445 fn internal_tld_blocked() {
1446 let host: url::Host<&str> = url::Host::Domain("service.internal");
1447 assert!(is_private_host(&host));
1448 }
1449
1450 #[test]
1451 fn local_tld_blocked() {
1452 let host: url::Host<&str> = url::Host::Domain("printer.local");
1453 assert!(is_private_host(&host));
1454 }
1455
1456 #[test]
1457 fn public_domain_not_blocked() {
1458 let host: url::Host<&str> = url::Host::Domain("example.com");
1459 assert!(!is_private_host(&host));
1460 }
1461
1462 #[tokio::test]
1465 async fn resolve_loopback_rejected() {
1466 let url = url::Url::parse("https://127.0.0.1/path").unwrap();
1468 let result = resolve_and_validate(&url).await;
1470 assert!(
1471 result.is_err(),
1472 "loopback IP must be rejected by resolve_and_validate"
1473 );
1474 let err = result.unwrap_err();
1475 assert!(matches!(err, crate::executor::ToolError::Blocked { .. }));
1476 }
1477
1478 #[tokio::test]
1479 async fn resolve_private_10_rejected() {
1480 let url = url::Url::parse("https://10.0.0.1/path").unwrap();
1481 let result = resolve_and_validate(&url).await;
1482 assert!(result.is_err());
1483 assert!(matches!(
1484 result.unwrap_err(),
1485 crate::executor::ToolError::Blocked { .. }
1486 ));
1487 }
1488
1489 #[tokio::test]
1490 async fn resolve_private_192_rejected() {
1491 let url = url::Url::parse("https://192.168.1.1/path").unwrap();
1492 let result = resolve_and_validate(&url).await;
1493 assert!(result.is_err());
1494 assert!(matches!(
1495 result.unwrap_err(),
1496 crate::executor::ToolError::Blocked { .. }
1497 ));
1498 }
1499
1500 #[tokio::test]
1501 async fn resolve_ipv6_loopback_rejected() {
1502 let url = url::Url::parse("https://[::1]/path").unwrap();
1503 let result = resolve_and_validate(&url).await;
1504 assert!(result.is_err());
1505 assert!(matches!(
1506 result.unwrap_err(),
1507 crate::executor::ToolError::Blocked { .. }
1508 ));
1509 }
1510
1511 #[tokio::test]
1512 async fn resolve_no_host_returns_ok() {
1513 let url = url::Url::parse("https://example.com/path").unwrap();
1515 let url_no_host = url::Url::parse("data:text/plain,hello").unwrap();
1517 let result = resolve_and_validate(&url_no_host).await;
1519 assert!(result.is_ok());
1520 let (host, addrs) = result.unwrap();
1521 assert!(host.is_empty());
1522 assert!(addrs.is_empty());
1523 drop(url);
1524 drop(url_no_host);
1525 }
1526}