1use std::net::{IpAddr, SocketAddr};
5use std::time::Duration;
6
7use schemars::JsonSchema;
8use serde::Deserialize;
9use url::Url;
10
11use crate::config::ScrapeConfig;
12use crate::executor::{ToolCall, ToolError, ToolExecutor, ToolOutput, deserialize_params};
13
14#[derive(Debug, Deserialize, JsonSchema)]
15struct ScrapeInstruction {
16 url: String,
18 select: String,
20 #[serde(default = "default_extract")]
22 extract: String,
23 limit: Option<usize>,
25}
26
27fn default_extract() -> String {
28 "text".into()
29}
30
31#[derive(Debug)]
32enum ExtractMode {
33 Text,
34 Html,
35 Attr(String),
36}
37
38impl ExtractMode {
39 fn parse(s: &str) -> Self {
40 match s {
41 "text" => Self::Text,
42 "html" => Self::Html,
43 attr if attr.starts_with("attr:") => {
44 Self::Attr(attr.strip_prefix("attr:").unwrap_or(attr).to_owned())
45 }
46 _ => Self::Text,
47 }
48 }
49}
50
51#[derive(Debug)]
56pub struct WebScrapeExecutor {
57 timeout: Duration,
58 max_body_bytes: usize,
59}
60
61impl WebScrapeExecutor {
62 #[must_use]
63 pub fn new(config: &ScrapeConfig) -> Self {
64 Self {
65 timeout: Duration::from_secs(config.timeout),
66 max_body_bytes: config.max_body_bytes,
67 }
68 }
69
70 fn build_client(&self, host: &str, addrs: &[SocketAddr]) -> reqwest::Client {
71 let mut builder = reqwest::Client::builder()
72 .timeout(self.timeout)
73 .redirect(reqwest::redirect::Policy::none());
74 builder = builder.resolve_to_addrs(host, addrs);
75 builder.build().unwrap_or_default()
76 }
77}
78
79impl ToolExecutor for WebScrapeExecutor {
80 fn tool_definitions(&self) -> Vec<crate::registry::ToolDef> {
81 use crate::registry::{InvocationHint, ToolDef};
82 vec![ToolDef {
83 id: "web_scrape",
84 description: "Scrape data from a web page via CSS selectors",
85 schema: schemars::schema_for!(ScrapeInstruction),
86 invocation: InvocationHint::FencedBlock("scrape"),
87 }]
88 }
89
90 async fn execute(&self, response: &str) -> Result<Option<ToolOutput>, ToolError> {
91 let blocks = extract_scrape_blocks(response);
92 if blocks.is_empty() {
93 return Ok(None);
94 }
95
96 let mut outputs = Vec::with_capacity(blocks.len());
97 #[allow(clippy::cast_possible_truncation)]
98 let blocks_executed = blocks.len() as u32;
99
100 for block in &blocks {
101 let instruction: ScrapeInstruction = serde_json::from_str(block).map_err(|e| {
102 ToolError::Execution(std::io::Error::new(
103 std::io::ErrorKind::InvalidData,
104 e.to_string(),
105 ))
106 })?;
107 outputs.push(self.scrape_instruction(&instruction).await?);
108 }
109
110 Ok(Some(ToolOutput {
111 tool_name: "web-scrape".to_owned(),
112 summary: outputs.join("\n\n"),
113 blocks_executed,
114 filter_stats: None,
115 diff: None,
116 streamed: false,
117 }))
118 }
119
120 async fn execute_tool_call(&self, call: &ToolCall) -> Result<Option<ToolOutput>, ToolError> {
121 if call.tool_id != "web_scrape" {
122 return Ok(None);
123 }
124
125 let instruction: ScrapeInstruction = deserialize_params(&call.params)?;
126
127 let result = self.scrape_instruction(&instruction).await?;
128
129 Ok(Some(ToolOutput {
130 tool_name: "web-scrape".to_owned(),
131 summary: result,
132 blocks_executed: 1,
133 filter_stats: None,
134 diff: None,
135 streamed: false,
136 }))
137 }
138}
139
140impl WebScrapeExecutor {
141 async fn scrape_instruction(
142 &self,
143 instruction: &ScrapeInstruction,
144 ) -> Result<String, ToolError> {
145 let parsed = validate_url(&instruction.url)?;
146 let (host, addrs) = resolve_and_validate(&parsed).await?;
147 let html = self.fetch_html(&instruction.url, &host, &addrs).await?;
148 let selector = instruction.select.clone();
149 let extract = ExtractMode::parse(&instruction.extract);
150 let limit = instruction.limit.unwrap_or(10);
151 tokio::task::spawn_blocking(move || parse_and_extract(&html, &selector, &extract, limit))
152 .await
153 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?
154 }
155
156 async fn fetch_html(
166 &self,
167 url: &str,
168 host: &str,
169 addrs: &[SocketAddr],
170 ) -> Result<String, ToolError> {
171 const MAX_REDIRECTS: usize = 3;
172
173 let mut current_url = url.to_owned();
174 let mut current_host = host.to_owned();
175 let mut current_addrs = addrs.to_vec();
176
177 for hop in 0..=MAX_REDIRECTS {
178 let client = self.build_client(¤t_host, ¤t_addrs);
180 let resp = client
181 .get(¤t_url)
182 .send()
183 .await
184 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
185
186 let status = resp.status();
187
188 if status.is_redirection() {
189 if hop == MAX_REDIRECTS {
190 return Err(ToolError::Execution(std::io::Error::other(
191 "too many redirects",
192 )));
193 }
194
195 let location = resp
196 .headers()
197 .get(reqwest::header::LOCATION)
198 .and_then(|v| v.to_str().ok())
199 .ok_or_else(|| {
200 ToolError::Execution(std::io::Error::other("redirect with no Location"))
201 })?;
202
203 let base = Url::parse(¤t_url)
205 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
206 let next_url = base
207 .join(location)
208 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
209
210 let validated = validate_url(next_url.as_str())?;
211 let (next_host, next_addrs) = resolve_and_validate(&validated).await?;
212
213 current_url = next_url.to_string();
214 current_host = next_host;
215 current_addrs = next_addrs;
216 continue;
217 }
218
219 if !status.is_success() {
220 return Err(ToolError::Execution(std::io::Error::other(format!(
221 "HTTP {status}",
222 ))));
223 }
224
225 let bytes = resp
226 .bytes()
227 .await
228 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
229
230 if bytes.len() > self.max_body_bytes {
231 return Err(ToolError::Execution(std::io::Error::other(format!(
232 "response too large: {} bytes (max: {})",
233 bytes.len(),
234 self.max_body_bytes,
235 ))));
236 }
237
238 return String::from_utf8(bytes.to_vec())
239 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())));
240 }
241
242 Err(ToolError::Execution(std::io::Error::other(
243 "too many redirects",
244 )))
245 }
246}
247
248fn extract_scrape_blocks(text: &str) -> Vec<&str> {
249 crate::executor::extract_fenced_blocks(text, "scrape")
250}
251
252fn validate_url(raw: &str) -> Result<Url, ToolError> {
253 let parsed = Url::parse(raw).map_err(|_| ToolError::Blocked {
254 command: format!("invalid URL: {raw}"),
255 })?;
256
257 if parsed.scheme() != "https" {
258 return Err(ToolError::Blocked {
259 command: format!("scheme not allowed: {}", parsed.scheme()),
260 });
261 }
262
263 if let Some(host) = parsed.host()
264 && is_private_host(&host)
265 {
266 return Err(ToolError::Blocked {
267 command: format!(
268 "private/local host blocked: {}",
269 parsed.host_str().unwrap_or("")
270 ),
271 });
272 }
273
274 Ok(parsed)
275}
276
277pub(crate) fn is_private_ip(ip: IpAddr) -> bool {
278 match ip {
279 IpAddr::V4(v4) => {
280 v4.is_loopback()
281 || v4.is_private()
282 || v4.is_link_local()
283 || v4.is_unspecified()
284 || v4.is_broadcast()
285 }
286 IpAddr::V6(v6) => {
287 if v6.is_loopback() || v6.is_unspecified() {
288 return true;
289 }
290 let seg = v6.segments();
291 if seg[0] & 0xffc0 == 0xfe80 {
293 return true;
294 }
295 if seg[0] & 0xfe00 == 0xfc00 {
297 return true;
298 }
299 if seg[0..6] == [0, 0, 0, 0, 0, 0xffff] {
301 let v4 = v6
302 .to_ipv4_mapped()
303 .unwrap_or(std::net::Ipv4Addr::UNSPECIFIED);
304 return v4.is_loopback()
305 || v4.is_private()
306 || v4.is_link_local()
307 || v4.is_unspecified()
308 || v4.is_broadcast();
309 }
310 false
311 }
312 }
313}
314
315fn is_private_host(host: &url::Host<&str>) -> bool {
316 match host {
317 url::Host::Domain(d) => {
318 #[allow(clippy::case_sensitive_file_extension_comparisons)]
321 {
322 *d == "localhost"
323 || d.ends_with(".localhost")
324 || d.ends_with(".internal")
325 || d.ends_with(".local")
326 }
327 }
328 url::Host::Ipv4(v4) => is_private_ip(IpAddr::V4(*v4)),
329 url::Host::Ipv6(v6) => is_private_ip(IpAddr::V6(*v6)),
330 }
331}
332
333async fn resolve_and_validate(url: &Url) -> Result<(String, Vec<SocketAddr>), ToolError> {
339 let Some(host) = url.host_str() else {
340 return Ok((String::new(), vec![]));
341 };
342 let port = url.port_or_known_default().unwrap_or(443);
343 let addrs: Vec<SocketAddr> = tokio::net::lookup_host(format!("{host}:{port}"))
344 .await
345 .map_err(|e| ToolError::Blocked {
346 command: format!("DNS resolution failed: {e}"),
347 })?
348 .collect();
349 for addr in &addrs {
350 if is_private_ip(addr.ip()) {
351 return Err(ToolError::Blocked {
352 command: format!("SSRF protection: private IP {} for host {host}", addr.ip()),
353 });
354 }
355 }
356 Ok((host.to_owned(), addrs))
357}
358
359fn parse_and_extract(
360 html: &str,
361 selector: &str,
362 extract: &ExtractMode,
363 limit: usize,
364) -> Result<String, ToolError> {
365 let soup = scrape_core::Soup::parse(html);
366
367 let tags = soup.find_all(selector).map_err(|e| {
368 ToolError::Execution(std::io::Error::new(
369 std::io::ErrorKind::InvalidData,
370 format!("invalid selector: {e}"),
371 ))
372 })?;
373
374 let mut results = Vec::new();
375
376 for tag in tags.into_iter().take(limit) {
377 let value = match extract {
378 ExtractMode::Text => tag.text(),
379 ExtractMode::Html => tag.inner_html(),
380 ExtractMode::Attr(name) => tag.get(name).unwrap_or_default().to_owned(),
381 };
382 if !value.trim().is_empty() {
383 results.push(value.trim().to_owned());
384 }
385 }
386
387 if results.is_empty() {
388 Ok(format!("No results for selector: {selector}"))
389 } else {
390 Ok(results.join("\n"))
391 }
392}
393
394#[cfg(test)]
395mod tests {
396 use super::*;
397
398 #[test]
401 fn extract_single_block() {
402 let text =
403 "Here:\n```scrape\n{\"url\":\"https://example.com\",\"select\":\"h1\"}\n```\nDone.";
404 let blocks = extract_scrape_blocks(text);
405 assert_eq!(blocks.len(), 1);
406 assert!(blocks[0].contains("example.com"));
407 }
408
409 #[test]
410 fn extract_multiple_blocks() {
411 let text = "```scrape\n{\"url\":\"https://a.com\",\"select\":\"h1\"}\n```\ntext\n```scrape\n{\"url\":\"https://b.com\",\"select\":\"p\"}\n```";
412 let blocks = extract_scrape_blocks(text);
413 assert_eq!(blocks.len(), 2);
414 }
415
416 #[test]
417 fn no_blocks_returns_empty() {
418 let blocks = extract_scrape_blocks("plain text, no code blocks");
419 assert!(blocks.is_empty());
420 }
421
422 #[test]
423 fn unclosed_block_ignored() {
424 let blocks = extract_scrape_blocks("```scrape\n{\"url\":\"https://x.com\"}");
425 assert!(blocks.is_empty());
426 }
427
428 #[test]
429 fn non_scrape_block_ignored() {
430 let text =
431 "```bash\necho hi\n```\n```scrape\n{\"url\":\"https://x.com\",\"select\":\"h1\"}\n```";
432 let blocks = extract_scrape_blocks(text);
433 assert_eq!(blocks.len(), 1);
434 assert!(blocks[0].contains("x.com"));
435 }
436
437 #[test]
438 fn multiline_json_block() {
439 let text =
440 "```scrape\n{\n \"url\": \"https://example.com\",\n \"select\": \"h1\"\n}\n```";
441 let blocks = extract_scrape_blocks(text);
442 assert_eq!(blocks.len(), 1);
443 let instr: ScrapeInstruction = serde_json::from_str(blocks[0]).unwrap();
444 assert_eq!(instr.url, "https://example.com");
445 }
446
447 #[test]
450 fn parse_valid_instruction() {
451 let json = r#"{"url":"https://example.com","select":"h1","extract":"text","limit":5}"#;
452 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
453 assert_eq!(instr.url, "https://example.com");
454 assert_eq!(instr.select, "h1");
455 assert_eq!(instr.extract, "text");
456 assert_eq!(instr.limit, Some(5));
457 }
458
459 #[test]
460 fn parse_minimal_instruction() {
461 let json = r#"{"url":"https://example.com","select":"p"}"#;
462 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
463 assert_eq!(instr.extract, "text");
464 assert!(instr.limit.is_none());
465 }
466
467 #[test]
468 fn parse_attr_extract() {
469 let json = r#"{"url":"https://example.com","select":"a","extract":"attr:href"}"#;
470 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
471 assert_eq!(instr.extract, "attr:href");
472 }
473
474 #[test]
475 fn parse_invalid_json_errors() {
476 let result = serde_json::from_str::<ScrapeInstruction>("not json");
477 assert!(result.is_err());
478 }
479
480 #[test]
483 fn extract_mode_text() {
484 assert!(matches!(ExtractMode::parse("text"), ExtractMode::Text));
485 }
486
487 #[test]
488 fn extract_mode_html() {
489 assert!(matches!(ExtractMode::parse("html"), ExtractMode::Html));
490 }
491
492 #[test]
493 fn extract_mode_attr() {
494 let mode = ExtractMode::parse("attr:href");
495 assert!(matches!(mode, ExtractMode::Attr(ref s) if s == "href"));
496 }
497
498 #[test]
499 fn extract_mode_unknown_defaults_to_text() {
500 assert!(matches!(ExtractMode::parse("unknown"), ExtractMode::Text));
501 }
502
503 #[test]
506 fn valid_https_url() {
507 assert!(validate_url("https://example.com").is_ok());
508 }
509
510 #[test]
511 fn http_rejected() {
512 let err = validate_url("http://example.com").unwrap_err();
513 assert!(matches!(err, ToolError::Blocked { .. }));
514 }
515
516 #[test]
517 fn ftp_rejected() {
518 let err = validate_url("ftp://files.example.com").unwrap_err();
519 assert!(matches!(err, ToolError::Blocked { .. }));
520 }
521
522 #[test]
523 fn file_rejected() {
524 let err = validate_url("file:///etc/passwd").unwrap_err();
525 assert!(matches!(err, ToolError::Blocked { .. }));
526 }
527
528 #[test]
529 fn invalid_url_rejected() {
530 let err = validate_url("not a url").unwrap_err();
531 assert!(matches!(err, ToolError::Blocked { .. }));
532 }
533
534 #[test]
535 fn localhost_blocked() {
536 let err = validate_url("https://localhost/path").unwrap_err();
537 assert!(matches!(err, ToolError::Blocked { .. }));
538 }
539
540 #[test]
541 fn loopback_ip_blocked() {
542 let err = validate_url("https://127.0.0.1/path").unwrap_err();
543 assert!(matches!(err, ToolError::Blocked { .. }));
544 }
545
546 #[test]
547 fn private_10_blocked() {
548 let err = validate_url("https://10.0.0.1/api").unwrap_err();
549 assert!(matches!(err, ToolError::Blocked { .. }));
550 }
551
552 #[test]
553 fn private_172_blocked() {
554 let err = validate_url("https://172.16.0.1/api").unwrap_err();
555 assert!(matches!(err, ToolError::Blocked { .. }));
556 }
557
558 #[test]
559 fn private_192_blocked() {
560 let err = validate_url("https://192.168.1.1/api").unwrap_err();
561 assert!(matches!(err, ToolError::Blocked { .. }));
562 }
563
564 #[test]
565 fn ipv6_loopback_blocked() {
566 let err = validate_url("https://[::1]/path").unwrap_err();
567 assert!(matches!(err, ToolError::Blocked { .. }));
568 }
569
570 #[test]
571 fn public_ip_allowed() {
572 assert!(validate_url("https://93.184.216.34/page").is_ok());
573 }
574
575 #[test]
578 fn extract_text_from_html() {
579 let html = "<html><body><h1>Hello World</h1><p>Content</p></body></html>";
580 let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
581 assert_eq!(result, "Hello World");
582 }
583
584 #[test]
585 fn extract_multiple_elements() {
586 let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
587 let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
588 assert_eq!(result, "A\nB\nC");
589 }
590
591 #[test]
592 fn extract_with_limit() {
593 let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
594 let result = parse_and_extract(html, "li", &ExtractMode::Text, 2).unwrap();
595 assert_eq!(result, "A\nB");
596 }
597
598 #[test]
599 fn extract_attr_href() {
600 let html = r#"<a href="https://example.com">Link</a>"#;
601 let result =
602 parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
603 assert_eq!(result, "https://example.com");
604 }
605
606 #[test]
607 fn extract_inner_html() {
608 let html = "<div><span>inner</span></div>";
609 let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
610 assert!(result.contains("<span>inner</span>"));
611 }
612
613 #[test]
614 fn no_matches_returns_message() {
615 let html = "<html><body><p>text</p></body></html>";
616 let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
617 assert!(result.starts_with("No results for selector:"));
618 }
619
620 #[test]
621 fn empty_text_skipped() {
622 let html = "<ul><li> </li><li>A</li></ul>";
623 let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
624 assert_eq!(result, "A");
625 }
626
627 #[test]
628 fn invalid_selector_errors() {
629 let html = "<html><body></body></html>";
630 let result = parse_and_extract(html, "[[[invalid", &ExtractMode::Text, 10);
631 assert!(result.is_err());
632 }
633
634 #[test]
635 fn empty_html_returns_no_results() {
636 let result = parse_and_extract("", "h1", &ExtractMode::Text, 10).unwrap();
637 assert!(result.starts_with("No results for selector:"));
638 }
639
640 #[test]
641 fn nested_selector() {
642 let html = "<div><span>inner</span></div><span>outer</span>";
643 let result = parse_and_extract(html, "div > span", &ExtractMode::Text, 10).unwrap();
644 assert_eq!(result, "inner");
645 }
646
647 #[test]
648 fn attr_missing_returns_empty() {
649 let html = r#"<a>No href</a>"#;
650 let result =
651 parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
652 assert!(result.starts_with("No results for selector:"));
653 }
654
655 #[test]
656 fn extract_html_mode() {
657 let html = "<div><b>bold</b> text</div>";
658 let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
659 assert!(result.contains("<b>bold</b>"));
660 }
661
662 #[test]
663 fn limit_zero_returns_no_results() {
664 let html = "<ul><li>A</li><li>B</li></ul>";
665 let result = parse_and_extract(html, "li", &ExtractMode::Text, 0).unwrap();
666 assert!(result.starts_with("No results for selector:"));
667 }
668
669 #[test]
672 fn url_with_port_allowed() {
673 assert!(validate_url("https://example.com:8443/path").is_ok());
674 }
675
676 #[test]
677 fn link_local_ip_blocked() {
678 let err = validate_url("https://169.254.1.1/path").unwrap_err();
679 assert!(matches!(err, ToolError::Blocked { .. }));
680 }
681
682 #[test]
683 fn url_no_scheme_rejected() {
684 let err = validate_url("example.com/path").unwrap_err();
685 assert!(matches!(err, ToolError::Blocked { .. }));
686 }
687
688 #[test]
689 fn unspecified_ipv4_blocked() {
690 let err = validate_url("https://0.0.0.0/path").unwrap_err();
691 assert!(matches!(err, ToolError::Blocked { .. }));
692 }
693
694 #[test]
695 fn broadcast_ipv4_blocked() {
696 let err = validate_url("https://255.255.255.255/path").unwrap_err();
697 assert!(matches!(err, ToolError::Blocked { .. }));
698 }
699
700 #[test]
701 fn ipv6_link_local_blocked() {
702 let err = validate_url("https://[fe80::1]/path").unwrap_err();
703 assert!(matches!(err, ToolError::Blocked { .. }));
704 }
705
706 #[test]
707 fn ipv6_unique_local_blocked() {
708 let err = validate_url("https://[fd12::1]/path").unwrap_err();
709 assert!(matches!(err, ToolError::Blocked { .. }));
710 }
711
712 #[test]
713 fn ipv4_mapped_ipv6_loopback_blocked() {
714 let err = validate_url("https://[::ffff:127.0.0.1]/path").unwrap_err();
715 assert!(matches!(err, ToolError::Blocked { .. }));
716 }
717
718 #[test]
719 fn ipv4_mapped_ipv6_private_blocked() {
720 let err = validate_url("https://[::ffff:10.0.0.1]/path").unwrap_err();
721 assert!(matches!(err, ToolError::Blocked { .. }));
722 }
723
724 #[tokio::test]
727 async fn executor_no_blocks_returns_none() {
728 let config = ScrapeConfig::default();
729 let executor = WebScrapeExecutor::new(&config);
730 let result = executor.execute("plain text").await;
731 assert!(result.unwrap().is_none());
732 }
733
734 #[tokio::test]
735 async fn executor_invalid_json_errors() {
736 let config = ScrapeConfig::default();
737 let executor = WebScrapeExecutor::new(&config);
738 let response = "```scrape\nnot json\n```";
739 let result = executor.execute(response).await;
740 assert!(matches!(result, Err(ToolError::Execution(_))));
741 }
742
743 #[tokio::test]
744 async fn executor_blocked_url_errors() {
745 let config = ScrapeConfig::default();
746 let executor = WebScrapeExecutor::new(&config);
747 let response = "```scrape\n{\"url\":\"http://example.com\",\"select\":\"h1\"}\n```";
748 let result = executor.execute(response).await;
749 assert!(matches!(result, Err(ToolError::Blocked { .. })));
750 }
751
752 #[tokio::test]
753 async fn executor_private_ip_blocked() {
754 let config = ScrapeConfig::default();
755 let executor = WebScrapeExecutor::new(&config);
756 let response = "```scrape\n{\"url\":\"https://192.168.1.1/api\",\"select\":\"h1\"}\n```";
757 let result = executor.execute(response).await;
758 assert!(matches!(result, Err(ToolError::Blocked { .. })));
759 }
760
761 #[tokio::test]
762 async fn executor_unreachable_host_returns_error() {
763 let config = ScrapeConfig {
764 timeout: 1,
765 max_body_bytes: 1_048_576,
766 };
767 let executor = WebScrapeExecutor::new(&config);
768 let response = "```scrape\n{\"url\":\"https://192.0.2.1:1/page\",\"select\":\"h1\"}\n```";
769 let result = executor.execute(response).await;
770 assert!(matches!(result, Err(ToolError::Execution(_))));
771 }
772
773 #[tokio::test]
774 async fn executor_localhost_url_blocked() {
775 let config = ScrapeConfig::default();
776 let executor = WebScrapeExecutor::new(&config);
777 let response = "```scrape\n{\"url\":\"https://localhost:9999/api\",\"select\":\"h1\"}\n```";
778 let result = executor.execute(response).await;
779 assert!(matches!(result, Err(ToolError::Blocked { .. })));
780 }
781
782 #[tokio::test]
783 async fn executor_empty_text_returns_none() {
784 let config = ScrapeConfig::default();
785 let executor = WebScrapeExecutor::new(&config);
786 let result = executor.execute("").await;
787 assert!(result.unwrap().is_none());
788 }
789
790 #[tokio::test]
791 async fn executor_multiple_blocks_first_blocked() {
792 let config = ScrapeConfig::default();
793 let executor = WebScrapeExecutor::new(&config);
794 let response = "```scrape\n{\"url\":\"http://evil.com\",\"select\":\"h1\"}\n```\n\
795 ```scrape\n{\"url\":\"https://ok.com\",\"select\":\"h1\"}\n```";
796 let result = executor.execute(response).await;
797 assert!(result.is_err());
798 }
799
800 #[test]
801 fn validate_url_empty_string() {
802 let err = validate_url("").unwrap_err();
803 assert!(matches!(err, ToolError::Blocked { .. }));
804 }
805
806 #[test]
807 fn validate_url_javascript_scheme_blocked() {
808 let err = validate_url("javascript:alert(1)").unwrap_err();
809 assert!(matches!(err, ToolError::Blocked { .. }));
810 }
811
812 #[test]
813 fn validate_url_data_scheme_blocked() {
814 let err = validate_url("data:text/html,<h1>hi</h1>").unwrap_err();
815 assert!(matches!(err, ToolError::Blocked { .. }));
816 }
817
818 #[test]
819 fn is_private_host_public_domain_is_false() {
820 let host: url::Host<&str> = url::Host::Domain("example.com");
821 assert!(!is_private_host(&host));
822 }
823
824 #[test]
825 fn is_private_host_localhost_is_true() {
826 let host: url::Host<&str> = url::Host::Domain("localhost");
827 assert!(is_private_host(&host));
828 }
829
830 #[test]
831 fn is_private_host_ipv6_unspecified_is_true() {
832 let host = url::Host::Ipv6(std::net::Ipv6Addr::UNSPECIFIED);
833 assert!(is_private_host(&host));
834 }
835
836 #[test]
837 fn is_private_host_public_ipv6_is_false() {
838 let host = url::Host::Ipv6("2001:db8::1".parse().unwrap());
839 assert!(!is_private_host(&host));
840 }
841
842 async fn mock_server_executor() -> (WebScrapeExecutor, wiremock::MockServer) {
853 let server = wiremock::MockServer::start().await;
854 let executor = WebScrapeExecutor {
855 timeout: Duration::from_secs(5),
856 max_body_bytes: 1_048_576,
857 };
858 (executor, server)
859 }
860
861 fn server_host_and_addr(server: &wiremock::MockServer) -> (String, Vec<std::net::SocketAddr>) {
863 let uri = server.uri();
864 let url = Url::parse(&uri).unwrap();
865 let host = url.host_str().unwrap_or("127.0.0.1").to_owned();
866 let port = url.port().unwrap_or(80);
867 let addr: std::net::SocketAddr = format!("{host}:{port}").parse().unwrap();
868 (host, vec![addr])
869 }
870
871 async fn follow_redirects_raw(
875 executor: &WebScrapeExecutor,
876 start_url: &str,
877 host: &str,
878 addrs: &[std::net::SocketAddr],
879 ) -> Result<String, ToolError> {
880 const MAX_REDIRECTS: usize = 3;
881 let mut current_url = start_url.to_owned();
882 let mut current_host = host.to_owned();
883 let mut current_addrs = addrs.to_vec();
884
885 for hop in 0..=MAX_REDIRECTS {
886 let client = executor.build_client(¤t_host, ¤t_addrs);
887 let resp = client
888 .get(¤t_url)
889 .send()
890 .await
891 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
892
893 let status = resp.status();
894
895 if status.is_redirection() {
896 if hop == MAX_REDIRECTS {
897 return Err(ToolError::Execution(std::io::Error::other(
898 "too many redirects",
899 )));
900 }
901
902 let location = resp
903 .headers()
904 .get(reqwest::header::LOCATION)
905 .and_then(|v| v.to_str().ok())
906 .ok_or_else(|| {
907 ToolError::Execution(std::io::Error::other("redirect with no Location"))
908 })?;
909
910 let base = Url::parse(¤t_url)
911 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
912 let next_url = base
913 .join(location)
914 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
915
916 current_url = next_url.to_string();
918 let _ = &mut current_host;
920 let _ = &mut current_addrs;
921 continue;
922 }
923
924 if !status.is_success() {
925 return Err(ToolError::Execution(std::io::Error::other(format!(
926 "HTTP {status}",
927 ))));
928 }
929
930 let bytes = resp
931 .bytes()
932 .await
933 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
934
935 if bytes.len() > executor.max_body_bytes {
936 return Err(ToolError::Execution(std::io::Error::other(format!(
937 "response too large: {} bytes (max: {})",
938 bytes.len(),
939 executor.max_body_bytes,
940 ))));
941 }
942
943 return String::from_utf8(bytes.to_vec())
944 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())));
945 }
946
947 Err(ToolError::Execution(std::io::Error::other(
948 "too many redirects",
949 )))
950 }
951
952 #[tokio::test]
953 async fn fetch_html_success_returns_body() {
954 use wiremock::matchers::{method, path};
955 use wiremock::{Mock, ResponseTemplate};
956
957 let (executor, server) = mock_server_executor().await;
958 Mock::given(method("GET"))
959 .and(path("/page"))
960 .respond_with(ResponseTemplate::new(200).set_body_string("<h1>OK</h1>"))
961 .mount(&server)
962 .await;
963
964 let (host, addrs) = server_host_and_addr(&server);
965 let url = format!("{}/page", server.uri());
966 let result = executor.fetch_html(&url, &host, &addrs).await;
967 assert!(result.is_ok(), "expected Ok, got: {result:?}");
968 assert_eq!(result.unwrap(), "<h1>OK</h1>");
969 }
970
971 #[tokio::test]
972 async fn fetch_html_non_2xx_returns_error() {
973 use wiremock::matchers::{method, path};
974 use wiremock::{Mock, ResponseTemplate};
975
976 let (executor, server) = mock_server_executor().await;
977 Mock::given(method("GET"))
978 .and(path("/forbidden"))
979 .respond_with(ResponseTemplate::new(403))
980 .mount(&server)
981 .await;
982
983 let (host, addrs) = server_host_and_addr(&server);
984 let url = format!("{}/forbidden", server.uri());
985 let result = executor.fetch_html(&url, &host, &addrs).await;
986 assert!(result.is_err());
987 let msg = result.unwrap_err().to_string();
988 assert!(msg.contains("403"), "expected 403 in error: {msg}");
989 }
990
991 #[tokio::test]
992 async fn fetch_html_404_returns_error() {
993 use wiremock::matchers::{method, path};
994 use wiremock::{Mock, ResponseTemplate};
995
996 let (executor, server) = mock_server_executor().await;
997 Mock::given(method("GET"))
998 .and(path("/missing"))
999 .respond_with(ResponseTemplate::new(404))
1000 .mount(&server)
1001 .await;
1002
1003 let (host, addrs) = server_host_and_addr(&server);
1004 let url = format!("{}/missing", server.uri());
1005 let result = executor.fetch_html(&url, &host, &addrs).await;
1006 assert!(result.is_err());
1007 let msg = result.unwrap_err().to_string();
1008 assert!(msg.contains("404"), "expected 404 in error: {msg}");
1009 }
1010
1011 #[tokio::test]
1012 async fn fetch_html_redirect_no_location_returns_error() {
1013 use wiremock::matchers::{method, path};
1014 use wiremock::{Mock, ResponseTemplate};
1015
1016 let (executor, server) = mock_server_executor().await;
1017 Mock::given(method("GET"))
1019 .and(path("/redirect-no-loc"))
1020 .respond_with(ResponseTemplate::new(302))
1021 .mount(&server)
1022 .await;
1023
1024 let (host, addrs) = server_host_and_addr(&server);
1025 let url = format!("{}/redirect-no-loc", server.uri());
1026 let result = executor.fetch_html(&url, &host, &addrs).await;
1027 assert!(result.is_err());
1028 let msg = result.unwrap_err().to_string();
1029 assert!(
1030 msg.contains("Location") || msg.contains("location"),
1031 "expected Location-related error: {msg}"
1032 );
1033 }
1034
1035 #[tokio::test]
1036 async fn fetch_html_single_redirect_followed() {
1037 use wiremock::matchers::{method, path};
1038 use wiremock::{Mock, ResponseTemplate};
1039
1040 let (executor, server) = mock_server_executor().await;
1041 let final_url = format!("{}/final", server.uri());
1042
1043 Mock::given(method("GET"))
1044 .and(path("/start"))
1045 .respond_with(ResponseTemplate::new(302).insert_header("location", final_url.as_str()))
1046 .mount(&server)
1047 .await;
1048
1049 Mock::given(method("GET"))
1050 .and(path("/final"))
1051 .respond_with(ResponseTemplate::new(200).set_body_string("<p>final</p>"))
1052 .mount(&server)
1053 .await;
1054
1055 let (host, addrs) = server_host_and_addr(&server);
1056 let url = format!("{}/start", server.uri());
1057 let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1058 assert!(result.is_ok(), "single redirect should succeed: {result:?}");
1059 assert_eq!(result.unwrap(), "<p>final</p>");
1060 }
1061
1062 #[tokio::test]
1063 async fn fetch_html_three_redirects_allowed() {
1064 use wiremock::matchers::{method, path};
1065 use wiremock::{Mock, ResponseTemplate};
1066
1067 let (executor, server) = mock_server_executor().await;
1068 let hop2 = format!("{}/hop2", server.uri());
1069 let hop3 = format!("{}/hop3", server.uri());
1070 let final_dest = format!("{}/done", server.uri());
1071
1072 Mock::given(method("GET"))
1073 .and(path("/hop1"))
1074 .respond_with(ResponseTemplate::new(301).insert_header("location", hop2.as_str()))
1075 .mount(&server)
1076 .await;
1077 Mock::given(method("GET"))
1078 .and(path("/hop2"))
1079 .respond_with(ResponseTemplate::new(301).insert_header("location", hop3.as_str()))
1080 .mount(&server)
1081 .await;
1082 Mock::given(method("GET"))
1083 .and(path("/hop3"))
1084 .respond_with(ResponseTemplate::new(301).insert_header("location", final_dest.as_str()))
1085 .mount(&server)
1086 .await;
1087 Mock::given(method("GET"))
1088 .and(path("/done"))
1089 .respond_with(ResponseTemplate::new(200).set_body_string("<p>done</p>"))
1090 .mount(&server)
1091 .await;
1092
1093 let (host, addrs) = server_host_and_addr(&server);
1094 let url = format!("{}/hop1", server.uri());
1095 let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1096 assert!(result.is_ok(), "3 redirects should succeed: {result:?}");
1097 assert_eq!(result.unwrap(), "<p>done</p>");
1098 }
1099
1100 #[tokio::test]
1101 async fn fetch_html_four_redirects_rejected() {
1102 use wiremock::matchers::{method, path};
1103 use wiremock::{Mock, ResponseTemplate};
1104
1105 let (executor, server) = mock_server_executor().await;
1106 let hop2 = format!("{}/r2", server.uri());
1107 let hop3 = format!("{}/r3", server.uri());
1108 let hop4 = format!("{}/r4", server.uri());
1109 let hop5 = format!("{}/r5", server.uri());
1110
1111 for (from, to) in [
1112 ("/r1", &hop2),
1113 ("/r2", &hop3),
1114 ("/r3", &hop4),
1115 ("/r4", &hop5),
1116 ] {
1117 Mock::given(method("GET"))
1118 .and(path(from))
1119 .respond_with(ResponseTemplate::new(301).insert_header("location", to.as_str()))
1120 .mount(&server)
1121 .await;
1122 }
1123
1124 let (host, addrs) = server_host_and_addr(&server);
1125 let url = format!("{}/r1", server.uri());
1126 let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1127 assert!(result.is_err(), "4 redirects should be rejected");
1128 let msg = result.unwrap_err().to_string();
1129 assert!(
1130 msg.contains("redirect"),
1131 "expected redirect-related error: {msg}"
1132 );
1133 }
1134
1135 #[tokio::test]
1136 async fn fetch_html_body_too_large_returns_error() {
1137 use wiremock::matchers::{method, path};
1138 use wiremock::{Mock, ResponseTemplate};
1139
1140 let small_limit_executor = WebScrapeExecutor {
1141 timeout: Duration::from_secs(5),
1142 max_body_bytes: 10,
1143 };
1144 let server = wiremock::MockServer::start().await;
1145 Mock::given(method("GET"))
1146 .and(path("/big"))
1147 .respond_with(
1148 ResponseTemplate::new(200)
1149 .set_body_string("this body is definitely longer than ten bytes"),
1150 )
1151 .mount(&server)
1152 .await;
1153
1154 let (host, addrs) = server_host_and_addr(&server);
1155 let url = format!("{}/big", server.uri());
1156 let result = small_limit_executor.fetch_html(&url, &host, &addrs).await;
1157 assert!(result.is_err());
1158 let msg = result.unwrap_err().to_string();
1159 assert!(msg.contains("too large"), "expected too-large error: {msg}");
1160 }
1161
1162 #[test]
1163 fn extract_scrape_blocks_empty_block_content() {
1164 let text = "```scrape\n\n```";
1165 let blocks = extract_scrape_blocks(text);
1166 assert_eq!(blocks.len(), 1);
1167 assert!(blocks[0].is_empty());
1168 }
1169
1170 #[test]
1171 fn extract_scrape_blocks_whitespace_only() {
1172 let text = "```scrape\n \n```";
1173 let blocks = extract_scrape_blocks(text);
1174 assert_eq!(blocks.len(), 1);
1175 }
1176
1177 #[test]
1178 fn parse_and_extract_multiple_selectors() {
1179 let html = "<div><h1>Title</h1><p>Para</p></div>";
1180 let result = parse_and_extract(html, "h1, p", &ExtractMode::Text, 10).unwrap();
1181 assert!(result.contains("Title"));
1182 assert!(result.contains("Para"));
1183 }
1184
1185 #[test]
1186 fn webscrape_executor_new_with_custom_config() {
1187 let config = ScrapeConfig {
1188 timeout: 60,
1189 max_body_bytes: 512,
1190 };
1191 let executor = WebScrapeExecutor::new(&config);
1192 assert_eq!(executor.max_body_bytes, 512);
1193 }
1194
1195 #[test]
1196 fn webscrape_executor_debug() {
1197 let config = ScrapeConfig::default();
1198 let executor = WebScrapeExecutor::new(&config);
1199 let dbg = format!("{executor:?}");
1200 assert!(dbg.contains("WebScrapeExecutor"));
1201 }
1202
1203 #[test]
1204 fn extract_mode_attr_empty_name() {
1205 let mode = ExtractMode::parse("attr:");
1206 assert!(matches!(mode, ExtractMode::Attr(ref s) if s.is_empty()));
1207 }
1208
1209 #[test]
1210 fn default_extract_returns_text() {
1211 assert_eq!(default_extract(), "text");
1212 }
1213
1214 #[test]
1215 fn scrape_instruction_debug() {
1216 let json = r#"{"url":"https://example.com","select":"h1"}"#;
1217 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
1218 let dbg = format!("{instr:?}");
1219 assert!(dbg.contains("ScrapeInstruction"));
1220 }
1221
1222 #[test]
1223 fn extract_mode_debug() {
1224 let mode = ExtractMode::Text;
1225 let dbg = format!("{mode:?}");
1226 assert!(dbg.contains("Text"));
1227 }
1228
1229 #[test]
1234 fn max_redirects_constant_is_three() {
1235 const MAX_REDIRECTS: usize = 3;
1239 assert_eq!(MAX_REDIRECTS, 3, "fetch_html allows exactly 3 redirects");
1240 }
1241
1242 #[test]
1245 fn redirect_no_location_error_message() {
1246 let err = std::io::Error::other("redirect with no Location");
1247 assert!(err.to_string().contains("redirect with no Location"));
1248 }
1249
1250 #[test]
1252 fn too_many_redirects_error_message() {
1253 let err = std::io::Error::other("too many redirects");
1254 assert!(err.to_string().contains("too many redirects"));
1255 }
1256
1257 #[test]
1259 fn non_2xx_status_error_format() {
1260 let status = reqwest::StatusCode::FORBIDDEN;
1261 let msg = format!("HTTP {status}");
1262 assert!(msg.contains("403"));
1263 }
1264
1265 #[test]
1267 fn not_found_status_error_format() {
1268 let status = reqwest::StatusCode::NOT_FOUND;
1269 let msg = format!("HTTP {status}");
1270 assert!(msg.contains("404"));
1271 }
1272
1273 #[test]
1275 fn relative_redirect_same_host_path() {
1276 let base = Url::parse("https://example.com/current").unwrap();
1277 let resolved = base.join("/other").unwrap();
1278 assert_eq!(resolved.as_str(), "https://example.com/other");
1279 }
1280
1281 #[test]
1283 fn relative_redirect_relative_path() {
1284 let base = Url::parse("https://example.com/a/b").unwrap();
1285 let resolved = base.join("c").unwrap();
1286 assert_eq!(resolved.as_str(), "https://example.com/a/c");
1287 }
1288
1289 #[test]
1291 fn absolute_redirect_overrides_base() {
1292 let base = Url::parse("https://example.com/page").unwrap();
1293 let resolved = base.join("https://other.com/target").unwrap();
1294 assert_eq!(resolved.as_str(), "https://other.com/target");
1295 }
1296
1297 #[test]
1299 fn redirect_http_downgrade_rejected() {
1300 let location = "http://example.com/page";
1301 let base = Url::parse("https://example.com/start").unwrap();
1302 let next = base.join(location).unwrap();
1303 let err = validate_url(next.as_str()).unwrap_err();
1304 assert!(matches!(err, ToolError::Blocked { .. }));
1305 }
1306
1307 #[test]
1309 fn redirect_location_private_ip_blocked() {
1310 let location = "https://192.168.100.1/admin";
1311 let base = Url::parse("https://example.com/start").unwrap();
1312 let next = base.join(location).unwrap();
1313 let err = validate_url(next.as_str()).unwrap_err();
1314 assert!(matches!(err, ToolError::Blocked { .. }));
1315 let cmd = match err {
1316 ToolError::Blocked { command } => command,
1317 _ => panic!("expected Blocked"),
1318 };
1319 assert!(
1320 cmd.contains("private") || cmd.contains("scheme"),
1321 "error message should describe the block reason: {cmd}"
1322 );
1323 }
1324
1325 #[test]
1327 fn redirect_location_internal_domain_blocked() {
1328 let location = "https://metadata.internal/latest/meta-data/";
1329 let base = Url::parse("https://example.com/start").unwrap();
1330 let next = base.join(location).unwrap();
1331 let err = validate_url(next.as_str()).unwrap_err();
1332 assert!(matches!(err, ToolError::Blocked { .. }));
1333 }
1334
1335 #[test]
1337 fn redirect_chain_three_hops_all_public() {
1338 let hops = [
1339 "https://redirect1.example.com/hop1",
1340 "https://redirect2.example.com/hop2",
1341 "https://destination.example.com/final",
1342 ];
1343 for hop in hops {
1344 assert!(validate_url(hop).is_ok(), "expected ok for {hop}");
1345 }
1346 }
1347
1348 #[test]
1353 fn redirect_to_private_ip_rejected_by_validate_url() {
1354 let private_targets = [
1356 "https://127.0.0.1/secret",
1357 "https://10.0.0.1/internal",
1358 "https://192.168.1.1/admin",
1359 "https://172.16.0.1/data",
1360 "https://[::1]/path",
1361 "https://[fe80::1]/path",
1362 "https://localhost/path",
1363 "https://service.internal/api",
1364 ];
1365 for target in private_targets {
1366 let result = validate_url(target);
1367 assert!(result.is_err(), "expected error for {target}");
1368 assert!(
1369 matches!(result.unwrap_err(), ToolError::Blocked { .. }),
1370 "expected Blocked for {target}"
1371 );
1372 }
1373 }
1374
1375 #[test]
1377 fn redirect_relative_url_resolves_correctly() {
1378 let base = Url::parse("https://example.com/page").unwrap();
1379 let relative = "/other";
1380 let resolved = base.join(relative).unwrap();
1381 assert_eq!(resolved.as_str(), "https://example.com/other");
1382 }
1383
1384 #[test]
1386 fn redirect_to_http_rejected() {
1387 let err = validate_url("http://example.com/page").unwrap_err();
1388 assert!(matches!(err, ToolError::Blocked { .. }));
1389 }
1390
1391 #[test]
1392 fn ipv4_mapped_ipv6_link_local_blocked() {
1393 let err = validate_url("https://[::ffff:169.254.0.1]/path").unwrap_err();
1394 assert!(matches!(err, ToolError::Blocked { .. }));
1395 }
1396
1397 #[test]
1398 fn ipv4_mapped_ipv6_public_allowed() {
1399 assert!(validate_url("https://[::ffff:93.184.216.34]/path").is_ok());
1400 }
1401
1402 #[test]
1403 fn tool_definitions_returns_web_scrape() {
1404 let config = ScrapeConfig::default();
1405 let executor = WebScrapeExecutor::new(&config);
1406 let defs = executor.tool_definitions();
1407 assert_eq!(defs.len(), 1);
1408 assert_eq!(defs[0].id, "web_scrape");
1409 assert_eq!(
1410 defs[0].invocation,
1411 crate::registry::InvocationHint::FencedBlock("scrape")
1412 );
1413 }
1414
1415 #[test]
1416 fn tool_definitions_schema_has_all_params() {
1417 let config = ScrapeConfig::default();
1418 let executor = WebScrapeExecutor::new(&config);
1419 let defs = executor.tool_definitions();
1420 let obj = defs[0].schema.as_object().unwrap();
1421 let props = obj["properties"].as_object().unwrap();
1422 assert!(props.contains_key("url"));
1423 assert!(props.contains_key("select"));
1424 assert!(props.contains_key("extract"));
1425 assert!(props.contains_key("limit"));
1426 let req = obj["required"].as_array().unwrap();
1427 assert!(req.iter().any(|v| v.as_str() == Some("url")));
1428 assert!(req.iter().any(|v| v.as_str() == Some("select")));
1429 assert!(!req.iter().any(|v| v.as_str() == Some("extract")));
1430 }
1431
1432 #[test]
1435 fn subdomain_localhost_blocked() {
1436 let host: url::Host<&str> = url::Host::Domain("foo.localhost");
1437 assert!(is_private_host(&host));
1438 }
1439
1440 #[test]
1441 fn internal_tld_blocked() {
1442 let host: url::Host<&str> = url::Host::Domain("service.internal");
1443 assert!(is_private_host(&host));
1444 }
1445
1446 #[test]
1447 fn local_tld_blocked() {
1448 let host: url::Host<&str> = url::Host::Domain("printer.local");
1449 assert!(is_private_host(&host));
1450 }
1451
1452 #[test]
1453 fn public_domain_not_blocked() {
1454 let host: url::Host<&str> = url::Host::Domain("example.com");
1455 assert!(!is_private_host(&host));
1456 }
1457
1458 #[tokio::test]
1461 async fn resolve_loopback_rejected() {
1462 let url = url::Url::parse("https://127.0.0.1/path").unwrap();
1464 let result = resolve_and_validate(&url).await;
1466 assert!(
1467 result.is_err(),
1468 "loopback IP must be rejected by resolve_and_validate"
1469 );
1470 let err = result.unwrap_err();
1471 assert!(matches!(err, crate::executor::ToolError::Blocked { .. }));
1472 }
1473
1474 #[tokio::test]
1475 async fn resolve_private_10_rejected() {
1476 let url = url::Url::parse("https://10.0.0.1/path").unwrap();
1477 let result = resolve_and_validate(&url).await;
1478 assert!(result.is_err());
1479 assert!(matches!(
1480 result.unwrap_err(),
1481 crate::executor::ToolError::Blocked { .. }
1482 ));
1483 }
1484
1485 #[tokio::test]
1486 async fn resolve_private_192_rejected() {
1487 let url = url::Url::parse("https://192.168.1.1/path").unwrap();
1488 let result = resolve_and_validate(&url).await;
1489 assert!(result.is_err());
1490 assert!(matches!(
1491 result.unwrap_err(),
1492 crate::executor::ToolError::Blocked { .. }
1493 ));
1494 }
1495
1496 #[tokio::test]
1497 async fn resolve_ipv6_loopback_rejected() {
1498 let url = url::Url::parse("https://[::1]/path").unwrap();
1499 let result = resolve_and_validate(&url).await;
1500 assert!(result.is_err());
1501 assert!(matches!(
1502 result.unwrap_err(),
1503 crate::executor::ToolError::Blocked { .. }
1504 ));
1505 }
1506
1507 #[tokio::test]
1508 async fn resolve_no_host_returns_ok() {
1509 let url = url::Url::parse("https://example.com/path").unwrap();
1511 let url_no_host = url::Url::parse("data:text/plain,hello").unwrap();
1513 let result = resolve_and_validate(&url_no_host).await;
1515 assert!(result.is_ok());
1516 let (host, addrs) = result.unwrap();
1517 assert!(host.is_empty());
1518 assert!(addrs.is_empty());
1519 drop(url);
1520 drop(url_no_host);
1521 }
1522}