1use std::net::{IpAddr, SocketAddr};
5use std::time::Duration;
6
7use schemars::JsonSchema;
8use serde::Deserialize;
9use url::Url;
10
11use crate::config::ScrapeConfig;
12use crate::executor::{ToolCall, ToolError, ToolExecutor, ToolOutput, deserialize_params};
13
14#[derive(Debug, Deserialize, JsonSchema)]
15struct ScrapeInstruction {
16 url: String,
18 select: String,
20 #[serde(default = "default_extract")]
22 extract: String,
23 limit: Option<usize>,
25}
26
27fn default_extract() -> String {
28 "text".into()
29}
30
31#[derive(Debug)]
32enum ExtractMode {
33 Text,
34 Html,
35 Attr(String),
36}
37
38impl ExtractMode {
39 fn parse(s: &str) -> Self {
40 match s {
41 "text" => Self::Text,
42 "html" => Self::Html,
43 attr if attr.starts_with("attr:") => {
44 Self::Attr(attr.strip_prefix("attr:").unwrap_or(attr).to_owned())
45 }
46 _ => Self::Text,
47 }
48 }
49}
50
51#[derive(Debug)]
56pub struct WebScrapeExecutor {
57 timeout: Duration,
58 max_body_bytes: usize,
59}
60
61impl WebScrapeExecutor {
62 #[must_use]
63 pub fn new(config: &ScrapeConfig) -> Self {
64 Self {
65 timeout: Duration::from_secs(config.timeout),
66 max_body_bytes: config.max_body_bytes,
67 }
68 }
69
70 fn build_client(&self, host: &str, addrs: &[SocketAddr]) -> reqwest::Client {
71 let mut builder = reqwest::Client::builder()
72 .timeout(self.timeout)
73 .redirect(reqwest::redirect::Policy::limited(3));
74 builder = builder.resolve_to_addrs(host, addrs);
75 builder.build().unwrap_or_default()
76 }
77}
78
79impl ToolExecutor for WebScrapeExecutor {
80 fn tool_definitions(&self) -> Vec<crate::registry::ToolDef> {
81 use crate::registry::{InvocationHint, ToolDef};
82 vec![ToolDef {
83 id: "web_scrape",
84 description: "Scrape data from a web page via CSS selectors",
85 schema: schemars::schema_for!(ScrapeInstruction),
86 invocation: InvocationHint::FencedBlock("scrape"),
87 }]
88 }
89
90 async fn execute(&self, response: &str) -> Result<Option<ToolOutput>, ToolError> {
91 let blocks = extract_scrape_blocks(response);
92 if blocks.is_empty() {
93 return Ok(None);
94 }
95
96 let mut outputs = Vec::with_capacity(blocks.len());
97 #[allow(clippy::cast_possible_truncation)]
98 let blocks_executed = blocks.len() as u32;
99
100 for block in &blocks {
101 let instruction: ScrapeInstruction = serde_json::from_str(block).map_err(|e| {
102 ToolError::Execution(std::io::Error::new(
103 std::io::ErrorKind::InvalidData,
104 e.to_string(),
105 ))
106 })?;
107 outputs.push(self.scrape_instruction(&instruction).await?);
108 }
109
110 Ok(Some(ToolOutput {
111 tool_name: "web-scrape".to_owned(),
112 summary: outputs.join("\n\n"),
113 blocks_executed,
114 filter_stats: None,
115 diff: None,
116 streamed: false,
117 }))
118 }
119
120 async fn execute_tool_call(&self, call: &ToolCall) -> Result<Option<ToolOutput>, ToolError> {
121 if call.tool_id != "web_scrape" {
122 return Ok(None);
123 }
124
125 let instruction: ScrapeInstruction = deserialize_params(&call.params)?;
126
127 let result = self.scrape_instruction(&instruction).await?;
128
129 Ok(Some(ToolOutput {
130 tool_name: "web-scrape".to_owned(),
131 summary: result,
132 blocks_executed: 1,
133 filter_stats: None,
134 diff: None,
135 streamed: false,
136 }))
137 }
138}
139
140impl WebScrapeExecutor {
141 async fn scrape_instruction(
142 &self,
143 instruction: &ScrapeInstruction,
144 ) -> Result<String, ToolError> {
145 let parsed = validate_url(&instruction.url)?;
146 let (host, addrs) = resolve_and_validate(&parsed).await?;
147 let client = self.build_client(&host, &addrs);
150 let html = self.fetch_html(&client, &instruction.url).await?;
151 let selector = instruction.select.clone();
152 let extract = ExtractMode::parse(&instruction.extract);
153 let limit = instruction.limit.unwrap_or(10);
154 tokio::task::spawn_blocking(move || parse_and_extract(&html, &selector, &extract, limit))
155 .await
156 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?
157 }
158
159 async fn fetch_html(&self, client: &reqwest::Client, url: &str) -> Result<String, ToolError> {
160 let resp = client
161 .get(url)
162 .send()
163 .await
164 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
165
166 if !resp.status().is_success() {
167 return Err(ToolError::Execution(std::io::Error::other(format!(
168 "HTTP {}",
169 resp.status(),
170 ))));
171 }
172
173 let bytes = resp
174 .bytes()
175 .await
176 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
177
178 if bytes.len() > self.max_body_bytes {
179 return Err(ToolError::Execution(std::io::Error::other(format!(
180 "response too large: {} bytes (max: {})",
181 bytes.len(),
182 self.max_body_bytes,
183 ))));
184 }
185
186 String::from_utf8(bytes.to_vec())
187 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))
188 }
189}
190
191fn extract_scrape_blocks(text: &str) -> Vec<&str> {
192 crate::executor::extract_fenced_blocks(text, "scrape")
193}
194
195fn validate_url(raw: &str) -> Result<Url, ToolError> {
196 let parsed = Url::parse(raw).map_err(|_| ToolError::Blocked {
197 command: format!("invalid URL: {raw}"),
198 })?;
199
200 if parsed.scheme() != "https" {
201 return Err(ToolError::Blocked {
202 command: format!("scheme not allowed: {}", parsed.scheme()),
203 });
204 }
205
206 if let Some(host) = parsed.host()
207 && is_private_host(&host)
208 {
209 return Err(ToolError::Blocked {
210 command: format!(
211 "private/local host blocked: {}",
212 parsed.host_str().unwrap_or("")
213 ),
214 });
215 }
216
217 Ok(parsed)
218}
219
220pub(crate) fn is_private_ip(ip: IpAddr) -> bool {
221 match ip {
222 IpAddr::V4(v4) => {
223 v4.is_loopback()
224 || v4.is_private()
225 || v4.is_link_local()
226 || v4.is_unspecified()
227 || v4.is_broadcast()
228 }
229 IpAddr::V6(v6) => {
230 if v6.is_loopback() || v6.is_unspecified() {
231 return true;
232 }
233 let seg = v6.segments();
234 if seg[0] & 0xffc0 == 0xfe80 {
236 return true;
237 }
238 if seg[0] & 0xfe00 == 0xfc00 {
240 return true;
241 }
242 if seg[0..6] == [0, 0, 0, 0, 0, 0xffff] {
244 let v4 = v6
245 .to_ipv4_mapped()
246 .unwrap_or(std::net::Ipv4Addr::UNSPECIFIED);
247 return v4.is_loopback()
248 || v4.is_private()
249 || v4.is_link_local()
250 || v4.is_unspecified()
251 || v4.is_broadcast();
252 }
253 false
254 }
255 }
256}
257
258fn is_private_host(host: &url::Host<&str>) -> bool {
259 match host {
260 url::Host::Domain(d) => {
261 #[allow(clippy::case_sensitive_file_extension_comparisons)]
264 {
265 *d == "localhost"
266 || d.ends_with(".localhost")
267 || d.ends_with(".internal")
268 || d.ends_with(".local")
269 }
270 }
271 url::Host::Ipv4(v4) => is_private_ip(IpAddr::V4(*v4)),
272 url::Host::Ipv6(v6) => is_private_ip(IpAddr::V6(*v6)),
273 }
274}
275
276async fn resolve_and_validate(url: &Url) -> Result<(String, Vec<SocketAddr>), ToolError> {
282 let Some(host) = url.host_str() else {
283 return Ok((String::new(), vec![]));
284 };
285 let port = url.port_or_known_default().unwrap_or(443);
286 let addrs: Vec<SocketAddr> = tokio::net::lookup_host(format!("{host}:{port}"))
287 .await
288 .map_err(|e| ToolError::Blocked {
289 command: format!("DNS resolution failed: {e}"),
290 })?
291 .collect();
292 for addr in &addrs {
293 if is_private_ip(addr.ip()) {
294 return Err(ToolError::Blocked {
295 command: format!("SSRF protection: private IP {} for host {host}", addr.ip()),
296 });
297 }
298 }
299 Ok((host.to_owned(), addrs))
300}
301
302fn parse_and_extract(
303 html: &str,
304 selector: &str,
305 extract: &ExtractMode,
306 limit: usize,
307) -> Result<String, ToolError> {
308 let soup = scrape_core::Soup::parse(html);
309
310 let tags = soup.find_all(selector).map_err(|e| {
311 ToolError::Execution(std::io::Error::new(
312 std::io::ErrorKind::InvalidData,
313 format!("invalid selector: {e}"),
314 ))
315 })?;
316
317 let mut results = Vec::new();
318
319 for tag in tags.into_iter().take(limit) {
320 let value = match extract {
321 ExtractMode::Text => tag.text(),
322 ExtractMode::Html => tag.inner_html(),
323 ExtractMode::Attr(name) => tag.get(name).unwrap_or_default().to_owned(),
324 };
325 if !value.trim().is_empty() {
326 results.push(value.trim().to_owned());
327 }
328 }
329
330 if results.is_empty() {
331 Ok(format!("No results for selector: {selector}"))
332 } else {
333 Ok(results.join("\n"))
334 }
335}
336
337#[cfg(test)]
338mod tests {
339 use super::*;
340
341 #[test]
344 fn extract_single_block() {
345 let text =
346 "Here:\n```scrape\n{\"url\":\"https://example.com\",\"select\":\"h1\"}\n```\nDone.";
347 let blocks = extract_scrape_blocks(text);
348 assert_eq!(blocks.len(), 1);
349 assert!(blocks[0].contains("example.com"));
350 }
351
352 #[test]
353 fn extract_multiple_blocks() {
354 let text = "```scrape\n{\"url\":\"https://a.com\",\"select\":\"h1\"}\n```\ntext\n```scrape\n{\"url\":\"https://b.com\",\"select\":\"p\"}\n```";
355 let blocks = extract_scrape_blocks(text);
356 assert_eq!(blocks.len(), 2);
357 }
358
359 #[test]
360 fn no_blocks_returns_empty() {
361 let blocks = extract_scrape_blocks("plain text, no code blocks");
362 assert!(blocks.is_empty());
363 }
364
365 #[test]
366 fn unclosed_block_ignored() {
367 let blocks = extract_scrape_blocks("```scrape\n{\"url\":\"https://x.com\"}");
368 assert!(blocks.is_empty());
369 }
370
371 #[test]
372 fn non_scrape_block_ignored() {
373 let text =
374 "```bash\necho hi\n```\n```scrape\n{\"url\":\"https://x.com\",\"select\":\"h1\"}\n```";
375 let blocks = extract_scrape_blocks(text);
376 assert_eq!(blocks.len(), 1);
377 assert!(blocks[0].contains("x.com"));
378 }
379
380 #[test]
381 fn multiline_json_block() {
382 let text =
383 "```scrape\n{\n \"url\": \"https://example.com\",\n \"select\": \"h1\"\n}\n```";
384 let blocks = extract_scrape_blocks(text);
385 assert_eq!(blocks.len(), 1);
386 let instr: ScrapeInstruction = serde_json::from_str(blocks[0]).unwrap();
387 assert_eq!(instr.url, "https://example.com");
388 }
389
390 #[test]
393 fn parse_valid_instruction() {
394 let json = r#"{"url":"https://example.com","select":"h1","extract":"text","limit":5}"#;
395 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
396 assert_eq!(instr.url, "https://example.com");
397 assert_eq!(instr.select, "h1");
398 assert_eq!(instr.extract, "text");
399 assert_eq!(instr.limit, Some(5));
400 }
401
402 #[test]
403 fn parse_minimal_instruction() {
404 let json = r#"{"url":"https://example.com","select":"p"}"#;
405 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
406 assert_eq!(instr.extract, "text");
407 assert!(instr.limit.is_none());
408 }
409
410 #[test]
411 fn parse_attr_extract() {
412 let json = r#"{"url":"https://example.com","select":"a","extract":"attr:href"}"#;
413 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
414 assert_eq!(instr.extract, "attr:href");
415 }
416
417 #[test]
418 fn parse_invalid_json_errors() {
419 let result = serde_json::from_str::<ScrapeInstruction>("not json");
420 assert!(result.is_err());
421 }
422
423 #[test]
426 fn extract_mode_text() {
427 assert!(matches!(ExtractMode::parse("text"), ExtractMode::Text));
428 }
429
430 #[test]
431 fn extract_mode_html() {
432 assert!(matches!(ExtractMode::parse("html"), ExtractMode::Html));
433 }
434
435 #[test]
436 fn extract_mode_attr() {
437 let mode = ExtractMode::parse("attr:href");
438 assert!(matches!(mode, ExtractMode::Attr(ref s) if s == "href"));
439 }
440
441 #[test]
442 fn extract_mode_unknown_defaults_to_text() {
443 assert!(matches!(ExtractMode::parse("unknown"), ExtractMode::Text));
444 }
445
446 #[test]
449 fn valid_https_url() {
450 assert!(validate_url("https://example.com").is_ok());
451 }
452
453 #[test]
454 fn http_rejected() {
455 let err = validate_url("http://example.com").unwrap_err();
456 assert!(matches!(err, ToolError::Blocked { .. }));
457 }
458
459 #[test]
460 fn ftp_rejected() {
461 let err = validate_url("ftp://files.example.com").unwrap_err();
462 assert!(matches!(err, ToolError::Blocked { .. }));
463 }
464
465 #[test]
466 fn file_rejected() {
467 let err = validate_url("file:///etc/passwd").unwrap_err();
468 assert!(matches!(err, ToolError::Blocked { .. }));
469 }
470
471 #[test]
472 fn invalid_url_rejected() {
473 let err = validate_url("not a url").unwrap_err();
474 assert!(matches!(err, ToolError::Blocked { .. }));
475 }
476
477 #[test]
478 fn localhost_blocked() {
479 let err = validate_url("https://localhost/path").unwrap_err();
480 assert!(matches!(err, ToolError::Blocked { .. }));
481 }
482
483 #[test]
484 fn loopback_ip_blocked() {
485 let err = validate_url("https://127.0.0.1/path").unwrap_err();
486 assert!(matches!(err, ToolError::Blocked { .. }));
487 }
488
489 #[test]
490 fn private_10_blocked() {
491 let err = validate_url("https://10.0.0.1/api").unwrap_err();
492 assert!(matches!(err, ToolError::Blocked { .. }));
493 }
494
495 #[test]
496 fn private_172_blocked() {
497 let err = validate_url("https://172.16.0.1/api").unwrap_err();
498 assert!(matches!(err, ToolError::Blocked { .. }));
499 }
500
501 #[test]
502 fn private_192_blocked() {
503 let err = validate_url("https://192.168.1.1/api").unwrap_err();
504 assert!(matches!(err, ToolError::Blocked { .. }));
505 }
506
507 #[test]
508 fn ipv6_loopback_blocked() {
509 let err = validate_url("https://[::1]/path").unwrap_err();
510 assert!(matches!(err, ToolError::Blocked { .. }));
511 }
512
513 #[test]
514 fn public_ip_allowed() {
515 assert!(validate_url("https://93.184.216.34/page").is_ok());
516 }
517
518 #[test]
521 fn extract_text_from_html() {
522 let html = "<html><body><h1>Hello World</h1><p>Content</p></body></html>";
523 let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
524 assert_eq!(result, "Hello World");
525 }
526
527 #[test]
528 fn extract_multiple_elements() {
529 let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
530 let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
531 assert_eq!(result, "A\nB\nC");
532 }
533
534 #[test]
535 fn extract_with_limit() {
536 let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
537 let result = parse_and_extract(html, "li", &ExtractMode::Text, 2).unwrap();
538 assert_eq!(result, "A\nB");
539 }
540
541 #[test]
542 fn extract_attr_href() {
543 let html = r#"<a href="https://example.com">Link</a>"#;
544 let result =
545 parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
546 assert_eq!(result, "https://example.com");
547 }
548
549 #[test]
550 fn extract_inner_html() {
551 let html = "<div><span>inner</span></div>";
552 let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
553 assert!(result.contains("<span>inner</span>"));
554 }
555
556 #[test]
557 fn no_matches_returns_message() {
558 let html = "<html><body><p>text</p></body></html>";
559 let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
560 assert!(result.starts_with("No results for selector:"));
561 }
562
563 #[test]
564 fn empty_text_skipped() {
565 let html = "<ul><li> </li><li>A</li></ul>";
566 let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
567 assert_eq!(result, "A");
568 }
569
570 #[test]
571 fn invalid_selector_errors() {
572 let html = "<html><body></body></html>";
573 let result = parse_and_extract(html, "[[[invalid", &ExtractMode::Text, 10);
574 assert!(result.is_err());
575 }
576
577 #[test]
578 fn empty_html_returns_no_results() {
579 let result = parse_and_extract("", "h1", &ExtractMode::Text, 10).unwrap();
580 assert!(result.starts_with("No results for selector:"));
581 }
582
583 #[test]
584 fn nested_selector() {
585 let html = "<div><span>inner</span></div><span>outer</span>";
586 let result = parse_and_extract(html, "div > span", &ExtractMode::Text, 10).unwrap();
587 assert_eq!(result, "inner");
588 }
589
590 #[test]
591 fn attr_missing_returns_empty() {
592 let html = r#"<a>No href</a>"#;
593 let result =
594 parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
595 assert!(result.starts_with("No results for selector:"));
596 }
597
598 #[test]
599 fn extract_html_mode() {
600 let html = "<div><b>bold</b> text</div>";
601 let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
602 assert!(result.contains("<b>bold</b>"));
603 }
604
605 #[test]
606 fn limit_zero_returns_no_results() {
607 let html = "<ul><li>A</li><li>B</li></ul>";
608 let result = parse_and_extract(html, "li", &ExtractMode::Text, 0).unwrap();
609 assert!(result.starts_with("No results for selector:"));
610 }
611
612 #[test]
615 fn url_with_port_allowed() {
616 assert!(validate_url("https://example.com:8443/path").is_ok());
617 }
618
619 #[test]
620 fn link_local_ip_blocked() {
621 let err = validate_url("https://169.254.1.1/path").unwrap_err();
622 assert!(matches!(err, ToolError::Blocked { .. }));
623 }
624
625 #[test]
626 fn url_no_scheme_rejected() {
627 let err = validate_url("example.com/path").unwrap_err();
628 assert!(matches!(err, ToolError::Blocked { .. }));
629 }
630
631 #[test]
632 fn unspecified_ipv4_blocked() {
633 let err = validate_url("https://0.0.0.0/path").unwrap_err();
634 assert!(matches!(err, ToolError::Blocked { .. }));
635 }
636
637 #[test]
638 fn broadcast_ipv4_blocked() {
639 let err = validate_url("https://255.255.255.255/path").unwrap_err();
640 assert!(matches!(err, ToolError::Blocked { .. }));
641 }
642
643 #[test]
644 fn ipv6_link_local_blocked() {
645 let err = validate_url("https://[fe80::1]/path").unwrap_err();
646 assert!(matches!(err, ToolError::Blocked { .. }));
647 }
648
649 #[test]
650 fn ipv6_unique_local_blocked() {
651 let err = validate_url("https://[fd12::1]/path").unwrap_err();
652 assert!(matches!(err, ToolError::Blocked { .. }));
653 }
654
655 #[test]
656 fn ipv4_mapped_ipv6_loopback_blocked() {
657 let err = validate_url("https://[::ffff:127.0.0.1]/path").unwrap_err();
658 assert!(matches!(err, ToolError::Blocked { .. }));
659 }
660
661 #[test]
662 fn ipv4_mapped_ipv6_private_blocked() {
663 let err = validate_url("https://[::ffff:10.0.0.1]/path").unwrap_err();
664 assert!(matches!(err, ToolError::Blocked { .. }));
665 }
666
667 #[tokio::test]
670 async fn executor_no_blocks_returns_none() {
671 let config = ScrapeConfig::default();
672 let executor = WebScrapeExecutor::new(&config);
673 let result = executor.execute("plain text").await;
674 assert!(result.unwrap().is_none());
675 }
676
677 #[tokio::test]
678 async fn executor_invalid_json_errors() {
679 let config = ScrapeConfig::default();
680 let executor = WebScrapeExecutor::new(&config);
681 let response = "```scrape\nnot json\n```";
682 let result = executor.execute(response).await;
683 assert!(matches!(result, Err(ToolError::Execution(_))));
684 }
685
686 #[tokio::test]
687 async fn executor_blocked_url_errors() {
688 let config = ScrapeConfig::default();
689 let executor = WebScrapeExecutor::new(&config);
690 let response = "```scrape\n{\"url\":\"http://example.com\",\"select\":\"h1\"}\n```";
691 let result = executor.execute(response).await;
692 assert!(matches!(result, Err(ToolError::Blocked { .. })));
693 }
694
695 #[tokio::test]
696 async fn executor_private_ip_blocked() {
697 let config = ScrapeConfig::default();
698 let executor = WebScrapeExecutor::new(&config);
699 let response = "```scrape\n{\"url\":\"https://192.168.1.1/api\",\"select\":\"h1\"}\n```";
700 let result = executor.execute(response).await;
701 assert!(matches!(result, Err(ToolError::Blocked { .. })));
702 }
703
704 #[tokio::test]
705 async fn executor_unreachable_host_returns_error() {
706 let config = ScrapeConfig {
707 timeout: 1,
708 max_body_bytes: 1_048_576,
709 };
710 let executor = WebScrapeExecutor::new(&config);
711 let response = "```scrape\n{\"url\":\"https://192.0.2.1:1/page\",\"select\":\"h1\"}\n```";
712 let result = executor.execute(response).await;
713 assert!(matches!(result, Err(ToolError::Execution(_))));
714 }
715
716 #[tokio::test]
717 async fn executor_localhost_url_blocked() {
718 let config = ScrapeConfig::default();
719 let executor = WebScrapeExecutor::new(&config);
720 let response = "```scrape\n{\"url\":\"https://localhost:9999/api\",\"select\":\"h1\"}\n```";
721 let result = executor.execute(response).await;
722 assert!(matches!(result, Err(ToolError::Blocked { .. })));
723 }
724
725 #[tokio::test]
726 async fn executor_empty_text_returns_none() {
727 let config = ScrapeConfig::default();
728 let executor = WebScrapeExecutor::new(&config);
729 let result = executor.execute("").await;
730 assert!(result.unwrap().is_none());
731 }
732
733 #[tokio::test]
734 async fn executor_multiple_blocks_first_blocked() {
735 let config = ScrapeConfig::default();
736 let executor = WebScrapeExecutor::new(&config);
737 let response = "```scrape\n{\"url\":\"http://evil.com\",\"select\":\"h1\"}\n```\n\
738 ```scrape\n{\"url\":\"https://ok.com\",\"select\":\"h1\"}\n```";
739 let result = executor.execute(response).await;
740 assert!(result.is_err());
741 }
742
743 #[test]
744 fn validate_url_empty_string() {
745 let err = validate_url("").unwrap_err();
746 assert!(matches!(err, ToolError::Blocked { .. }));
747 }
748
749 #[test]
750 fn validate_url_javascript_scheme_blocked() {
751 let err = validate_url("javascript:alert(1)").unwrap_err();
752 assert!(matches!(err, ToolError::Blocked { .. }));
753 }
754
755 #[test]
756 fn validate_url_data_scheme_blocked() {
757 let err = validate_url("data:text/html,<h1>hi</h1>").unwrap_err();
758 assert!(matches!(err, ToolError::Blocked { .. }));
759 }
760
761 #[test]
762 fn is_private_host_public_domain_is_false() {
763 let host: url::Host<&str> = url::Host::Domain("example.com");
764 assert!(!is_private_host(&host));
765 }
766
767 #[test]
768 fn is_private_host_localhost_is_true() {
769 let host: url::Host<&str> = url::Host::Domain("localhost");
770 assert!(is_private_host(&host));
771 }
772
773 #[test]
774 fn is_private_host_ipv6_unspecified_is_true() {
775 let host = url::Host::Ipv6(std::net::Ipv6Addr::UNSPECIFIED);
776 assert!(is_private_host(&host));
777 }
778
779 #[test]
780 fn is_private_host_public_ipv6_is_false() {
781 let host = url::Host::Ipv6("2001:db8::1".parse().unwrap());
782 assert!(!is_private_host(&host));
783 }
784
785 #[test]
786 fn extract_scrape_blocks_empty_block_content() {
787 let text = "```scrape\n\n```";
788 let blocks = extract_scrape_blocks(text);
789 assert_eq!(blocks.len(), 1);
790 assert!(blocks[0].is_empty());
791 }
792
793 #[test]
794 fn extract_scrape_blocks_whitespace_only() {
795 let text = "```scrape\n \n```";
796 let blocks = extract_scrape_blocks(text);
797 assert_eq!(blocks.len(), 1);
798 }
799
800 #[test]
801 fn parse_and_extract_multiple_selectors() {
802 let html = "<div><h1>Title</h1><p>Para</p></div>";
803 let result = parse_and_extract(html, "h1, p", &ExtractMode::Text, 10).unwrap();
804 assert!(result.contains("Title"));
805 assert!(result.contains("Para"));
806 }
807
808 #[test]
809 fn webscrape_executor_new_with_custom_config() {
810 let config = ScrapeConfig {
811 timeout: 60,
812 max_body_bytes: 512,
813 };
814 let executor = WebScrapeExecutor::new(&config);
815 assert_eq!(executor.max_body_bytes, 512);
816 }
817
818 #[test]
819 fn webscrape_executor_debug() {
820 let config = ScrapeConfig::default();
821 let executor = WebScrapeExecutor::new(&config);
822 let dbg = format!("{executor:?}");
823 assert!(dbg.contains("WebScrapeExecutor"));
824 }
825
826 #[test]
827 fn extract_mode_attr_empty_name() {
828 let mode = ExtractMode::parse("attr:");
829 assert!(matches!(mode, ExtractMode::Attr(ref s) if s.is_empty()));
830 }
831
832 #[test]
833 fn default_extract_returns_text() {
834 assert_eq!(default_extract(), "text");
835 }
836
837 #[test]
838 fn scrape_instruction_debug() {
839 let json = r#"{"url":"https://example.com","select":"h1"}"#;
840 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
841 let dbg = format!("{instr:?}");
842 assert!(dbg.contains("ScrapeInstruction"));
843 }
844
845 #[test]
846 fn extract_mode_debug() {
847 let mode = ExtractMode::Text;
848 let dbg = format!("{mode:?}");
849 assert!(dbg.contains("Text"));
850 }
851
852 #[test]
853 fn ipv4_mapped_ipv6_link_local_blocked() {
854 let err = validate_url("https://[::ffff:169.254.0.1]/path").unwrap_err();
855 assert!(matches!(err, ToolError::Blocked { .. }));
856 }
857
858 #[test]
859 fn ipv4_mapped_ipv6_public_allowed() {
860 assert!(validate_url("https://[::ffff:93.184.216.34]/path").is_ok());
861 }
862
863 #[test]
864 fn tool_definitions_returns_web_scrape() {
865 let config = ScrapeConfig::default();
866 let executor = WebScrapeExecutor::new(&config);
867 let defs = executor.tool_definitions();
868 assert_eq!(defs.len(), 1);
869 assert_eq!(defs[0].id, "web_scrape");
870 assert_eq!(
871 defs[0].invocation,
872 crate::registry::InvocationHint::FencedBlock("scrape")
873 );
874 }
875
876 #[test]
877 fn tool_definitions_schema_has_all_params() {
878 let config = ScrapeConfig::default();
879 let executor = WebScrapeExecutor::new(&config);
880 let defs = executor.tool_definitions();
881 let obj = defs[0].schema.as_object().unwrap();
882 let props = obj["properties"].as_object().unwrap();
883 assert!(props.contains_key("url"));
884 assert!(props.contains_key("select"));
885 assert!(props.contains_key("extract"));
886 assert!(props.contains_key("limit"));
887 let req = obj["required"].as_array().unwrap();
888 assert!(req.iter().any(|v| v.as_str() == Some("url")));
889 assert!(req.iter().any(|v| v.as_str() == Some("select")));
890 assert!(!req.iter().any(|v| v.as_str() == Some("extract")));
891 }
892
893 #[test]
896 fn subdomain_localhost_blocked() {
897 let host: url::Host<&str> = url::Host::Domain("foo.localhost");
898 assert!(is_private_host(&host));
899 }
900
901 #[test]
902 fn internal_tld_blocked() {
903 let host: url::Host<&str> = url::Host::Domain("service.internal");
904 assert!(is_private_host(&host));
905 }
906
907 #[test]
908 fn local_tld_blocked() {
909 let host: url::Host<&str> = url::Host::Domain("printer.local");
910 assert!(is_private_host(&host));
911 }
912
913 #[test]
914 fn public_domain_not_blocked() {
915 let host: url::Host<&str> = url::Host::Domain("example.com");
916 assert!(!is_private_host(&host));
917 }
918
919 #[tokio::test]
922 async fn resolve_loopback_rejected() {
923 let url = url::Url::parse("https://127.0.0.1/path").unwrap();
925 let result = resolve_and_validate(&url).await;
927 assert!(
928 result.is_err(),
929 "loopback IP must be rejected by resolve_and_validate"
930 );
931 let err = result.unwrap_err();
932 assert!(matches!(err, crate::executor::ToolError::Blocked { .. }));
933 }
934
935 #[tokio::test]
936 async fn resolve_private_10_rejected() {
937 let url = url::Url::parse("https://10.0.0.1/path").unwrap();
938 let result = resolve_and_validate(&url).await;
939 assert!(result.is_err());
940 assert!(matches!(
941 result.unwrap_err(),
942 crate::executor::ToolError::Blocked { .. }
943 ));
944 }
945
946 #[tokio::test]
947 async fn resolve_private_192_rejected() {
948 let url = url::Url::parse("https://192.168.1.1/path").unwrap();
949 let result = resolve_and_validate(&url).await;
950 assert!(result.is_err());
951 assert!(matches!(
952 result.unwrap_err(),
953 crate::executor::ToolError::Blocked { .. }
954 ));
955 }
956
957 #[tokio::test]
958 async fn resolve_ipv6_loopback_rejected() {
959 let url = url::Url::parse("https://[::1]/path").unwrap();
960 let result = resolve_and_validate(&url).await;
961 assert!(result.is_err());
962 assert!(matches!(
963 result.unwrap_err(),
964 crate::executor::ToolError::Blocked { .. }
965 ));
966 }
967
968 #[tokio::test]
969 async fn resolve_no_host_returns_ok() {
970 let url = url::Url::parse("https://example.com/path").unwrap();
972 let url_no_host = url::Url::parse("data:text/plain,hello").unwrap();
974 let result = resolve_and_validate(&url_no_host).await;
976 assert!(result.is_ok());
977 let (host, addrs) = result.unwrap();
978 assert!(host.is_empty());
979 assert!(addrs.is_empty());
980 drop(url);
981 drop(url_no_host);
982 }
983}