1use std::net::{IpAddr, SocketAddr};
2use std::time::Duration;
3
4use schemars::JsonSchema;
5use serde::Deserialize;
6use url::Url;
7
8use crate::config::ScrapeConfig;
9use crate::executor::{ToolCall, ToolError, ToolExecutor, ToolOutput, deserialize_params};
10
11#[derive(Debug, Deserialize, JsonSchema)]
12struct ScrapeInstruction {
13 url: String,
15 select: String,
17 #[serde(default = "default_extract")]
19 extract: String,
20 limit: Option<usize>,
22}
23
24fn default_extract() -> String {
25 "text".into()
26}
27
28#[derive(Debug)]
29enum ExtractMode {
30 Text,
31 Html,
32 Attr(String),
33}
34
35impl ExtractMode {
36 fn parse(s: &str) -> Self {
37 match s {
38 "text" => Self::Text,
39 "html" => Self::Html,
40 attr if attr.starts_with("attr:") => {
41 Self::Attr(attr.strip_prefix("attr:").unwrap_or(attr).to_owned())
42 }
43 _ => Self::Text,
44 }
45 }
46}
47
48#[derive(Debug)]
53pub struct WebScrapeExecutor {
54 timeout: Duration,
55 max_body_bytes: usize,
56}
57
58impl WebScrapeExecutor {
59 #[must_use]
60 pub fn new(config: &ScrapeConfig) -> Self {
61 Self {
62 timeout: Duration::from_secs(config.timeout),
63 max_body_bytes: config.max_body_bytes,
64 }
65 }
66
67 fn build_client(&self, host: &str, addrs: &[SocketAddr]) -> reqwest::Client {
68 let mut builder = reqwest::Client::builder()
69 .timeout(self.timeout)
70 .redirect(reqwest::redirect::Policy::limited(3));
71 builder = builder.resolve_to_addrs(host, addrs);
72 builder.build().unwrap_or_default()
73 }
74}
75
76impl ToolExecutor for WebScrapeExecutor {
77 fn tool_definitions(&self) -> Vec<crate::registry::ToolDef> {
78 use crate::registry::{InvocationHint, ToolDef};
79 vec![ToolDef {
80 id: "web_scrape",
81 description: "Scrape data from a web page via CSS selectors",
82 schema: schemars::schema_for!(ScrapeInstruction),
83 invocation: InvocationHint::FencedBlock("scrape"),
84 }]
85 }
86
87 async fn execute(&self, response: &str) -> Result<Option<ToolOutput>, ToolError> {
88 let blocks = extract_scrape_blocks(response);
89 if blocks.is_empty() {
90 return Ok(None);
91 }
92
93 let mut outputs = Vec::with_capacity(blocks.len());
94 #[allow(clippy::cast_possible_truncation)]
95 let blocks_executed = blocks.len() as u32;
96
97 for block in &blocks {
98 let instruction: ScrapeInstruction = serde_json::from_str(block).map_err(|e| {
99 ToolError::Execution(std::io::Error::new(
100 std::io::ErrorKind::InvalidData,
101 e.to_string(),
102 ))
103 })?;
104 outputs.push(self.scrape_instruction(&instruction).await?);
105 }
106
107 Ok(Some(ToolOutput {
108 tool_name: "web-scrape".to_owned(),
109 summary: outputs.join("\n\n"),
110 blocks_executed,
111 filter_stats: None,
112 diff: None,
113 streamed: false,
114 }))
115 }
116
117 async fn execute_tool_call(&self, call: &ToolCall) -> Result<Option<ToolOutput>, ToolError> {
118 if call.tool_id != "web_scrape" {
119 return Ok(None);
120 }
121
122 let instruction: ScrapeInstruction = deserialize_params(&call.params)?;
123
124 let result = self.scrape_instruction(&instruction).await?;
125
126 Ok(Some(ToolOutput {
127 tool_name: "web-scrape".to_owned(),
128 summary: result,
129 blocks_executed: 1,
130 filter_stats: None,
131 diff: None,
132 streamed: false,
133 }))
134 }
135}
136
137impl WebScrapeExecutor {
138 async fn scrape_instruction(
139 &self,
140 instruction: &ScrapeInstruction,
141 ) -> Result<String, ToolError> {
142 let parsed = validate_url(&instruction.url)?;
143 let (host, addrs) = resolve_and_validate(&parsed).await?;
144 let client = self.build_client(&host, &addrs);
147 let html = self.fetch_html(&client, &instruction.url).await?;
148 let selector = instruction.select.clone();
149 let extract = ExtractMode::parse(&instruction.extract);
150 let limit = instruction.limit.unwrap_or(10);
151 tokio::task::spawn_blocking(move || parse_and_extract(&html, &selector, &extract, limit))
152 .await
153 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?
154 }
155
156 async fn fetch_html(&self, client: &reqwest::Client, url: &str) -> Result<String, ToolError> {
157 let resp = client
158 .get(url)
159 .send()
160 .await
161 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
162
163 if !resp.status().is_success() {
164 return Err(ToolError::Execution(std::io::Error::other(format!(
165 "HTTP {}",
166 resp.status(),
167 ))));
168 }
169
170 let bytes = resp
171 .bytes()
172 .await
173 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
174
175 if bytes.len() > self.max_body_bytes {
176 return Err(ToolError::Execution(std::io::Error::other(format!(
177 "response too large: {} bytes (max: {})",
178 bytes.len(),
179 self.max_body_bytes,
180 ))));
181 }
182
183 String::from_utf8(bytes.to_vec())
184 .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))
185 }
186}
187
188fn extract_scrape_blocks(text: &str) -> Vec<&str> {
189 crate::executor::extract_fenced_blocks(text, "scrape")
190}
191
192fn validate_url(raw: &str) -> Result<Url, ToolError> {
193 let parsed = Url::parse(raw).map_err(|_| ToolError::Blocked {
194 command: format!("invalid URL: {raw}"),
195 })?;
196
197 if parsed.scheme() != "https" {
198 return Err(ToolError::Blocked {
199 command: format!("scheme not allowed: {}", parsed.scheme()),
200 });
201 }
202
203 if let Some(host) = parsed.host()
204 && is_private_host(&host)
205 {
206 return Err(ToolError::Blocked {
207 command: format!(
208 "private/local host blocked: {}",
209 parsed.host_str().unwrap_or("")
210 ),
211 });
212 }
213
214 Ok(parsed)
215}
216
217pub(crate) fn is_private_ip(ip: IpAddr) -> bool {
218 match ip {
219 IpAddr::V4(v4) => {
220 v4.is_loopback()
221 || v4.is_private()
222 || v4.is_link_local()
223 || v4.is_unspecified()
224 || v4.is_broadcast()
225 }
226 IpAddr::V6(v6) => {
227 if v6.is_loopback() || v6.is_unspecified() {
228 return true;
229 }
230 let seg = v6.segments();
231 if seg[0] & 0xffc0 == 0xfe80 {
233 return true;
234 }
235 if seg[0] & 0xfe00 == 0xfc00 {
237 return true;
238 }
239 if seg[0..6] == [0, 0, 0, 0, 0, 0xffff] {
241 let v4 = v6
242 .to_ipv4_mapped()
243 .unwrap_or(std::net::Ipv4Addr::UNSPECIFIED);
244 return v4.is_loopback()
245 || v4.is_private()
246 || v4.is_link_local()
247 || v4.is_unspecified()
248 || v4.is_broadcast();
249 }
250 false
251 }
252 }
253}
254
255fn is_private_host(host: &url::Host<&str>) -> bool {
256 match host {
257 url::Host::Domain(d) => {
258 #[allow(clippy::case_sensitive_file_extension_comparisons)]
261 {
262 *d == "localhost"
263 || d.ends_with(".localhost")
264 || d.ends_with(".internal")
265 || d.ends_with(".local")
266 }
267 }
268 url::Host::Ipv4(v4) => is_private_ip(IpAddr::V4(*v4)),
269 url::Host::Ipv6(v6) => is_private_ip(IpAddr::V6(*v6)),
270 }
271}
272
273async fn resolve_and_validate(url: &Url) -> Result<(String, Vec<SocketAddr>), ToolError> {
279 let Some(host) = url.host_str() else {
280 return Ok((String::new(), vec![]));
281 };
282 let port = url.port_or_known_default().unwrap_or(443);
283 let addrs: Vec<SocketAddr> = tokio::net::lookup_host(format!("{host}:{port}"))
284 .await
285 .map_err(|e| ToolError::Blocked {
286 command: format!("DNS resolution failed: {e}"),
287 })?
288 .collect();
289 for addr in &addrs {
290 if is_private_ip(addr.ip()) {
291 return Err(ToolError::Blocked {
292 command: format!("SSRF protection: private IP {} for host {host}", addr.ip()),
293 });
294 }
295 }
296 Ok((host.to_owned(), addrs))
297}
298
299fn parse_and_extract(
300 html: &str,
301 selector: &str,
302 extract: &ExtractMode,
303 limit: usize,
304) -> Result<String, ToolError> {
305 let soup = scrape_core::Soup::parse(html);
306
307 let tags = soup.find_all(selector).map_err(|e| {
308 ToolError::Execution(std::io::Error::new(
309 std::io::ErrorKind::InvalidData,
310 format!("invalid selector: {e}"),
311 ))
312 })?;
313
314 let mut results = Vec::new();
315
316 for tag in tags.into_iter().take(limit) {
317 let value = match extract {
318 ExtractMode::Text => tag.text(),
319 ExtractMode::Html => tag.inner_html(),
320 ExtractMode::Attr(name) => tag.get(name).unwrap_or_default().to_owned(),
321 };
322 if !value.trim().is_empty() {
323 results.push(value.trim().to_owned());
324 }
325 }
326
327 if results.is_empty() {
328 Ok(format!("No results for selector: {selector}"))
329 } else {
330 Ok(results.join("\n"))
331 }
332}
333
334#[cfg(test)]
335mod tests {
336 use super::*;
337
338 #[test]
341 fn extract_single_block() {
342 let text =
343 "Here:\n```scrape\n{\"url\":\"https://example.com\",\"select\":\"h1\"}\n```\nDone.";
344 let blocks = extract_scrape_blocks(text);
345 assert_eq!(blocks.len(), 1);
346 assert!(blocks[0].contains("example.com"));
347 }
348
349 #[test]
350 fn extract_multiple_blocks() {
351 let text = "```scrape\n{\"url\":\"https://a.com\",\"select\":\"h1\"}\n```\ntext\n```scrape\n{\"url\":\"https://b.com\",\"select\":\"p\"}\n```";
352 let blocks = extract_scrape_blocks(text);
353 assert_eq!(blocks.len(), 2);
354 }
355
356 #[test]
357 fn no_blocks_returns_empty() {
358 let blocks = extract_scrape_blocks("plain text, no code blocks");
359 assert!(blocks.is_empty());
360 }
361
362 #[test]
363 fn unclosed_block_ignored() {
364 let blocks = extract_scrape_blocks("```scrape\n{\"url\":\"https://x.com\"}");
365 assert!(blocks.is_empty());
366 }
367
368 #[test]
369 fn non_scrape_block_ignored() {
370 let text =
371 "```bash\necho hi\n```\n```scrape\n{\"url\":\"https://x.com\",\"select\":\"h1\"}\n```";
372 let blocks = extract_scrape_blocks(text);
373 assert_eq!(blocks.len(), 1);
374 assert!(blocks[0].contains("x.com"));
375 }
376
377 #[test]
378 fn multiline_json_block() {
379 let text =
380 "```scrape\n{\n \"url\": \"https://example.com\",\n \"select\": \"h1\"\n}\n```";
381 let blocks = extract_scrape_blocks(text);
382 assert_eq!(blocks.len(), 1);
383 let instr: ScrapeInstruction = serde_json::from_str(blocks[0]).unwrap();
384 assert_eq!(instr.url, "https://example.com");
385 }
386
387 #[test]
390 fn parse_valid_instruction() {
391 let json = r#"{"url":"https://example.com","select":"h1","extract":"text","limit":5}"#;
392 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
393 assert_eq!(instr.url, "https://example.com");
394 assert_eq!(instr.select, "h1");
395 assert_eq!(instr.extract, "text");
396 assert_eq!(instr.limit, Some(5));
397 }
398
399 #[test]
400 fn parse_minimal_instruction() {
401 let json = r#"{"url":"https://example.com","select":"p"}"#;
402 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
403 assert_eq!(instr.extract, "text");
404 assert!(instr.limit.is_none());
405 }
406
407 #[test]
408 fn parse_attr_extract() {
409 let json = r#"{"url":"https://example.com","select":"a","extract":"attr:href"}"#;
410 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
411 assert_eq!(instr.extract, "attr:href");
412 }
413
414 #[test]
415 fn parse_invalid_json_errors() {
416 let result = serde_json::from_str::<ScrapeInstruction>("not json");
417 assert!(result.is_err());
418 }
419
420 #[test]
423 fn extract_mode_text() {
424 assert!(matches!(ExtractMode::parse("text"), ExtractMode::Text));
425 }
426
427 #[test]
428 fn extract_mode_html() {
429 assert!(matches!(ExtractMode::parse("html"), ExtractMode::Html));
430 }
431
432 #[test]
433 fn extract_mode_attr() {
434 let mode = ExtractMode::parse("attr:href");
435 assert!(matches!(mode, ExtractMode::Attr(ref s) if s == "href"));
436 }
437
438 #[test]
439 fn extract_mode_unknown_defaults_to_text() {
440 assert!(matches!(ExtractMode::parse("unknown"), ExtractMode::Text));
441 }
442
443 #[test]
446 fn valid_https_url() {
447 assert!(validate_url("https://example.com").is_ok());
448 }
449
450 #[test]
451 fn http_rejected() {
452 let err = validate_url("http://example.com").unwrap_err();
453 assert!(matches!(err, ToolError::Blocked { .. }));
454 }
455
456 #[test]
457 fn ftp_rejected() {
458 let err = validate_url("ftp://files.example.com").unwrap_err();
459 assert!(matches!(err, ToolError::Blocked { .. }));
460 }
461
462 #[test]
463 fn file_rejected() {
464 let err = validate_url("file:///etc/passwd").unwrap_err();
465 assert!(matches!(err, ToolError::Blocked { .. }));
466 }
467
468 #[test]
469 fn invalid_url_rejected() {
470 let err = validate_url("not a url").unwrap_err();
471 assert!(matches!(err, ToolError::Blocked { .. }));
472 }
473
474 #[test]
475 fn localhost_blocked() {
476 let err = validate_url("https://localhost/path").unwrap_err();
477 assert!(matches!(err, ToolError::Blocked { .. }));
478 }
479
480 #[test]
481 fn loopback_ip_blocked() {
482 let err = validate_url("https://127.0.0.1/path").unwrap_err();
483 assert!(matches!(err, ToolError::Blocked { .. }));
484 }
485
486 #[test]
487 fn private_10_blocked() {
488 let err = validate_url("https://10.0.0.1/api").unwrap_err();
489 assert!(matches!(err, ToolError::Blocked { .. }));
490 }
491
492 #[test]
493 fn private_172_blocked() {
494 let err = validate_url("https://172.16.0.1/api").unwrap_err();
495 assert!(matches!(err, ToolError::Blocked { .. }));
496 }
497
498 #[test]
499 fn private_192_blocked() {
500 let err = validate_url("https://192.168.1.1/api").unwrap_err();
501 assert!(matches!(err, ToolError::Blocked { .. }));
502 }
503
504 #[test]
505 fn ipv6_loopback_blocked() {
506 let err = validate_url("https://[::1]/path").unwrap_err();
507 assert!(matches!(err, ToolError::Blocked { .. }));
508 }
509
510 #[test]
511 fn public_ip_allowed() {
512 assert!(validate_url("https://93.184.216.34/page").is_ok());
513 }
514
515 #[test]
518 fn extract_text_from_html() {
519 let html = "<html><body><h1>Hello World</h1><p>Content</p></body></html>";
520 let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
521 assert_eq!(result, "Hello World");
522 }
523
524 #[test]
525 fn extract_multiple_elements() {
526 let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
527 let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
528 assert_eq!(result, "A\nB\nC");
529 }
530
531 #[test]
532 fn extract_with_limit() {
533 let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
534 let result = parse_and_extract(html, "li", &ExtractMode::Text, 2).unwrap();
535 assert_eq!(result, "A\nB");
536 }
537
538 #[test]
539 fn extract_attr_href() {
540 let html = r#"<a href="https://example.com">Link</a>"#;
541 let result =
542 parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
543 assert_eq!(result, "https://example.com");
544 }
545
546 #[test]
547 fn extract_inner_html() {
548 let html = "<div><span>inner</span></div>";
549 let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
550 assert!(result.contains("<span>inner</span>"));
551 }
552
553 #[test]
554 fn no_matches_returns_message() {
555 let html = "<html><body><p>text</p></body></html>";
556 let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
557 assert!(result.starts_with("No results for selector:"));
558 }
559
560 #[test]
561 fn empty_text_skipped() {
562 let html = "<ul><li> </li><li>A</li></ul>";
563 let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
564 assert_eq!(result, "A");
565 }
566
567 #[test]
568 fn invalid_selector_errors() {
569 let html = "<html><body></body></html>";
570 let result = parse_and_extract(html, "[[[invalid", &ExtractMode::Text, 10);
571 assert!(result.is_err());
572 }
573
574 #[test]
575 fn empty_html_returns_no_results() {
576 let result = parse_and_extract("", "h1", &ExtractMode::Text, 10).unwrap();
577 assert!(result.starts_with("No results for selector:"));
578 }
579
580 #[test]
581 fn nested_selector() {
582 let html = "<div><span>inner</span></div><span>outer</span>";
583 let result = parse_and_extract(html, "div > span", &ExtractMode::Text, 10).unwrap();
584 assert_eq!(result, "inner");
585 }
586
587 #[test]
588 fn attr_missing_returns_empty() {
589 let html = r#"<a>No href</a>"#;
590 let result =
591 parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
592 assert!(result.starts_with("No results for selector:"));
593 }
594
595 #[test]
596 fn extract_html_mode() {
597 let html = "<div><b>bold</b> text</div>";
598 let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
599 assert!(result.contains("<b>bold</b>"));
600 }
601
602 #[test]
603 fn limit_zero_returns_no_results() {
604 let html = "<ul><li>A</li><li>B</li></ul>";
605 let result = parse_and_extract(html, "li", &ExtractMode::Text, 0).unwrap();
606 assert!(result.starts_with("No results for selector:"));
607 }
608
609 #[test]
612 fn url_with_port_allowed() {
613 assert!(validate_url("https://example.com:8443/path").is_ok());
614 }
615
616 #[test]
617 fn link_local_ip_blocked() {
618 let err = validate_url("https://169.254.1.1/path").unwrap_err();
619 assert!(matches!(err, ToolError::Blocked { .. }));
620 }
621
622 #[test]
623 fn url_no_scheme_rejected() {
624 let err = validate_url("example.com/path").unwrap_err();
625 assert!(matches!(err, ToolError::Blocked { .. }));
626 }
627
628 #[test]
629 fn unspecified_ipv4_blocked() {
630 let err = validate_url("https://0.0.0.0/path").unwrap_err();
631 assert!(matches!(err, ToolError::Blocked { .. }));
632 }
633
634 #[test]
635 fn broadcast_ipv4_blocked() {
636 let err = validate_url("https://255.255.255.255/path").unwrap_err();
637 assert!(matches!(err, ToolError::Blocked { .. }));
638 }
639
640 #[test]
641 fn ipv6_link_local_blocked() {
642 let err = validate_url("https://[fe80::1]/path").unwrap_err();
643 assert!(matches!(err, ToolError::Blocked { .. }));
644 }
645
646 #[test]
647 fn ipv6_unique_local_blocked() {
648 let err = validate_url("https://[fd12::1]/path").unwrap_err();
649 assert!(matches!(err, ToolError::Blocked { .. }));
650 }
651
652 #[test]
653 fn ipv4_mapped_ipv6_loopback_blocked() {
654 let err = validate_url("https://[::ffff:127.0.0.1]/path").unwrap_err();
655 assert!(matches!(err, ToolError::Blocked { .. }));
656 }
657
658 #[test]
659 fn ipv4_mapped_ipv6_private_blocked() {
660 let err = validate_url("https://[::ffff:10.0.0.1]/path").unwrap_err();
661 assert!(matches!(err, ToolError::Blocked { .. }));
662 }
663
664 #[tokio::test]
667 async fn executor_no_blocks_returns_none() {
668 let config = ScrapeConfig::default();
669 let executor = WebScrapeExecutor::new(&config);
670 let result = executor.execute("plain text").await;
671 assert!(result.unwrap().is_none());
672 }
673
674 #[tokio::test]
675 async fn executor_invalid_json_errors() {
676 let config = ScrapeConfig::default();
677 let executor = WebScrapeExecutor::new(&config);
678 let response = "```scrape\nnot json\n```";
679 let result = executor.execute(response).await;
680 assert!(matches!(result, Err(ToolError::Execution(_))));
681 }
682
683 #[tokio::test]
684 async fn executor_blocked_url_errors() {
685 let config = ScrapeConfig::default();
686 let executor = WebScrapeExecutor::new(&config);
687 let response = "```scrape\n{\"url\":\"http://example.com\",\"select\":\"h1\"}\n```";
688 let result = executor.execute(response).await;
689 assert!(matches!(result, Err(ToolError::Blocked { .. })));
690 }
691
692 #[tokio::test]
693 async fn executor_private_ip_blocked() {
694 let config = ScrapeConfig::default();
695 let executor = WebScrapeExecutor::new(&config);
696 let response = "```scrape\n{\"url\":\"https://192.168.1.1/api\",\"select\":\"h1\"}\n```";
697 let result = executor.execute(response).await;
698 assert!(matches!(result, Err(ToolError::Blocked { .. })));
699 }
700
701 #[tokio::test]
702 async fn executor_unreachable_host_returns_error() {
703 let config = ScrapeConfig {
704 timeout: 1,
705 max_body_bytes: 1_048_576,
706 };
707 let executor = WebScrapeExecutor::new(&config);
708 let response = "```scrape\n{\"url\":\"https://192.0.2.1:1/page\",\"select\":\"h1\"}\n```";
709 let result = executor.execute(response).await;
710 assert!(matches!(result, Err(ToolError::Execution(_))));
711 }
712
713 #[tokio::test]
714 async fn executor_localhost_url_blocked() {
715 let config = ScrapeConfig::default();
716 let executor = WebScrapeExecutor::new(&config);
717 let response = "```scrape\n{\"url\":\"https://localhost:9999/api\",\"select\":\"h1\"}\n```";
718 let result = executor.execute(response).await;
719 assert!(matches!(result, Err(ToolError::Blocked { .. })));
720 }
721
722 #[tokio::test]
723 async fn executor_empty_text_returns_none() {
724 let config = ScrapeConfig::default();
725 let executor = WebScrapeExecutor::new(&config);
726 let result = executor.execute("").await;
727 assert!(result.unwrap().is_none());
728 }
729
730 #[tokio::test]
731 async fn executor_multiple_blocks_first_blocked() {
732 let config = ScrapeConfig::default();
733 let executor = WebScrapeExecutor::new(&config);
734 let response = "```scrape\n{\"url\":\"http://evil.com\",\"select\":\"h1\"}\n```\n\
735 ```scrape\n{\"url\":\"https://ok.com\",\"select\":\"h1\"}\n```";
736 let result = executor.execute(response).await;
737 assert!(result.is_err());
738 }
739
740 #[test]
741 fn validate_url_empty_string() {
742 let err = validate_url("").unwrap_err();
743 assert!(matches!(err, ToolError::Blocked { .. }));
744 }
745
746 #[test]
747 fn validate_url_javascript_scheme_blocked() {
748 let err = validate_url("javascript:alert(1)").unwrap_err();
749 assert!(matches!(err, ToolError::Blocked { .. }));
750 }
751
752 #[test]
753 fn validate_url_data_scheme_blocked() {
754 let err = validate_url("data:text/html,<h1>hi</h1>").unwrap_err();
755 assert!(matches!(err, ToolError::Blocked { .. }));
756 }
757
758 #[test]
759 fn is_private_host_public_domain_is_false() {
760 let host: url::Host<&str> = url::Host::Domain("example.com");
761 assert!(!is_private_host(&host));
762 }
763
764 #[test]
765 fn is_private_host_localhost_is_true() {
766 let host: url::Host<&str> = url::Host::Domain("localhost");
767 assert!(is_private_host(&host));
768 }
769
770 #[test]
771 fn is_private_host_ipv6_unspecified_is_true() {
772 let host = url::Host::Ipv6(std::net::Ipv6Addr::UNSPECIFIED);
773 assert!(is_private_host(&host));
774 }
775
776 #[test]
777 fn is_private_host_public_ipv6_is_false() {
778 let host = url::Host::Ipv6("2001:db8::1".parse().unwrap());
779 assert!(!is_private_host(&host));
780 }
781
782 #[test]
783 fn extract_scrape_blocks_empty_block_content() {
784 let text = "```scrape\n\n```";
785 let blocks = extract_scrape_blocks(text);
786 assert_eq!(blocks.len(), 1);
787 assert!(blocks[0].is_empty());
788 }
789
790 #[test]
791 fn extract_scrape_blocks_whitespace_only() {
792 let text = "```scrape\n \n```";
793 let blocks = extract_scrape_blocks(text);
794 assert_eq!(blocks.len(), 1);
795 }
796
797 #[test]
798 fn parse_and_extract_multiple_selectors() {
799 let html = "<div><h1>Title</h1><p>Para</p></div>";
800 let result = parse_and_extract(html, "h1, p", &ExtractMode::Text, 10).unwrap();
801 assert!(result.contains("Title"));
802 assert!(result.contains("Para"));
803 }
804
805 #[test]
806 fn webscrape_executor_new_with_custom_config() {
807 let config = ScrapeConfig {
808 timeout: 60,
809 max_body_bytes: 512,
810 };
811 let executor = WebScrapeExecutor::new(&config);
812 assert_eq!(executor.max_body_bytes, 512);
813 }
814
815 #[test]
816 fn webscrape_executor_debug() {
817 let config = ScrapeConfig::default();
818 let executor = WebScrapeExecutor::new(&config);
819 let dbg = format!("{executor:?}");
820 assert!(dbg.contains("WebScrapeExecutor"));
821 }
822
823 #[test]
824 fn extract_mode_attr_empty_name() {
825 let mode = ExtractMode::parse("attr:");
826 assert!(matches!(mode, ExtractMode::Attr(ref s) if s.is_empty()));
827 }
828
829 #[test]
830 fn default_extract_returns_text() {
831 assert_eq!(default_extract(), "text");
832 }
833
834 #[test]
835 fn scrape_instruction_debug() {
836 let json = r#"{"url":"https://example.com","select":"h1"}"#;
837 let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
838 let dbg = format!("{instr:?}");
839 assert!(dbg.contains("ScrapeInstruction"));
840 }
841
842 #[test]
843 fn extract_mode_debug() {
844 let mode = ExtractMode::Text;
845 let dbg = format!("{mode:?}");
846 assert!(dbg.contains("Text"));
847 }
848
849 #[test]
850 fn ipv4_mapped_ipv6_link_local_blocked() {
851 let err = validate_url("https://[::ffff:169.254.0.1]/path").unwrap_err();
852 assert!(matches!(err, ToolError::Blocked { .. }));
853 }
854
855 #[test]
856 fn ipv4_mapped_ipv6_public_allowed() {
857 assert!(validate_url("https://[::ffff:93.184.216.34]/path").is_ok());
858 }
859
860 #[test]
861 fn tool_definitions_returns_web_scrape() {
862 let config = ScrapeConfig::default();
863 let executor = WebScrapeExecutor::new(&config);
864 let defs = executor.tool_definitions();
865 assert_eq!(defs.len(), 1);
866 assert_eq!(defs[0].id, "web_scrape");
867 assert_eq!(
868 defs[0].invocation,
869 crate::registry::InvocationHint::FencedBlock("scrape")
870 );
871 }
872
873 #[test]
874 fn tool_definitions_schema_has_all_params() {
875 let config = ScrapeConfig::default();
876 let executor = WebScrapeExecutor::new(&config);
877 let defs = executor.tool_definitions();
878 let obj = defs[0].schema.as_object().unwrap();
879 let props = obj["properties"].as_object().unwrap();
880 assert!(props.contains_key("url"));
881 assert!(props.contains_key("select"));
882 assert!(props.contains_key("extract"));
883 assert!(props.contains_key("limit"));
884 let req = obj["required"].as_array().unwrap();
885 assert!(req.iter().any(|v| v.as_str() == Some("url")));
886 assert!(req.iter().any(|v| v.as_str() == Some("select")));
887 assert!(!req.iter().any(|v| v.as_str() == Some("extract")));
888 }
889
890 #[test]
893 fn subdomain_localhost_blocked() {
894 let host: url::Host<&str> = url::Host::Domain("foo.localhost");
895 assert!(is_private_host(&host));
896 }
897
898 #[test]
899 fn internal_tld_blocked() {
900 let host: url::Host<&str> = url::Host::Domain("service.internal");
901 assert!(is_private_host(&host));
902 }
903
904 #[test]
905 fn local_tld_blocked() {
906 let host: url::Host<&str> = url::Host::Domain("printer.local");
907 assert!(is_private_host(&host));
908 }
909
910 #[test]
911 fn public_domain_not_blocked() {
912 let host: url::Host<&str> = url::Host::Domain("example.com");
913 assert!(!is_private_host(&host));
914 }
915
916 #[tokio::test]
919 async fn resolve_loopback_rejected() {
920 let url = url::Url::parse("https://127.0.0.1/path").unwrap();
922 let result = resolve_and_validate(&url).await;
924 assert!(
925 result.is_err(),
926 "loopback IP must be rejected by resolve_and_validate"
927 );
928 let err = result.unwrap_err();
929 assert!(matches!(err, crate::executor::ToolError::Blocked { .. }));
930 }
931
932 #[tokio::test]
933 async fn resolve_private_10_rejected() {
934 let url = url::Url::parse("https://10.0.0.1/path").unwrap();
935 let result = resolve_and_validate(&url).await;
936 assert!(result.is_err());
937 assert!(matches!(
938 result.unwrap_err(),
939 crate::executor::ToolError::Blocked { .. }
940 ));
941 }
942
943 #[tokio::test]
944 async fn resolve_private_192_rejected() {
945 let url = url::Url::parse("https://192.168.1.1/path").unwrap();
946 let result = resolve_and_validate(&url).await;
947 assert!(result.is_err());
948 assert!(matches!(
949 result.unwrap_err(),
950 crate::executor::ToolError::Blocked { .. }
951 ));
952 }
953
954 #[tokio::test]
955 async fn resolve_ipv6_loopback_rejected() {
956 let url = url::Url::parse("https://[::1]/path").unwrap();
957 let result = resolve_and_validate(&url).await;
958 assert!(result.is_err());
959 assert!(matches!(
960 result.unwrap_err(),
961 crate::executor::ToolError::Blocked { .. }
962 ));
963 }
964
965 #[tokio::test]
966 async fn resolve_no_host_returns_ok() {
967 let url = url::Url::parse("https://example.com/path").unwrap();
969 let url_no_host = url::Url::parse("data:text/plain,hello").unwrap();
971 let result = resolve_and_validate(&url_no_host).await;
973 assert!(result.is_ok());
974 let (host, addrs) = result.unwrap();
975 assert!(host.is_empty());
976 assert!(addrs.is_empty());
977 drop(url);
978 drop(url_no_host);
979 }
980}