zeph_tools/
scrape.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use std::net::{IpAddr, SocketAddr};
5use std::time::Duration;
6
7use schemars::JsonSchema;
8use serde::Deserialize;
9use url::Url;
10
11use crate::config::ScrapeConfig;
12use crate::executor::{ToolCall, ToolError, ToolExecutor, ToolOutput, deserialize_params};
13
14#[derive(Debug, Deserialize, JsonSchema)]
15struct ScrapeInstruction {
16    /// HTTPS URL to scrape
17    url: String,
18    /// CSS selector
19    select: String,
20    /// Extract mode: text, html, or attr:<name>
21    #[serde(default = "default_extract")]
22    extract: String,
23    /// Max results to return
24    limit: Option<usize>,
25}
26
27fn default_extract() -> String {
28    "text".into()
29}
30
31#[derive(Debug)]
32enum ExtractMode {
33    Text,
34    Html,
35    Attr(String),
36}
37
38impl ExtractMode {
39    fn parse(s: &str) -> Self {
40        match s {
41            "text" => Self::Text,
42            "html" => Self::Html,
43            attr if attr.starts_with("attr:") => {
44                Self::Attr(attr.strip_prefix("attr:").unwrap_or(attr).to_owned())
45            }
46            _ => Self::Text,
47        }
48    }
49}
50
51/// Extracts data from web pages via CSS selectors.
52///
53/// Detects ` ```scrape ` blocks in LLM responses containing JSON instructions,
54/// fetches the URL, and parses HTML with `scrape-core`.
55#[derive(Debug)]
56pub struct WebScrapeExecutor {
57    timeout: Duration,
58    max_body_bytes: usize,
59}
60
61impl WebScrapeExecutor {
62    #[must_use]
63    pub fn new(config: &ScrapeConfig) -> Self {
64        Self {
65            timeout: Duration::from_secs(config.timeout),
66            max_body_bytes: config.max_body_bytes,
67        }
68    }
69
70    fn build_client(&self, host: &str, addrs: &[SocketAddr]) -> reqwest::Client {
71        let mut builder = reqwest::Client::builder()
72            .timeout(self.timeout)
73            .redirect(reqwest::redirect::Policy::none());
74        builder = builder.resolve_to_addrs(host, addrs);
75        builder.build().unwrap_or_default()
76    }
77}
78
79impl ToolExecutor for WebScrapeExecutor {
80    fn tool_definitions(&self) -> Vec<crate::registry::ToolDef> {
81        use crate::registry::{InvocationHint, ToolDef};
82        vec![ToolDef {
83            id: "web_scrape".into(),
84            description: "Scrape data from a web page via CSS selectors".into(),
85            schema: schemars::schema_for!(ScrapeInstruction),
86            invocation: InvocationHint::FencedBlock("scrape"),
87        }]
88    }
89
90    async fn execute(&self, response: &str) -> Result<Option<ToolOutput>, ToolError> {
91        let blocks = extract_scrape_blocks(response);
92        if blocks.is_empty() {
93            return Ok(None);
94        }
95
96        let mut outputs = Vec::with_capacity(blocks.len());
97        #[allow(clippy::cast_possible_truncation)]
98        let blocks_executed = blocks.len() as u32;
99
100        for block in &blocks {
101            let instruction: ScrapeInstruction = serde_json::from_str(block).map_err(|e| {
102                ToolError::Execution(std::io::Error::new(
103                    std::io::ErrorKind::InvalidData,
104                    e.to_string(),
105                ))
106            })?;
107            outputs.push(self.scrape_instruction(&instruction).await?);
108        }
109
110        Ok(Some(ToolOutput {
111            tool_name: "web-scrape".to_owned(),
112            summary: outputs.join("\n\n"),
113            blocks_executed,
114            filter_stats: None,
115            diff: None,
116            streamed: false,
117            terminal_id: None,
118        }))
119    }
120
121    async fn execute_tool_call(&self, call: &ToolCall) -> Result<Option<ToolOutput>, ToolError> {
122        if call.tool_id != "web_scrape" {
123            return Ok(None);
124        }
125
126        let instruction: ScrapeInstruction = deserialize_params(&call.params)?;
127
128        let result = self.scrape_instruction(&instruction).await?;
129
130        Ok(Some(ToolOutput {
131            tool_name: "web-scrape".to_owned(),
132            summary: result,
133            blocks_executed: 1,
134            filter_stats: None,
135            diff: None,
136            streamed: false,
137            terminal_id: None,
138        }))
139    }
140}
141
142impl WebScrapeExecutor {
143    async fn scrape_instruction(
144        &self,
145        instruction: &ScrapeInstruction,
146    ) -> Result<String, ToolError> {
147        let parsed = validate_url(&instruction.url)?;
148        let (host, addrs) = resolve_and_validate(&parsed).await?;
149        let html = self.fetch_html(&instruction.url, &host, &addrs).await?;
150        let selector = instruction.select.clone();
151        let extract = ExtractMode::parse(&instruction.extract);
152        let limit = instruction.limit.unwrap_or(10);
153        tokio::task::spawn_blocking(move || parse_and_extract(&html, &selector, &extract, limit))
154            .await
155            .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?
156    }
157
158    /// Fetches the HTML at `url`, manually following up to 3 redirects.
159    ///
160    /// Each redirect target is validated with `validate_url` and `resolve_and_validate`
161    /// before following, preventing SSRF via redirect chains.
162    ///
163    /// # Errors
164    ///
165    /// Returns `ToolError::Blocked` if any redirect target resolves to a private IP.
166    /// Returns `ToolError::Execution` on HTTP errors, too-large bodies, or too many redirects.
167    async fn fetch_html(
168        &self,
169        url: &str,
170        host: &str,
171        addrs: &[SocketAddr],
172    ) -> Result<String, ToolError> {
173        const MAX_REDIRECTS: usize = 3;
174
175        let mut current_url = url.to_owned();
176        let mut current_host = host.to_owned();
177        let mut current_addrs = addrs.to_vec();
178
179        for hop in 0..=MAX_REDIRECTS {
180            // Build a per-hop client pinned to the current hop's validated addresses.
181            let client = self.build_client(&current_host, &current_addrs);
182            let resp = client
183                .get(&current_url)
184                .send()
185                .await
186                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
187
188            let status = resp.status();
189
190            if status.is_redirection() {
191                if hop == MAX_REDIRECTS {
192                    return Err(ToolError::Execution(std::io::Error::other(
193                        "too many redirects",
194                    )));
195                }
196
197                let location = resp
198                    .headers()
199                    .get(reqwest::header::LOCATION)
200                    .and_then(|v| v.to_str().ok())
201                    .ok_or_else(|| {
202                        ToolError::Execution(std::io::Error::other("redirect with no Location"))
203                    })?;
204
205                // Resolve relative redirect URLs against the current URL.
206                let base = Url::parse(&current_url)
207                    .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
208                let next_url = base
209                    .join(location)
210                    .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
211
212                let validated = validate_url(next_url.as_str())?;
213                let (next_host, next_addrs) = resolve_and_validate(&validated).await?;
214
215                current_url = next_url.to_string();
216                current_host = next_host;
217                current_addrs = next_addrs;
218                continue;
219            }
220
221            if !status.is_success() {
222                return Err(ToolError::Execution(std::io::Error::other(format!(
223                    "HTTP {status}",
224                ))));
225            }
226
227            let bytes = resp
228                .bytes()
229                .await
230                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
231
232            if bytes.len() > self.max_body_bytes {
233                return Err(ToolError::Execution(std::io::Error::other(format!(
234                    "response too large: {} bytes (max: {})",
235                    bytes.len(),
236                    self.max_body_bytes,
237                ))));
238            }
239
240            return String::from_utf8(bytes.to_vec())
241                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())));
242        }
243
244        Err(ToolError::Execution(std::io::Error::other(
245            "too many redirects",
246        )))
247    }
248}
249
250fn extract_scrape_blocks(text: &str) -> Vec<&str> {
251    crate::executor::extract_fenced_blocks(text, "scrape")
252}
253
254fn validate_url(raw: &str) -> Result<Url, ToolError> {
255    let parsed = Url::parse(raw).map_err(|_| ToolError::Blocked {
256        command: format!("invalid URL: {raw}"),
257    })?;
258
259    if parsed.scheme() != "https" {
260        return Err(ToolError::Blocked {
261            command: format!("scheme not allowed: {}", parsed.scheme()),
262        });
263    }
264
265    if let Some(host) = parsed.host()
266        && is_private_host(&host)
267    {
268        return Err(ToolError::Blocked {
269            command: format!(
270                "private/local host blocked: {}",
271                parsed.host_str().unwrap_or("")
272            ),
273        });
274    }
275
276    Ok(parsed)
277}
278
279pub(crate) fn is_private_ip(ip: IpAddr) -> bool {
280    match ip {
281        IpAddr::V4(v4) => {
282            v4.is_loopback()
283                || v4.is_private()
284                || v4.is_link_local()
285                || v4.is_unspecified()
286                || v4.is_broadcast()
287        }
288        IpAddr::V6(v6) => {
289            if v6.is_loopback() || v6.is_unspecified() {
290                return true;
291            }
292            let seg = v6.segments();
293            // fe80::/10 — link-local
294            if seg[0] & 0xffc0 == 0xfe80 {
295                return true;
296            }
297            // fc00::/7 — unique local
298            if seg[0] & 0xfe00 == 0xfc00 {
299                return true;
300            }
301            // ::ffff:x.x.x.x — IPv4-mapped, check inner IPv4
302            if seg[0..6] == [0, 0, 0, 0, 0, 0xffff] {
303                let v4 = v6
304                    .to_ipv4_mapped()
305                    .unwrap_or(std::net::Ipv4Addr::UNSPECIFIED);
306                return v4.is_loopback()
307                    || v4.is_private()
308                    || v4.is_link_local()
309                    || v4.is_unspecified()
310                    || v4.is_broadcast();
311            }
312            false
313        }
314    }
315}
316
317fn is_private_host(host: &url::Host<&str>) -> bool {
318    match host {
319        url::Host::Domain(d) => {
320            // Exact match or subdomain of localhost (e.g. foo.localhost)
321            // and .internal/.local TLDs used in cloud/k8s environments.
322            #[allow(clippy::case_sensitive_file_extension_comparisons)]
323            {
324                *d == "localhost"
325                    || d.ends_with(".localhost")
326                    || d.ends_with(".internal")
327                    || d.ends_with(".local")
328            }
329        }
330        url::Host::Ipv4(v4) => is_private_ip(IpAddr::V4(*v4)),
331        url::Host::Ipv6(v6) => is_private_ip(IpAddr::V6(*v6)),
332    }
333}
334
335/// Resolves DNS for the URL host, validates all resolved IPs against private ranges,
336/// and returns the hostname and validated socket addresses.
337///
338/// Returning the addresses allows the caller to pin the HTTP client to these exact
339/// addresses, eliminating TOCTOU between DNS validation and the actual connection.
340async fn resolve_and_validate(url: &Url) -> Result<(String, Vec<SocketAddr>), ToolError> {
341    let Some(host) = url.host_str() else {
342        return Ok((String::new(), vec![]));
343    };
344    let port = url.port_or_known_default().unwrap_or(443);
345    let addrs: Vec<SocketAddr> = tokio::net::lookup_host(format!("{host}:{port}"))
346        .await
347        .map_err(|e| ToolError::Blocked {
348            command: format!("DNS resolution failed: {e}"),
349        })?
350        .collect();
351    for addr in &addrs {
352        if is_private_ip(addr.ip()) {
353            return Err(ToolError::Blocked {
354                command: format!("SSRF protection: private IP {} for host {host}", addr.ip()),
355            });
356        }
357    }
358    Ok((host.to_owned(), addrs))
359}
360
361fn parse_and_extract(
362    html: &str,
363    selector: &str,
364    extract: &ExtractMode,
365    limit: usize,
366) -> Result<String, ToolError> {
367    let soup = scrape_core::Soup::parse(html);
368
369    let tags = soup.find_all(selector).map_err(|e| {
370        ToolError::Execution(std::io::Error::new(
371            std::io::ErrorKind::InvalidData,
372            format!("invalid selector: {e}"),
373        ))
374    })?;
375
376    let mut results = Vec::new();
377
378    for tag in tags.into_iter().take(limit) {
379        let value = match extract {
380            ExtractMode::Text => tag.text(),
381            ExtractMode::Html => tag.inner_html(),
382            ExtractMode::Attr(name) => tag.get(name).unwrap_or_default().to_owned(),
383        };
384        if !value.trim().is_empty() {
385            results.push(value.trim().to_owned());
386        }
387    }
388
389    if results.is_empty() {
390        Ok(format!("No results for selector: {selector}"))
391    } else {
392        Ok(results.join("\n"))
393    }
394}
395
396#[cfg(test)]
397mod tests {
398    use super::*;
399
400    // --- extract_scrape_blocks ---
401
402    #[test]
403    fn extract_single_block() {
404        let text =
405            "Here:\n```scrape\n{\"url\":\"https://example.com\",\"select\":\"h1\"}\n```\nDone.";
406        let blocks = extract_scrape_blocks(text);
407        assert_eq!(blocks.len(), 1);
408        assert!(blocks[0].contains("example.com"));
409    }
410
411    #[test]
412    fn extract_multiple_blocks() {
413        let text = "```scrape\n{\"url\":\"https://a.com\",\"select\":\"h1\"}\n```\ntext\n```scrape\n{\"url\":\"https://b.com\",\"select\":\"p\"}\n```";
414        let blocks = extract_scrape_blocks(text);
415        assert_eq!(blocks.len(), 2);
416    }
417
418    #[test]
419    fn no_blocks_returns_empty() {
420        let blocks = extract_scrape_blocks("plain text, no code blocks");
421        assert!(blocks.is_empty());
422    }
423
424    #[test]
425    fn unclosed_block_ignored() {
426        let blocks = extract_scrape_blocks("```scrape\n{\"url\":\"https://x.com\"}");
427        assert!(blocks.is_empty());
428    }
429
430    #[test]
431    fn non_scrape_block_ignored() {
432        let text =
433            "```bash\necho hi\n```\n```scrape\n{\"url\":\"https://x.com\",\"select\":\"h1\"}\n```";
434        let blocks = extract_scrape_blocks(text);
435        assert_eq!(blocks.len(), 1);
436        assert!(blocks[0].contains("x.com"));
437    }
438
439    #[test]
440    fn multiline_json_block() {
441        let text =
442            "```scrape\n{\n  \"url\": \"https://example.com\",\n  \"select\": \"h1\"\n}\n```";
443        let blocks = extract_scrape_blocks(text);
444        assert_eq!(blocks.len(), 1);
445        let instr: ScrapeInstruction = serde_json::from_str(blocks[0]).unwrap();
446        assert_eq!(instr.url, "https://example.com");
447    }
448
449    // --- ScrapeInstruction parsing ---
450
451    #[test]
452    fn parse_valid_instruction() {
453        let json = r#"{"url":"https://example.com","select":"h1","extract":"text","limit":5}"#;
454        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
455        assert_eq!(instr.url, "https://example.com");
456        assert_eq!(instr.select, "h1");
457        assert_eq!(instr.extract, "text");
458        assert_eq!(instr.limit, Some(5));
459    }
460
461    #[test]
462    fn parse_minimal_instruction() {
463        let json = r#"{"url":"https://example.com","select":"p"}"#;
464        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
465        assert_eq!(instr.extract, "text");
466        assert!(instr.limit.is_none());
467    }
468
469    #[test]
470    fn parse_attr_extract() {
471        let json = r#"{"url":"https://example.com","select":"a","extract":"attr:href"}"#;
472        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
473        assert_eq!(instr.extract, "attr:href");
474    }
475
476    #[test]
477    fn parse_invalid_json_errors() {
478        let result = serde_json::from_str::<ScrapeInstruction>("not json");
479        assert!(result.is_err());
480    }
481
482    // --- ExtractMode ---
483
484    #[test]
485    fn extract_mode_text() {
486        assert!(matches!(ExtractMode::parse("text"), ExtractMode::Text));
487    }
488
489    #[test]
490    fn extract_mode_html() {
491        assert!(matches!(ExtractMode::parse("html"), ExtractMode::Html));
492    }
493
494    #[test]
495    fn extract_mode_attr() {
496        let mode = ExtractMode::parse("attr:href");
497        assert!(matches!(mode, ExtractMode::Attr(ref s) if s == "href"));
498    }
499
500    #[test]
501    fn extract_mode_unknown_defaults_to_text() {
502        assert!(matches!(ExtractMode::parse("unknown"), ExtractMode::Text));
503    }
504
505    // --- validate_url ---
506
507    #[test]
508    fn valid_https_url() {
509        assert!(validate_url("https://example.com").is_ok());
510    }
511
512    #[test]
513    fn http_rejected() {
514        let err = validate_url("http://example.com").unwrap_err();
515        assert!(matches!(err, ToolError::Blocked { .. }));
516    }
517
518    #[test]
519    fn ftp_rejected() {
520        let err = validate_url("ftp://files.example.com").unwrap_err();
521        assert!(matches!(err, ToolError::Blocked { .. }));
522    }
523
524    #[test]
525    fn file_rejected() {
526        let err = validate_url("file:///etc/passwd").unwrap_err();
527        assert!(matches!(err, ToolError::Blocked { .. }));
528    }
529
530    #[test]
531    fn invalid_url_rejected() {
532        let err = validate_url("not a url").unwrap_err();
533        assert!(matches!(err, ToolError::Blocked { .. }));
534    }
535
536    #[test]
537    fn localhost_blocked() {
538        let err = validate_url("https://localhost/path").unwrap_err();
539        assert!(matches!(err, ToolError::Blocked { .. }));
540    }
541
542    #[test]
543    fn loopback_ip_blocked() {
544        let err = validate_url("https://127.0.0.1/path").unwrap_err();
545        assert!(matches!(err, ToolError::Blocked { .. }));
546    }
547
548    #[test]
549    fn private_10_blocked() {
550        let err = validate_url("https://10.0.0.1/api").unwrap_err();
551        assert!(matches!(err, ToolError::Blocked { .. }));
552    }
553
554    #[test]
555    fn private_172_blocked() {
556        let err = validate_url("https://172.16.0.1/api").unwrap_err();
557        assert!(matches!(err, ToolError::Blocked { .. }));
558    }
559
560    #[test]
561    fn private_192_blocked() {
562        let err = validate_url("https://192.168.1.1/api").unwrap_err();
563        assert!(matches!(err, ToolError::Blocked { .. }));
564    }
565
566    #[test]
567    fn ipv6_loopback_blocked() {
568        let err = validate_url("https://[::1]/path").unwrap_err();
569        assert!(matches!(err, ToolError::Blocked { .. }));
570    }
571
572    #[test]
573    fn public_ip_allowed() {
574        assert!(validate_url("https://93.184.216.34/page").is_ok());
575    }
576
577    // --- parse_and_extract ---
578
579    #[test]
580    fn extract_text_from_html() {
581        let html = "<html><body><h1>Hello World</h1><p>Content</p></body></html>";
582        let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
583        assert_eq!(result, "Hello World");
584    }
585
586    #[test]
587    fn extract_multiple_elements() {
588        let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
589        let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
590        assert_eq!(result, "A\nB\nC");
591    }
592
593    #[test]
594    fn extract_with_limit() {
595        let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
596        let result = parse_and_extract(html, "li", &ExtractMode::Text, 2).unwrap();
597        assert_eq!(result, "A\nB");
598    }
599
600    #[test]
601    fn extract_attr_href() {
602        let html = r#"<a href="https://example.com">Link</a>"#;
603        let result =
604            parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
605        assert_eq!(result, "https://example.com");
606    }
607
608    #[test]
609    fn extract_inner_html() {
610        let html = "<div><span>inner</span></div>";
611        let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
612        assert!(result.contains("<span>inner</span>"));
613    }
614
615    #[test]
616    fn no_matches_returns_message() {
617        let html = "<html><body><p>text</p></body></html>";
618        let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
619        assert!(result.starts_with("No results for selector:"));
620    }
621
622    #[test]
623    fn empty_text_skipped() {
624        let html = "<ul><li>  </li><li>A</li></ul>";
625        let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
626        assert_eq!(result, "A");
627    }
628
629    #[test]
630    fn invalid_selector_errors() {
631        let html = "<html><body></body></html>";
632        let result = parse_and_extract(html, "[[[invalid", &ExtractMode::Text, 10);
633        assert!(result.is_err());
634    }
635
636    #[test]
637    fn empty_html_returns_no_results() {
638        let result = parse_and_extract("", "h1", &ExtractMode::Text, 10).unwrap();
639        assert!(result.starts_with("No results for selector:"));
640    }
641
642    #[test]
643    fn nested_selector() {
644        let html = "<div><span>inner</span></div><span>outer</span>";
645        let result = parse_and_extract(html, "div > span", &ExtractMode::Text, 10).unwrap();
646        assert_eq!(result, "inner");
647    }
648
649    #[test]
650    fn attr_missing_returns_empty() {
651        let html = r#"<a>No href</a>"#;
652        let result =
653            parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
654        assert!(result.starts_with("No results for selector:"));
655    }
656
657    #[test]
658    fn extract_html_mode() {
659        let html = "<div><b>bold</b> text</div>";
660        let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
661        assert!(result.contains("<b>bold</b>"));
662    }
663
664    #[test]
665    fn limit_zero_returns_no_results() {
666        let html = "<ul><li>A</li><li>B</li></ul>";
667        let result = parse_and_extract(html, "li", &ExtractMode::Text, 0).unwrap();
668        assert!(result.starts_with("No results for selector:"));
669    }
670
671    // --- validate_url edge cases ---
672
673    #[test]
674    fn url_with_port_allowed() {
675        assert!(validate_url("https://example.com:8443/path").is_ok());
676    }
677
678    #[test]
679    fn link_local_ip_blocked() {
680        let err = validate_url("https://169.254.1.1/path").unwrap_err();
681        assert!(matches!(err, ToolError::Blocked { .. }));
682    }
683
684    #[test]
685    fn url_no_scheme_rejected() {
686        let err = validate_url("example.com/path").unwrap_err();
687        assert!(matches!(err, ToolError::Blocked { .. }));
688    }
689
690    #[test]
691    fn unspecified_ipv4_blocked() {
692        let err = validate_url("https://0.0.0.0/path").unwrap_err();
693        assert!(matches!(err, ToolError::Blocked { .. }));
694    }
695
696    #[test]
697    fn broadcast_ipv4_blocked() {
698        let err = validate_url("https://255.255.255.255/path").unwrap_err();
699        assert!(matches!(err, ToolError::Blocked { .. }));
700    }
701
702    #[test]
703    fn ipv6_link_local_blocked() {
704        let err = validate_url("https://[fe80::1]/path").unwrap_err();
705        assert!(matches!(err, ToolError::Blocked { .. }));
706    }
707
708    #[test]
709    fn ipv6_unique_local_blocked() {
710        let err = validate_url("https://[fd12::1]/path").unwrap_err();
711        assert!(matches!(err, ToolError::Blocked { .. }));
712    }
713
714    #[test]
715    fn ipv4_mapped_ipv6_loopback_blocked() {
716        let err = validate_url("https://[::ffff:127.0.0.1]/path").unwrap_err();
717        assert!(matches!(err, ToolError::Blocked { .. }));
718    }
719
720    #[test]
721    fn ipv4_mapped_ipv6_private_blocked() {
722        let err = validate_url("https://[::ffff:10.0.0.1]/path").unwrap_err();
723        assert!(matches!(err, ToolError::Blocked { .. }));
724    }
725
726    // --- WebScrapeExecutor (no-network) ---
727
728    #[tokio::test]
729    async fn executor_no_blocks_returns_none() {
730        let config = ScrapeConfig::default();
731        let executor = WebScrapeExecutor::new(&config);
732        let result = executor.execute("plain text").await;
733        assert!(result.unwrap().is_none());
734    }
735
736    #[tokio::test]
737    async fn executor_invalid_json_errors() {
738        let config = ScrapeConfig::default();
739        let executor = WebScrapeExecutor::new(&config);
740        let response = "```scrape\nnot json\n```";
741        let result = executor.execute(response).await;
742        assert!(matches!(result, Err(ToolError::Execution(_))));
743    }
744
745    #[tokio::test]
746    async fn executor_blocked_url_errors() {
747        let config = ScrapeConfig::default();
748        let executor = WebScrapeExecutor::new(&config);
749        let response = "```scrape\n{\"url\":\"http://example.com\",\"select\":\"h1\"}\n```";
750        let result = executor.execute(response).await;
751        assert!(matches!(result, Err(ToolError::Blocked { .. })));
752    }
753
754    #[tokio::test]
755    async fn executor_private_ip_blocked() {
756        let config = ScrapeConfig::default();
757        let executor = WebScrapeExecutor::new(&config);
758        let response = "```scrape\n{\"url\":\"https://192.168.1.1/api\",\"select\":\"h1\"}\n```";
759        let result = executor.execute(response).await;
760        assert!(matches!(result, Err(ToolError::Blocked { .. })));
761    }
762
763    #[tokio::test]
764    async fn executor_unreachable_host_returns_error() {
765        let config = ScrapeConfig {
766            timeout: 1,
767            max_body_bytes: 1_048_576,
768        };
769        let executor = WebScrapeExecutor::new(&config);
770        let response = "```scrape\n{\"url\":\"https://192.0.2.1:1/page\",\"select\":\"h1\"}\n```";
771        let result = executor.execute(response).await;
772        assert!(matches!(result, Err(ToolError::Execution(_))));
773    }
774
775    #[tokio::test]
776    async fn executor_localhost_url_blocked() {
777        let config = ScrapeConfig::default();
778        let executor = WebScrapeExecutor::new(&config);
779        let response = "```scrape\n{\"url\":\"https://localhost:9999/api\",\"select\":\"h1\"}\n```";
780        let result = executor.execute(response).await;
781        assert!(matches!(result, Err(ToolError::Blocked { .. })));
782    }
783
784    #[tokio::test]
785    async fn executor_empty_text_returns_none() {
786        let config = ScrapeConfig::default();
787        let executor = WebScrapeExecutor::new(&config);
788        let result = executor.execute("").await;
789        assert!(result.unwrap().is_none());
790    }
791
792    #[tokio::test]
793    async fn executor_multiple_blocks_first_blocked() {
794        let config = ScrapeConfig::default();
795        let executor = WebScrapeExecutor::new(&config);
796        let response = "```scrape\n{\"url\":\"http://evil.com\",\"select\":\"h1\"}\n```\n\
797             ```scrape\n{\"url\":\"https://ok.com\",\"select\":\"h1\"}\n```";
798        let result = executor.execute(response).await;
799        assert!(result.is_err());
800    }
801
802    #[test]
803    fn validate_url_empty_string() {
804        let err = validate_url("").unwrap_err();
805        assert!(matches!(err, ToolError::Blocked { .. }));
806    }
807
808    #[test]
809    fn validate_url_javascript_scheme_blocked() {
810        let err = validate_url("javascript:alert(1)").unwrap_err();
811        assert!(matches!(err, ToolError::Blocked { .. }));
812    }
813
814    #[test]
815    fn validate_url_data_scheme_blocked() {
816        let err = validate_url("data:text/html,<h1>hi</h1>").unwrap_err();
817        assert!(matches!(err, ToolError::Blocked { .. }));
818    }
819
820    #[test]
821    fn is_private_host_public_domain_is_false() {
822        let host: url::Host<&str> = url::Host::Domain("example.com");
823        assert!(!is_private_host(&host));
824    }
825
826    #[test]
827    fn is_private_host_localhost_is_true() {
828        let host: url::Host<&str> = url::Host::Domain("localhost");
829        assert!(is_private_host(&host));
830    }
831
832    #[test]
833    fn is_private_host_ipv6_unspecified_is_true() {
834        let host = url::Host::Ipv6(std::net::Ipv6Addr::UNSPECIFIED);
835        assert!(is_private_host(&host));
836    }
837
838    #[test]
839    fn is_private_host_public_ipv6_is_false() {
840        let host = url::Host::Ipv6("2001:db8::1".parse().unwrap());
841        assert!(!is_private_host(&host));
842    }
843
844    // --- fetch_html redirect logic: wiremock HTTP server tests ---
845    //
846    // These tests use a local wiremock server to exercise the redirect-following logic
847    // in `fetch_html` without requiring an external HTTPS connection. The server binds to
848    // 127.0.0.1, and tests call `fetch_html` directly (bypassing `validate_url`) to avoid
849    // the SSRF guard that would otherwise block loopback connections.
850
851    /// Helper: returns executor + (server_url, server_addr) from a running wiremock mock server.
852    /// The server address is passed to `fetch_html` via `resolve_to_addrs` so the client
853    /// connects to the mock instead of doing a real DNS lookup.
854    async fn mock_server_executor() -> (WebScrapeExecutor, wiremock::MockServer) {
855        let server = wiremock::MockServer::start().await;
856        let executor = WebScrapeExecutor {
857            timeout: Duration::from_secs(5),
858            max_body_bytes: 1_048_576,
859        };
860        (executor, server)
861    }
862
863    /// Parses the mock server's URI into (host_str, socket_addr) for use with `build_client`.
864    fn server_host_and_addr(server: &wiremock::MockServer) -> (String, Vec<std::net::SocketAddr>) {
865        let uri = server.uri();
866        let url = Url::parse(&uri).unwrap();
867        let host = url.host_str().unwrap_or("127.0.0.1").to_owned();
868        let port = url.port().unwrap_or(80);
869        let addr: std::net::SocketAddr = format!("{host}:{port}").parse().unwrap();
870        (host, vec![addr])
871    }
872
873    /// Test-only redirect follower that mimics `fetch_html`'s loop but skips `validate_url` /
874    /// `resolve_and_validate`. This lets us exercise the redirect-counting and
875    /// missing-Location logic against a plain HTTP wiremock server.
876    async fn follow_redirects_raw(
877        executor: &WebScrapeExecutor,
878        start_url: &str,
879        host: &str,
880        addrs: &[std::net::SocketAddr],
881    ) -> Result<String, ToolError> {
882        const MAX_REDIRECTS: usize = 3;
883        let mut current_url = start_url.to_owned();
884        let mut current_host = host.to_owned();
885        let mut current_addrs = addrs.to_vec();
886
887        for hop in 0..=MAX_REDIRECTS {
888            let client = executor.build_client(&current_host, &current_addrs);
889            let resp = client
890                .get(&current_url)
891                .send()
892                .await
893                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
894
895            let status = resp.status();
896
897            if status.is_redirection() {
898                if hop == MAX_REDIRECTS {
899                    return Err(ToolError::Execution(std::io::Error::other(
900                        "too many redirects",
901                    )));
902                }
903
904                let location = resp
905                    .headers()
906                    .get(reqwest::header::LOCATION)
907                    .and_then(|v| v.to_str().ok())
908                    .ok_or_else(|| {
909                        ToolError::Execution(std::io::Error::other("redirect with no Location"))
910                    })?;
911
912                let base = Url::parse(&current_url)
913                    .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
914                let next_url = base
915                    .join(location)
916                    .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
917
918                // Re-use same host/addrs (mock server is always the same endpoint).
919                current_url = next_url.to_string();
920                // Preserve host/addrs as-is since the mock server doesn't change.
921                let _ = &mut current_host;
922                let _ = &mut current_addrs;
923                continue;
924            }
925
926            if !status.is_success() {
927                return Err(ToolError::Execution(std::io::Error::other(format!(
928                    "HTTP {status}",
929                ))));
930            }
931
932            let bytes = resp
933                .bytes()
934                .await
935                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
936
937            if bytes.len() > executor.max_body_bytes {
938                return Err(ToolError::Execution(std::io::Error::other(format!(
939                    "response too large: {} bytes (max: {})",
940                    bytes.len(),
941                    executor.max_body_bytes,
942                ))));
943            }
944
945            return String::from_utf8(bytes.to_vec())
946                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())));
947        }
948
949        Err(ToolError::Execution(std::io::Error::other(
950            "too many redirects",
951        )))
952    }
953
954    #[tokio::test]
955    async fn fetch_html_success_returns_body() {
956        use wiremock::matchers::{method, path};
957        use wiremock::{Mock, ResponseTemplate};
958
959        let (executor, server) = mock_server_executor().await;
960        Mock::given(method("GET"))
961            .and(path("/page"))
962            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>OK</h1>"))
963            .mount(&server)
964            .await;
965
966        let (host, addrs) = server_host_and_addr(&server);
967        let url = format!("{}/page", server.uri());
968        let result = executor.fetch_html(&url, &host, &addrs).await;
969        assert!(result.is_ok(), "expected Ok, got: {result:?}");
970        assert_eq!(result.unwrap(), "<h1>OK</h1>");
971    }
972
973    #[tokio::test]
974    async fn fetch_html_non_2xx_returns_error() {
975        use wiremock::matchers::{method, path};
976        use wiremock::{Mock, ResponseTemplate};
977
978        let (executor, server) = mock_server_executor().await;
979        Mock::given(method("GET"))
980            .and(path("/forbidden"))
981            .respond_with(ResponseTemplate::new(403))
982            .mount(&server)
983            .await;
984
985        let (host, addrs) = server_host_and_addr(&server);
986        let url = format!("{}/forbidden", server.uri());
987        let result = executor.fetch_html(&url, &host, &addrs).await;
988        assert!(result.is_err());
989        let msg = result.unwrap_err().to_string();
990        assert!(msg.contains("403"), "expected 403 in error: {msg}");
991    }
992
993    #[tokio::test]
994    async fn fetch_html_404_returns_error() {
995        use wiremock::matchers::{method, path};
996        use wiremock::{Mock, ResponseTemplate};
997
998        let (executor, server) = mock_server_executor().await;
999        Mock::given(method("GET"))
1000            .and(path("/missing"))
1001            .respond_with(ResponseTemplate::new(404))
1002            .mount(&server)
1003            .await;
1004
1005        let (host, addrs) = server_host_and_addr(&server);
1006        let url = format!("{}/missing", server.uri());
1007        let result = executor.fetch_html(&url, &host, &addrs).await;
1008        assert!(result.is_err());
1009        let msg = result.unwrap_err().to_string();
1010        assert!(msg.contains("404"), "expected 404 in error: {msg}");
1011    }
1012
1013    #[tokio::test]
1014    async fn fetch_html_redirect_no_location_returns_error() {
1015        use wiremock::matchers::{method, path};
1016        use wiremock::{Mock, ResponseTemplate};
1017
1018        let (executor, server) = mock_server_executor().await;
1019        // 302 with no Location header
1020        Mock::given(method("GET"))
1021            .and(path("/redirect-no-loc"))
1022            .respond_with(ResponseTemplate::new(302))
1023            .mount(&server)
1024            .await;
1025
1026        let (host, addrs) = server_host_and_addr(&server);
1027        let url = format!("{}/redirect-no-loc", server.uri());
1028        let result = executor.fetch_html(&url, &host, &addrs).await;
1029        assert!(result.is_err());
1030        let msg = result.unwrap_err().to_string();
1031        assert!(
1032            msg.contains("Location") || msg.contains("location"),
1033            "expected Location-related error: {msg}"
1034        );
1035    }
1036
1037    #[tokio::test]
1038    async fn fetch_html_single_redirect_followed() {
1039        use wiremock::matchers::{method, path};
1040        use wiremock::{Mock, ResponseTemplate};
1041
1042        let (executor, server) = mock_server_executor().await;
1043        let final_url = format!("{}/final", server.uri());
1044
1045        Mock::given(method("GET"))
1046            .and(path("/start"))
1047            .respond_with(ResponseTemplate::new(302).insert_header("location", final_url.as_str()))
1048            .mount(&server)
1049            .await;
1050
1051        Mock::given(method("GET"))
1052            .and(path("/final"))
1053            .respond_with(ResponseTemplate::new(200).set_body_string("<p>final</p>"))
1054            .mount(&server)
1055            .await;
1056
1057        let (host, addrs) = server_host_and_addr(&server);
1058        let url = format!("{}/start", server.uri());
1059        let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1060        assert!(result.is_ok(), "single redirect should succeed: {result:?}");
1061        assert_eq!(result.unwrap(), "<p>final</p>");
1062    }
1063
1064    #[tokio::test]
1065    async fn fetch_html_three_redirects_allowed() {
1066        use wiremock::matchers::{method, path};
1067        use wiremock::{Mock, ResponseTemplate};
1068
1069        let (executor, server) = mock_server_executor().await;
1070        let hop2 = format!("{}/hop2", server.uri());
1071        let hop3 = format!("{}/hop3", server.uri());
1072        let final_dest = format!("{}/done", server.uri());
1073
1074        Mock::given(method("GET"))
1075            .and(path("/hop1"))
1076            .respond_with(ResponseTemplate::new(301).insert_header("location", hop2.as_str()))
1077            .mount(&server)
1078            .await;
1079        Mock::given(method("GET"))
1080            .and(path("/hop2"))
1081            .respond_with(ResponseTemplate::new(301).insert_header("location", hop3.as_str()))
1082            .mount(&server)
1083            .await;
1084        Mock::given(method("GET"))
1085            .and(path("/hop3"))
1086            .respond_with(ResponseTemplate::new(301).insert_header("location", final_dest.as_str()))
1087            .mount(&server)
1088            .await;
1089        Mock::given(method("GET"))
1090            .and(path("/done"))
1091            .respond_with(ResponseTemplate::new(200).set_body_string("<p>done</p>"))
1092            .mount(&server)
1093            .await;
1094
1095        let (host, addrs) = server_host_and_addr(&server);
1096        let url = format!("{}/hop1", server.uri());
1097        let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1098        assert!(result.is_ok(), "3 redirects should succeed: {result:?}");
1099        assert_eq!(result.unwrap(), "<p>done</p>");
1100    }
1101
1102    #[tokio::test]
1103    async fn fetch_html_four_redirects_rejected() {
1104        use wiremock::matchers::{method, path};
1105        use wiremock::{Mock, ResponseTemplate};
1106
1107        let (executor, server) = mock_server_executor().await;
1108        let hop2 = format!("{}/r2", server.uri());
1109        let hop3 = format!("{}/r3", server.uri());
1110        let hop4 = format!("{}/r4", server.uri());
1111        let hop5 = format!("{}/r5", server.uri());
1112
1113        for (from, to) in [
1114            ("/r1", &hop2),
1115            ("/r2", &hop3),
1116            ("/r3", &hop4),
1117            ("/r4", &hop5),
1118        ] {
1119            Mock::given(method("GET"))
1120                .and(path(from))
1121                .respond_with(ResponseTemplate::new(301).insert_header("location", to.as_str()))
1122                .mount(&server)
1123                .await;
1124        }
1125
1126        let (host, addrs) = server_host_and_addr(&server);
1127        let url = format!("{}/r1", server.uri());
1128        let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1129        assert!(result.is_err(), "4 redirects should be rejected");
1130        let msg = result.unwrap_err().to_string();
1131        assert!(
1132            msg.contains("redirect"),
1133            "expected redirect-related error: {msg}"
1134        );
1135    }
1136
1137    #[tokio::test]
1138    async fn fetch_html_body_too_large_returns_error() {
1139        use wiremock::matchers::{method, path};
1140        use wiremock::{Mock, ResponseTemplate};
1141
1142        let small_limit_executor = WebScrapeExecutor {
1143            timeout: Duration::from_secs(5),
1144            max_body_bytes: 10,
1145        };
1146        let server = wiremock::MockServer::start().await;
1147        Mock::given(method("GET"))
1148            .and(path("/big"))
1149            .respond_with(
1150                ResponseTemplate::new(200)
1151                    .set_body_string("this body is definitely longer than ten bytes"),
1152            )
1153            .mount(&server)
1154            .await;
1155
1156        let (host, addrs) = server_host_and_addr(&server);
1157        let url = format!("{}/big", server.uri());
1158        let result = small_limit_executor.fetch_html(&url, &host, &addrs).await;
1159        assert!(result.is_err());
1160        let msg = result.unwrap_err().to_string();
1161        assert!(msg.contains("too large"), "expected too-large error: {msg}");
1162    }
1163
1164    #[test]
1165    fn extract_scrape_blocks_empty_block_content() {
1166        let text = "```scrape\n\n```";
1167        let blocks = extract_scrape_blocks(text);
1168        assert_eq!(blocks.len(), 1);
1169        assert!(blocks[0].is_empty());
1170    }
1171
1172    #[test]
1173    fn extract_scrape_blocks_whitespace_only() {
1174        let text = "```scrape\n   \n```";
1175        let blocks = extract_scrape_blocks(text);
1176        assert_eq!(blocks.len(), 1);
1177    }
1178
1179    #[test]
1180    fn parse_and_extract_multiple_selectors() {
1181        let html = "<div><h1>Title</h1><p>Para</p></div>";
1182        let result = parse_and_extract(html, "h1, p", &ExtractMode::Text, 10).unwrap();
1183        assert!(result.contains("Title"));
1184        assert!(result.contains("Para"));
1185    }
1186
1187    #[test]
1188    fn webscrape_executor_new_with_custom_config() {
1189        let config = ScrapeConfig {
1190            timeout: 60,
1191            max_body_bytes: 512,
1192        };
1193        let executor = WebScrapeExecutor::new(&config);
1194        assert_eq!(executor.max_body_bytes, 512);
1195    }
1196
1197    #[test]
1198    fn webscrape_executor_debug() {
1199        let config = ScrapeConfig::default();
1200        let executor = WebScrapeExecutor::new(&config);
1201        let dbg = format!("{executor:?}");
1202        assert!(dbg.contains("WebScrapeExecutor"));
1203    }
1204
1205    #[test]
1206    fn extract_mode_attr_empty_name() {
1207        let mode = ExtractMode::parse("attr:");
1208        assert!(matches!(mode, ExtractMode::Attr(ref s) if s.is_empty()));
1209    }
1210
1211    #[test]
1212    fn default_extract_returns_text() {
1213        assert_eq!(default_extract(), "text");
1214    }
1215
1216    #[test]
1217    fn scrape_instruction_debug() {
1218        let json = r#"{"url":"https://example.com","select":"h1"}"#;
1219        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
1220        let dbg = format!("{instr:?}");
1221        assert!(dbg.contains("ScrapeInstruction"));
1222    }
1223
1224    #[test]
1225    fn extract_mode_debug() {
1226        let mode = ExtractMode::Text;
1227        let dbg = format!("{mode:?}");
1228        assert!(dbg.contains("Text"));
1229    }
1230
1231    // --- fetch_html redirect logic: constant and validation unit tests ---
1232
1233    /// MAX_REDIRECTS is 3; the 4th redirect attempt must be rejected.
1234    /// Verify the boundary is correct by inspecting the constant value.
1235    #[test]
1236    fn max_redirects_constant_is_three() {
1237        // fetch_html uses `for hop in 0..=MAX_REDIRECTS` and returns error when hop == MAX_REDIRECTS
1238        // while still in a redirect. That means hops 0,1,2 can redirect; hop 3 triggers the error.
1239        // This test documents the expected limit.
1240        const MAX_REDIRECTS: usize = 3;
1241        assert_eq!(MAX_REDIRECTS, 3, "fetch_html allows exactly 3 redirects");
1242    }
1243
1244    /// Verifies that a Location-less redirect would produce an error string containing the
1245    /// expected message, matching the error path in fetch_html.
1246    #[test]
1247    fn redirect_no_location_error_message() {
1248        let err = std::io::Error::other("redirect with no Location");
1249        assert!(err.to_string().contains("redirect with no Location"));
1250    }
1251
1252    /// Verifies that a too-many-redirects condition produces the expected error string.
1253    #[test]
1254    fn too_many_redirects_error_message() {
1255        let err = std::io::Error::other("too many redirects");
1256        assert!(err.to_string().contains("too many redirects"));
1257    }
1258
1259    /// Verifies that a non-2xx HTTP status produces an error message with the status code.
1260    #[test]
1261    fn non_2xx_status_error_format() {
1262        let status = reqwest::StatusCode::FORBIDDEN;
1263        let msg = format!("HTTP {status}");
1264        assert!(msg.contains("403"));
1265    }
1266
1267    /// Verifies that a 404 response status code formats into the expected error message.
1268    #[test]
1269    fn not_found_status_error_format() {
1270        let status = reqwest::StatusCode::NOT_FOUND;
1271        let msg = format!("HTTP {status}");
1272        assert!(msg.contains("404"));
1273    }
1274
1275    /// Verifies relative redirect resolution for same-host paths (simulates Location: /other).
1276    #[test]
1277    fn relative_redirect_same_host_path() {
1278        let base = Url::parse("https://example.com/current").unwrap();
1279        let resolved = base.join("/other").unwrap();
1280        assert_eq!(resolved.as_str(), "https://example.com/other");
1281    }
1282
1283    /// Verifies relative redirect resolution preserves scheme and host.
1284    #[test]
1285    fn relative_redirect_relative_path() {
1286        let base = Url::parse("https://example.com/a/b").unwrap();
1287        let resolved = base.join("c").unwrap();
1288        assert_eq!(resolved.as_str(), "https://example.com/a/c");
1289    }
1290
1291    /// Verifies that an absolute redirect URL overrides base URL completely.
1292    #[test]
1293    fn absolute_redirect_overrides_base() {
1294        let base = Url::parse("https://example.com/page").unwrap();
1295        let resolved = base.join("https://other.com/target").unwrap();
1296        assert_eq!(resolved.as_str(), "https://other.com/target");
1297    }
1298
1299    /// Verifies that a redirect Location of http:// (downgrade) is rejected.
1300    #[test]
1301    fn redirect_http_downgrade_rejected() {
1302        let location = "http://example.com/page";
1303        let base = Url::parse("https://example.com/start").unwrap();
1304        let next = base.join(location).unwrap();
1305        let err = validate_url(next.as_str()).unwrap_err();
1306        assert!(matches!(err, ToolError::Blocked { .. }));
1307    }
1308
1309    /// Verifies that a redirect to a private IP literal is blocked.
1310    #[test]
1311    fn redirect_location_private_ip_blocked() {
1312        let location = "https://192.168.100.1/admin";
1313        let base = Url::parse("https://example.com/start").unwrap();
1314        let next = base.join(location).unwrap();
1315        let err = validate_url(next.as_str()).unwrap_err();
1316        assert!(matches!(err, ToolError::Blocked { .. }));
1317        let cmd = match err {
1318            ToolError::Blocked { command } => command,
1319            _ => panic!("expected Blocked"),
1320        };
1321        assert!(
1322            cmd.contains("private") || cmd.contains("scheme"),
1323            "error message should describe the block reason: {cmd}"
1324        );
1325    }
1326
1327    /// Verifies that a redirect to a .internal domain is blocked.
1328    #[test]
1329    fn redirect_location_internal_domain_blocked() {
1330        let location = "https://metadata.internal/latest/meta-data/";
1331        let base = Url::parse("https://example.com/start").unwrap();
1332        let next = base.join(location).unwrap();
1333        let err = validate_url(next.as_str()).unwrap_err();
1334        assert!(matches!(err, ToolError::Blocked { .. }));
1335    }
1336
1337    /// Verifies that a chain of 3 valid public redirects passes validate_url at every hop.
1338    #[test]
1339    fn redirect_chain_three_hops_all_public() {
1340        let hops = [
1341            "https://redirect1.example.com/hop1",
1342            "https://redirect2.example.com/hop2",
1343            "https://destination.example.com/final",
1344        ];
1345        for hop in hops {
1346            assert!(validate_url(hop).is_ok(), "expected ok for {hop}");
1347        }
1348    }
1349
1350    // --- SSRF redirect chain defense ---
1351
1352    /// Verifies that a redirect Location pointing to a private IP is rejected by validate_url
1353    /// before any connection attempt — simulating the validation step inside fetch_html.
1354    #[test]
1355    fn redirect_to_private_ip_rejected_by_validate_url() {
1356        // These would appear as Location headers in a redirect response.
1357        let private_targets = [
1358            "https://127.0.0.1/secret",
1359            "https://10.0.0.1/internal",
1360            "https://192.168.1.1/admin",
1361            "https://172.16.0.1/data",
1362            "https://[::1]/path",
1363            "https://[fe80::1]/path",
1364            "https://localhost/path",
1365            "https://service.internal/api",
1366        ];
1367        for target in private_targets {
1368            let result = validate_url(target);
1369            assert!(result.is_err(), "expected error for {target}");
1370            assert!(
1371                matches!(result.unwrap_err(), ToolError::Blocked { .. }),
1372                "expected Blocked for {target}"
1373            );
1374        }
1375    }
1376
1377    /// Verifies that relative redirect URLs are resolved correctly before validation.
1378    #[test]
1379    fn redirect_relative_url_resolves_correctly() {
1380        let base = Url::parse("https://example.com/page").unwrap();
1381        let relative = "/other";
1382        let resolved = base.join(relative).unwrap();
1383        assert_eq!(resolved.as_str(), "https://example.com/other");
1384    }
1385
1386    /// Verifies that a protocol-relative redirect to http:// is rejected (scheme check).
1387    #[test]
1388    fn redirect_to_http_rejected() {
1389        let err = validate_url("http://example.com/page").unwrap_err();
1390        assert!(matches!(err, ToolError::Blocked { .. }));
1391    }
1392
1393    #[test]
1394    fn ipv4_mapped_ipv6_link_local_blocked() {
1395        let err = validate_url("https://[::ffff:169.254.0.1]/path").unwrap_err();
1396        assert!(matches!(err, ToolError::Blocked { .. }));
1397    }
1398
1399    #[test]
1400    fn ipv4_mapped_ipv6_public_allowed() {
1401        assert!(validate_url("https://[::ffff:93.184.216.34]/path").is_ok());
1402    }
1403
1404    #[test]
1405    fn tool_definitions_returns_web_scrape() {
1406        let config = ScrapeConfig::default();
1407        let executor = WebScrapeExecutor::new(&config);
1408        let defs = executor.tool_definitions();
1409        assert_eq!(defs.len(), 1);
1410        assert_eq!(defs[0].id, "web_scrape");
1411        assert_eq!(
1412            defs[0].invocation,
1413            crate::registry::InvocationHint::FencedBlock("scrape")
1414        );
1415    }
1416
1417    #[test]
1418    fn tool_definitions_schema_has_all_params() {
1419        let config = ScrapeConfig::default();
1420        let executor = WebScrapeExecutor::new(&config);
1421        let defs = executor.tool_definitions();
1422        let obj = defs[0].schema.as_object().unwrap();
1423        let props = obj["properties"].as_object().unwrap();
1424        assert!(props.contains_key("url"));
1425        assert!(props.contains_key("select"));
1426        assert!(props.contains_key("extract"));
1427        assert!(props.contains_key("limit"));
1428        let req = obj["required"].as_array().unwrap();
1429        assert!(req.iter().any(|v| v.as_str() == Some("url")));
1430        assert!(req.iter().any(|v| v.as_str() == Some("select")));
1431        assert!(!req.iter().any(|v| v.as_str() == Some("extract")));
1432    }
1433
1434    // --- is_private_host: new domain checks (AUD-02) ---
1435
1436    #[test]
1437    fn subdomain_localhost_blocked() {
1438        let host: url::Host<&str> = url::Host::Domain("foo.localhost");
1439        assert!(is_private_host(&host));
1440    }
1441
1442    #[test]
1443    fn internal_tld_blocked() {
1444        let host: url::Host<&str> = url::Host::Domain("service.internal");
1445        assert!(is_private_host(&host));
1446    }
1447
1448    #[test]
1449    fn local_tld_blocked() {
1450        let host: url::Host<&str> = url::Host::Domain("printer.local");
1451        assert!(is_private_host(&host));
1452    }
1453
1454    #[test]
1455    fn public_domain_not_blocked() {
1456        let host: url::Host<&str> = url::Host::Domain("example.com");
1457        assert!(!is_private_host(&host));
1458    }
1459
1460    // --- resolve_and_validate: private IP rejection ---
1461
1462    #[tokio::test]
1463    async fn resolve_loopback_rejected() {
1464        // 127.0.0.1 resolves directly (literal IP in DNS query)
1465        let url = url::Url::parse("https://127.0.0.1/path").unwrap();
1466        // validate_url catches this before resolve_and_validate, but test directly
1467        let result = resolve_and_validate(&url).await;
1468        assert!(
1469            result.is_err(),
1470            "loopback IP must be rejected by resolve_and_validate"
1471        );
1472        let err = result.unwrap_err();
1473        assert!(matches!(err, crate::executor::ToolError::Blocked { .. }));
1474    }
1475
1476    #[tokio::test]
1477    async fn resolve_private_10_rejected() {
1478        let url = url::Url::parse("https://10.0.0.1/path").unwrap();
1479        let result = resolve_and_validate(&url).await;
1480        assert!(result.is_err());
1481        assert!(matches!(
1482            result.unwrap_err(),
1483            crate::executor::ToolError::Blocked { .. }
1484        ));
1485    }
1486
1487    #[tokio::test]
1488    async fn resolve_private_192_rejected() {
1489        let url = url::Url::parse("https://192.168.1.1/path").unwrap();
1490        let result = resolve_and_validate(&url).await;
1491        assert!(result.is_err());
1492        assert!(matches!(
1493            result.unwrap_err(),
1494            crate::executor::ToolError::Blocked { .. }
1495        ));
1496    }
1497
1498    #[tokio::test]
1499    async fn resolve_ipv6_loopback_rejected() {
1500        let url = url::Url::parse("https://[::1]/path").unwrap();
1501        let result = resolve_and_validate(&url).await;
1502        assert!(result.is_err());
1503        assert!(matches!(
1504            result.unwrap_err(),
1505            crate::executor::ToolError::Blocked { .. }
1506        ));
1507    }
1508
1509    #[tokio::test]
1510    async fn resolve_no_host_returns_ok() {
1511        // URL without a resolvable host — should pass through
1512        let url = url::Url::parse("https://example.com/path").unwrap();
1513        // We can't do a live DNS test, but we can verify a URL with no host
1514        let url_no_host = url::Url::parse("data:text/plain,hello").unwrap();
1515        // data: URLs have no host; resolve_and_validate should return Ok with empty addrs
1516        let result = resolve_and_validate(&url_no_host).await;
1517        assert!(result.is_ok());
1518        let (host, addrs) = result.unwrap();
1519        assert!(host.is_empty());
1520        assert!(addrs.is_empty());
1521        drop(url);
1522        drop(url_no_host);
1523    }
1524}
zeph_tools/scrape.rs

zeph_tools/
scrape.rs