zeph_tools/
scrape.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use std::net::{IpAddr, SocketAddr};
5use std::time::Duration;
6
7use schemars::JsonSchema;
8use serde::Deserialize;
9use url::Url;
10
11use crate::config::ScrapeConfig;
12use crate::executor::{ToolCall, ToolError, ToolExecutor, ToolOutput, deserialize_params};
13
14#[derive(Debug, Deserialize, JsonSchema)]
15struct ScrapeInstruction {
16    /// HTTPS URL to scrape
17    url: String,
18    /// CSS selector
19    select: String,
20    /// Extract mode: text, html, or attr:<name>
21    #[serde(default = "default_extract")]
22    extract: String,
23    /// Max results to return
24    limit: Option<usize>,
25}
26
27fn default_extract() -> String {
28    "text".into()
29}
30
31#[derive(Debug)]
32enum ExtractMode {
33    Text,
34    Html,
35    Attr(String),
36}
37
38impl ExtractMode {
39    fn parse(s: &str) -> Self {
40        match s {
41            "text" => Self::Text,
42            "html" => Self::Html,
43            attr if attr.starts_with("attr:") => {
44                Self::Attr(attr.strip_prefix("attr:").unwrap_or(attr).to_owned())
45            }
46            _ => Self::Text,
47        }
48    }
49}
50
51/// Extracts data from web pages via CSS selectors.
52///
53/// Detects ` ```scrape ` blocks in LLM responses containing JSON instructions,
54/// fetches the URL, and parses HTML with `scrape-core`.
55#[derive(Debug)]
56pub struct WebScrapeExecutor {
57    timeout: Duration,
58    max_body_bytes: usize,
59}
60
61impl WebScrapeExecutor {
62    #[must_use]
63    pub fn new(config: &ScrapeConfig) -> Self {
64        Self {
65            timeout: Duration::from_secs(config.timeout),
66            max_body_bytes: config.max_body_bytes,
67        }
68    }
69
70    fn build_client(&self, host: &str, addrs: &[SocketAddr]) -> reqwest::Client {
71        let mut builder = reqwest::Client::builder()
72            .timeout(self.timeout)
73            .redirect(reqwest::redirect::Policy::none());
74        builder = builder.resolve_to_addrs(host, addrs);
75        builder.build().unwrap_or_default()
76    }
77}
78
79impl ToolExecutor for WebScrapeExecutor {
80    fn tool_definitions(&self) -> Vec<crate::registry::ToolDef> {
81        use crate::registry::{InvocationHint, ToolDef};
82        vec![ToolDef {
83            id: "web_scrape".into(),
84            description: "Scrape data from a web page via CSS selectors".into(),
85            schema: schemars::schema_for!(ScrapeInstruction),
86            invocation: InvocationHint::FencedBlock("scrape"),
87        }]
88    }
89
90    async fn execute(&self, response: &str) -> Result<Option<ToolOutput>, ToolError> {
91        let blocks = extract_scrape_blocks(response);
92        if blocks.is_empty() {
93            return Ok(None);
94        }
95
96        let mut outputs = Vec::with_capacity(blocks.len());
97        #[allow(clippy::cast_possible_truncation)]
98        let blocks_executed = blocks.len() as u32;
99
100        for block in &blocks {
101            let instruction: ScrapeInstruction = serde_json::from_str(block).map_err(|e| {
102                ToolError::Execution(std::io::Error::new(
103                    std::io::ErrorKind::InvalidData,
104                    e.to_string(),
105                ))
106            })?;
107            outputs.push(self.scrape_instruction(&instruction).await?);
108        }
109
110        Ok(Some(ToolOutput {
111            tool_name: "web-scrape".to_owned(),
112            summary: outputs.join("\n\n"),
113            blocks_executed,
114            filter_stats: None,
115            diff: None,
116            streamed: false,
117            terminal_id: None,
118            locations: None,
119        }))
120    }
121
122    async fn execute_tool_call(&self, call: &ToolCall) -> Result<Option<ToolOutput>, ToolError> {
123        if call.tool_id != "web_scrape" {
124            return Ok(None);
125        }
126
127        let instruction: ScrapeInstruction = deserialize_params(&call.params)?;
128
129        let result = self.scrape_instruction(&instruction).await?;
130
131        Ok(Some(ToolOutput {
132            tool_name: "web-scrape".to_owned(),
133            summary: result,
134            blocks_executed: 1,
135            filter_stats: None,
136            diff: None,
137            streamed: false,
138            terminal_id: None,
139            locations: None,
140        }))
141    }
142}
143
144impl WebScrapeExecutor {
145    async fn scrape_instruction(
146        &self,
147        instruction: &ScrapeInstruction,
148    ) -> Result<String, ToolError> {
149        let parsed = validate_url(&instruction.url)?;
150        let (host, addrs) = resolve_and_validate(&parsed).await?;
151        let html = self.fetch_html(&instruction.url, &host, &addrs).await?;
152        let selector = instruction.select.clone();
153        let extract = ExtractMode::parse(&instruction.extract);
154        let limit = instruction.limit.unwrap_or(10);
155        tokio::task::spawn_blocking(move || parse_and_extract(&html, &selector, &extract, limit))
156            .await
157            .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?
158    }
159
160    /// Fetches the HTML at `url`, manually following up to 3 redirects.
161    ///
162    /// Each redirect target is validated with `validate_url` and `resolve_and_validate`
163    /// before following, preventing SSRF via redirect chains.
164    ///
165    /// # Errors
166    ///
167    /// Returns `ToolError::Blocked` if any redirect target resolves to a private IP.
168    /// Returns `ToolError::Execution` on HTTP errors, too-large bodies, or too many redirects.
169    async fn fetch_html(
170        &self,
171        url: &str,
172        host: &str,
173        addrs: &[SocketAddr],
174    ) -> Result<String, ToolError> {
175        const MAX_REDIRECTS: usize = 3;
176
177        let mut current_url = url.to_owned();
178        let mut current_host = host.to_owned();
179        let mut current_addrs = addrs.to_vec();
180
181        for hop in 0..=MAX_REDIRECTS {
182            // Build a per-hop client pinned to the current hop's validated addresses.
183            let client = self.build_client(&current_host, &current_addrs);
184            let resp = client
185                .get(&current_url)
186                .send()
187                .await
188                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
189
190            let status = resp.status();
191
192            if status.is_redirection() {
193                if hop == MAX_REDIRECTS {
194                    return Err(ToolError::Execution(std::io::Error::other(
195                        "too many redirects",
196                    )));
197                }
198
199                let location = resp
200                    .headers()
201                    .get(reqwest::header::LOCATION)
202                    .and_then(|v| v.to_str().ok())
203                    .ok_or_else(|| {
204                        ToolError::Execution(std::io::Error::other("redirect with no Location"))
205                    })?;
206
207                // Resolve relative redirect URLs against the current URL.
208                let base = Url::parse(&current_url)
209                    .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
210                let next_url = base
211                    .join(location)
212                    .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
213
214                let validated = validate_url(next_url.as_str())?;
215                let (next_host, next_addrs) = resolve_and_validate(&validated).await?;
216
217                current_url = next_url.to_string();
218                current_host = next_host;
219                current_addrs = next_addrs;
220                continue;
221            }
222
223            if !status.is_success() {
224                return Err(ToolError::Execution(std::io::Error::other(format!(
225                    "HTTP {status}",
226                ))));
227            }
228
229            let bytes = resp
230                .bytes()
231                .await
232                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
233
234            if bytes.len() > self.max_body_bytes {
235                return Err(ToolError::Execution(std::io::Error::other(format!(
236                    "response too large: {} bytes (max: {})",
237                    bytes.len(),
238                    self.max_body_bytes,
239                ))));
240            }
241
242            return String::from_utf8(bytes.to_vec())
243                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())));
244        }
245
246        Err(ToolError::Execution(std::io::Error::other(
247            "too many redirects",
248        )))
249    }
250}
251
252fn extract_scrape_blocks(text: &str) -> Vec<&str> {
253    crate::executor::extract_fenced_blocks(text, "scrape")
254}
255
256fn validate_url(raw: &str) -> Result<Url, ToolError> {
257    let parsed = Url::parse(raw).map_err(|_| ToolError::Blocked {
258        command: format!("invalid URL: {raw}"),
259    })?;
260
261    if parsed.scheme() != "https" {
262        return Err(ToolError::Blocked {
263            command: format!("scheme not allowed: {}", parsed.scheme()),
264        });
265    }
266
267    if let Some(host) = parsed.host()
268        && is_private_host(&host)
269    {
270        return Err(ToolError::Blocked {
271            command: format!(
272                "private/local host blocked: {}",
273                parsed.host_str().unwrap_or("")
274            ),
275        });
276    }
277
278    Ok(parsed)
279}
280
281pub(crate) fn is_private_ip(ip: IpAddr) -> bool {
282    match ip {
283        IpAddr::V4(v4) => {
284            v4.is_loopback()
285                || v4.is_private()
286                || v4.is_link_local()
287                || v4.is_unspecified()
288                || v4.is_broadcast()
289        }
290        IpAddr::V6(v6) => {
291            if v6.is_loopback() || v6.is_unspecified() {
292                return true;
293            }
294            let seg = v6.segments();
295            // fe80::/10 — link-local
296            if seg[0] & 0xffc0 == 0xfe80 {
297                return true;
298            }
299            // fc00::/7 — unique local
300            if seg[0] & 0xfe00 == 0xfc00 {
301                return true;
302            }
303            // ::ffff:x.x.x.x — IPv4-mapped, check inner IPv4
304            if seg[0..6] == [0, 0, 0, 0, 0, 0xffff] {
305                let v4 = v6
306                    .to_ipv4_mapped()
307                    .unwrap_or(std::net::Ipv4Addr::UNSPECIFIED);
308                return v4.is_loopback()
309                    || v4.is_private()
310                    || v4.is_link_local()
311                    || v4.is_unspecified()
312                    || v4.is_broadcast();
313            }
314            false
315        }
316    }
317}
318
319fn is_private_host(host: &url::Host<&str>) -> bool {
320    match host {
321        url::Host::Domain(d) => {
322            // Exact match or subdomain of localhost (e.g. foo.localhost)
323            // and .internal/.local TLDs used in cloud/k8s environments.
324            #[allow(clippy::case_sensitive_file_extension_comparisons)]
325            {
326                *d == "localhost"
327                    || d.ends_with(".localhost")
328                    || d.ends_with(".internal")
329                    || d.ends_with(".local")
330            }
331        }
332        url::Host::Ipv4(v4) => is_private_ip(IpAddr::V4(*v4)),
333        url::Host::Ipv6(v6) => is_private_ip(IpAddr::V6(*v6)),
334    }
335}
336
337/// Resolves DNS for the URL host, validates all resolved IPs against private ranges,
338/// and returns the hostname and validated socket addresses.
339///
340/// Returning the addresses allows the caller to pin the HTTP client to these exact
341/// addresses, eliminating TOCTOU between DNS validation and the actual connection.
342async fn resolve_and_validate(url: &Url) -> Result<(String, Vec<SocketAddr>), ToolError> {
343    let Some(host) = url.host_str() else {
344        return Ok((String::new(), vec![]));
345    };
346    let port = url.port_or_known_default().unwrap_or(443);
347    let addrs: Vec<SocketAddr> = tokio::net::lookup_host(format!("{host}:{port}"))
348        .await
349        .map_err(|e| ToolError::Blocked {
350            command: format!("DNS resolution failed: {e}"),
351        })?
352        .collect();
353    for addr in &addrs {
354        if is_private_ip(addr.ip()) {
355            return Err(ToolError::Blocked {
356                command: format!("SSRF protection: private IP {} for host {host}", addr.ip()),
357            });
358        }
359    }
360    Ok((host.to_owned(), addrs))
361}
362
363fn parse_and_extract(
364    html: &str,
365    selector: &str,
366    extract: &ExtractMode,
367    limit: usize,
368) -> Result<String, ToolError> {
369    let soup = scrape_core::Soup::parse(html);
370
371    let tags = soup.find_all(selector).map_err(|e| {
372        ToolError::Execution(std::io::Error::new(
373            std::io::ErrorKind::InvalidData,
374            format!("invalid selector: {e}"),
375        ))
376    })?;
377
378    let mut results = Vec::new();
379
380    for tag in tags.into_iter().take(limit) {
381        let value = match extract {
382            ExtractMode::Text => tag.text(),
383            ExtractMode::Html => tag.inner_html(),
384            ExtractMode::Attr(name) => tag.get(name).unwrap_or_default().to_owned(),
385        };
386        if !value.trim().is_empty() {
387            results.push(value.trim().to_owned());
388        }
389    }
390
391    if results.is_empty() {
392        Ok(format!("No results for selector: {selector}"))
393    } else {
394        Ok(results.join("\n"))
395    }
396}
397
398#[cfg(test)]
399mod tests {
400    use super::*;
401
402    // --- extract_scrape_blocks ---
403
404    #[test]
405    fn extract_single_block() {
406        let text =
407            "Here:\n```scrape\n{\"url\":\"https://example.com\",\"select\":\"h1\"}\n```\nDone.";
408        let blocks = extract_scrape_blocks(text);
409        assert_eq!(blocks.len(), 1);
410        assert!(blocks[0].contains("example.com"));
411    }
412
413    #[test]
414    fn extract_multiple_blocks() {
415        let text = "```scrape\n{\"url\":\"https://a.com\",\"select\":\"h1\"}\n```\ntext\n```scrape\n{\"url\":\"https://b.com\",\"select\":\"p\"}\n```";
416        let blocks = extract_scrape_blocks(text);
417        assert_eq!(blocks.len(), 2);
418    }
419
420    #[test]
421    fn no_blocks_returns_empty() {
422        let blocks = extract_scrape_blocks("plain text, no code blocks");
423        assert!(blocks.is_empty());
424    }
425
426    #[test]
427    fn unclosed_block_ignored() {
428        let blocks = extract_scrape_blocks("```scrape\n{\"url\":\"https://x.com\"}");
429        assert!(blocks.is_empty());
430    }
431
432    #[test]
433    fn non_scrape_block_ignored() {
434        let text =
435            "```bash\necho hi\n```\n```scrape\n{\"url\":\"https://x.com\",\"select\":\"h1\"}\n```";
436        let blocks = extract_scrape_blocks(text);
437        assert_eq!(blocks.len(), 1);
438        assert!(blocks[0].contains("x.com"));
439    }
440
441    #[test]
442    fn multiline_json_block() {
443        let text =
444            "```scrape\n{\n  \"url\": \"https://example.com\",\n  \"select\": \"h1\"\n}\n```";
445        let blocks = extract_scrape_blocks(text);
446        assert_eq!(blocks.len(), 1);
447        let instr: ScrapeInstruction = serde_json::from_str(blocks[0]).unwrap();
448        assert_eq!(instr.url, "https://example.com");
449    }
450
451    // --- ScrapeInstruction parsing ---
452
453    #[test]
454    fn parse_valid_instruction() {
455        let json = r#"{"url":"https://example.com","select":"h1","extract":"text","limit":5}"#;
456        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
457        assert_eq!(instr.url, "https://example.com");
458        assert_eq!(instr.select, "h1");
459        assert_eq!(instr.extract, "text");
460        assert_eq!(instr.limit, Some(5));
461    }
462
463    #[test]
464    fn parse_minimal_instruction() {
465        let json = r#"{"url":"https://example.com","select":"p"}"#;
466        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
467        assert_eq!(instr.extract, "text");
468        assert!(instr.limit.is_none());
469    }
470
471    #[test]
472    fn parse_attr_extract() {
473        let json = r#"{"url":"https://example.com","select":"a","extract":"attr:href"}"#;
474        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
475        assert_eq!(instr.extract, "attr:href");
476    }
477
478    #[test]
479    fn parse_invalid_json_errors() {
480        let result = serde_json::from_str::<ScrapeInstruction>("not json");
481        assert!(result.is_err());
482    }
483
484    // --- ExtractMode ---
485
486    #[test]
487    fn extract_mode_text() {
488        assert!(matches!(ExtractMode::parse("text"), ExtractMode::Text));
489    }
490
491    #[test]
492    fn extract_mode_html() {
493        assert!(matches!(ExtractMode::parse("html"), ExtractMode::Html));
494    }
495
496    #[test]
497    fn extract_mode_attr() {
498        let mode = ExtractMode::parse("attr:href");
499        assert!(matches!(mode, ExtractMode::Attr(ref s) if s == "href"));
500    }
501
502    #[test]
503    fn extract_mode_unknown_defaults_to_text() {
504        assert!(matches!(ExtractMode::parse("unknown"), ExtractMode::Text));
505    }
506
507    // --- validate_url ---
508
509    #[test]
510    fn valid_https_url() {
511        assert!(validate_url("https://example.com").is_ok());
512    }
513
514    #[test]
515    fn http_rejected() {
516        let err = validate_url("http://example.com").unwrap_err();
517        assert!(matches!(err, ToolError::Blocked { .. }));
518    }
519
520    #[test]
521    fn ftp_rejected() {
522        let err = validate_url("ftp://files.example.com").unwrap_err();
523        assert!(matches!(err, ToolError::Blocked { .. }));
524    }
525
526    #[test]
527    fn file_rejected() {
528        let err = validate_url("file:///etc/passwd").unwrap_err();
529        assert!(matches!(err, ToolError::Blocked { .. }));
530    }
531
532    #[test]
533    fn invalid_url_rejected() {
534        let err = validate_url("not a url").unwrap_err();
535        assert!(matches!(err, ToolError::Blocked { .. }));
536    }
537
538    #[test]
539    fn localhost_blocked() {
540        let err = validate_url("https://localhost/path").unwrap_err();
541        assert!(matches!(err, ToolError::Blocked { .. }));
542    }
543
544    #[test]
545    fn loopback_ip_blocked() {
546        let err = validate_url("https://127.0.0.1/path").unwrap_err();
547        assert!(matches!(err, ToolError::Blocked { .. }));
548    }
549
550    #[test]
551    fn private_10_blocked() {
552        let err = validate_url("https://10.0.0.1/api").unwrap_err();
553        assert!(matches!(err, ToolError::Blocked { .. }));
554    }
555
556    #[test]
557    fn private_172_blocked() {
558        let err = validate_url("https://172.16.0.1/api").unwrap_err();
559        assert!(matches!(err, ToolError::Blocked { .. }));
560    }
561
562    #[test]
563    fn private_192_blocked() {
564        let err = validate_url("https://192.168.1.1/api").unwrap_err();
565        assert!(matches!(err, ToolError::Blocked { .. }));
566    }
567
568    #[test]
569    fn ipv6_loopback_blocked() {
570        let err = validate_url("https://[::1]/path").unwrap_err();
571        assert!(matches!(err, ToolError::Blocked { .. }));
572    }
573
574    #[test]
575    fn public_ip_allowed() {
576        assert!(validate_url("https://93.184.216.34/page").is_ok());
577    }
578
579    // --- parse_and_extract ---
580
581    #[test]
582    fn extract_text_from_html() {
583        let html = "<html><body><h1>Hello World</h1><p>Content</p></body></html>";
584        let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
585        assert_eq!(result, "Hello World");
586    }
587
588    #[test]
589    fn extract_multiple_elements() {
590        let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
591        let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
592        assert_eq!(result, "A\nB\nC");
593    }
594
595    #[test]
596    fn extract_with_limit() {
597        let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
598        let result = parse_and_extract(html, "li", &ExtractMode::Text, 2).unwrap();
599        assert_eq!(result, "A\nB");
600    }
601
602    #[test]
603    fn extract_attr_href() {
604        let html = r#"<a href="https://example.com">Link</a>"#;
605        let result =
606            parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
607        assert_eq!(result, "https://example.com");
608    }
609
610    #[test]
611    fn extract_inner_html() {
612        let html = "<div><span>inner</span></div>";
613        let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
614        assert!(result.contains("<span>inner</span>"));
615    }
616
617    #[test]
618    fn no_matches_returns_message() {
619        let html = "<html><body><p>text</p></body></html>";
620        let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
621        assert!(result.starts_with("No results for selector:"));
622    }
623
624    #[test]
625    fn empty_text_skipped() {
626        let html = "<ul><li>  </li><li>A</li></ul>";
627        let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
628        assert_eq!(result, "A");
629    }
630
631    #[test]
632    fn invalid_selector_errors() {
633        let html = "<html><body></body></html>";
634        let result = parse_and_extract(html, "[[[invalid", &ExtractMode::Text, 10);
635        assert!(result.is_err());
636    }
637
638    #[test]
639    fn empty_html_returns_no_results() {
640        let result = parse_and_extract("", "h1", &ExtractMode::Text, 10).unwrap();
641        assert!(result.starts_with("No results for selector:"));
642    }
643
644    #[test]
645    fn nested_selector() {
646        let html = "<div><span>inner</span></div><span>outer</span>";
647        let result = parse_and_extract(html, "div > span", &ExtractMode::Text, 10).unwrap();
648        assert_eq!(result, "inner");
649    }
650
651    #[test]
652    fn attr_missing_returns_empty() {
653        let html = r#"<a>No href</a>"#;
654        let result =
655            parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
656        assert!(result.starts_with("No results for selector:"));
657    }
658
659    #[test]
660    fn extract_html_mode() {
661        let html = "<div><b>bold</b> text</div>";
662        let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
663        assert!(result.contains("<b>bold</b>"));
664    }
665
666    #[test]
667    fn limit_zero_returns_no_results() {
668        let html = "<ul><li>A</li><li>B</li></ul>";
669        let result = parse_and_extract(html, "li", &ExtractMode::Text, 0).unwrap();
670        assert!(result.starts_with("No results for selector:"));
671    }
672
673    // --- validate_url edge cases ---
674
675    #[test]
676    fn url_with_port_allowed() {
677        assert!(validate_url("https://example.com:8443/path").is_ok());
678    }
679
680    #[test]
681    fn link_local_ip_blocked() {
682        let err = validate_url("https://169.254.1.1/path").unwrap_err();
683        assert!(matches!(err, ToolError::Blocked { .. }));
684    }
685
686    #[test]
687    fn url_no_scheme_rejected() {
688        let err = validate_url("example.com/path").unwrap_err();
689        assert!(matches!(err, ToolError::Blocked { .. }));
690    }
691
692    #[test]
693    fn unspecified_ipv4_blocked() {
694        let err = validate_url("https://0.0.0.0/path").unwrap_err();
695        assert!(matches!(err, ToolError::Blocked { .. }));
696    }
697
698    #[test]
699    fn broadcast_ipv4_blocked() {
700        let err = validate_url("https://255.255.255.255/path").unwrap_err();
701        assert!(matches!(err, ToolError::Blocked { .. }));
702    }
703
704    #[test]
705    fn ipv6_link_local_blocked() {
706        let err = validate_url("https://[fe80::1]/path").unwrap_err();
707        assert!(matches!(err, ToolError::Blocked { .. }));
708    }
709
710    #[test]
711    fn ipv6_unique_local_blocked() {
712        let err = validate_url("https://[fd12::1]/path").unwrap_err();
713        assert!(matches!(err, ToolError::Blocked { .. }));
714    }
715
716    #[test]
717    fn ipv4_mapped_ipv6_loopback_blocked() {
718        let err = validate_url("https://[::ffff:127.0.0.1]/path").unwrap_err();
719        assert!(matches!(err, ToolError::Blocked { .. }));
720    }
721
722    #[test]
723    fn ipv4_mapped_ipv6_private_blocked() {
724        let err = validate_url("https://[::ffff:10.0.0.1]/path").unwrap_err();
725        assert!(matches!(err, ToolError::Blocked { .. }));
726    }
727
728    // --- WebScrapeExecutor (no-network) ---
729
730    #[tokio::test]
731    async fn executor_no_blocks_returns_none() {
732        let config = ScrapeConfig::default();
733        let executor = WebScrapeExecutor::new(&config);
734        let result = executor.execute("plain text").await;
735        assert!(result.unwrap().is_none());
736    }
737
738    #[tokio::test]
739    async fn executor_invalid_json_errors() {
740        let config = ScrapeConfig::default();
741        let executor = WebScrapeExecutor::new(&config);
742        let response = "```scrape\nnot json\n```";
743        let result = executor.execute(response).await;
744        assert!(matches!(result, Err(ToolError::Execution(_))));
745    }
746
747    #[tokio::test]
748    async fn executor_blocked_url_errors() {
749        let config = ScrapeConfig::default();
750        let executor = WebScrapeExecutor::new(&config);
751        let response = "```scrape\n{\"url\":\"http://example.com\",\"select\":\"h1\"}\n```";
752        let result = executor.execute(response).await;
753        assert!(matches!(result, Err(ToolError::Blocked { .. })));
754    }
755
756    #[tokio::test]
757    async fn executor_private_ip_blocked() {
758        let config = ScrapeConfig::default();
759        let executor = WebScrapeExecutor::new(&config);
760        let response = "```scrape\n{\"url\":\"https://192.168.1.1/api\",\"select\":\"h1\"}\n```";
761        let result = executor.execute(response).await;
762        assert!(matches!(result, Err(ToolError::Blocked { .. })));
763    }
764
765    #[tokio::test]
766    async fn executor_unreachable_host_returns_error() {
767        let config = ScrapeConfig {
768            timeout: 1,
769            max_body_bytes: 1_048_576,
770        };
771        let executor = WebScrapeExecutor::new(&config);
772        let response = "```scrape\n{\"url\":\"https://192.0.2.1:1/page\",\"select\":\"h1\"}\n```";
773        let result = executor.execute(response).await;
774        assert!(matches!(result, Err(ToolError::Execution(_))));
775    }
776
777    #[tokio::test]
778    async fn executor_localhost_url_blocked() {
779        let config = ScrapeConfig::default();
780        let executor = WebScrapeExecutor::new(&config);
781        let response = "```scrape\n{\"url\":\"https://localhost:9999/api\",\"select\":\"h1\"}\n```";
782        let result = executor.execute(response).await;
783        assert!(matches!(result, Err(ToolError::Blocked { .. })));
784    }
785
786    #[tokio::test]
787    async fn executor_empty_text_returns_none() {
788        let config = ScrapeConfig::default();
789        let executor = WebScrapeExecutor::new(&config);
790        let result = executor.execute("").await;
791        assert!(result.unwrap().is_none());
792    }
793
794    #[tokio::test]
795    async fn executor_multiple_blocks_first_blocked() {
796        let config = ScrapeConfig::default();
797        let executor = WebScrapeExecutor::new(&config);
798        let response = "```scrape\n{\"url\":\"http://evil.com\",\"select\":\"h1\"}\n```\n\
799             ```scrape\n{\"url\":\"https://ok.com\",\"select\":\"h1\"}\n```";
800        let result = executor.execute(response).await;
801        assert!(result.is_err());
802    }
803
804    #[test]
805    fn validate_url_empty_string() {
806        let err = validate_url("").unwrap_err();
807        assert!(matches!(err, ToolError::Blocked { .. }));
808    }
809
810    #[test]
811    fn validate_url_javascript_scheme_blocked() {
812        let err = validate_url("javascript:alert(1)").unwrap_err();
813        assert!(matches!(err, ToolError::Blocked { .. }));
814    }
815
816    #[test]
817    fn validate_url_data_scheme_blocked() {
818        let err = validate_url("data:text/html,<h1>hi</h1>").unwrap_err();
819        assert!(matches!(err, ToolError::Blocked { .. }));
820    }
821
822    #[test]
823    fn is_private_host_public_domain_is_false() {
824        let host: url::Host<&str> = url::Host::Domain("example.com");
825        assert!(!is_private_host(&host));
826    }
827
828    #[test]
829    fn is_private_host_localhost_is_true() {
830        let host: url::Host<&str> = url::Host::Domain("localhost");
831        assert!(is_private_host(&host));
832    }
833
834    #[test]
835    fn is_private_host_ipv6_unspecified_is_true() {
836        let host = url::Host::Ipv6(std::net::Ipv6Addr::UNSPECIFIED);
837        assert!(is_private_host(&host));
838    }
839
840    #[test]
841    fn is_private_host_public_ipv6_is_false() {
842        let host = url::Host::Ipv6("2001:db8::1".parse().unwrap());
843        assert!(!is_private_host(&host));
844    }
845
846    // --- fetch_html redirect logic: wiremock HTTP server tests ---
847    //
848    // These tests use a local wiremock server to exercise the redirect-following logic
849    // in `fetch_html` without requiring an external HTTPS connection. The server binds to
850    // 127.0.0.1, and tests call `fetch_html` directly (bypassing `validate_url`) to avoid
851    // the SSRF guard that would otherwise block loopback connections.
852
853    /// Helper: returns executor + (server_url, server_addr) from a running wiremock mock server.
854    /// The server address is passed to `fetch_html` via `resolve_to_addrs` so the client
855    /// connects to the mock instead of doing a real DNS lookup.
856    async fn mock_server_executor() -> (WebScrapeExecutor, wiremock::MockServer) {
857        let server = wiremock::MockServer::start().await;
858        let executor = WebScrapeExecutor {
859            timeout: Duration::from_secs(5),
860            max_body_bytes: 1_048_576,
861        };
862        (executor, server)
863    }
864
865    /// Parses the mock server's URI into (host_str, socket_addr) for use with `build_client`.
866    fn server_host_and_addr(server: &wiremock::MockServer) -> (String, Vec<std::net::SocketAddr>) {
867        let uri = server.uri();
868        let url = Url::parse(&uri).unwrap();
869        let host = url.host_str().unwrap_or("127.0.0.1").to_owned();
870        let port = url.port().unwrap_or(80);
871        let addr: std::net::SocketAddr = format!("{host}:{port}").parse().unwrap();
872        (host, vec![addr])
873    }
874
875    /// Test-only redirect follower that mimics `fetch_html`'s loop but skips `validate_url` /
876    /// `resolve_and_validate`. This lets us exercise the redirect-counting and
877    /// missing-Location logic against a plain HTTP wiremock server.
878    async fn follow_redirects_raw(
879        executor: &WebScrapeExecutor,
880        start_url: &str,
881        host: &str,
882        addrs: &[std::net::SocketAddr],
883    ) -> Result<String, ToolError> {
884        const MAX_REDIRECTS: usize = 3;
885        let mut current_url = start_url.to_owned();
886        let mut current_host = host.to_owned();
887        let mut current_addrs = addrs.to_vec();
888
889        for hop in 0..=MAX_REDIRECTS {
890            let client = executor.build_client(&current_host, &current_addrs);
891            let resp = client
892                .get(&current_url)
893                .send()
894                .await
895                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
896
897            let status = resp.status();
898
899            if status.is_redirection() {
900                if hop == MAX_REDIRECTS {
901                    return Err(ToolError::Execution(std::io::Error::other(
902                        "too many redirects",
903                    )));
904                }
905
906                let location = resp
907                    .headers()
908                    .get(reqwest::header::LOCATION)
909                    .and_then(|v| v.to_str().ok())
910                    .ok_or_else(|| {
911                        ToolError::Execution(std::io::Error::other("redirect with no Location"))
912                    })?;
913
914                let base = Url::parse(&current_url)
915                    .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
916                let next_url = base
917                    .join(location)
918                    .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
919
920                // Re-use same host/addrs (mock server is always the same endpoint).
921                current_url = next_url.to_string();
922                // Preserve host/addrs as-is since the mock server doesn't change.
923                let _ = &mut current_host;
924                let _ = &mut current_addrs;
925                continue;
926            }
927
928            if !status.is_success() {
929                return Err(ToolError::Execution(std::io::Error::other(format!(
930                    "HTTP {status}",
931                ))));
932            }
933
934            let bytes = resp
935                .bytes()
936                .await
937                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
938
939            if bytes.len() > executor.max_body_bytes {
940                return Err(ToolError::Execution(std::io::Error::other(format!(
941                    "response too large: {} bytes (max: {})",
942                    bytes.len(),
943                    executor.max_body_bytes,
944                ))));
945            }
946
947            return String::from_utf8(bytes.to_vec())
948                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())));
949        }
950
951        Err(ToolError::Execution(std::io::Error::other(
952            "too many redirects",
953        )))
954    }
955
956    #[tokio::test]
957    async fn fetch_html_success_returns_body() {
958        use wiremock::matchers::{method, path};
959        use wiremock::{Mock, ResponseTemplate};
960
961        let (executor, server) = mock_server_executor().await;
962        Mock::given(method("GET"))
963            .and(path("/page"))
964            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>OK</h1>"))
965            .mount(&server)
966            .await;
967
968        let (host, addrs) = server_host_and_addr(&server);
969        let url = format!("{}/page", server.uri());
970        let result = executor.fetch_html(&url, &host, &addrs).await;
971        assert!(result.is_ok(), "expected Ok, got: {result:?}");
972        assert_eq!(result.unwrap(), "<h1>OK</h1>");
973    }
974
975    #[tokio::test]
976    async fn fetch_html_non_2xx_returns_error() {
977        use wiremock::matchers::{method, path};
978        use wiremock::{Mock, ResponseTemplate};
979
980        let (executor, server) = mock_server_executor().await;
981        Mock::given(method("GET"))
982            .and(path("/forbidden"))
983            .respond_with(ResponseTemplate::new(403))
984            .mount(&server)
985            .await;
986
987        let (host, addrs) = server_host_and_addr(&server);
988        let url = format!("{}/forbidden", server.uri());
989        let result = executor.fetch_html(&url, &host, &addrs).await;
990        assert!(result.is_err());
991        let msg = result.unwrap_err().to_string();
992        assert!(msg.contains("403"), "expected 403 in error: {msg}");
993    }
994
995    #[tokio::test]
996    async fn fetch_html_404_returns_error() {
997        use wiremock::matchers::{method, path};
998        use wiremock::{Mock, ResponseTemplate};
999
1000        let (executor, server) = mock_server_executor().await;
1001        Mock::given(method("GET"))
1002            .and(path("/missing"))
1003            .respond_with(ResponseTemplate::new(404))
1004            .mount(&server)
1005            .await;
1006
1007        let (host, addrs) = server_host_and_addr(&server);
1008        let url = format!("{}/missing", server.uri());
1009        let result = executor.fetch_html(&url, &host, &addrs).await;
1010        assert!(result.is_err());
1011        let msg = result.unwrap_err().to_string();
1012        assert!(msg.contains("404"), "expected 404 in error: {msg}");
1013    }
1014
1015    #[tokio::test]
1016    async fn fetch_html_redirect_no_location_returns_error() {
1017        use wiremock::matchers::{method, path};
1018        use wiremock::{Mock, ResponseTemplate};
1019
1020        let (executor, server) = mock_server_executor().await;
1021        // 302 with no Location header
1022        Mock::given(method("GET"))
1023            .and(path("/redirect-no-loc"))
1024            .respond_with(ResponseTemplate::new(302))
1025            .mount(&server)
1026            .await;
1027
1028        let (host, addrs) = server_host_and_addr(&server);
1029        let url = format!("{}/redirect-no-loc", server.uri());
1030        let result = executor.fetch_html(&url, &host, &addrs).await;
1031        assert!(result.is_err());
1032        let msg = result.unwrap_err().to_string();
1033        assert!(
1034            msg.contains("Location") || msg.contains("location"),
1035            "expected Location-related error: {msg}"
1036        );
1037    }
1038
1039    #[tokio::test]
1040    async fn fetch_html_single_redirect_followed() {
1041        use wiremock::matchers::{method, path};
1042        use wiremock::{Mock, ResponseTemplate};
1043
1044        let (executor, server) = mock_server_executor().await;
1045        let final_url = format!("{}/final", server.uri());
1046
1047        Mock::given(method("GET"))
1048            .and(path("/start"))
1049            .respond_with(ResponseTemplate::new(302).insert_header("location", final_url.as_str()))
1050            .mount(&server)
1051            .await;
1052
1053        Mock::given(method("GET"))
1054            .and(path("/final"))
1055            .respond_with(ResponseTemplate::new(200).set_body_string("<p>final</p>"))
1056            .mount(&server)
1057            .await;
1058
1059        let (host, addrs) = server_host_and_addr(&server);
1060        let url = format!("{}/start", server.uri());
1061        let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1062        assert!(result.is_ok(), "single redirect should succeed: {result:?}");
1063        assert_eq!(result.unwrap(), "<p>final</p>");
1064    }
1065
1066    #[tokio::test]
1067    async fn fetch_html_three_redirects_allowed() {
1068        use wiremock::matchers::{method, path};
1069        use wiremock::{Mock, ResponseTemplate};
1070
1071        let (executor, server) = mock_server_executor().await;
1072        let hop2 = format!("{}/hop2", server.uri());
1073        let hop3 = format!("{}/hop3", server.uri());
1074        let final_dest = format!("{}/done", server.uri());
1075
1076        Mock::given(method("GET"))
1077            .and(path("/hop1"))
1078            .respond_with(ResponseTemplate::new(301).insert_header("location", hop2.as_str()))
1079            .mount(&server)
1080            .await;
1081        Mock::given(method("GET"))
1082            .and(path("/hop2"))
1083            .respond_with(ResponseTemplate::new(301).insert_header("location", hop3.as_str()))
1084            .mount(&server)
1085            .await;
1086        Mock::given(method("GET"))
1087            .and(path("/hop3"))
1088            .respond_with(ResponseTemplate::new(301).insert_header("location", final_dest.as_str()))
1089            .mount(&server)
1090            .await;
1091        Mock::given(method("GET"))
1092            .and(path("/done"))
1093            .respond_with(ResponseTemplate::new(200).set_body_string("<p>done</p>"))
1094            .mount(&server)
1095            .await;
1096
1097        let (host, addrs) = server_host_and_addr(&server);
1098        let url = format!("{}/hop1", server.uri());
1099        let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1100        assert!(result.is_ok(), "3 redirects should succeed: {result:?}");
1101        assert_eq!(result.unwrap(), "<p>done</p>");
1102    }
1103
1104    #[tokio::test]
1105    async fn fetch_html_four_redirects_rejected() {
1106        use wiremock::matchers::{method, path};
1107        use wiremock::{Mock, ResponseTemplate};
1108
1109        let (executor, server) = mock_server_executor().await;
1110        let hop2 = format!("{}/r2", server.uri());
1111        let hop3 = format!("{}/r3", server.uri());
1112        let hop4 = format!("{}/r4", server.uri());
1113        let hop5 = format!("{}/r5", server.uri());
1114
1115        for (from, to) in [
1116            ("/r1", &hop2),
1117            ("/r2", &hop3),
1118            ("/r3", &hop4),
1119            ("/r4", &hop5),
1120        ] {
1121            Mock::given(method("GET"))
1122                .and(path(from))
1123                .respond_with(ResponseTemplate::new(301).insert_header("location", to.as_str()))
1124                .mount(&server)
1125                .await;
1126        }
1127
1128        let (host, addrs) = server_host_and_addr(&server);
1129        let url = format!("{}/r1", server.uri());
1130        let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1131        assert!(result.is_err(), "4 redirects should be rejected");
1132        let msg = result.unwrap_err().to_string();
1133        assert!(
1134            msg.contains("redirect"),
1135            "expected redirect-related error: {msg}"
1136        );
1137    }
1138
1139    #[tokio::test]
1140    async fn fetch_html_body_too_large_returns_error() {
1141        use wiremock::matchers::{method, path};
1142        use wiremock::{Mock, ResponseTemplate};
1143
1144        let small_limit_executor = WebScrapeExecutor {
1145            timeout: Duration::from_secs(5),
1146            max_body_bytes: 10,
1147        };
1148        let server = wiremock::MockServer::start().await;
1149        Mock::given(method("GET"))
1150            .and(path("/big"))
1151            .respond_with(
1152                ResponseTemplate::new(200)
1153                    .set_body_string("this body is definitely longer than ten bytes"),
1154            )
1155            .mount(&server)
1156            .await;
1157
1158        let (host, addrs) = server_host_and_addr(&server);
1159        let url = format!("{}/big", server.uri());
1160        let result = small_limit_executor.fetch_html(&url, &host, &addrs).await;
1161        assert!(result.is_err());
1162        let msg = result.unwrap_err().to_string();
1163        assert!(msg.contains("too large"), "expected too-large error: {msg}");
1164    }
1165
1166    #[test]
1167    fn extract_scrape_blocks_empty_block_content() {
1168        let text = "```scrape\n\n```";
1169        let blocks = extract_scrape_blocks(text);
1170        assert_eq!(blocks.len(), 1);
1171        assert!(blocks[0].is_empty());
1172    }
1173
1174    #[test]
1175    fn extract_scrape_blocks_whitespace_only() {
1176        let text = "```scrape\n   \n```";
1177        let blocks = extract_scrape_blocks(text);
1178        assert_eq!(blocks.len(), 1);
1179    }
1180
1181    #[test]
1182    fn parse_and_extract_multiple_selectors() {
1183        let html = "<div><h1>Title</h1><p>Para</p></div>";
1184        let result = parse_and_extract(html, "h1, p", &ExtractMode::Text, 10).unwrap();
1185        assert!(result.contains("Title"));
1186        assert!(result.contains("Para"));
1187    }
1188
1189    #[test]
1190    fn webscrape_executor_new_with_custom_config() {
1191        let config = ScrapeConfig {
1192            timeout: 60,
1193            max_body_bytes: 512,
1194        };
1195        let executor = WebScrapeExecutor::new(&config);
1196        assert_eq!(executor.max_body_bytes, 512);
1197    }
1198
1199    #[test]
1200    fn webscrape_executor_debug() {
1201        let config = ScrapeConfig::default();
1202        let executor = WebScrapeExecutor::new(&config);
1203        let dbg = format!("{executor:?}");
1204        assert!(dbg.contains("WebScrapeExecutor"));
1205    }
1206
1207    #[test]
1208    fn extract_mode_attr_empty_name() {
1209        let mode = ExtractMode::parse("attr:");
1210        assert!(matches!(mode, ExtractMode::Attr(ref s) if s.is_empty()));
1211    }
1212
1213    #[test]
1214    fn default_extract_returns_text() {
1215        assert_eq!(default_extract(), "text");
1216    }
1217
1218    #[test]
1219    fn scrape_instruction_debug() {
1220        let json = r#"{"url":"https://example.com","select":"h1"}"#;
1221        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
1222        let dbg = format!("{instr:?}");
1223        assert!(dbg.contains("ScrapeInstruction"));
1224    }
1225
1226    #[test]
1227    fn extract_mode_debug() {
1228        let mode = ExtractMode::Text;
1229        let dbg = format!("{mode:?}");
1230        assert!(dbg.contains("Text"));
1231    }
1232
1233    // --- fetch_html redirect logic: constant and validation unit tests ---
1234
1235    /// MAX_REDIRECTS is 3; the 4th redirect attempt must be rejected.
1236    /// Verify the boundary is correct by inspecting the constant value.
1237    #[test]
1238    fn max_redirects_constant_is_three() {
1239        // fetch_html uses `for hop in 0..=MAX_REDIRECTS` and returns error when hop == MAX_REDIRECTS
1240        // while still in a redirect. That means hops 0,1,2 can redirect; hop 3 triggers the error.
1241        // This test documents the expected limit.
1242        const MAX_REDIRECTS: usize = 3;
1243        assert_eq!(MAX_REDIRECTS, 3, "fetch_html allows exactly 3 redirects");
1244    }
1245
1246    /// Verifies that a Location-less redirect would produce an error string containing the
1247    /// expected message, matching the error path in fetch_html.
1248    #[test]
1249    fn redirect_no_location_error_message() {
1250        let err = std::io::Error::other("redirect with no Location");
1251        assert!(err.to_string().contains("redirect with no Location"));
1252    }
1253
1254    /// Verifies that a too-many-redirects condition produces the expected error string.
1255    #[test]
1256    fn too_many_redirects_error_message() {
1257        let err = std::io::Error::other("too many redirects");
1258        assert!(err.to_string().contains("too many redirects"));
1259    }
1260
1261    /// Verifies that a non-2xx HTTP status produces an error message with the status code.
1262    #[test]
1263    fn non_2xx_status_error_format() {
1264        let status = reqwest::StatusCode::FORBIDDEN;
1265        let msg = format!("HTTP {status}");
1266        assert!(msg.contains("403"));
1267    }
1268
1269    /// Verifies that a 404 response status code formats into the expected error message.
1270    #[test]
1271    fn not_found_status_error_format() {
1272        let status = reqwest::StatusCode::NOT_FOUND;
1273        let msg = format!("HTTP {status}");
1274        assert!(msg.contains("404"));
1275    }
1276
1277    /// Verifies relative redirect resolution for same-host paths (simulates Location: /other).
1278    #[test]
1279    fn relative_redirect_same_host_path() {
1280        let base = Url::parse("https://example.com/current").unwrap();
1281        let resolved = base.join("/other").unwrap();
1282        assert_eq!(resolved.as_str(), "https://example.com/other");
1283    }
1284
1285    /// Verifies relative redirect resolution preserves scheme and host.
1286    #[test]
1287    fn relative_redirect_relative_path() {
1288        let base = Url::parse("https://example.com/a/b").unwrap();
1289        let resolved = base.join("c").unwrap();
1290        assert_eq!(resolved.as_str(), "https://example.com/a/c");
1291    }
1292
1293    /// Verifies that an absolute redirect URL overrides base URL completely.
1294    #[test]
1295    fn absolute_redirect_overrides_base() {
1296        let base = Url::parse("https://example.com/page").unwrap();
1297        let resolved = base.join("https://other.com/target").unwrap();
1298        assert_eq!(resolved.as_str(), "https://other.com/target");
1299    }
1300
1301    /// Verifies that a redirect Location of http:// (downgrade) is rejected.
1302    #[test]
1303    fn redirect_http_downgrade_rejected() {
1304        let location = "http://example.com/page";
1305        let base = Url::parse("https://example.com/start").unwrap();
1306        let next = base.join(location).unwrap();
1307        let err = validate_url(next.as_str()).unwrap_err();
1308        assert!(matches!(err, ToolError::Blocked { .. }));
1309    }
1310
1311    /// Verifies that a redirect to a private IP literal is blocked.
1312    #[test]
1313    fn redirect_location_private_ip_blocked() {
1314        let location = "https://192.168.100.1/admin";
1315        let base = Url::parse("https://example.com/start").unwrap();
1316        let next = base.join(location).unwrap();
1317        let err = validate_url(next.as_str()).unwrap_err();
1318        assert!(matches!(err, ToolError::Blocked { .. }));
1319        let cmd = match err {
1320            ToolError::Blocked { command } => command,
1321            _ => panic!("expected Blocked"),
1322        };
1323        assert!(
1324            cmd.contains("private") || cmd.contains("scheme"),
1325            "error message should describe the block reason: {cmd}"
1326        );
1327    }
1328
1329    /// Verifies that a redirect to a .internal domain is blocked.
1330    #[test]
1331    fn redirect_location_internal_domain_blocked() {
1332        let location = "https://metadata.internal/latest/meta-data/";
1333        let base = Url::parse("https://example.com/start").unwrap();
1334        let next = base.join(location).unwrap();
1335        let err = validate_url(next.as_str()).unwrap_err();
1336        assert!(matches!(err, ToolError::Blocked { .. }));
1337    }
1338
1339    /// Verifies that a chain of 3 valid public redirects passes validate_url at every hop.
1340    #[test]
1341    fn redirect_chain_three_hops_all_public() {
1342        let hops = [
1343            "https://redirect1.example.com/hop1",
1344            "https://redirect2.example.com/hop2",
1345            "https://destination.example.com/final",
1346        ];
1347        for hop in hops {
1348            assert!(validate_url(hop).is_ok(), "expected ok for {hop}");
1349        }
1350    }
1351
1352    // --- SSRF redirect chain defense ---
1353
1354    /// Verifies that a redirect Location pointing to a private IP is rejected by validate_url
1355    /// before any connection attempt — simulating the validation step inside fetch_html.
1356    #[test]
1357    fn redirect_to_private_ip_rejected_by_validate_url() {
1358        // These would appear as Location headers in a redirect response.
1359        let private_targets = [
1360            "https://127.0.0.1/secret",
1361            "https://10.0.0.1/internal",
1362            "https://192.168.1.1/admin",
1363            "https://172.16.0.1/data",
1364            "https://[::1]/path",
1365            "https://[fe80::1]/path",
1366            "https://localhost/path",
1367            "https://service.internal/api",
1368        ];
1369        for target in private_targets {
1370            let result = validate_url(target);
1371            assert!(result.is_err(), "expected error for {target}");
1372            assert!(
1373                matches!(result.unwrap_err(), ToolError::Blocked { .. }),
1374                "expected Blocked for {target}"
1375            );
1376        }
1377    }
1378
1379    /// Verifies that relative redirect URLs are resolved correctly before validation.
1380    #[test]
1381    fn redirect_relative_url_resolves_correctly() {
1382        let base = Url::parse("https://example.com/page").unwrap();
1383        let relative = "/other";
1384        let resolved = base.join(relative).unwrap();
1385        assert_eq!(resolved.as_str(), "https://example.com/other");
1386    }
1387
1388    /// Verifies that a protocol-relative redirect to http:// is rejected (scheme check).
1389    #[test]
1390    fn redirect_to_http_rejected() {
1391        let err = validate_url("http://example.com/page").unwrap_err();
1392        assert!(matches!(err, ToolError::Blocked { .. }));
1393    }
1394
1395    #[test]
1396    fn ipv4_mapped_ipv6_link_local_blocked() {
1397        let err = validate_url("https://[::ffff:169.254.0.1]/path").unwrap_err();
1398        assert!(matches!(err, ToolError::Blocked { .. }));
1399    }
1400
1401    #[test]
1402    fn ipv4_mapped_ipv6_public_allowed() {
1403        assert!(validate_url("https://[::ffff:93.184.216.34]/path").is_ok());
1404    }
1405
1406    #[test]
1407    fn tool_definitions_returns_web_scrape() {
1408        let config = ScrapeConfig::default();
1409        let executor = WebScrapeExecutor::new(&config);
1410        let defs = executor.tool_definitions();
1411        assert_eq!(defs.len(), 1);
1412        assert_eq!(defs[0].id, "web_scrape");
1413        assert_eq!(
1414            defs[0].invocation,
1415            crate::registry::InvocationHint::FencedBlock("scrape")
1416        );
1417    }
1418
1419    #[test]
1420    fn tool_definitions_schema_has_all_params() {
1421        let config = ScrapeConfig::default();
1422        let executor = WebScrapeExecutor::new(&config);
1423        let defs = executor.tool_definitions();
1424        let obj = defs[0].schema.as_object().unwrap();
1425        let props = obj["properties"].as_object().unwrap();
1426        assert!(props.contains_key("url"));
1427        assert!(props.contains_key("select"));
1428        assert!(props.contains_key("extract"));
1429        assert!(props.contains_key("limit"));
1430        let req = obj["required"].as_array().unwrap();
1431        assert!(req.iter().any(|v| v.as_str() == Some("url")));
1432        assert!(req.iter().any(|v| v.as_str() == Some("select")));
1433        assert!(!req.iter().any(|v| v.as_str() == Some("extract")));
1434    }
1435
1436    // --- is_private_host: new domain checks (AUD-02) ---
1437
1438    #[test]
1439    fn subdomain_localhost_blocked() {
1440        let host: url::Host<&str> = url::Host::Domain("foo.localhost");
1441        assert!(is_private_host(&host));
1442    }
1443
1444    #[test]
1445    fn internal_tld_blocked() {
1446        let host: url::Host<&str> = url::Host::Domain("service.internal");
1447        assert!(is_private_host(&host));
1448    }
1449
1450    #[test]
1451    fn local_tld_blocked() {
1452        let host: url::Host<&str> = url::Host::Domain("printer.local");
1453        assert!(is_private_host(&host));
1454    }
1455
1456    #[test]
1457    fn public_domain_not_blocked() {
1458        let host: url::Host<&str> = url::Host::Domain("example.com");
1459        assert!(!is_private_host(&host));
1460    }
1461
1462    // --- resolve_and_validate: private IP rejection ---
1463
1464    #[tokio::test]
1465    async fn resolve_loopback_rejected() {
1466        // 127.0.0.1 resolves directly (literal IP in DNS query)
1467        let url = url::Url::parse("https://127.0.0.1/path").unwrap();
1468        // validate_url catches this before resolve_and_validate, but test directly
1469        let result = resolve_and_validate(&url).await;
1470        assert!(
1471            result.is_err(),
1472            "loopback IP must be rejected by resolve_and_validate"
1473        );
1474        let err = result.unwrap_err();
1475        assert!(matches!(err, crate::executor::ToolError::Blocked { .. }));
1476    }
1477
1478    #[tokio::test]
1479    async fn resolve_private_10_rejected() {
1480        let url = url::Url::parse("https://10.0.0.1/path").unwrap();
1481        let result = resolve_and_validate(&url).await;
1482        assert!(result.is_err());
1483        assert!(matches!(
1484            result.unwrap_err(),
1485            crate::executor::ToolError::Blocked { .. }
1486        ));
1487    }
1488
1489    #[tokio::test]
1490    async fn resolve_private_192_rejected() {
1491        let url = url::Url::parse("https://192.168.1.1/path").unwrap();
1492        let result = resolve_and_validate(&url).await;
1493        assert!(result.is_err());
1494        assert!(matches!(
1495            result.unwrap_err(),
1496            crate::executor::ToolError::Blocked { .. }
1497        ));
1498    }
1499
1500    #[tokio::test]
1501    async fn resolve_ipv6_loopback_rejected() {
1502        let url = url::Url::parse("https://[::1]/path").unwrap();
1503        let result = resolve_and_validate(&url).await;
1504        assert!(result.is_err());
1505        assert!(matches!(
1506            result.unwrap_err(),
1507            crate::executor::ToolError::Blocked { .. }
1508        ));
1509    }
1510
1511    #[tokio::test]
1512    async fn resolve_no_host_returns_ok() {
1513        // URL without a resolvable host — should pass through
1514        let url = url::Url::parse("https://example.com/path").unwrap();
1515        // We can't do a live DNS test, but we can verify a URL with no host
1516        let url_no_host = url::Url::parse("data:text/plain,hello").unwrap();
1517        // data: URLs have no host; resolve_and_validate should return Ok with empty addrs
1518        let result = resolve_and_validate(&url_no_host).await;
1519        assert!(result.is_ok());
1520        let (host, addrs) = result.unwrap();
1521        assert!(host.is_empty());
1522        assert!(addrs.is_empty());
1523        drop(url);
1524        drop(url_no_host);
1525    }
1526}
zeph_tools/scrape.rs

zeph_tools/
scrape.rs