zeph_tools/
scrape.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use std::net::{IpAddr, SocketAddr};
5use std::time::Duration;
6
7use schemars::JsonSchema;
8use serde::Deserialize;
9use url::Url;
10
11use crate::config::ScrapeConfig;
12use crate::executor::{ToolCall, ToolError, ToolExecutor, ToolOutput, deserialize_params};
13
14#[derive(Debug, Deserialize, JsonSchema)]
15struct ScrapeInstruction {
16    /// HTTPS URL to scrape
17    url: String,
18    /// CSS selector
19    select: String,
20    /// Extract mode: text, html, or attr:<name>
21    #[serde(default = "default_extract")]
22    extract: String,
23    /// Max results to return
24    limit: Option<usize>,
25}
26
27fn default_extract() -> String {
28    "text".into()
29}
30
31#[derive(Debug)]
32enum ExtractMode {
33    Text,
34    Html,
35    Attr(String),
36}
37
38impl ExtractMode {
39    fn parse(s: &str) -> Self {
40        match s {
41            "text" => Self::Text,
42            "html" => Self::Html,
43            attr if attr.starts_with("attr:") => {
44                Self::Attr(attr.strip_prefix("attr:").unwrap_or(attr).to_owned())
45            }
46            _ => Self::Text,
47        }
48    }
49}
50
51/// Extracts data from web pages via CSS selectors.
52///
53/// Detects ` ```scrape ` blocks in LLM responses containing JSON instructions,
54/// fetches the URL, and parses HTML with `scrape-core`.
55#[derive(Debug)]
56pub struct WebScrapeExecutor {
57    timeout: Duration,
58    max_body_bytes: usize,
59}
60
61impl WebScrapeExecutor {
62    #[must_use]
63    pub fn new(config: &ScrapeConfig) -> Self {
64        Self {
65            timeout: Duration::from_secs(config.timeout),
66            max_body_bytes: config.max_body_bytes,
67        }
68    }
69
70    fn build_client(&self, host: &str, addrs: &[SocketAddr]) -> reqwest::Client {
71        let mut builder = reqwest::Client::builder()
72            .timeout(self.timeout)
73            .redirect(reqwest::redirect::Policy::none());
74        builder = builder.resolve_to_addrs(host, addrs);
75        builder.build().unwrap_or_default()
76    }
77}
78
79impl ToolExecutor for WebScrapeExecutor {
80    fn tool_definitions(&self) -> Vec<crate::registry::ToolDef> {
81        use crate::registry::{InvocationHint, ToolDef};
82        vec![ToolDef {
83            id: "web_scrape",
84            description: "Scrape data from a web page via CSS selectors",
85            schema: schemars::schema_for!(ScrapeInstruction),
86            invocation: InvocationHint::FencedBlock("scrape"),
87        }]
88    }
89
90    async fn execute(&self, response: &str) -> Result<Option<ToolOutput>, ToolError> {
91        let blocks = extract_scrape_blocks(response);
92        if blocks.is_empty() {
93            return Ok(None);
94        }
95
96        let mut outputs = Vec::with_capacity(blocks.len());
97        #[allow(clippy::cast_possible_truncation)]
98        let blocks_executed = blocks.len() as u32;
99
100        for block in &blocks {
101            let instruction: ScrapeInstruction = serde_json::from_str(block).map_err(|e| {
102                ToolError::Execution(std::io::Error::new(
103                    std::io::ErrorKind::InvalidData,
104                    e.to_string(),
105                ))
106            })?;
107            outputs.push(self.scrape_instruction(&instruction).await?);
108        }
109
110        Ok(Some(ToolOutput {
111            tool_name: "web-scrape".to_owned(),
112            summary: outputs.join("\n\n"),
113            blocks_executed,
114            filter_stats: None,
115            diff: None,
116            streamed: false,
117        }))
118    }
119
120    async fn execute_tool_call(&self, call: &ToolCall) -> Result<Option<ToolOutput>, ToolError> {
121        if call.tool_id != "web_scrape" {
122            return Ok(None);
123        }
124
125        let instruction: ScrapeInstruction = deserialize_params(&call.params)?;
126
127        let result = self.scrape_instruction(&instruction).await?;
128
129        Ok(Some(ToolOutput {
130            tool_name: "web-scrape".to_owned(),
131            summary: result,
132            blocks_executed: 1,
133            filter_stats: None,
134            diff: None,
135            streamed: false,
136        }))
137    }
138}
139
140impl WebScrapeExecutor {
141    async fn scrape_instruction(
142        &self,
143        instruction: &ScrapeInstruction,
144    ) -> Result<String, ToolError> {
145        let parsed = validate_url(&instruction.url)?;
146        let (host, addrs) = resolve_and_validate(&parsed).await?;
147        let html = self.fetch_html(&instruction.url, &host, &addrs).await?;
148        let selector = instruction.select.clone();
149        let extract = ExtractMode::parse(&instruction.extract);
150        let limit = instruction.limit.unwrap_or(10);
151        tokio::task::spawn_blocking(move || parse_and_extract(&html, &selector, &extract, limit))
152            .await
153            .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?
154    }
155
156    /// Fetches the HTML at `url`, manually following up to 3 redirects.
157    ///
158    /// Each redirect target is validated with `validate_url` and `resolve_and_validate`
159    /// before following, preventing SSRF via redirect chains.
160    ///
161    /// # Errors
162    ///
163    /// Returns `ToolError::Blocked` if any redirect target resolves to a private IP.
164    /// Returns `ToolError::Execution` on HTTP errors, too-large bodies, or too many redirects.
165    async fn fetch_html(
166        &self,
167        url: &str,
168        host: &str,
169        addrs: &[SocketAddr],
170    ) -> Result<String, ToolError> {
171        const MAX_REDIRECTS: usize = 3;
172
173        let mut current_url = url.to_owned();
174        let mut current_host = host.to_owned();
175        let mut current_addrs = addrs.to_vec();
176
177        for hop in 0..=MAX_REDIRECTS {
178            // Build a per-hop client pinned to the current hop's validated addresses.
179            let client = self.build_client(&current_host, &current_addrs);
180            let resp = client
181                .get(&current_url)
182                .send()
183                .await
184                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
185
186            let status = resp.status();
187
188            if status.is_redirection() {
189                if hop == MAX_REDIRECTS {
190                    return Err(ToolError::Execution(std::io::Error::other(
191                        "too many redirects",
192                    )));
193                }
194
195                let location = resp
196                    .headers()
197                    .get(reqwest::header::LOCATION)
198                    .and_then(|v| v.to_str().ok())
199                    .ok_or_else(|| {
200                        ToolError::Execution(std::io::Error::other("redirect with no Location"))
201                    })?;
202
203                // Resolve relative redirect URLs against the current URL.
204                let base = Url::parse(&current_url)
205                    .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
206                let next_url = base
207                    .join(location)
208                    .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
209
210                let validated = validate_url(next_url.as_str())?;
211                let (next_host, next_addrs) = resolve_and_validate(&validated).await?;
212
213                current_url = next_url.to_string();
214                current_host = next_host;
215                current_addrs = next_addrs;
216                continue;
217            }
218
219            if !status.is_success() {
220                return Err(ToolError::Execution(std::io::Error::other(format!(
221                    "HTTP {status}",
222                ))));
223            }
224
225            let bytes = resp
226                .bytes()
227                .await
228                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
229
230            if bytes.len() > self.max_body_bytes {
231                return Err(ToolError::Execution(std::io::Error::other(format!(
232                    "response too large: {} bytes (max: {})",
233                    bytes.len(),
234                    self.max_body_bytes,
235                ))));
236            }
237
238            return String::from_utf8(bytes.to_vec())
239                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())));
240        }
241
242        Err(ToolError::Execution(std::io::Error::other(
243            "too many redirects",
244        )))
245    }
246}
247
248fn extract_scrape_blocks(text: &str) -> Vec<&str> {
249    crate::executor::extract_fenced_blocks(text, "scrape")
250}
251
252fn validate_url(raw: &str) -> Result<Url, ToolError> {
253    let parsed = Url::parse(raw).map_err(|_| ToolError::Blocked {
254        command: format!("invalid URL: {raw}"),
255    })?;
256
257    if parsed.scheme() != "https" {
258        return Err(ToolError::Blocked {
259            command: format!("scheme not allowed: {}", parsed.scheme()),
260        });
261    }
262
263    if let Some(host) = parsed.host()
264        && is_private_host(&host)
265    {
266        return Err(ToolError::Blocked {
267            command: format!(
268                "private/local host blocked: {}",
269                parsed.host_str().unwrap_or("")
270            ),
271        });
272    }
273
274    Ok(parsed)
275}
276
277pub(crate) fn is_private_ip(ip: IpAddr) -> bool {
278    match ip {
279        IpAddr::V4(v4) => {
280            v4.is_loopback()
281                || v4.is_private()
282                || v4.is_link_local()
283                || v4.is_unspecified()
284                || v4.is_broadcast()
285        }
286        IpAddr::V6(v6) => {
287            if v6.is_loopback() || v6.is_unspecified() {
288                return true;
289            }
290            let seg = v6.segments();
291            // fe80::/10 — link-local
292            if seg[0] & 0xffc0 == 0xfe80 {
293                return true;
294            }
295            // fc00::/7 — unique local
296            if seg[0] & 0xfe00 == 0xfc00 {
297                return true;
298            }
299            // ::ffff:x.x.x.x — IPv4-mapped, check inner IPv4
300            if seg[0..6] == [0, 0, 0, 0, 0, 0xffff] {
301                let v4 = v6
302                    .to_ipv4_mapped()
303                    .unwrap_or(std::net::Ipv4Addr::UNSPECIFIED);
304                return v4.is_loopback()
305                    || v4.is_private()
306                    || v4.is_link_local()
307                    || v4.is_unspecified()
308                    || v4.is_broadcast();
309            }
310            false
311        }
312    }
313}
314
315fn is_private_host(host: &url::Host<&str>) -> bool {
316    match host {
317        url::Host::Domain(d) => {
318            // Exact match or subdomain of localhost (e.g. foo.localhost)
319            // and .internal/.local TLDs used in cloud/k8s environments.
320            #[allow(clippy::case_sensitive_file_extension_comparisons)]
321            {
322                *d == "localhost"
323                    || d.ends_with(".localhost")
324                    || d.ends_with(".internal")
325                    || d.ends_with(".local")
326            }
327        }
328        url::Host::Ipv4(v4) => is_private_ip(IpAddr::V4(*v4)),
329        url::Host::Ipv6(v6) => is_private_ip(IpAddr::V6(*v6)),
330    }
331}
332
333/// Resolves DNS for the URL host, validates all resolved IPs against private ranges,
334/// and returns the hostname and validated socket addresses.
335///
336/// Returning the addresses allows the caller to pin the HTTP client to these exact
337/// addresses, eliminating TOCTOU between DNS validation and the actual connection.
338async fn resolve_and_validate(url: &Url) -> Result<(String, Vec<SocketAddr>), ToolError> {
339    let Some(host) = url.host_str() else {
340        return Ok((String::new(), vec![]));
341    };
342    let port = url.port_or_known_default().unwrap_or(443);
343    let addrs: Vec<SocketAddr> = tokio::net::lookup_host(format!("{host}:{port}"))
344        .await
345        .map_err(|e| ToolError::Blocked {
346            command: format!("DNS resolution failed: {e}"),
347        })?
348        .collect();
349    for addr in &addrs {
350        if is_private_ip(addr.ip()) {
351            return Err(ToolError::Blocked {
352                command: format!("SSRF protection: private IP {} for host {host}", addr.ip()),
353            });
354        }
355    }
356    Ok((host.to_owned(), addrs))
357}
358
359fn parse_and_extract(
360    html: &str,
361    selector: &str,
362    extract: &ExtractMode,
363    limit: usize,
364) -> Result<String, ToolError> {
365    let soup = scrape_core::Soup::parse(html);
366
367    let tags = soup.find_all(selector).map_err(|e| {
368        ToolError::Execution(std::io::Error::new(
369            std::io::ErrorKind::InvalidData,
370            format!("invalid selector: {e}"),
371        ))
372    })?;
373
374    let mut results = Vec::new();
375
376    for tag in tags.into_iter().take(limit) {
377        let value = match extract {
378            ExtractMode::Text => tag.text(),
379            ExtractMode::Html => tag.inner_html(),
380            ExtractMode::Attr(name) => tag.get(name).unwrap_or_default().to_owned(),
381        };
382        if !value.trim().is_empty() {
383            results.push(value.trim().to_owned());
384        }
385    }
386
387    if results.is_empty() {
388        Ok(format!("No results for selector: {selector}"))
389    } else {
390        Ok(results.join("\n"))
391    }
392}
393
394#[cfg(test)]
395mod tests {
396    use super::*;
397
398    // --- extract_scrape_blocks ---
399
400    #[test]
401    fn extract_single_block() {
402        let text =
403            "Here:\n```scrape\n{\"url\":\"https://example.com\",\"select\":\"h1\"}\n```\nDone.";
404        let blocks = extract_scrape_blocks(text);
405        assert_eq!(blocks.len(), 1);
406        assert!(blocks[0].contains("example.com"));
407    }
408
409    #[test]
410    fn extract_multiple_blocks() {
411        let text = "```scrape\n{\"url\":\"https://a.com\",\"select\":\"h1\"}\n```\ntext\n```scrape\n{\"url\":\"https://b.com\",\"select\":\"p\"}\n```";
412        let blocks = extract_scrape_blocks(text);
413        assert_eq!(blocks.len(), 2);
414    }
415
416    #[test]
417    fn no_blocks_returns_empty() {
418        let blocks = extract_scrape_blocks("plain text, no code blocks");
419        assert!(blocks.is_empty());
420    }
421
422    #[test]
423    fn unclosed_block_ignored() {
424        let blocks = extract_scrape_blocks("```scrape\n{\"url\":\"https://x.com\"}");
425        assert!(blocks.is_empty());
426    }
427
428    #[test]
429    fn non_scrape_block_ignored() {
430        let text =
431            "```bash\necho hi\n```\n```scrape\n{\"url\":\"https://x.com\",\"select\":\"h1\"}\n```";
432        let blocks = extract_scrape_blocks(text);
433        assert_eq!(blocks.len(), 1);
434        assert!(blocks[0].contains("x.com"));
435    }
436
437    #[test]
438    fn multiline_json_block() {
439        let text =
440            "```scrape\n{\n  \"url\": \"https://example.com\",\n  \"select\": \"h1\"\n}\n```";
441        let blocks = extract_scrape_blocks(text);
442        assert_eq!(blocks.len(), 1);
443        let instr: ScrapeInstruction = serde_json::from_str(blocks[0]).unwrap();
444        assert_eq!(instr.url, "https://example.com");
445    }
446
447    // --- ScrapeInstruction parsing ---
448
449    #[test]
450    fn parse_valid_instruction() {
451        let json = r#"{"url":"https://example.com","select":"h1","extract":"text","limit":5}"#;
452        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
453        assert_eq!(instr.url, "https://example.com");
454        assert_eq!(instr.select, "h1");
455        assert_eq!(instr.extract, "text");
456        assert_eq!(instr.limit, Some(5));
457    }
458
459    #[test]
460    fn parse_minimal_instruction() {
461        let json = r#"{"url":"https://example.com","select":"p"}"#;
462        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
463        assert_eq!(instr.extract, "text");
464        assert!(instr.limit.is_none());
465    }
466
467    #[test]
468    fn parse_attr_extract() {
469        let json = r#"{"url":"https://example.com","select":"a","extract":"attr:href"}"#;
470        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
471        assert_eq!(instr.extract, "attr:href");
472    }
473
474    #[test]
475    fn parse_invalid_json_errors() {
476        let result = serde_json::from_str::<ScrapeInstruction>("not json");
477        assert!(result.is_err());
478    }
479
480    // --- ExtractMode ---
481
482    #[test]
483    fn extract_mode_text() {
484        assert!(matches!(ExtractMode::parse("text"), ExtractMode::Text));
485    }
486
487    #[test]
488    fn extract_mode_html() {
489        assert!(matches!(ExtractMode::parse("html"), ExtractMode::Html));
490    }
491
492    #[test]
493    fn extract_mode_attr() {
494        let mode = ExtractMode::parse("attr:href");
495        assert!(matches!(mode, ExtractMode::Attr(ref s) if s == "href"));
496    }
497
498    #[test]
499    fn extract_mode_unknown_defaults_to_text() {
500        assert!(matches!(ExtractMode::parse("unknown"), ExtractMode::Text));
501    }
502
503    // --- validate_url ---
504
505    #[test]
506    fn valid_https_url() {
507        assert!(validate_url("https://example.com").is_ok());
508    }
509
510    #[test]
511    fn http_rejected() {
512        let err = validate_url("http://example.com").unwrap_err();
513        assert!(matches!(err, ToolError::Blocked { .. }));
514    }
515
516    #[test]
517    fn ftp_rejected() {
518        let err = validate_url("ftp://files.example.com").unwrap_err();
519        assert!(matches!(err, ToolError::Blocked { .. }));
520    }
521
522    #[test]
523    fn file_rejected() {
524        let err = validate_url("file:///etc/passwd").unwrap_err();
525        assert!(matches!(err, ToolError::Blocked { .. }));
526    }
527
528    #[test]
529    fn invalid_url_rejected() {
530        let err = validate_url("not a url").unwrap_err();
531        assert!(matches!(err, ToolError::Blocked { .. }));
532    }
533
534    #[test]
535    fn localhost_blocked() {
536        let err = validate_url("https://localhost/path").unwrap_err();
537        assert!(matches!(err, ToolError::Blocked { .. }));
538    }
539
540    #[test]
541    fn loopback_ip_blocked() {
542        let err = validate_url("https://127.0.0.1/path").unwrap_err();
543        assert!(matches!(err, ToolError::Blocked { .. }));
544    }
545
546    #[test]
547    fn private_10_blocked() {
548        let err = validate_url("https://10.0.0.1/api").unwrap_err();
549        assert!(matches!(err, ToolError::Blocked { .. }));
550    }
551
552    #[test]
553    fn private_172_blocked() {
554        let err = validate_url("https://172.16.0.1/api").unwrap_err();
555        assert!(matches!(err, ToolError::Blocked { .. }));
556    }
557
558    #[test]
559    fn private_192_blocked() {
560        let err = validate_url("https://192.168.1.1/api").unwrap_err();
561        assert!(matches!(err, ToolError::Blocked { .. }));
562    }
563
564    #[test]
565    fn ipv6_loopback_blocked() {
566        let err = validate_url("https://[::1]/path").unwrap_err();
567        assert!(matches!(err, ToolError::Blocked { .. }));
568    }
569
570    #[test]
571    fn public_ip_allowed() {
572        assert!(validate_url("https://93.184.216.34/page").is_ok());
573    }
574
575    // --- parse_and_extract ---
576
577    #[test]
578    fn extract_text_from_html() {
579        let html = "<html><body><h1>Hello World</h1><p>Content</p></body></html>";
580        let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
581        assert_eq!(result, "Hello World");
582    }
583
584    #[test]
585    fn extract_multiple_elements() {
586        let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
587        let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
588        assert_eq!(result, "A\nB\nC");
589    }
590
591    #[test]
592    fn extract_with_limit() {
593        let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
594        let result = parse_and_extract(html, "li", &ExtractMode::Text, 2).unwrap();
595        assert_eq!(result, "A\nB");
596    }
597
598    #[test]
599    fn extract_attr_href() {
600        let html = r#"<a href="https://example.com">Link</a>"#;
601        let result =
602            parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
603        assert_eq!(result, "https://example.com");
604    }
605
606    #[test]
607    fn extract_inner_html() {
608        let html = "<div><span>inner</span></div>";
609        let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
610        assert!(result.contains("<span>inner</span>"));
611    }
612
613    #[test]
614    fn no_matches_returns_message() {
615        let html = "<html><body><p>text</p></body></html>";
616        let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
617        assert!(result.starts_with("No results for selector:"));
618    }
619
620    #[test]
621    fn empty_text_skipped() {
622        let html = "<ul><li>  </li><li>A</li></ul>";
623        let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
624        assert_eq!(result, "A");
625    }
626
627    #[test]
628    fn invalid_selector_errors() {
629        let html = "<html><body></body></html>";
630        let result = parse_and_extract(html, "[[[invalid", &ExtractMode::Text, 10);
631        assert!(result.is_err());
632    }
633
634    #[test]
635    fn empty_html_returns_no_results() {
636        let result = parse_and_extract("", "h1", &ExtractMode::Text, 10).unwrap();
637        assert!(result.starts_with("No results for selector:"));
638    }
639
640    #[test]
641    fn nested_selector() {
642        let html = "<div><span>inner</span></div><span>outer</span>";
643        let result = parse_and_extract(html, "div > span", &ExtractMode::Text, 10).unwrap();
644        assert_eq!(result, "inner");
645    }
646
647    #[test]
648    fn attr_missing_returns_empty() {
649        let html = r#"<a>No href</a>"#;
650        let result =
651            parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
652        assert!(result.starts_with("No results for selector:"));
653    }
654
655    #[test]
656    fn extract_html_mode() {
657        let html = "<div><b>bold</b> text</div>";
658        let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
659        assert!(result.contains("<b>bold</b>"));
660    }
661
662    #[test]
663    fn limit_zero_returns_no_results() {
664        let html = "<ul><li>A</li><li>B</li></ul>";
665        let result = parse_and_extract(html, "li", &ExtractMode::Text, 0).unwrap();
666        assert!(result.starts_with("No results for selector:"));
667    }
668
669    // --- validate_url edge cases ---
670
671    #[test]
672    fn url_with_port_allowed() {
673        assert!(validate_url("https://example.com:8443/path").is_ok());
674    }
675
676    #[test]
677    fn link_local_ip_blocked() {
678        let err = validate_url("https://169.254.1.1/path").unwrap_err();
679        assert!(matches!(err, ToolError::Blocked { .. }));
680    }
681
682    #[test]
683    fn url_no_scheme_rejected() {
684        let err = validate_url("example.com/path").unwrap_err();
685        assert!(matches!(err, ToolError::Blocked { .. }));
686    }
687
688    #[test]
689    fn unspecified_ipv4_blocked() {
690        let err = validate_url("https://0.0.0.0/path").unwrap_err();
691        assert!(matches!(err, ToolError::Blocked { .. }));
692    }
693
694    #[test]
695    fn broadcast_ipv4_blocked() {
696        let err = validate_url("https://255.255.255.255/path").unwrap_err();
697        assert!(matches!(err, ToolError::Blocked { .. }));
698    }
699
700    #[test]
701    fn ipv6_link_local_blocked() {
702        let err = validate_url("https://[fe80::1]/path").unwrap_err();
703        assert!(matches!(err, ToolError::Blocked { .. }));
704    }
705
706    #[test]
707    fn ipv6_unique_local_blocked() {
708        let err = validate_url("https://[fd12::1]/path").unwrap_err();
709        assert!(matches!(err, ToolError::Blocked { .. }));
710    }
711
712    #[test]
713    fn ipv4_mapped_ipv6_loopback_blocked() {
714        let err = validate_url("https://[::ffff:127.0.0.1]/path").unwrap_err();
715        assert!(matches!(err, ToolError::Blocked { .. }));
716    }
717
718    #[test]
719    fn ipv4_mapped_ipv6_private_blocked() {
720        let err = validate_url("https://[::ffff:10.0.0.1]/path").unwrap_err();
721        assert!(matches!(err, ToolError::Blocked { .. }));
722    }
723
724    // --- WebScrapeExecutor (no-network) ---
725
726    #[tokio::test]
727    async fn executor_no_blocks_returns_none() {
728        let config = ScrapeConfig::default();
729        let executor = WebScrapeExecutor::new(&config);
730        let result = executor.execute("plain text").await;
731        assert!(result.unwrap().is_none());
732    }
733
734    #[tokio::test]
735    async fn executor_invalid_json_errors() {
736        let config = ScrapeConfig::default();
737        let executor = WebScrapeExecutor::new(&config);
738        let response = "```scrape\nnot json\n```";
739        let result = executor.execute(response).await;
740        assert!(matches!(result, Err(ToolError::Execution(_))));
741    }
742
743    #[tokio::test]
744    async fn executor_blocked_url_errors() {
745        let config = ScrapeConfig::default();
746        let executor = WebScrapeExecutor::new(&config);
747        let response = "```scrape\n{\"url\":\"http://example.com\",\"select\":\"h1\"}\n```";
748        let result = executor.execute(response).await;
749        assert!(matches!(result, Err(ToolError::Blocked { .. })));
750    }
751
752    #[tokio::test]
753    async fn executor_private_ip_blocked() {
754        let config = ScrapeConfig::default();
755        let executor = WebScrapeExecutor::new(&config);
756        let response = "```scrape\n{\"url\":\"https://192.168.1.1/api\",\"select\":\"h1\"}\n```";
757        let result = executor.execute(response).await;
758        assert!(matches!(result, Err(ToolError::Blocked { .. })));
759    }
760
761    #[tokio::test]
762    async fn executor_unreachable_host_returns_error() {
763        let config = ScrapeConfig {
764            timeout: 1,
765            max_body_bytes: 1_048_576,
766        };
767        let executor = WebScrapeExecutor::new(&config);
768        let response = "```scrape\n{\"url\":\"https://192.0.2.1:1/page\",\"select\":\"h1\"}\n```";
769        let result = executor.execute(response).await;
770        assert!(matches!(result, Err(ToolError::Execution(_))));
771    }
772
773    #[tokio::test]
774    async fn executor_localhost_url_blocked() {
775        let config = ScrapeConfig::default();
776        let executor = WebScrapeExecutor::new(&config);
777        let response = "```scrape\n{\"url\":\"https://localhost:9999/api\",\"select\":\"h1\"}\n```";
778        let result = executor.execute(response).await;
779        assert!(matches!(result, Err(ToolError::Blocked { .. })));
780    }
781
782    #[tokio::test]
783    async fn executor_empty_text_returns_none() {
784        let config = ScrapeConfig::default();
785        let executor = WebScrapeExecutor::new(&config);
786        let result = executor.execute("").await;
787        assert!(result.unwrap().is_none());
788    }
789
790    #[tokio::test]
791    async fn executor_multiple_blocks_first_blocked() {
792        let config = ScrapeConfig::default();
793        let executor = WebScrapeExecutor::new(&config);
794        let response = "```scrape\n{\"url\":\"http://evil.com\",\"select\":\"h1\"}\n```\n\
795             ```scrape\n{\"url\":\"https://ok.com\",\"select\":\"h1\"}\n```";
796        let result = executor.execute(response).await;
797        assert!(result.is_err());
798    }
799
800    #[test]
801    fn validate_url_empty_string() {
802        let err = validate_url("").unwrap_err();
803        assert!(matches!(err, ToolError::Blocked { .. }));
804    }
805
806    #[test]
807    fn validate_url_javascript_scheme_blocked() {
808        let err = validate_url("javascript:alert(1)").unwrap_err();
809        assert!(matches!(err, ToolError::Blocked { .. }));
810    }
811
812    #[test]
813    fn validate_url_data_scheme_blocked() {
814        let err = validate_url("data:text/html,<h1>hi</h1>").unwrap_err();
815        assert!(matches!(err, ToolError::Blocked { .. }));
816    }
817
818    #[test]
819    fn is_private_host_public_domain_is_false() {
820        let host: url::Host<&str> = url::Host::Domain("example.com");
821        assert!(!is_private_host(&host));
822    }
823
824    #[test]
825    fn is_private_host_localhost_is_true() {
826        let host: url::Host<&str> = url::Host::Domain("localhost");
827        assert!(is_private_host(&host));
828    }
829
830    #[test]
831    fn is_private_host_ipv6_unspecified_is_true() {
832        let host = url::Host::Ipv6(std::net::Ipv6Addr::UNSPECIFIED);
833        assert!(is_private_host(&host));
834    }
835
836    #[test]
837    fn is_private_host_public_ipv6_is_false() {
838        let host = url::Host::Ipv6("2001:db8::1".parse().unwrap());
839        assert!(!is_private_host(&host));
840    }
841
842    // --- fetch_html redirect logic: wiremock HTTP server tests ---
843    //
844    // These tests use a local wiremock server to exercise the redirect-following logic
845    // in `fetch_html` without requiring an external HTTPS connection. The server binds to
846    // 127.0.0.1, and tests call `fetch_html` directly (bypassing `validate_url`) to avoid
847    // the SSRF guard that would otherwise block loopback connections.
848
849    /// Helper: returns executor + (server_url, server_addr) from a running wiremock mock server.
850    /// The server address is passed to `fetch_html` via `resolve_to_addrs` so the client
851    /// connects to the mock instead of doing a real DNS lookup.
852    async fn mock_server_executor() -> (WebScrapeExecutor, wiremock::MockServer) {
853        let server = wiremock::MockServer::start().await;
854        let executor = WebScrapeExecutor {
855            timeout: Duration::from_secs(5),
856            max_body_bytes: 1_048_576,
857        };
858        (executor, server)
859    }
860
861    /// Parses the mock server's URI into (host_str, socket_addr) for use with `build_client`.
862    fn server_host_and_addr(server: &wiremock::MockServer) -> (String, Vec<std::net::SocketAddr>) {
863        let uri = server.uri();
864        let url = Url::parse(&uri).unwrap();
865        let host = url.host_str().unwrap_or("127.0.0.1").to_owned();
866        let port = url.port().unwrap_or(80);
867        let addr: std::net::SocketAddr = format!("{host}:{port}").parse().unwrap();
868        (host, vec![addr])
869    }
870
871    /// Test-only redirect follower that mimics `fetch_html`'s loop but skips `validate_url` /
872    /// `resolve_and_validate`. This lets us exercise the redirect-counting and
873    /// missing-Location logic against a plain HTTP wiremock server.
874    async fn follow_redirects_raw(
875        executor: &WebScrapeExecutor,
876        start_url: &str,
877        host: &str,
878        addrs: &[std::net::SocketAddr],
879    ) -> Result<String, ToolError> {
880        const MAX_REDIRECTS: usize = 3;
881        let mut current_url = start_url.to_owned();
882        let mut current_host = host.to_owned();
883        let mut current_addrs = addrs.to_vec();
884
885        for hop in 0..=MAX_REDIRECTS {
886            let client = executor.build_client(&current_host, &current_addrs);
887            let resp = client
888                .get(&current_url)
889                .send()
890                .await
891                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
892
893            let status = resp.status();
894
895            if status.is_redirection() {
896                if hop == MAX_REDIRECTS {
897                    return Err(ToolError::Execution(std::io::Error::other(
898                        "too many redirects",
899                    )));
900                }
901
902                let location = resp
903                    .headers()
904                    .get(reqwest::header::LOCATION)
905                    .and_then(|v| v.to_str().ok())
906                    .ok_or_else(|| {
907                        ToolError::Execution(std::io::Error::other("redirect with no Location"))
908                    })?;
909
910                let base = Url::parse(&current_url)
911                    .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
912                let next_url = base
913                    .join(location)
914                    .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
915
916                // Re-use same host/addrs (mock server is always the same endpoint).
917                current_url = next_url.to_string();
918                // Preserve host/addrs as-is since the mock server doesn't change.
919                let _ = &mut current_host;
920                let _ = &mut current_addrs;
921                continue;
922            }
923
924            if !status.is_success() {
925                return Err(ToolError::Execution(std::io::Error::other(format!(
926                    "HTTP {status}",
927                ))));
928            }
929
930            let bytes = resp
931                .bytes()
932                .await
933                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
934
935            if bytes.len() > executor.max_body_bytes {
936                return Err(ToolError::Execution(std::io::Error::other(format!(
937                    "response too large: {} bytes (max: {})",
938                    bytes.len(),
939                    executor.max_body_bytes,
940                ))));
941            }
942
943            return String::from_utf8(bytes.to_vec())
944                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())));
945        }
946
947        Err(ToolError::Execution(std::io::Error::other(
948            "too many redirects",
949        )))
950    }
951
952    #[tokio::test]
953    async fn fetch_html_success_returns_body() {
954        use wiremock::matchers::{method, path};
955        use wiremock::{Mock, ResponseTemplate};
956
957        let (executor, server) = mock_server_executor().await;
958        Mock::given(method("GET"))
959            .and(path("/page"))
960            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>OK</h1>"))
961            .mount(&server)
962            .await;
963
964        let (host, addrs) = server_host_and_addr(&server);
965        let url = format!("{}/page", server.uri());
966        let result = executor.fetch_html(&url, &host, &addrs).await;
967        assert!(result.is_ok(), "expected Ok, got: {result:?}");
968        assert_eq!(result.unwrap(), "<h1>OK</h1>");
969    }
970
971    #[tokio::test]
972    async fn fetch_html_non_2xx_returns_error() {
973        use wiremock::matchers::{method, path};
974        use wiremock::{Mock, ResponseTemplate};
975
976        let (executor, server) = mock_server_executor().await;
977        Mock::given(method("GET"))
978            .and(path("/forbidden"))
979            .respond_with(ResponseTemplate::new(403))
980            .mount(&server)
981            .await;
982
983        let (host, addrs) = server_host_and_addr(&server);
984        let url = format!("{}/forbidden", server.uri());
985        let result = executor.fetch_html(&url, &host, &addrs).await;
986        assert!(result.is_err());
987        let msg = result.unwrap_err().to_string();
988        assert!(msg.contains("403"), "expected 403 in error: {msg}");
989    }
990
991    #[tokio::test]
992    async fn fetch_html_404_returns_error() {
993        use wiremock::matchers::{method, path};
994        use wiremock::{Mock, ResponseTemplate};
995
996        let (executor, server) = mock_server_executor().await;
997        Mock::given(method("GET"))
998            .and(path("/missing"))
999            .respond_with(ResponseTemplate::new(404))
1000            .mount(&server)
1001            .await;
1002
1003        let (host, addrs) = server_host_and_addr(&server);
1004        let url = format!("{}/missing", server.uri());
1005        let result = executor.fetch_html(&url, &host, &addrs).await;
1006        assert!(result.is_err());
1007        let msg = result.unwrap_err().to_string();
1008        assert!(msg.contains("404"), "expected 404 in error: {msg}");
1009    }
1010
1011    #[tokio::test]
1012    async fn fetch_html_redirect_no_location_returns_error() {
1013        use wiremock::matchers::{method, path};
1014        use wiremock::{Mock, ResponseTemplate};
1015
1016        let (executor, server) = mock_server_executor().await;
1017        // 302 with no Location header
1018        Mock::given(method("GET"))
1019            .and(path("/redirect-no-loc"))
1020            .respond_with(ResponseTemplate::new(302))
1021            .mount(&server)
1022            .await;
1023
1024        let (host, addrs) = server_host_and_addr(&server);
1025        let url = format!("{}/redirect-no-loc", server.uri());
1026        let result = executor.fetch_html(&url, &host, &addrs).await;
1027        assert!(result.is_err());
1028        let msg = result.unwrap_err().to_string();
1029        assert!(
1030            msg.contains("Location") || msg.contains("location"),
1031            "expected Location-related error: {msg}"
1032        );
1033    }
1034
1035    #[tokio::test]
1036    async fn fetch_html_single_redirect_followed() {
1037        use wiremock::matchers::{method, path};
1038        use wiremock::{Mock, ResponseTemplate};
1039
1040        let (executor, server) = mock_server_executor().await;
1041        let final_url = format!("{}/final", server.uri());
1042
1043        Mock::given(method("GET"))
1044            .and(path("/start"))
1045            .respond_with(ResponseTemplate::new(302).insert_header("location", final_url.as_str()))
1046            .mount(&server)
1047            .await;
1048
1049        Mock::given(method("GET"))
1050            .and(path("/final"))
1051            .respond_with(ResponseTemplate::new(200).set_body_string("<p>final</p>"))
1052            .mount(&server)
1053            .await;
1054
1055        let (host, addrs) = server_host_and_addr(&server);
1056        let url = format!("{}/start", server.uri());
1057        let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1058        assert!(result.is_ok(), "single redirect should succeed: {result:?}");
1059        assert_eq!(result.unwrap(), "<p>final</p>");
1060    }
1061
1062    #[tokio::test]
1063    async fn fetch_html_three_redirects_allowed() {
1064        use wiremock::matchers::{method, path};
1065        use wiremock::{Mock, ResponseTemplate};
1066
1067        let (executor, server) = mock_server_executor().await;
1068        let hop2 = format!("{}/hop2", server.uri());
1069        let hop3 = format!("{}/hop3", server.uri());
1070        let final_dest = format!("{}/done", server.uri());
1071
1072        Mock::given(method("GET"))
1073            .and(path("/hop1"))
1074            .respond_with(ResponseTemplate::new(301).insert_header("location", hop2.as_str()))
1075            .mount(&server)
1076            .await;
1077        Mock::given(method("GET"))
1078            .and(path("/hop2"))
1079            .respond_with(ResponseTemplate::new(301).insert_header("location", hop3.as_str()))
1080            .mount(&server)
1081            .await;
1082        Mock::given(method("GET"))
1083            .and(path("/hop3"))
1084            .respond_with(ResponseTemplate::new(301).insert_header("location", final_dest.as_str()))
1085            .mount(&server)
1086            .await;
1087        Mock::given(method("GET"))
1088            .and(path("/done"))
1089            .respond_with(ResponseTemplate::new(200).set_body_string("<p>done</p>"))
1090            .mount(&server)
1091            .await;
1092
1093        let (host, addrs) = server_host_and_addr(&server);
1094        let url = format!("{}/hop1", server.uri());
1095        let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1096        assert!(result.is_ok(), "3 redirects should succeed: {result:?}");
1097        assert_eq!(result.unwrap(), "<p>done</p>");
1098    }
1099
1100    #[tokio::test]
1101    async fn fetch_html_four_redirects_rejected() {
1102        use wiremock::matchers::{method, path};
1103        use wiremock::{Mock, ResponseTemplate};
1104
1105        let (executor, server) = mock_server_executor().await;
1106        let hop2 = format!("{}/r2", server.uri());
1107        let hop3 = format!("{}/r3", server.uri());
1108        let hop4 = format!("{}/r4", server.uri());
1109        let hop5 = format!("{}/r5", server.uri());
1110
1111        for (from, to) in [
1112            ("/r1", &hop2),
1113            ("/r2", &hop3),
1114            ("/r3", &hop4),
1115            ("/r4", &hop5),
1116        ] {
1117            Mock::given(method("GET"))
1118                .and(path(from))
1119                .respond_with(ResponseTemplate::new(301).insert_header("location", to.as_str()))
1120                .mount(&server)
1121                .await;
1122        }
1123
1124        let (host, addrs) = server_host_and_addr(&server);
1125        let url = format!("{}/r1", server.uri());
1126        let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1127        assert!(result.is_err(), "4 redirects should be rejected");
1128        let msg = result.unwrap_err().to_string();
1129        assert!(
1130            msg.contains("redirect"),
1131            "expected redirect-related error: {msg}"
1132        );
1133    }
1134
1135    #[tokio::test]
1136    async fn fetch_html_body_too_large_returns_error() {
1137        use wiremock::matchers::{method, path};
1138        use wiremock::{Mock, ResponseTemplate};
1139
1140        let small_limit_executor = WebScrapeExecutor {
1141            timeout: Duration::from_secs(5),
1142            max_body_bytes: 10,
1143        };
1144        let server = wiremock::MockServer::start().await;
1145        Mock::given(method("GET"))
1146            .and(path("/big"))
1147            .respond_with(
1148                ResponseTemplate::new(200)
1149                    .set_body_string("this body is definitely longer than ten bytes"),
1150            )
1151            .mount(&server)
1152            .await;
1153
1154        let (host, addrs) = server_host_and_addr(&server);
1155        let url = format!("{}/big", server.uri());
1156        let result = small_limit_executor.fetch_html(&url, &host, &addrs).await;
1157        assert!(result.is_err());
1158        let msg = result.unwrap_err().to_string();
1159        assert!(msg.contains("too large"), "expected too-large error: {msg}");
1160    }
1161
1162    #[test]
1163    fn extract_scrape_blocks_empty_block_content() {
1164        let text = "```scrape\n\n```";
1165        let blocks = extract_scrape_blocks(text);
1166        assert_eq!(blocks.len(), 1);
1167        assert!(blocks[0].is_empty());
1168    }
1169
1170    #[test]
1171    fn extract_scrape_blocks_whitespace_only() {
1172        let text = "```scrape\n   \n```";
1173        let blocks = extract_scrape_blocks(text);
1174        assert_eq!(blocks.len(), 1);
1175    }
1176
1177    #[test]
1178    fn parse_and_extract_multiple_selectors() {
1179        let html = "<div><h1>Title</h1><p>Para</p></div>";
1180        let result = parse_and_extract(html, "h1, p", &ExtractMode::Text, 10).unwrap();
1181        assert!(result.contains("Title"));
1182        assert!(result.contains("Para"));
1183    }
1184
1185    #[test]
1186    fn webscrape_executor_new_with_custom_config() {
1187        let config = ScrapeConfig {
1188            timeout: 60,
1189            max_body_bytes: 512,
1190        };
1191        let executor = WebScrapeExecutor::new(&config);
1192        assert_eq!(executor.max_body_bytes, 512);
1193    }
1194
1195    #[test]
1196    fn webscrape_executor_debug() {
1197        let config = ScrapeConfig::default();
1198        let executor = WebScrapeExecutor::new(&config);
1199        let dbg = format!("{executor:?}");
1200        assert!(dbg.contains("WebScrapeExecutor"));
1201    }
1202
1203    #[test]
1204    fn extract_mode_attr_empty_name() {
1205        let mode = ExtractMode::parse("attr:");
1206        assert!(matches!(mode, ExtractMode::Attr(ref s) if s.is_empty()));
1207    }
1208
1209    #[test]
1210    fn default_extract_returns_text() {
1211        assert_eq!(default_extract(), "text");
1212    }
1213
1214    #[test]
1215    fn scrape_instruction_debug() {
1216        let json = r#"{"url":"https://example.com","select":"h1"}"#;
1217        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
1218        let dbg = format!("{instr:?}");
1219        assert!(dbg.contains("ScrapeInstruction"));
1220    }
1221
1222    #[test]
1223    fn extract_mode_debug() {
1224        let mode = ExtractMode::Text;
1225        let dbg = format!("{mode:?}");
1226        assert!(dbg.contains("Text"));
1227    }
1228
1229    // --- fetch_html redirect logic: constant and validation unit tests ---
1230
1231    /// MAX_REDIRECTS is 3; the 4th redirect attempt must be rejected.
1232    /// Verify the boundary is correct by inspecting the constant value.
1233    #[test]
1234    fn max_redirects_constant_is_three() {
1235        // fetch_html uses `for hop in 0..=MAX_REDIRECTS` and returns error when hop == MAX_REDIRECTS
1236        // while still in a redirect. That means hops 0,1,2 can redirect; hop 3 triggers the error.
1237        // This test documents the expected limit.
1238        const MAX_REDIRECTS: usize = 3;
1239        assert_eq!(MAX_REDIRECTS, 3, "fetch_html allows exactly 3 redirects");
1240    }
1241
1242    /// Verifies that a Location-less redirect would produce an error string containing the
1243    /// expected message, matching the error path in fetch_html.
1244    #[test]
1245    fn redirect_no_location_error_message() {
1246        let err = std::io::Error::other("redirect with no Location");
1247        assert!(err.to_string().contains("redirect with no Location"));
1248    }
1249
1250    /// Verifies that a too-many-redirects condition produces the expected error string.
1251    #[test]
1252    fn too_many_redirects_error_message() {
1253        let err = std::io::Error::other("too many redirects");
1254        assert!(err.to_string().contains("too many redirects"));
1255    }
1256
1257    /// Verifies that a non-2xx HTTP status produces an error message with the status code.
1258    #[test]
1259    fn non_2xx_status_error_format() {
1260        let status = reqwest::StatusCode::FORBIDDEN;
1261        let msg = format!("HTTP {status}");
1262        assert!(msg.contains("403"));
1263    }
1264
1265    /// Verifies that a 404 response status code formats into the expected error message.
1266    #[test]
1267    fn not_found_status_error_format() {
1268        let status = reqwest::StatusCode::NOT_FOUND;
1269        let msg = format!("HTTP {status}");
1270        assert!(msg.contains("404"));
1271    }
1272
1273    /// Verifies relative redirect resolution for same-host paths (simulates Location: /other).
1274    #[test]
1275    fn relative_redirect_same_host_path() {
1276        let base = Url::parse("https://example.com/current").unwrap();
1277        let resolved = base.join("/other").unwrap();
1278        assert_eq!(resolved.as_str(), "https://example.com/other");
1279    }
1280
1281    /// Verifies relative redirect resolution preserves scheme and host.
1282    #[test]
1283    fn relative_redirect_relative_path() {
1284        let base = Url::parse("https://example.com/a/b").unwrap();
1285        let resolved = base.join("c").unwrap();
1286        assert_eq!(resolved.as_str(), "https://example.com/a/c");
1287    }
1288
1289    /// Verifies that an absolute redirect URL overrides base URL completely.
1290    #[test]
1291    fn absolute_redirect_overrides_base() {
1292        let base = Url::parse("https://example.com/page").unwrap();
1293        let resolved = base.join("https://other.com/target").unwrap();
1294        assert_eq!(resolved.as_str(), "https://other.com/target");
1295    }
1296
1297    /// Verifies that a redirect Location of http:// (downgrade) is rejected.
1298    #[test]
1299    fn redirect_http_downgrade_rejected() {
1300        let location = "http://example.com/page";
1301        let base = Url::parse("https://example.com/start").unwrap();
1302        let next = base.join(location).unwrap();
1303        let err = validate_url(next.as_str()).unwrap_err();
1304        assert!(matches!(err, ToolError::Blocked { .. }));
1305    }
1306
1307    /// Verifies that a redirect to a private IP literal is blocked.
1308    #[test]
1309    fn redirect_location_private_ip_blocked() {
1310        let location = "https://192.168.100.1/admin";
1311        let base = Url::parse("https://example.com/start").unwrap();
1312        let next = base.join(location).unwrap();
1313        let err = validate_url(next.as_str()).unwrap_err();
1314        assert!(matches!(err, ToolError::Blocked { .. }));
1315        let cmd = match err {
1316            ToolError::Blocked { command } => command,
1317            _ => panic!("expected Blocked"),
1318        };
1319        assert!(
1320            cmd.contains("private") || cmd.contains("scheme"),
1321            "error message should describe the block reason: {cmd}"
1322        );
1323    }
1324
1325    /// Verifies that a redirect to a .internal domain is blocked.
1326    #[test]
1327    fn redirect_location_internal_domain_blocked() {
1328        let location = "https://metadata.internal/latest/meta-data/";
1329        let base = Url::parse("https://example.com/start").unwrap();
1330        let next = base.join(location).unwrap();
1331        let err = validate_url(next.as_str()).unwrap_err();
1332        assert!(matches!(err, ToolError::Blocked { .. }));
1333    }
1334
1335    /// Verifies that a chain of 3 valid public redirects passes validate_url at every hop.
1336    #[test]
1337    fn redirect_chain_three_hops_all_public() {
1338        let hops = [
1339            "https://redirect1.example.com/hop1",
1340            "https://redirect2.example.com/hop2",
1341            "https://destination.example.com/final",
1342        ];
1343        for hop in hops {
1344            assert!(validate_url(hop).is_ok(), "expected ok for {hop}");
1345        }
1346    }
1347
1348    // --- SSRF redirect chain defense ---
1349
1350    /// Verifies that a redirect Location pointing to a private IP is rejected by validate_url
1351    /// before any connection attempt — simulating the validation step inside fetch_html.
1352    #[test]
1353    fn redirect_to_private_ip_rejected_by_validate_url() {
1354        // These would appear as Location headers in a redirect response.
1355        let private_targets = [
1356            "https://127.0.0.1/secret",
1357            "https://10.0.0.1/internal",
1358            "https://192.168.1.1/admin",
1359            "https://172.16.0.1/data",
1360            "https://[::1]/path",
1361            "https://[fe80::1]/path",
1362            "https://localhost/path",
1363            "https://service.internal/api",
1364        ];
1365        for target in private_targets {
1366            let result = validate_url(target);
1367            assert!(result.is_err(), "expected error for {target}");
1368            assert!(
1369                matches!(result.unwrap_err(), ToolError::Blocked { .. }),
1370                "expected Blocked for {target}"
1371            );
1372        }
1373    }
1374
1375    /// Verifies that relative redirect URLs are resolved correctly before validation.
1376    #[test]
1377    fn redirect_relative_url_resolves_correctly() {
1378        let base = Url::parse("https://example.com/page").unwrap();
1379        let relative = "/other";
1380        let resolved = base.join(relative).unwrap();
1381        assert_eq!(resolved.as_str(), "https://example.com/other");
1382    }
1383
1384    /// Verifies that a protocol-relative redirect to http:// is rejected (scheme check).
1385    #[test]
1386    fn redirect_to_http_rejected() {
1387        let err = validate_url("http://example.com/page").unwrap_err();
1388        assert!(matches!(err, ToolError::Blocked { .. }));
1389    }
1390
1391    #[test]
1392    fn ipv4_mapped_ipv6_link_local_blocked() {
1393        let err = validate_url("https://[::ffff:169.254.0.1]/path").unwrap_err();
1394        assert!(matches!(err, ToolError::Blocked { .. }));
1395    }
1396
1397    #[test]
1398    fn ipv4_mapped_ipv6_public_allowed() {
1399        assert!(validate_url("https://[::ffff:93.184.216.34]/path").is_ok());
1400    }
1401
1402    #[test]
1403    fn tool_definitions_returns_web_scrape() {
1404        let config = ScrapeConfig::default();
1405        let executor = WebScrapeExecutor::new(&config);
1406        let defs = executor.tool_definitions();
1407        assert_eq!(defs.len(), 1);
1408        assert_eq!(defs[0].id, "web_scrape");
1409        assert_eq!(
1410            defs[0].invocation,
1411            crate::registry::InvocationHint::FencedBlock("scrape")
1412        );
1413    }
1414
1415    #[test]
1416    fn tool_definitions_schema_has_all_params() {
1417        let config = ScrapeConfig::default();
1418        let executor = WebScrapeExecutor::new(&config);
1419        let defs = executor.tool_definitions();
1420        let obj = defs[0].schema.as_object().unwrap();
1421        let props = obj["properties"].as_object().unwrap();
1422        assert!(props.contains_key("url"));
1423        assert!(props.contains_key("select"));
1424        assert!(props.contains_key("extract"));
1425        assert!(props.contains_key("limit"));
1426        let req = obj["required"].as_array().unwrap();
1427        assert!(req.iter().any(|v| v.as_str() == Some("url")));
1428        assert!(req.iter().any(|v| v.as_str() == Some("select")));
1429        assert!(!req.iter().any(|v| v.as_str() == Some("extract")));
1430    }
1431
1432    // --- is_private_host: new domain checks (AUD-02) ---
1433
1434    #[test]
1435    fn subdomain_localhost_blocked() {
1436        let host: url::Host<&str> = url::Host::Domain("foo.localhost");
1437        assert!(is_private_host(&host));
1438    }
1439
1440    #[test]
1441    fn internal_tld_blocked() {
1442        let host: url::Host<&str> = url::Host::Domain("service.internal");
1443        assert!(is_private_host(&host));
1444    }
1445
1446    #[test]
1447    fn local_tld_blocked() {
1448        let host: url::Host<&str> = url::Host::Domain("printer.local");
1449        assert!(is_private_host(&host));
1450    }
1451
1452    #[test]
1453    fn public_domain_not_blocked() {
1454        let host: url::Host<&str> = url::Host::Domain("example.com");
1455        assert!(!is_private_host(&host));
1456    }
1457
1458    // --- resolve_and_validate: private IP rejection ---
1459
1460    #[tokio::test]
1461    async fn resolve_loopback_rejected() {
1462        // 127.0.0.1 resolves directly (literal IP in DNS query)
1463        let url = url::Url::parse("https://127.0.0.1/path").unwrap();
1464        // validate_url catches this before resolve_and_validate, but test directly
1465        let result = resolve_and_validate(&url).await;
1466        assert!(
1467            result.is_err(),
1468            "loopback IP must be rejected by resolve_and_validate"
1469        );
1470        let err = result.unwrap_err();
1471        assert!(matches!(err, crate::executor::ToolError::Blocked { .. }));
1472    }
1473
1474    #[tokio::test]
1475    async fn resolve_private_10_rejected() {
1476        let url = url::Url::parse("https://10.0.0.1/path").unwrap();
1477        let result = resolve_and_validate(&url).await;
1478        assert!(result.is_err());
1479        assert!(matches!(
1480            result.unwrap_err(),
1481            crate::executor::ToolError::Blocked { .. }
1482        ));
1483    }
1484
1485    #[tokio::test]
1486    async fn resolve_private_192_rejected() {
1487        let url = url::Url::parse("https://192.168.1.1/path").unwrap();
1488        let result = resolve_and_validate(&url).await;
1489        assert!(result.is_err());
1490        assert!(matches!(
1491            result.unwrap_err(),
1492            crate::executor::ToolError::Blocked { .. }
1493        ));
1494    }
1495
1496    #[tokio::test]
1497    async fn resolve_ipv6_loopback_rejected() {
1498        let url = url::Url::parse("https://[::1]/path").unwrap();
1499        let result = resolve_and_validate(&url).await;
1500        assert!(result.is_err());
1501        assert!(matches!(
1502            result.unwrap_err(),
1503            crate::executor::ToolError::Blocked { .. }
1504        ));
1505    }
1506
1507    #[tokio::test]
1508    async fn resolve_no_host_returns_ok() {
1509        // URL without a resolvable host — should pass through
1510        let url = url::Url::parse("https://example.com/path").unwrap();
1511        // We can't do a live DNS test, but we can verify a URL with no host
1512        let url_no_host = url::Url::parse("data:text/plain,hello").unwrap();
1513        // data: URLs have no host; resolve_and_validate should return Ok with empty addrs
1514        let result = resolve_and_validate(&url_no_host).await;
1515        assert!(result.is_ok());
1516        let (host, addrs) = result.unwrap();
1517        assert!(host.is_empty());
1518        assert!(addrs.is_empty());
1519        drop(url);
1520        drop(url_no_host);
1521    }
1522}
zeph_tools/scrape.rs

zeph_tools/
scrape.rs