zeph_tools/
scrape.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use std::net::{IpAddr, SocketAddr};
5use std::time::Duration;
6
7use schemars::JsonSchema;
8use serde::Deserialize;
9use url::Url;
10
11use crate::config::ScrapeConfig;
12use crate::executor::{ToolCall, ToolError, ToolExecutor, ToolOutput, deserialize_params};
13
14#[derive(Debug, Deserialize, JsonSchema)]
15struct ScrapeInstruction {
16    /// HTTPS URL to scrape
17    url: String,
18    /// CSS selector
19    select: String,
20    /// Extract mode: text, html, or attr:<name>
21    #[serde(default = "default_extract")]
22    extract: String,
23    /// Max results to return
24    limit: Option<usize>,
25}
26
27fn default_extract() -> String {
28    "text".into()
29}
30
31#[derive(Debug)]
32enum ExtractMode {
33    Text,
34    Html,
35    Attr(String),
36}
37
38impl ExtractMode {
39    fn parse(s: &str) -> Self {
40        match s {
41            "text" => Self::Text,
42            "html" => Self::Html,
43            attr if attr.starts_with("attr:") => {
44                Self::Attr(attr.strip_prefix("attr:").unwrap_or(attr).to_owned())
45            }
46            _ => Self::Text,
47        }
48    }
49}
50
51/// Extracts data from web pages via CSS selectors.
52///
53/// Detects ` ```scrape ` blocks in LLM responses containing JSON instructions,
54/// fetches the URL, and parses HTML with `scrape-core`.
55#[derive(Debug)]
56pub struct WebScrapeExecutor {
57    timeout: Duration,
58    max_body_bytes: usize,
59}
60
61impl WebScrapeExecutor {
62    #[must_use]
63    pub fn new(config: &ScrapeConfig) -> Self {
64        Self {
65            timeout: Duration::from_secs(config.timeout),
66            max_body_bytes: config.max_body_bytes,
67        }
68    }
69
70    fn build_client(&self, host: &str, addrs: &[SocketAddr]) -> reqwest::Client {
71        let mut builder = reqwest::Client::builder()
72            .timeout(self.timeout)
73            .redirect(reqwest::redirect::Policy::limited(3));
74        builder = builder.resolve_to_addrs(host, addrs);
75        builder.build().unwrap_or_default()
76    }
77}
78
79impl ToolExecutor for WebScrapeExecutor {
80    fn tool_definitions(&self) -> Vec<crate::registry::ToolDef> {
81        use crate::registry::{InvocationHint, ToolDef};
82        vec![ToolDef {
83            id: "web_scrape",
84            description: "Scrape data from a web page via CSS selectors",
85            schema: schemars::schema_for!(ScrapeInstruction),
86            invocation: InvocationHint::FencedBlock("scrape"),
87        }]
88    }
89
90    async fn execute(&self, response: &str) -> Result<Option<ToolOutput>, ToolError> {
91        let blocks = extract_scrape_blocks(response);
92        if blocks.is_empty() {
93            return Ok(None);
94        }
95
96        let mut outputs = Vec::with_capacity(blocks.len());
97        #[allow(clippy::cast_possible_truncation)]
98        let blocks_executed = blocks.len() as u32;
99
100        for block in &blocks {
101            let instruction: ScrapeInstruction = serde_json::from_str(block).map_err(|e| {
102                ToolError::Execution(std::io::Error::new(
103                    std::io::ErrorKind::InvalidData,
104                    e.to_string(),
105                ))
106            })?;
107            outputs.push(self.scrape_instruction(&instruction).await?);
108        }
109
110        Ok(Some(ToolOutput {
111            tool_name: "web-scrape".to_owned(),
112            summary: outputs.join("\n\n"),
113            blocks_executed,
114            filter_stats: None,
115            diff: None,
116            streamed: false,
117        }))
118    }
119
120    async fn execute_tool_call(&self, call: &ToolCall) -> Result<Option<ToolOutput>, ToolError> {
121        if call.tool_id != "web_scrape" {
122            return Ok(None);
123        }
124
125        let instruction: ScrapeInstruction = deserialize_params(&call.params)?;
126
127        let result = self.scrape_instruction(&instruction).await?;
128
129        Ok(Some(ToolOutput {
130            tool_name: "web-scrape".to_owned(),
131            summary: result,
132            blocks_executed: 1,
133            filter_stats: None,
134            diff: None,
135            streamed: false,
136        }))
137    }
138}
139
140impl WebScrapeExecutor {
141    async fn scrape_instruction(
142        &self,
143        instruction: &ScrapeInstruction,
144    ) -> Result<String, ToolError> {
145        let parsed = validate_url(&instruction.url)?;
146        let (host, addrs) = resolve_and_validate(&parsed).await?;
147        // Build a per-request client pinned to the validated addresses, eliminating
148        // TOCTOU between DNS validation and the actual HTTP connection.
149        let client = self.build_client(&host, &addrs);
150        let html = self.fetch_html(&client, &instruction.url).await?;
151        let selector = instruction.select.clone();
152        let extract = ExtractMode::parse(&instruction.extract);
153        let limit = instruction.limit.unwrap_or(10);
154        tokio::task::spawn_blocking(move || parse_and_extract(&html, &selector, &extract, limit))
155            .await
156            .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?
157    }
158
159    async fn fetch_html(&self, client: &reqwest::Client, url: &str) -> Result<String, ToolError> {
160        let resp = client
161            .get(url)
162            .send()
163            .await
164            .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
165
166        if !resp.status().is_success() {
167            return Err(ToolError::Execution(std::io::Error::other(format!(
168                "HTTP {}",
169                resp.status(),
170            ))));
171        }
172
173        let bytes = resp
174            .bytes()
175            .await
176            .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
177
178        if bytes.len() > self.max_body_bytes {
179            return Err(ToolError::Execution(std::io::Error::other(format!(
180                "response too large: {} bytes (max: {})",
181                bytes.len(),
182                self.max_body_bytes,
183            ))));
184        }
185
186        String::from_utf8(bytes.to_vec())
187            .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))
188    }
189}
190
191fn extract_scrape_blocks(text: &str) -> Vec<&str> {
192    crate::executor::extract_fenced_blocks(text, "scrape")
193}
194
195fn validate_url(raw: &str) -> Result<Url, ToolError> {
196    let parsed = Url::parse(raw).map_err(|_| ToolError::Blocked {
197        command: format!("invalid URL: {raw}"),
198    })?;
199
200    if parsed.scheme() != "https" {
201        return Err(ToolError::Blocked {
202            command: format!("scheme not allowed: {}", parsed.scheme()),
203        });
204    }
205
206    if let Some(host) = parsed.host()
207        && is_private_host(&host)
208    {
209        return Err(ToolError::Blocked {
210            command: format!(
211                "private/local host blocked: {}",
212                parsed.host_str().unwrap_or("")
213            ),
214        });
215    }
216
217    Ok(parsed)
218}
219
220pub(crate) fn is_private_ip(ip: IpAddr) -> bool {
221    match ip {
222        IpAddr::V4(v4) => {
223            v4.is_loopback()
224                || v4.is_private()
225                || v4.is_link_local()
226                || v4.is_unspecified()
227                || v4.is_broadcast()
228        }
229        IpAddr::V6(v6) => {
230            if v6.is_loopback() || v6.is_unspecified() {
231                return true;
232            }
233            let seg = v6.segments();
234            // fe80::/10 — link-local
235            if seg[0] & 0xffc0 == 0xfe80 {
236                return true;
237            }
238            // fc00::/7 — unique local
239            if seg[0] & 0xfe00 == 0xfc00 {
240                return true;
241            }
242            // ::ffff:x.x.x.x — IPv4-mapped, check inner IPv4
243            if seg[0..6] == [0, 0, 0, 0, 0, 0xffff] {
244                let v4 = v6
245                    .to_ipv4_mapped()
246                    .unwrap_or(std::net::Ipv4Addr::UNSPECIFIED);
247                return v4.is_loopback()
248                    || v4.is_private()
249                    || v4.is_link_local()
250                    || v4.is_unspecified()
251                    || v4.is_broadcast();
252            }
253            false
254        }
255    }
256}
257
258fn is_private_host(host: &url::Host<&str>) -> bool {
259    match host {
260        url::Host::Domain(d) => {
261            // Exact match or subdomain of localhost (e.g. foo.localhost)
262            // and .internal/.local TLDs used in cloud/k8s environments.
263            #[allow(clippy::case_sensitive_file_extension_comparisons)]
264            {
265                *d == "localhost"
266                    || d.ends_with(".localhost")
267                    || d.ends_with(".internal")
268                    || d.ends_with(".local")
269            }
270        }
271        url::Host::Ipv4(v4) => is_private_ip(IpAddr::V4(*v4)),
272        url::Host::Ipv6(v6) => is_private_ip(IpAddr::V6(*v6)),
273    }
274}
275
276/// Resolves DNS for the URL host, validates all resolved IPs against private ranges,
277/// and returns the hostname and validated socket addresses.
278///
279/// Returning the addresses allows the caller to pin the HTTP client to these exact
280/// addresses, eliminating TOCTOU between DNS validation and the actual connection.
281async fn resolve_and_validate(url: &Url) -> Result<(String, Vec<SocketAddr>), ToolError> {
282    let Some(host) = url.host_str() else {
283        return Ok((String::new(), vec![]));
284    };
285    let port = url.port_or_known_default().unwrap_or(443);
286    let addrs: Vec<SocketAddr> = tokio::net::lookup_host(format!("{host}:{port}"))
287        .await
288        .map_err(|e| ToolError::Blocked {
289            command: format!("DNS resolution failed: {e}"),
290        })?
291        .collect();
292    for addr in &addrs {
293        if is_private_ip(addr.ip()) {
294            return Err(ToolError::Blocked {
295                command: format!("SSRF protection: private IP {} for host {host}", addr.ip()),
296            });
297        }
298    }
299    Ok((host.to_owned(), addrs))
300}
301
302fn parse_and_extract(
303    html: &str,
304    selector: &str,
305    extract: &ExtractMode,
306    limit: usize,
307) -> Result<String, ToolError> {
308    let soup = scrape_core::Soup::parse(html);
309
310    let tags = soup.find_all(selector).map_err(|e| {
311        ToolError::Execution(std::io::Error::new(
312            std::io::ErrorKind::InvalidData,
313            format!("invalid selector: {e}"),
314        ))
315    })?;
316
317    let mut results = Vec::new();
318
319    for tag in tags.into_iter().take(limit) {
320        let value = match extract {
321            ExtractMode::Text => tag.text(),
322            ExtractMode::Html => tag.inner_html(),
323            ExtractMode::Attr(name) => tag.get(name).unwrap_or_default().to_owned(),
324        };
325        if !value.trim().is_empty() {
326            results.push(value.trim().to_owned());
327        }
328    }
329
330    if results.is_empty() {
331        Ok(format!("No results for selector: {selector}"))
332    } else {
333        Ok(results.join("\n"))
334    }
335}
336
337#[cfg(test)]
338mod tests {
339    use super::*;
340
341    // --- extract_scrape_blocks ---
342
343    #[test]
344    fn extract_single_block() {
345        let text =
346            "Here:\n```scrape\n{\"url\":\"https://example.com\",\"select\":\"h1\"}\n```\nDone.";
347        let blocks = extract_scrape_blocks(text);
348        assert_eq!(blocks.len(), 1);
349        assert!(blocks[0].contains("example.com"));
350    }
351
352    #[test]
353    fn extract_multiple_blocks() {
354        let text = "```scrape\n{\"url\":\"https://a.com\",\"select\":\"h1\"}\n```\ntext\n```scrape\n{\"url\":\"https://b.com\",\"select\":\"p\"}\n```";
355        let blocks = extract_scrape_blocks(text);
356        assert_eq!(blocks.len(), 2);
357    }
358
359    #[test]
360    fn no_blocks_returns_empty() {
361        let blocks = extract_scrape_blocks("plain text, no code blocks");
362        assert!(blocks.is_empty());
363    }
364
365    #[test]
366    fn unclosed_block_ignored() {
367        let blocks = extract_scrape_blocks("```scrape\n{\"url\":\"https://x.com\"}");
368        assert!(blocks.is_empty());
369    }
370
371    #[test]
372    fn non_scrape_block_ignored() {
373        let text =
374            "```bash\necho hi\n```\n```scrape\n{\"url\":\"https://x.com\",\"select\":\"h1\"}\n```";
375        let blocks = extract_scrape_blocks(text);
376        assert_eq!(blocks.len(), 1);
377        assert!(blocks[0].contains("x.com"));
378    }
379
380    #[test]
381    fn multiline_json_block() {
382        let text =
383            "```scrape\n{\n  \"url\": \"https://example.com\",\n  \"select\": \"h1\"\n}\n```";
384        let blocks = extract_scrape_blocks(text);
385        assert_eq!(blocks.len(), 1);
386        let instr: ScrapeInstruction = serde_json::from_str(blocks[0]).unwrap();
387        assert_eq!(instr.url, "https://example.com");
388    }
389
390    // --- ScrapeInstruction parsing ---
391
392    #[test]
393    fn parse_valid_instruction() {
394        let json = r#"{"url":"https://example.com","select":"h1","extract":"text","limit":5}"#;
395        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
396        assert_eq!(instr.url, "https://example.com");
397        assert_eq!(instr.select, "h1");
398        assert_eq!(instr.extract, "text");
399        assert_eq!(instr.limit, Some(5));
400    }
401
402    #[test]
403    fn parse_minimal_instruction() {
404        let json = r#"{"url":"https://example.com","select":"p"}"#;
405        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
406        assert_eq!(instr.extract, "text");
407        assert!(instr.limit.is_none());
408    }
409
410    #[test]
411    fn parse_attr_extract() {
412        let json = r#"{"url":"https://example.com","select":"a","extract":"attr:href"}"#;
413        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
414        assert_eq!(instr.extract, "attr:href");
415    }
416
417    #[test]
418    fn parse_invalid_json_errors() {
419        let result = serde_json::from_str::<ScrapeInstruction>("not json");
420        assert!(result.is_err());
421    }
422
423    // --- ExtractMode ---
424
425    #[test]
426    fn extract_mode_text() {
427        assert!(matches!(ExtractMode::parse("text"), ExtractMode::Text));
428    }
429
430    #[test]
431    fn extract_mode_html() {
432        assert!(matches!(ExtractMode::parse("html"), ExtractMode::Html));
433    }
434
435    #[test]
436    fn extract_mode_attr() {
437        let mode = ExtractMode::parse("attr:href");
438        assert!(matches!(mode, ExtractMode::Attr(ref s) if s == "href"));
439    }
440
441    #[test]
442    fn extract_mode_unknown_defaults_to_text() {
443        assert!(matches!(ExtractMode::parse("unknown"), ExtractMode::Text));
444    }
445
446    // --- validate_url ---
447
448    #[test]
449    fn valid_https_url() {
450        assert!(validate_url("https://example.com").is_ok());
451    }
452
453    #[test]
454    fn http_rejected() {
455        let err = validate_url("http://example.com").unwrap_err();
456        assert!(matches!(err, ToolError::Blocked { .. }));
457    }
458
459    #[test]
460    fn ftp_rejected() {
461        let err = validate_url("ftp://files.example.com").unwrap_err();
462        assert!(matches!(err, ToolError::Blocked { .. }));
463    }
464
465    #[test]
466    fn file_rejected() {
467        let err = validate_url("file:///etc/passwd").unwrap_err();
468        assert!(matches!(err, ToolError::Blocked { .. }));
469    }
470
471    #[test]
472    fn invalid_url_rejected() {
473        let err = validate_url("not a url").unwrap_err();
474        assert!(matches!(err, ToolError::Blocked { .. }));
475    }
476
477    #[test]
478    fn localhost_blocked() {
479        let err = validate_url("https://localhost/path").unwrap_err();
480        assert!(matches!(err, ToolError::Blocked { .. }));
481    }
482
483    #[test]
484    fn loopback_ip_blocked() {
485        let err = validate_url("https://127.0.0.1/path").unwrap_err();
486        assert!(matches!(err, ToolError::Blocked { .. }));
487    }
488
489    #[test]
490    fn private_10_blocked() {
491        let err = validate_url("https://10.0.0.1/api").unwrap_err();
492        assert!(matches!(err, ToolError::Blocked { .. }));
493    }
494
495    #[test]
496    fn private_172_blocked() {
497        let err = validate_url("https://172.16.0.1/api").unwrap_err();
498        assert!(matches!(err, ToolError::Blocked { .. }));
499    }
500
501    #[test]
502    fn private_192_blocked() {
503        let err = validate_url("https://192.168.1.1/api").unwrap_err();
504        assert!(matches!(err, ToolError::Blocked { .. }));
505    }
506
507    #[test]
508    fn ipv6_loopback_blocked() {
509        let err = validate_url("https://[::1]/path").unwrap_err();
510        assert!(matches!(err, ToolError::Blocked { .. }));
511    }
512
513    #[test]
514    fn public_ip_allowed() {
515        assert!(validate_url("https://93.184.216.34/page").is_ok());
516    }
517
518    // --- parse_and_extract ---
519
520    #[test]
521    fn extract_text_from_html() {
522        let html = "<html><body><h1>Hello World</h1><p>Content</p></body></html>";
523        let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
524        assert_eq!(result, "Hello World");
525    }
526
527    #[test]
528    fn extract_multiple_elements() {
529        let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
530        let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
531        assert_eq!(result, "A\nB\nC");
532    }
533
534    #[test]
535    fn extract_with_limit() {
536        let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
537        let result = parse_and_extract(html, "li", &ExtractMode::Text, 2).unwrap();
538        assert_eq!(result, "A\nB");
539    }
540
541    #[test]
542    fn extract_attr_href() {
543        let html = r#"<a href="https://example.com">Link</a>"#;
544        let result =
545            parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
546        assert_eq!(result, "https://example.com");
547    }
548
549    #[test]
550    fn extract_inner_html() {
551        let html = "<div><span>inner</span></div>";
552        let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
553        assert!(result.contains("<span>inner</span>"));
554    }
555
556    #[test]
557    fn no_matches_returns_message() {
558        let html = "<html><body><p>text</p></body></html>";
559        let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
560        assert!(result.starts_with("No results for selector:"));
561    }
562
563    #[test]
564    fn empty_text_skipped() {
565        let html = "<ul><li>  </li><li>A</li></ul>";
566        let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
567        assert_eq!(result, "A");
568    }
569
570    #[test]
571    fn invalid_selector_errors() {
572        let html = "<html><body></body></html>";
573        let result = parse_and_extract(html, "[[[invalid", &ExtractMode::Text, 10);
574        assert!(result.is_err());
575    }
576
577    #[test]
578    fn empty_html_returns_no_results() {
579        let result = parse_and_extract("", "h1", &ExtractMode::Text, 10).unwrap();
580        assert!(result.starts_with("No results for selector:"));
581    }
582
583    #[test]
584    fn nested_selector() {
585        let html = "<div><span>inner</span></div><span>outer</span>";
586        let result = parse_and_extract(html, "div > span", &ExtractMode::Text, 10).unwrap();
587        assert_eq!(result, "inner");
588    }
589
590    #[test]
591    fn attr_missing_returns_empty() {
592        let html = r#"<a>No href</a>"#;
593        let result =
594            parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
595        assert!(result.starts_with("No results for selector:"));
596    }
597
598    #[test]
599    fn extract_html_mode() {
600        let html = "<div><b>bold</b> text</div>";
601        let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
602        assert!(result.contains("<b>bold</b>"));
603    }
604
605    #[test]
606    fn limit_zero_returns_no_results() {
607        let html = "<ul><li>A</li><li>B</li></ul>";
608        let result = parse_and_extract(html, "li", &ExtractMode::Text, 0).unwrap();
609        assert!(result.starts_with("No results for selector:"));
610    }
611
612    // --- validate_url edge cases ---
613
614    #[test]
615    fn url_with_port_allowed() {
616        assert!(validate_url("https://example.com:8443/path").is_ok());
617    }
618
619    #[test]
620    fn link_local_ip_blocked() {
621        let err = validate_url("https://169.254.1.1/path").unwrap_err();
622        assert!(matches!(err, ToolError::Blocked { .. }));
623    }
624
625    #[test]
626    fn url_no_scheme_rejected() {
627        let err = validate_url("example.com/path").unwrap_err();
628        assert!(matches!(err, ToolError::Blocked { .. }));
629    }
630
631    #[test]
632    fn unspecified_ipv4_blocked() {
633        let err = validate_url("https://0.0.0.0/path").unwrap_err();
634        assert!(matches!(err, ToolError::Blocked { .. }));
635    }
636
637    #[test]
638    fn broadcast_ipv4_blocked() {
639        let err = validate_url("https://255.255.255.255/path").unwrap_err();
640        assert!(matches!(err, ToolError::Blocked { .. }));
641    }
642
643    #[test]
644    fn ipv6_link_local_blocked() {
645        let err = validate_url("https://[fe80::1]/path").unwrap_err();
646        assert!(matches!(err, ToolError::Blocked { .. }));
647    }
648
649    #[test]
650    fn ipv6_unique_local_blocked() {
651        let err = validate_url("https://[fd12::1]/path").unwrap_err();
652        assert!(matches!(err, ToolError::Blocked { .. }));
653    }
654
655    #[test]
656    fn ipv4_mapped_ipv6_loopback_blocked() {
657        let err = validate_url("https://[::ffff:127.0.0.1]/path").unwrap_err();
658        assert!(matches!(err, ToolError::Blocked { .. }));
659    }
660
661    #[test]
662    fn ipv4_mapped_ipv6_private_blocked() {
663        let err = validate_url("https://[::ffff:10.0.0.1]/path").unwrap_err();
664        assert!(matches!(err, ToolError::Blocked { .. }));
665    }
666
667    // --- WebScrapeExecutor (no-network) ---
668
669    #[tokio::test]
670    async fn executor_no_blocks_returns_none() {
671        let config = ScrapeConfig::default();
672        let executor = WebScrapeExecutor::new(&config);
673        let result = executor.execute("plain text").await;
674        assert!(result.unwrap().is_none());
675    }
676
677    #[tokio::test]
678    async fn executor_invalid_json_errors() {
679        let config = ScrapeConfig::default();
680        let executor = WebScrapeExecutor::new(&config);
681        let response = "```scrape\nnot json\n```";
682        let result = executor.execute(response).await;
683        assert!(matches!(result, Err(ToolError::Execution(_))));
684    }
685
686    #[tokio::test]
687    async fn executor_blocked_url_errors() {
688        let config = ScrapeConfig::default();
689        let executor = WebScrapeExecutor::new(&config);
690        let response = "```scrape\n{\"url\":\"http://example.com\",\"select\":\"h1\"}\n```";
691        let result = executor.execute(response).await;
692        assert!(matches!(result, Err(ToolError::Blocked { .. })));
693    }
694
695    #[tokio::test]
696    async fn executor_private_ip_blocked() {
697        let config = ScrapeConfig::default();
698        let executor = WebScrapeExecutor::new(&config);
699        let response = "```scrape\n{\"url\":\"https://192.168.1.1/api\",\"select\":\"h1\"}\n```";
700        let result = executor.execute(response).await;
701        assert!(matches!(result, Err(ToolError::Blocked { .. })));
702    }
703
704    #[tokio::test]
705    async fn executor_unreachable_host_returns_error() {
706        let config = ScrapeConfig {
707            timeout: 1,
708            max_body_bytes: 1_048_576,
709        };
710        let executor = WebScrapeExecutor::new(&config);
711        let response = "```scrape\n{\"url\":\"https://192.0.2.1:1/page\",\"select\":\"h1\"}\n```";
712        let result = executor.execute(response).await;
713        assert!(matches!(result, Err(ToolError::Execution(_))));
714    }
715
716    #[tokio::test]
717    async fn executor_localhost_url_blocked() {
718        let config = ScrapeConfig::default();
719        let executor = WebScrapeExecutor::new(&config);
720        let response = "```scrape\n{\"url\":\"https://localhost:9999/api\",\"select\":\"h1\"}\n```";
721        let result = executor.execute(response).await;
722        assert!(matches!(result, Err(ToolError::Blocked { .. })));
723    }
724
725    #[tokio::test]
726    async fn executor_empty_text_returns_none() {
727        let config = ScrapeConfig::default();
728        let executor = WebScrapeExecutor::new(&config);
729        let result = executor.execute("").await;
730        assert!(result.unwrap().is_none());
731    }
732
733    #[tokio::test]
734    async fn executor_multiple_blocks_first_blocked() {
735        let config = ScrapeConfig::default();
736        let executor = WebScrapeExecutor::new(&config);
737        let response = "```scrape\n{\"url\":\"http://evil.com\",\"select\":\"h1\"}\n```\n\
738             ```scrape\n{\"url\":\"https://ok.com\",\"select\":\"h1\"}\n```";
739        let result = executor.execute(response).await;
740        assert!(result.is_err());
741    }
742
743    #[test]
744    fn validate_url_empty_string() {
745        let err = validate_url("").unwrap_err();
746        assert!(matches!(err, ToolError::Blocked { .. }));
747    }
748
749    #[test]
750    fn validate_url_javascript_scheme_blocked() {
751        let err = validate_url("javascript:alert(1)").unwrap_err();
752        assert!(matches!(err, ToolError::Blocked { .. }));
753    }
754
755    #[test]
756    fn validate_url_data_scheme_blocked() {
757        let err = validate_url("data:text/html,<h1>hi</h1>").unwrap_err();
758        assert!(matches!(err, ToolError::Blocked { .. }));
759    }
760
761    #[test]
762    fn is_private_host_public_domain_is_false() {
763        let host: url::Host<&str> = url::Host::Domain("example.com");
764        assert!(!is_private_host(&host));
765    }
766
767    #[test]
768    fn is_private_host_localhost_is_true() {
769        let host: url::Host<&str> = url::Host::Domain("localhost");
770        assert!(is_private_host(&host));
771    }
772
773    #[test]
774    fn is_private_host_ipv6_unspecified_is_true() {
775        let host = url::Host::Ipv6(std::net::Ipv6Addr::UNSPECIFIED);
776        assert!(is_private_host(&host));
777    }
778
779    #[test]
780    fn is_private_host_public_ipv6_is_false() {
781        let host = url::Host::Ipv6("2001:db8::1".parse().unwrap());
782        assert!(!is_private_host(&host));
783    }
784
785    #[test]
786    fn extract_scrape_blocks_empty_block_content() {
787        let text = "```scrape\n\n```";
788        let blocks = extract_scrape_blocks(text);
789        assert_eq!(blocks.len(), 1);
790        assert!(blocks[0].is_empty());
791    }
792
793    #[test]
794    fn extract_scrape_blocks_whitespace_only() {
795        let text = "```scrape\n   \n```";
796        let blocks = extract_scrape_blocks(text);
797        assert_eq!(blocks.len(), 1);
798    }
799
800    #[test]
801    fn parse_and_extract_multiple_selectors() {
802        let html = "<div><h1>Title</h1><p>Para</p></div>";
803        let result = parse_and_extract(html, "h1, p", &ExtractMode::Text, 10).unwrap();
804        assert!(result.contains("Title"));
805        assert!(result.contains("Para"));
806    }
807
808    #[test]
809    fn webscrape_executor_new_with_custom_config() {
810        let config = ScrapeConfig {
811            timeout: 60,
812            max_body_bytes: 512,
813        };
814        let executor = WebScrapeExecutor::new(&config);
815        assert_eq!(executor.max_body_bytes, 512);
816    }
817
818    #[test]
819    fn webscrape_executor_debug() {
820        let config = ScrapeConfig::default();
821        let executor = WebScrapeExecutor::new(&config);
822        let dbg = format!("{executor:?}");
823        assert!(dbg.contains("WebScrapeExecutor"));
824    }
825
826    #[test]
827    fn extract_mode_attr_empty_name() {
828        let mode = ExtractMode::parse("attr:");
829        assert!(matches!(mode, ExtractMode::Attr(ref s) if s.is_empty()));
830    }
831
832    #[test]
833    fn default_extract_returns_text() {
834        assert_eq!(default_extract(), "text");
835    }
836
837    #[test]
838    fn scrape_instruction_debug() {
839        let json = r#"{"url":"https://example.com","select":"h1"}"#;
840        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
841        let dbg = format!("{instr:?}");
842        assert!(dbg.contains("ScrapeInstruction"));
843    }
844
845    #[test]
846    fn extract_mode_debug() {
847        let mode = ExtractMode::Text;
848        let dbg = format!("{mode:?}");
849        assert!(dbg.contains("Text"));
850    }
851
852    #[test]
853    fn ipv4_mapped_ipv6_link_local_blocked() {
854        let err = validate_url("https://[::ffff:169.254.0.1]/path").unwrap_err();
855        assert!(matches!(err, ToolError::Blocked { .. }));
856    }
857
858    #[test]
859    fn ipv4_mapped_ipv6_public_allowed() {
860        assert!(validate_url("https://[::ffff:93.184.216.34]/path").is_ok());
861    }
862
863    #[test]
864    fn tool_definitions_returns_web_scrape() {
865        let config = ScrapeConfig::default();
866        let executor = WebScrapeExecutor::new(&config);
867        let defs = executor.tool_definitions();
868        assert_eq!(defs.len(), 1);
869        assert_eq!(defs[0].id, "web_scrape");
870        assert_eq!(
871            defs[0].invocation,
872            crate::registry::InvocationHint::FencedBlock("scrape")
873        );
874    }
875
876    #[test]
877    fn tool_definitions_schema_has_all_params() {
878        let config = ScrapeConfig::default();
879        let executor = WebScrapeExecutor::new(&config);
880        let defs = executor.tool_definitions();
881        let obj = defs[0].schema.as_object().unwrap();
882        let props = obj["properties"].as_object().unwrap();
883        assert!(props.contains_key("url"));
884        assert!(props.contains_key("select"));
885        assert!(props.contains_key("extract"));
886        assert!(props.contains_key("limit"));
887        let req = obj["required"].as_array().unwrap();
888        assert!(req.iter().any(|v| v.as_str() == Some("url")));
889        assert!(req.iter().any(|v| v.as_str() == Some("select")));
890        assert!(!req.iter().any(|v| v.as_str() == Some("extract")));
891    }
892
893    // --- is_private_host: new domain checks (AUD-02) ---
894
895    #[test]
896    fn subdomain_localhost_blocked() {
897        let host: url::Host<&str> = url::Host::Domain("foo.localhost");
898        assert!(is_private_host(&host));
899    }
900
901    #[test]
902    fn internal_tld_blocked() {
903        let host: url::Host<&str> = url::Host::Domain("service.internal");
904        assert!(is_private_host(&host));
905    }
906
907    #[test]
908    fn local_tld_blocked() {
909        let host: url::Host<&str> = url::Host::Domain("printer.local");
910        assert!(is_private_host(&host));
911    }
912
913    #[test]
914    fn public_domain_not_blocked() {
915        let host: url::Host<&str> = url::Host::Domain("example.com");
916        assert!(!is_private_host(&host));
917    }
918
919    // --- resolve_and_validate: private IP rejection ---
920
921    #[tokio::test]
922    async fn resolve_loopback_rejected() {
923        // 127.0.0.1 resolves directly (literal IP in DNS query)
924        let url = url::Url::parse("https://127.0.0.1/path").unwrap();
925        // validate_url catches this before resolve_and_validate, but test directly
926        let result = resolve_and_validate(&url).await;
927        assert!(
928            result.is_err(),
929            "loopback IP must be rejected by resolve_and_validate"
930        );
931        let err = result.unwrap_err();
932        assert!(matches!(err, crate::executor::ToolError::Blocked { .. }));
933    }
934
935    #[tokio::test]
936    async fn resolve_private_10_rejected() {
937        let url = url::Url::parse("https://10.0.0.1/path").unwrap();
938        let result = resolve_and_validate(&url).await;
939        assert!(result.is_err());
940        assert!(matches!(
941            result.unwrap_err(),
942            crate::executor::ToolError::Blocked { .. }
943        ));
944    }
945
946    #[tokio::test]
947    async fn resolve_private_192_rejected() {
948        let url = url::Url::parse("https://192.168.1.1/path").unwrap();
949        let result = resolve_and_validate(&url).await;
950        assert!(result.is_err());
951        assert!(matches!(
952            result.unwrap_err(),
953            crate::executor::ToolError::Blocked { .. }
954        ));
955    }
956
957    #[tokio::test]
958    async fn resolve_ipv6_loopback_rejected() {
959        let url = url::Url::parse("https://[::1]/path").unwrap();
960        let result = resolve_and_validate(&url).await;
961        assert!(result.is_err());
962        assert!(matches!(
963            result.unwrap_err(),
964            crate::executor::ToolError::Blocked { .. }
965        ));
966    }
967
968    #[tokio::test]
969    async fn resolve_no_host_returns_ok() {
970        // URL without a resolvable host — should pass through
971        let url = url::Url::parse("https://example.com/path").unwrap();
972        // We can't do a live DNS test, but we can verify a URL with no host
973        let url_no_host = url::Url::parse("data:text/plain,hello").unwrap();
974        // data: URLs have no host; resolve_and_validate should return Ok with empty addrs
975        let result = resolve_and_validate(&url_no_host).await;
976        assert!(result.is_ok());
977        let (host, addrs) = result.unwrap();
978        assert!(host.is_empty());
979        assert!(addrs.is_empty());
980        drop(url);
981        drop(url_no_host);
982    }
983}
zeph_tools/scrape.rs

zeph_tools/
scrape.rs