zeph_tools/
scrape.rs

1use std::net::{IpAddr, SocketAddr};
2use std::time::Duration;
3
4use schemars::JsonSchema;
5use serde::Deserialize;
6use url::Url;
7
8use crate::config::ScrapeConfig;
9use crate::executor::{ToolCall, ToolError, ToolExecutor, ToolOutput, deserialize_params};
10
11#[derive(Debug, Deserialize, JsonSchema)]
12struct ScrapeInstruction {
13    /// HTTPS URL to scrape
14    url: String,
15    /// CSS selector
16    select: String,
17    /// Extract mode: text, html, or attr:<name>
18    #[serde(default = "default_extract")]
19    extract: String,
20    /// Max results to return
21    limit: Option<usize>,
22}
23
24fn default_extract() -> String {
25    "text".into()
26}
27
28#[derive(Debug)]
29enum ExtractMode {
30    Text,
31    Html,
32    Attr(String),
33}
34
35impl ExtractMode {
36    fn parse(s: &str) -> Self {
37        match s {
38            "text" => Self::Text,
39            "html" => Self::Html,
40            attr if attr.starts_with("attr:") => {
41                Self::Attr(attr.strip_prefix("attr:").unwrap_or(attr).to_owned())
42            }
43            _ => Self::Text,
44        }
45    }
46}
47
48/// Extracts data from web pages via CSS selectors.
49///
50/// Detects ` ```scrape ` blocks in LLM responses containing JSON instructions,
51/// fetches the URL, and parses HTML with `scrape-core`.
52#[derive(Debug)]
53pub struct WebScrapeExecutor {
54    timeout: Duration,
55    max_body_bytes: usize,
56}
57
58impl WebScrapeExecutor {
59    #[must_use]
60    pub fn new(config: &ScrapeConfig) -> Self {
61        Self {
62            timeout: Duration::from_secs(config.timeout),
63            max_body_bytes: config.max_body_bytes,
64        }
65    }
66
67    fn build_client(&self, host: &str, addrs: &[SocketAddr]) -> reqwest::Client {
68        let mut builder = reqwest::Client::builder()
69            .timeout(self.timeout)
70            .redirect(reqwest::redirect::Policy::limited(3));
71        builder = builder.resolve_to_addrs(host, addrs);
72        builder.build().unwrap_or_default()
73    }
74}
75
76impl ToolExecutor for WebScrapeExecutor {
77    fn tool_definitions(&self) -> Vec<crate::registry::ToolDef> {
78        use crate::registry::{InvocationHint, ToolDef};
79        vec![ToolDef {
80            id: "web_scrape",
81            description: "Scrape data from a web page via CSS selectors",
82            schema: schemars::schema_for!(ScrapeInstruction),
83            invocation: InvocationHint::FencedBlock("scrape"),
84        }]
85    }
86
87    async fn execute(&self, response: &str) -> Result<Option<ToolOutput>, ToolError> {
88        let blocks = extract_scrape_blocks(response);
89        if blocks.is_empty() {
90            return Ok(None);
91        }
92
93        let mut outputs = Vec::with_capacity(blocks.len());
94        #[allow(clippy::cast_possible_truncation)]
95        let blocks_executed = blocks.len() as u32;
96
97        for block in &blocks {
98            let instruction: ScrapeInstruction = serde_json::from_str(block).map_err(|e| {
99                ToolError::Execution(std::io::Error::new(
100                    std::io::ErrorKind::InvalidData,
101                    e.to_string(),
102                ))
103            })?;
104            outputs.push(self.scrape_instruction(&instruction).await?);
105        }
106
107        Ok(Some(ToolOutput {
108            tool_name: "web-scrape".to_owned(),
109            summary: outputs.join("\n\n"),
110            blocks_executed,
111            filter_stats: None,
112            diff: None,
113            streamed: false,
114        }))
115    }
116
117    async fn execute_tool_call(&self, call: &ToolCall) -> Result<Option<ToolOutput>, ToolError> {
118        if call.tool_id != "web_scrape" {
119            return Ok(None);
120        }
121
122        let instruction: ScrapeInstruction = deserialize_params(&call.params)?;
123
124        let result = self.scrape_instruction(&instruction).await?;
125
126        Ok(Some(ToolOutput {
127            tool_name: "web-scrape".to_owned(),
128            summary: result,
129            blocks_executed: 1,
130            filter_stats: None,
131            diff: None,
132            streamed: false,
133        }))
134    }
135}
136
137impl WebScrapeExecutor {
138    async fn scrape_instruction(
139        &self,
140        instruction: &ScrapeInstruction,
141    ) -> Result<String, ToolError> {
142        let parsed = validate_url(&instruction.url)?;
143        let (host, addrs) = resolve_and_validate(&parsed).await?;
144        // Build a per-request client pinned to the validated addresses, eliminating
145        // TOCTOU between DNS validation and the actual HTTP connection.
146        let client = self.build_client(&host, &addrs);
147        let html = self.fetch_html(&client, &instruction.url).await?;
148        let selector = instruction.select.clone();
149        let extract = ExtractMode::parse(&instruction.extract);
150        let limit = instruction.limit.unwrap_or(10);
151        tokio::task::spawn_blocking(move || parse_and_extract(&html, &selector, &extract, limit))
152            .await
153            .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?
154    }
155
156    async fn fetch_html(&self, client: &reqwest::Client, url: &str) -> Result<String, ToolError> {
157        let resp = client
158            .get(url)
159            .send()
160            .await
161            .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
162
163        if !resp.status().is_success() {
164            return Err(ToolError::Execution(std::io::Error::other(format!(
165                "HTTP {}",
166                resp.status(),
167            ))));
168        }
169
170        let bytes = resp
171            .bytes()
172            .await
173            .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
174
175        if bytes.len() > self.max_body_bytes {
176            return Err(ToolError::Execution(std::io::Error::other(format!(
177                "response too large: {} bytes (max: {})",
178                bytes.len(),
179                self.max_body_bytes,
180            ))));
181        }
182
183        String::from_utf8(bytes.to_vec())
184            .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))
185    }
186}
187
188fn extract_scrape_blocks(text: &str) -> Vec<&str> {
189    crate::executor::extract_fenced_blocks(text, "scrape")
190}
191
192fn validate_url(raw: &str) -> Result<Url, ToolError> {
193    let parsed = Url::parse(raw).map_err(|_| ToolError::Blocked {
194        command: format!("invalid URL: {raw}"),
195    })?;
196
197    if parsed.scheme() != "https" {
198        return Err(ToolError::Blocked {
199            command: format!("scheme not allowed: {}", parsed.scheme()),
200        });
201    }
202
203    if let Some(host) = parsed.host()
204        && is_private_host(&host)
205    {
206        return Err(ToolError::Blocked {
207            command: format!(
208                "private/local host blocked: {}",
209                parsed.host_str().unwrap_or("")
210            ),
211        });
212    }
213
214    Ok(parsed)
215}
216
217pub(crate) fn is_private_ip(ip: IpAddr) -> bool {
218    match ip {
219        IpAddr::V4(v4) => {
220            v4.is_loopback()
221                || v4.is_private()
222                || v4.is_link_local()
223                || v4.is_unspecified()
224                || v4.is_broadcast()
225        }
226        IpAddr::V6(v6) => {
227            if v6.is_loopback() || v6.is_unspecified() {
228                return true;
229            }
230            let seg = v6.segments();
231            // fe80::/10 — link-local
232            if seg[0] & 0xffc0 == 0xfe80 {
233                return true;
234            }
235            // fc00::/7 — unique local
236            if seg[0] & 0xfe00 == 0xfc00 {
237                return true;
238            }
239            // ::ffff:x.x.x.x — IPv4-mapped, check inner IPv4
240            if seg[0..6] == [0, 0, 0, 0, 0, 0xffff] {
241                let v4 = v6
242                    .to_ipv4_mapped()
243                    .unwrap_or(std::net::Ipv4Addr::UNSPECIFIED);
244                return v4.is_loopback()
245                    || v4.is_private()
246                    || v4.is_link_local()
247                    || v4.is_unspecified()
248                    || v4.is_broadcast();
249            }
250            false
251        }
252    }
253}
254
255fn is_private_host(host: &url::Host<&str>) -> bool {
256    match host {
257        url::Host::Domain(d) => {
258            // Exact match or subdomain of localhost (e.g. foo.localhost)
259            // and .internal/.local TLDs used in cloud/k8s environments.
260            #[allow(clippy::case_sensitive_file_extension_comparisons)]
261            {
262                *d == "localhost"
263                    || d.ends_with(".localhost")
264                    || d.ends_with(".internal")
265                    || d.ends_with(".local")
266            }
267        }
268        url::Host::Ipv4(v4) => is_private_ip(IpAddr::V4(*v4)),
269        url::Host::Ipv6(v6) => is_private_ip(IpAddr::V6(*v6)),
270    }
271}
272
273/// Resolves DNS for the URL host, validates all resolved IPs against private ranges,
274/// and returns the hostname and validated socket addresses.
275///
276/// Returning the addresses allows the caller to pin the HTTP client to these exact
277/// addresses, eliminating TOCTOU between DNS validation and the actual connection.
278async fn resolve_and_validate(url: &Url) -> Result<(String, Vec<SocketAddr>), ToolError> {
279    let Some(host) = url.host_str() else {
280        return Ok((String::new(), vec![]));
281    };
282    let port = url.port_or_known_default().unwrap_or(443);
283    let addrs: Vec<SocketAddr> = tokio::net::lookup_host(format!("{host}:{port}"))
284        .await
285        .map_err(|e| ToolError::Blocked {
286            command: format!("DNS resolution failed: {e}"),
287        })?
288        .collect();
289    for addr in &addrs {
290        if is_private_ip(addr.ip()) {
291            return Err(ToolError::Blocked {
292                command: format!("SSRF protection: private IP {} for host {host}", addr.ip()),
293            });
294        }
295    }
296    Ok((host.to_owned(), addrs))
297}
298
299fn parse_and_extract(
300    html: &str,
301    selector: &str,
302    extract: &ExtractMode,
303    limit: usize,
304) -> Result<String, ToolError> {
305    let soup = scrape_core::Soup::parse(html);
306
307    let tags = soup.find_all(selector).map_err(|e| {
308        ToolError::Execution(std::io::Error::new(
309            std::io::ErrorKind::InvalidData,
310            format!("invalid selector: {e}"),
311        ))
312    })?;
313
314    let mut results = Vec::new();
315
316    for tag in tags.into_iter().take(limit) {
317        let value = match extract {
318            ExtractMode::Text => tag.text(),
319            ExtractMode::Html => tag.inner_html(),
320            ExtractMode::Attr(name) => tag.get(name).unwrap_or_default().to_owned(),
321        };
322        if !value.trim().is_empty() {
323            results.push(value.trim().to_owned());
324        }
325    }
326
327    if results.is_empty() {
328        Ok(format!("No results for selector: {selector}"))
329    } else {
330        Ok(results.join("\n"))
331    }
332}
333
334#[cfg(test)]
335mod tests {
336    use super::*;
337
338    // --- extract_scrape_blocks ---
339
340    #[test]
341    fn extract_single_block() {
342        let text =
343            "Here:\n```scrape\n{\"url\":\"https://example.com\",\"select\":\"h1\"}\n```\nDone.";
344        let blocks = extract_scrape_blocks(text);
345        assert_eq!(blocks.len(), 1);
346        assert!(blocks[0].contains("example.com"));
347    }
348
349    #[test]
350    fn extract_multiple_blocks() {
351        let text = "```scrape\n{\"url\":\"https://a.com\",\"select\":\"h1\"}\n```\ntext\n```scrape\n{\"url\":\"https://b.com\",\"select\":\"p\"}\n```";
352        let blocks = extract_scrape_blocks(text);
353        assert_eq!(blocks.len(), 2);
354    }
355
356    #[test]
357    fn no_blocks_returns_empty() {
358        let blocks = extract_scrape_blocks("plain text, no code blocks");
359        assert!(blocks.is_empty());
360    }
361
362    #[test]
363    fn unclosed_block_ignored() {
364        let blocks = extract_scrape_blocks("```scrape\n{\"url\":\"https://x.com\"}");
365        assert!(blocks.is_empty());
366    }
367
368    #[test]
369    fn non_scrape_block_ignored() {
370        let text =
371            "```bash\necho hi\n```\n```scrape\n{\"url\":\"https://x.com\",\"select\":\"h1\"}\n```";
372        let blocks = extract_scrape_blocks(text);
373        assert_eq!(blocks.len(), 1);
374        assert!(blocks[0].contains("x.com"));
375    }
376
377    #[test]
378    fn multiline_json_block() {
379        let text =
380            "```scrape\n{\n  \"url\": \"https://example.com\",\n  \"select\": \"h1\"\n}\n```";
381        let blocks = extract_scrape_blocks(text);
382        assert_eq!(blocks.len(), 1);
383        let instr: ScrapeInstruction = serde_json::from_str(blocks[0]).unwrap();
384        assert_eq!(instr.url, "https://example.com");
385    }
386
387    // --- ScrapeInstruction parsing ---
388
389    #[test]
390    fn parse_valid_instruction() {
391        let json = r#"{"url":"https://example.com","select":"h1","extract":"text","limit":5}"#;
392        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
393        assert_eq!(instr.url, "https://example.com");
394        assert_eq!(instr.select, "h1");
395        assert_eq!(instr.extract, "text");
396        assert_eq!(instr.limit, Some(5));
397    }
398
399    #[test]
400    fn parse_minimal_instruction() {
401        let json = r#"{"url":"https://example.com","select":"p"}"#;
402        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
403        assert_eq!(instr.extract, "text");
404        assert!(instr.limit.is_none());
405    }
406
407    #[test]
408    fn parse_attr_extract() {
409        let json = r#"{"url":"https://example.com","select":"a","extract":"attr:href"}"#;
410        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
411        assert_eq!(instr.extract, "attr:href");
412    }
413
414    #[test]
415    fn parse_invalid_json_errors() {
416        let result = serde_json::from_str::<ScrapeInstruction>("not json");
417        assert!(result.is_err());
418    }
419
420    // --- ExtractMode ---
421
422    #[test]
423    fn extract_mode_text() {
424        assert!(matches!(ExtractMode::parse("text"), ExtractMode::Text));
425    }
426
427    #[test]
428    fn extract_mode_html() {
429        assert!(matches!(ExtractMode::parse("html"), ExtractMode::Html));
430    }
431
432    #[test]
433    fn extract_mode_attr() {
434        let mode = ExtractMode::parse("attr:href");
435        assert!(matches!(mode, ExtractMode::Attr(ref s) if s == "href"));
436    }
437
438    #[test]
439    fn extract_mode_unknown_defaults_to_text() {
440        assert!(matches!(ExtractMode::parse("unknown"), ExtractMode::Text));
441    }
442
443    // --- validate_url ---
444
445    #[test]
446    fn valid_https_url() {
447        assert!(validate_url("https://example.com").is_ok());
448    }
449
450    #[test]
451    fn http_rejected() {
452        let err = validate_url("http://example.com").unwrap_err();
453        assert!(matches!(err, ToolError::Blocked { .. }));
454    }
455
456    #[test]
457    fn ftp_rejected() {
458        let err = validate_url("ftp://files.example.com").unwrap_err();
459        assert!(matches!(err, ToolError::Blocked { .. }));
460    }
461
462    #[test]
463    fn file_rejected() {
464        let err = validate_url("file:///etc/passwd").unwrap_err();
465        assert!(matches!(err, ToolError::Blocked { .. }));
466    }
467
468    #[test]
469    fn invalid_url_rejected() {
470        let err = validate_url("not a url").unwrap_err();
471        assert!(matches!(err, ToolError::Blocked { .. }));
472    }
473
474    #[test]
475    fn localhost_blocked() {
476        let err = validate_url("https://localhost/path").unwrap_err();
477        assert!(matches!(err, ToolError::Blocked { .. }));
478    }
479
480    #[test]
481    fn loopback_ip_blocked() {
482        let err = validate_url("https://127.0.0.1/path").unwrap_err();
483        assert!(matches!(err, ToolError::Blocked { .. }));
484    }
485
486    #[test]
487    fn private_10_blocked() {
488        let err = validate_url("https://10.0.0.1/api").unwrap_err();
489        assert!(matches!(err, ToolError::Blocked { .. }));
490    }
491
492    #[test]
493    fn private_172_blocked() {
494        let err = validate_url("https://172.16.0.1/api").unwrap_err();
495        assert!(matches!(err, ToolError::Blocked { .. }));
496    }
497
498    #[test]
499    fn private_192_blocked() {
500        let err = validate_url("https://192.168.1.1/api").unwrap_err();
501        assert!(matches!(err, ToolError::Blocked { .. }));
502    }
503
504    #[test]
505    fn ipv6_loopback_blocked() {
506        let err = validate_url("https://[::1]/path").unwrap_err();
507        assert!(matches!(err, ToolError::Blocked { .. }));
508    }
509
510    #[test]
511    fn public_ip_allowed() {
512        assert!(validate_url("https://93.184.216.34/page").is_ok());
513    }
514
515    // --- parse_and_extract ---
516
517    #[test]
518    fn extract_text_from_html() {
519        let html = "<html><body><h1>Hello World</h1><p>Content</p></body></html>";
520        let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
521        assert_eq!(result, "Hello World");
522    }
523
524    #[test]
525    fn extract_multiple_elements() {
526        let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
527        let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
528        assert_eq!(result, "A\nB\nC");
529    }
530
531    #[test]
532    fn extract_with_limit() {
533        let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
534        let result = parse_and_extract(html, "li", &ExtractMode::Text, 2).unwrap();
535        assert_eq!(result, "A\nB");
536    }
537
538    #[test]
539    fn extract_attr_href() {
540        let html = r#"<a href="https://example.com">Link</a>"#;
541        let result =
542            parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
543        assert_eq!(result, "https://example.com");
544    }
545
546    #[test]
547    fn extract_inner_html() {
548        let html = "<div><span>inner</span></div>";
549        let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
550        assert!(result.contains("<span>inner</span>"));
551    }
552
553    #[test]
554    fn no_matches_returns_message() {
555        let html = "<html><body><p>text</p></body></html>";
556        let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
557        assert!(result.starts_with("No results for selector:"));
558    }
559
560    #[test]
561    fn empty_text_skipped() {
562        let html = "<ul><li>  </li><li>A</li></ul>";
563        let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
564        assert_eq!(result, "A");
565    }
566
567    #[test]
568    fn invalid_selector_errors() {
569        let html = "<html><body></body></html>";
570        let result = parse_and_extract(html, "[[[invalid", &ExtractMode::Text, 10);
571        assert!(result.is_err());
572    }
573
574    #[test]
575    fn empty_html_returns_no_results() {
576        let result = parse_and_extract("", "h1", &ExtractMode::Text, 10).unwrap();
577        assert!(result.starts_with("No results for selector:"));
578    }
579
580    #[test]
581    fn nested_selector() {
582        let html = "<div><span>inner</span></div><span>outer</span>";
583        let result = parse_and_extract(html, "div > span", &ExtractMode::Text, 10).unwrap();
584        assert_eq!(result, "inner");
585    }
586
587    #[test]
588    fn attr_missing_returns_empty() {
589        let html = r#"<a>No href</a>"#;
590        let result =
591            parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
592        assert!(result.starts_with("No results for selector:"));
593    }
594
595    #[test]
596    fn extract_html_mode() {
597        let html = "<div><b>bold</b> text</div>";
598        let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
599        assert!(result.contains("<b>bold</b>"));
600    }
601
602    #[test]
603    fn limit_zero_returns_no_results() {
604        let html = "<ul><li>A</li><li>B</li></ul>";
605        let result = parse_and_extract(html, "li", &ExtractMode::Text, 0).unwrap();
606        assert!(result.starts_with("No results for selector:"));
607    }
608
609    // --- validate_url edge cases ---
610
611    #[test]
612    fn url_with_port_allowed() {
613        assert!(validate_url("https://example.com:8443/path").is_ok());
614    }
615
616    #[test]
617    fn link_local_ip_blocked() {
618        let err = validate_url("https://169.254.1.1/path").unwrap_err();
619        assert!(matches!(err, ToolError::Blocked { .. }));
620    }
621
622    #[test]
623    fn url_no_scheme_rejected() {
624        let err = validate_url("example.com/path").unwrap_err();
625        assert!(matches!(err, ToolError::Blocked { .. }));
626    }
627
628    #[test]
629    fn unspecified_ipv4_blocked() {
630        let err = validate_url("https://0.0.0.0/path").unwrap_err();
631        assert!(matches!(err, ToolError::Blocked { .. }));
632    }
633
634    #[test]
635    fn broadcast_ipv4_blocked() {
636        let err = validate_url("https://255.255.255.255/path").unwrap_err();
637        assert!(matches!(err, ToolError::Blocked { .. }));
638    }
639
640    #[test]
641    fn ipv6_link_local_blocked() {
642        let err = validate_url("https://[fe80::1]/path").unwrap_err();
643        assert!(matches!(err, ToolError::Blocked { .. }));
644    }
645
646    #[test]
647    fn ipv6_unique_local_blocked() {
648        let err = validate_url("https://[fd12::1]/path").unwrap_err();
649        assert!(matches!(err, ToolError::Blocked { .. }));
650    }
651
652    #[test]
653    fn ipv4_mapped_ipv6_loopback_blocked() {
654        let err = validate_url("https://[::ffff:127.0.0.1]/path").unwrap_err();
655        assert!(matches!(err, ToolError::Blocked { .. }));
656    }
657
658    #[test]
659    fn ipv4_mapped_ipv6_private_blocked() {
660        let err = validate_url("https://[::ffff:10.0.0.1]/path").unwrap_err();
661        assert!(matches!(err, ToolError::Blocked { .. }));
662    }
663
664    // --- WebScrapeExecutor (no-network) ---
665
666    #[tokio::test]
667    async fn executor_no_blocks_returns_none() {
668        let config = ScrapeConfig::default();
669        let executor = WebScrapeExecutor::new(&config);
670        let result = executor.execute("plain text").await;
671        assert!(result.unwrap().is_none());
672    }
673
674    #[tokio::test]
675    async fn executor_invalid_json_errors() {
676        let config = ScrapeConfig::default();
677        let executor = WebScrapeExecutor::new(&config);
678        let response = "```scrape\nnot json\n```";
679        let result = executor.execute(response).await;
680        assert!(matches!(result, Err(ToolError::Execution(_))));
681    }
682
683    #[tokio::test]
684    async fn executor_blocked_url_errors() {
685        let config = ScrapeConfig::default();
686        let executor = WebScrapeExecutor::new(&config);
687        let response = "```scrape\n{\"url\":\"http://example.com\",\"select\":\"h1\"}\n```";
688        let result = executor.execute(response).await;
689        assert!(matches!(result, Err(ToolError::Blocked { .. })));
690    }
691
692    #[tokio::test]
693    async fn executor_private_ip_blocked() {
694        let config = ScrapeConfig::default();
695        let executor = WebScrapeExecutor::new(&config);
696        let response = "```scrape\n{\"url\":\"https://192.168.1.1/api\",\"select\":\"h1\"}\n```";
697        let result = executor.execute(response).await;
698        assert!(matches!(result, Err(ToolError::Blocked { .. })));
699    }
700
701    #[tokio::test]
702    async fn executor_unreachable_host_returns_error() {
703        let config = ScrapeConfig {
704            timeout: 1,
705            max_body_bytes: 1_048_576,
706        };
707        let executor = WebScrapeExecutor::new(&config);
708        let response = "```scrape\n{\"url\":\"https://192.0.2.1:1/page\",\"select\":\"h1\"}\n```";
709        let result = executor.execute(response).await;
710        assert!(matches!(result, Err(ToolError::Execution(_))));
711    }
712
713    #[tokio::test]
714    async fn executor_localhost_url_blocked() {
715        let config = ScrapeConfig::default();
716        let executor = WebScrapeExecutor::new(&config);
717        let response = "```scrape\n{\"url\":\"https://localhost:9999/api\",\"select\":\"h1\"}\n```";
718        let result = executor.execute(response).await;
719        assert!(matches!(result, Err(ToolError::Blocked { .. })));
720    }
721
722    #[tokio::test]
723    async fn executor_empty_text_returns_none() {
724        let config = ScrapeConfig::default();
725        let executor = WebScrapeExecutor::new(&config);
726        let result = executor.execute("").await;
727        assert!(result.unwrap().is_none());
728    }
729
730    #[tokio::test]
731    async fn executor_multiple_blocks_first_blocked() {
732        let config = ScrapeConfig::default();
733        let executor = WebScrapeExecutor::new(&config);
734        let response = "```scrape\n{\"url\":\"http://evil.com\",\"select\":\"h1\"}\n```\n\
735             ```scrape\n{\"url\":\"https://ok.com\",\"select\":\"h1\"}\n```";
736        let result = executor.execute(response).await;
737        assert!(result.is_err());
738    }
739
740    #[test]
741    fn validate_url_empty_string() {
742        let err = validate_url("").unwrap_err();
743        assert!(matches!(err, ToolError::Blocked { .. }));
744    }
745
746    #[test]
747    fn validate_url_javascript_scheme_blocked() {
748        let err = validate_url("javascript:alert(1)").unwrap_err();
749        assert!(matches!(err, ToolError::Blocked { .. }));
750    }
751
752    #[test]
753    fn validate_url_data_scheme_blocked() {
754        let err = validate_url("data:text/html,<h1>hi</h1>").unwrap_err();
755        assert!(matches!(err, ToolError::Blocked { .. }));
756    }
757
758    #[test]
759    fn is_private_host_public_domain_is_false() {
760        let host: url::Host<&str> = url::Host::Domain("example.com");
761        assert!(!is_private_host(&host));
762    }
763
764    #[test]
765    fn is_private_host_localhost_is_true() {
766        let host: url::Host<&str> = url::Host::Domain("localhost");
767        assert!(is_private_host(&host));
768    }
769
770    #[test]
771    fn is_private_host_ipv6_unspecified_is_true() {
772        let host = url::Host::Ipv6(std::net::Ipv6Addr::UNSPECIFIED);
773        assert!(is_private_host(&host));
774    }
775
776    #[test]
777    fn is_private_host_public_ipv6_is_false() {
778        let host = url::Host::Ipv6("2001:db8::1".parse().unwrap());
779        assert!(!is_private_host(&host));
780    }
781
782    #[test]
783    fn extract_scrape_blocks_empty_block_content() {
784        let text = "```scrape\n\n```";
785        let blocks = extract_scrape_blocks(text);
786        assert_eq!(blocks.len(), 1);
787        assert!(blocks[0].is_empty());
788    }
789
790    #[test]
791    fn extract_scrape_blocks_whitespace_only() {
792        let text = "```scrape\n   \n```";
793        let blocks = extract_scrape_blocks(text);
794        assert_eq!(blocks.len(), 1);
795    }
796
797    #[test]
798    fn parse_and_extract_multiple_selectors() {
799        let html = "<div><h1>Title</h1><p>Para</p></div>";
800        let result = parse_and_extract(html, "h1, p", &ExtractMode::Text, 10).unwrap();
801        assert!(result.contains("Title"));
802        assert!(result.contains("Para"));
803    }
804
805    #[test]
806    fn webscrape_executor_new_with_custom_config() {
807        let config = ScrapeConfig {
808            timeout: 60,
809            max_body_bytes: 512,
810        };
811        let executor = WebScrapeExecutor::new(&config);
812        assert_eq!(executor.max_body_bytes, 512);
813    }
814
815    #[test]
816    fn webscrape_executor_debug() {
817        let config = ScrapeConfig::default();
818        let executor = WebScrapeExecutor::new(&config);
819        let dbg = format!("{executor:?}");
820        assert!(dbg.contains("WebScrapeExecutor"));
821    }
822
823    #[test]
824    fn extract_mode_attr_empty_name() {
825        let mode = ExtractMode::parse("attr:");
826        assert!(matches!(mode, ExtractMode::Attr(ref s) if s.is_empty()));
827    }
828
829    #[test]
830    fn default_extract_returns_text() {
831        assert_eq!(default_extract(), "text");
832    }
833
834    #[test]
835    fn scrape_instruction_debug() {
836        let json = r#"{"url":"https://example.com","select":"h1"}"#;
837        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
838        let dbg = format!("{instr:?}");
839        assert!(dbg.contains("ScrapeInstruction"));
840    }
841
842    #[test]
843    fn extract_mode_debug() {
844        let mode = ExtractMode::Text;
845        let dbg = format!("{mode:?}");
846        assert!(dbg.contains("Text"));
847    }
848
849    #[test]
850    fn ipv4_mapped_ipv6_link_local_blocked() {
851        let err = validate_url("https://[::ffff:169.254.0.1]/path").unwrap_err();
852        assert!(matches!(err, ToolError::Blocked { .. }));
853    }
854
855    #[test]
856    fn ipv4_mapped_ipv6_public_allowed() {
857        assert!(validate_url("https://[::ffff:93.184.216.34]/path").is_ok());
858    }
859
860    #[test]
861    fn tool_definitions_returns_web_scrape() {
862        let config = ScrapeConfig::default();
863        let executor = WebScrapeExecutor::new(&config);
864        let defs = executor.tool_definitions();
865        assert_eq!(defs.len(), 1);
866        assert_eq!(defs[0].id, "web_scrape");
867        assert_eq!(
868            defs[0].invocation,
869            crate::registry::InvocationHint::FencedBlock("scrape")
870        );
871    }
872
873    #[test]
874    fn tool_definitions_schema_has_all_params() {
875        let config = ScrapeConfig::default();
876        let executor = WebScrapeExecutor::new(&config);
877        let defs = executor.tool_definitions();
878        let obj = defs[0].schema.as_object().unwrap();
879        let props = obj["properties"].as_object().unwrap();
880        assert!(props.contains_key("url"));
881        assert!(props.contains_key("select"));
882        assert!(props.contains_key("extract"));
883        assert!(props.contains_key("limit"));
884        let req = obj["required"].as_array().unwrap();
885        assert!(req.iter().any(|v| v.as_str() == Some("url")));
886        assert!(req.iter().any(|v| v.as_str() == Some("select")));
887        assert!(!req.iter().any(|v| v.as_str() == Some("extract")));
888    }
889
890    // --- is_private_host: new domain checks (AUD-02) ---
891
892    #[test]
893    fn subdomain_localhost_blocked() {
894        let host: url::Host<&str> = url::Host::Domain("foo.localhost");
895        assert!(is_private_host(&host));
896    }
897
898    #[test]
899    fn internal_tld_blocked() {
900        let host: url::Host<&str> = url::Host::Domain("service.internal");
901        assert!(is_private_host(&host));
902    }
903
904    #[test]
905    fn local_tld_blocked() {
906        let host: url::Host<&str> = url::Host::Domain("printer.local");
907        assert!(is_private_host(&host));
908    }
909
910    #[test]
911    fn public_domain_not_blocked() {
912        let host: url::Host<&str> = url::Host::Domain("example.com");
913        assert!(!is_private_host(&host));
914    }
915
916    // --- resolve_and_validate: private IP rejection ---
917
918    #[tokio::test]
919    async fn resolve_loopback_rejected() {
920        // 127.0.0.1 resolves directly (literal IP in DNS query)
921        let url = url::Url::parse("https://127.0.0.1/path").unwrap();
922        // validate_url catches this before resolve_and_validate, but test directly
923        let result = resolve_and_validate(&url).await;
924        assert!(
925            result.is_err(),
926            "loopback IP must be rejected by resolve_and_validate"
927        );
928        let err = result.unwrap_err();
929        assert!(matches!(err, crate::executor::ToolError::Blocked { .. }));
930    }
931
932    #[tokio::test]
933    async fn resolve_private_10_rejected() {
934        let url = url::Url::parse("https://10.0.0.1/path").unwrap();
935        let result = resolve_and_validate(&url).await;
936        assert!(result.is_err());
937        assert!(matches!(
938            result.unwrap_err(),
939            crate::executor::ToolError::Blocked { .. }
940        ));
941    }
942
943    #[tokio::test]
944    async fn resolve_private_192_rejected() {
945        let url = url::Url::parse("https://192.168.1.1/path").unwrap();
946        let result = resolve_and_validate(&url).await;
947        assert!(result.is_err());
948        assert!(matches!(
949            result.unwrap_err(),
950            crate::executor::ToolError::Blocked { .. }
951        ));
952    }
953
954    #[tokio::test]
955    async fn resolve_ipv6_loopback_rejected() {
956        let url = url::Url::parse("https://[::1]/path").unwrap();
957        let result = resolve_and_validate(&url).await;
958        assert!(result.is_err());
959        assert!(matches!(
960            result.unwrap_err(),
961            crate::executor::ToolError::Blocked { .. }
962        ));
963    }
964
965    #[tokio::test]
966    async fn resolve_no_host_returns_ok() {
967        // URL without a resolvable host — should pass through
968        let url = url::Url::parse("https://example.com/path").unwrap();
969        // We can't do a live DNS test, but we can verify a URL with no host
970        let url_no_host = url::Url::parse("data:text/plain,hello").unwrap();
971        // data: URLs have no host; resolve_and_validate should return Ok with empty addrs
972        let result = resolve_and_validate(&url_no_host).await;
973        assert!(result.is_ok());
974        let (host, addrs) = result.unwrap();
975        assert!(host.is_empty());
976        assert!(addrs.is_empty());
977        drop(url);
978        drop(url_no_host);
979    }
980}
zeph_tools/scrape.rs

zeph_tools/
scrape.rs