zeph_tools/
scrape.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Web scraping executor with SSRF protection and domain policy enforcement.
5//!
6//! Exposes two tools to the LLM:
7//!
8//! - **`web_scrape`** — fetches a URL and extracts elements matching a CSS selector.
9//! - **`fetch`** — fetches a URL and returns the raw response body as UTF-8 text.
10//!
11//! Both tools enforce:
12//!
13//! - HTTPS-only URLs (HTTP and other schemes are rejected).
14//! - DNS resolution followed by a private-IP check to prevent SSRF.
15//! - Optional domain allowlist and denylist from [`ScrapeConfig`].
16//! - Configurable timeout and maximum response body size.
17//! - Redirect following is disabled to prevent open-redirect SSRF bypasses.
18
19use std::net::{IpAddr, SocketAddr};
20use std::sync::Arc;
21use std::sync::atomic::{AtomicU64, Ordering};
22use std::time::{Duration, Instant};
23
24use schemars::JsonSchema;
25use serde::Deserialize;
26use url::Url;
27
28use zeph_common::ToolName;
29
30use zeph_sanitizer::IpiFilter;
31
32use crate::audit::{AuditEntry, AuditLogger, AuditResult, EgressEvent, chrono_now};
33use crate::config::{EgressConfig, ScrapeConfig};
34use crate::executor::{
35    ClaimSource, ToolCall, ToolError, ToolExecutor, ToolOutput, deserialize_params,
36};
37use crate::net::is_private_ip;
38
39/// Strips userinfo (`user:pass@`) and sensitive query params from a URL for safe logging.
40///
41/// Returns the sanitized URL string; falls back to the original if parsing fails.
42fn redact_url_for_log(url: &str) -> String {
43    let Ok(mut parsed) = Url::parse(url) else {
44        return url.to_owned();
45    };
46    // Remove userinfo.
47    let _ = parsed.set_username("");
48    let _ = parsed.set_password(None);
49    // Strip query params whose names suggest secrets (token, key, secret, password, auth, sig).
50    let sensitive = [
51        "token", "key", "secret", "password", "auth", "sig", "api_key", "apikey",
52    ];
53    let filtered: Vec<(String, String)> = parsed
54        .query_pairs()
55        .filter(|(k, _)| {
56            let lower = k.to_lowercase();
57            !sensitive.iter().any(|s| lower.contains(s))
58        })
59        .map(|(k, v)| (k.into_owned(), v.into_owned()))
60        .collect();
61    if filtered.is_empty() {
62        parsed.set_query(None);
63    } else {
64        let q: String = filtered
65            .iter()
66            .map(|(k, v)| format!("{k}={v}"))
67            .collect::<Vec<_>>()
68            .join("&");
69        parsed.set_query(Some(&q));
70    }
71    parsed.to_string()
72}
73
74#[derive(Debug, Deserialize, JsonSchema)]
75struct FetchParams {
76    /// HTTPS URL to fetch
77    url: String,
78}
79
80#[derive(Debug, Deserialize, JsonSchema)]
81struct ScrapeInstruction {
82    /// HTTPS URL to scrape
83    url: String,
84    /// CSS selector
85    select: String,
86    /// Extract mode: text, html, or attr:<name>
87    #[serde(default = "default_extract")]
88    extract: String,
89    /// Max results to return
90    limit: Option<usize>,
91}
92
93fn default_extract() -> String {
94    "text".into()
95}
96
97#[derive(Debug)]
98enum ExtractMode {
99    Text,
100    Html,
101    Attr(String),
102}
103
104impl ExtractMode {
105    fn parse(s: &str) -> Self {
106        match s {
107            "text" => Self::Text,
108            "html" => Self::Html,
109            attr if attr.starts_with("attr:") => {
110                Self::Attr(attr.strip_prefix("attr:").unwrap_or(attr).to_owned())
111            }
112            _ => Self::Text,
113        }
114    }
115}
116
117/// Extracts data from web pages via CSS selectors.
118///
119/// Handles two invocation paths:
120///
121/// 1. **Legacy fenced blocks** — detects ` ```scrape ` blocks in the LLM response, each
122///    containing a JSON scrape instruction object. Dispatched via [`ToolExecutor::execute`].
123/// 2. **Structured tool calls** — dispatched via [`ToolExecutor::execute_tool_call`] for
124///    tool IDs `"web_scrape"` and `"fetch"`.
125///
126/// # Security
127///
128/// - Only HTTPS URLs are accepted. HTTP and other schemes return [`ToolError::InvalidParams`].
129/// - DNS is resolved synchronously and each resolved address is checked against
130///   [`is_private_ip`]. Private addresses are rejected to prevent SSRF.
131/// - HTTP redirects are disabled (`Policy::none()`) to prevent open-redirect bypasses.
132/// - Domain allowlists and denylists from config are enforced before DNS resolution.
133///
134/// # Example
135///
136/// ```rust,no_run
137/// use zeph_tools::{WebScrapeExecutor, ToolExecutor, ToolCall, ScrapeConfig};
138/// use zeph_common::ToolName;
139///
140/// # async fn example() {
141/// let executor = WebScrapeExecutor::new(&ScrapeConfig::default());
142///
143/// let call = ToolCall {
144///     tool_id: ToolName::new("fetch"),
145///     params: {
146///         let mut m = serde_json::Map::new();
147///         m.insert("url".to_owned(), serde_json::json!("https://example.com"));
148///         m
149///     },
150///     caller_id: None,
151///     context: None,
152///     tool_call_id: String::new(),
153///     skill_name: None,
154/// };
155/// let _ = executor.execute_tool_call(&call).await;
156/// # }
157/// ```
158#[derive(Debug)]
159pub struct WebScrapeExecutor {
160    timeout: Duration,
161    max_body_bytes: usize,
162    allowed_domains: Vec<String>,
163    denied_domains: Vec<String>,
164    audit_logger: Option<Arc<AuditLogger>>,
165    egress_config: EgressConfig,
166    egress_tx: Option<tokio::sync::mpsc::Sender<EgressEvent>>,
167    egress_dropped: Arc<AtomicU64>,
168    /// IPI filter applied to every fetched response body before returning to callers.
169    ipi_filter: IpiFilter,
170}
171
172impl WebScrapeExecutor {
173    /// Create a new `WebScrapeExecutor` from configuration.
174    ///
175    /// No network connections are made at construction time.
176    #[must_use]
177    pub fn new(config: &ScrapeConfig) -> Self {
178        Self {
179            timeout: Duration::from_secs(config.timeout),
180            max_body_bytes: config.max_body_bytes,
181            allowed_domains: config.allowed_domains.clone(),
182            denied_domains: config.denied_domains.clone(),
183            audit_logger: None,
184            egress_config: EgressConfig::default(),
185            egress_tx: None,
186            egress_dropped: Arc::new(AtomicU64::new(0)),
187            ipi_filter: IpiFilter::new(config.ipi_filter_threshold),
188        }
189    }
190
191    /// Attach an audit logger. Each tool invocation will emit an [`AuditEntry`].
192    #[must_use]
193    pub fn with_audit(mut self, logger: Arc<AuditLogger>) -> Self {
194        self.audit_logger = Some(logger);
195        self
196    }
197
198    /// Configure egress event logging.
199    #[must_use]
200    pub fn with_egress_config(mut self, config: EgressConfig) -> Self {
201        self.egress_config = config;
202        self
203    }
204
205    /// Attach the egress telemetry channel sender and drop counter.
206    ///
207    /// Events are sent via [`tokio::sync::mpsc::Sender::try_send`] — the executor
208    /// never blocks waiting for capacity.
209    #[must_use]
210    pub fn with_egress_tx(
211        mut self,
212        tx: tokio::sync::mpsc::Sender<EgressEvent>,
213        dropped: Arc<AtomicU64>,
214    ) -> Self {
215        self.egress_tx = Some(tx);
216        self.egress_dropped = dropped;
217        self
218    }
219
220    /// Returns a clone of the egress drop counter, for use in the drain task.
221    #[must_use]
222    pub fn egress_dropped(&self) -> Arc<AtomicU64> {
223        Arc::clone(&self.egress_dropped)
224    }
225
226    fn build_client(&self, host: &str, addrs: &[SocketAddr]) -> reqwest::Client {
227        let mut builder = reqwest::Client::builder()
228            .timeout(self.timeout)
229            .redirect(reqwest::redirect::Policy::none());
230        builder = builder.resolve_to_addrs(host, addrs);
231        builder.build().unwrap_or_default()
232    }
233}
234
235impl ToolExecutor for WebScrapeExecutor {
236    fn tool_definitions(&self) -> Vec<crate::registry::ToolDef> {
237        use crate::registry::{InvocationHint, ToolDef};
238        vec![
239            ToolDef {
240                id: "web_scrape".into(),
241                description: "Extract structured data from a web page using CSS selectors.\n\nONLY call this tool when the user has explicitly provided a URL in their message, or when a prior tool call returned a URL to retrieve. NEVER construct, guess, or infer a URL from entity names, brand knowledge, or domain patterns.\n\nParameters: url (string, required) - HTTPS URL; select (string, required) - CSS selector; extract (string, optional) - \"text\", \"html\", or \"attr:<name>\"; limit (integer, optional) - max results\nReturns: extracted text/HTML/attribute values, one per line\nErrors: InvalidParams if URL is not HTTPS or selector is empty; Timeout after configured seconds; connection/DNS failures".into(),
242                schema: schemars::schema_for!(ScrapeInstruction),
243                invocation: InvocationHint::FencedBlock("scrape"),
244                output_schema: None,
245            },
246            ToolDef {
247                id: "fetch".into(),
248                description: "Fetch a URL and return the response body as plain text.\n\nONLY call this tool when the user has explicitly provided a URL in their message, or when a prior tool call returned a URL to retrieve. NEVER construct, guess, or infer a URL from entity names, brand knowledge, or domain patterns. If no URL is present in the conversation, do not call this tool.\n\nParameters: url (string, required) - HTTPS URL to fetch\nReturns: response body as UTF-8 text, truncated if exceeding max body size\nErrors: InvalidParams if URL is not HTTPS; Timeout; SSRF-blocked private IPs; connection failures".into(),
249                schema: schemars::schema_for!(FetchParams),
250                invocation: InvocationHint::ToolCall,
251                output_schema: None,
252            },
253        ]
254    }
255
256    async fn execute(&self, response: &str) -> Result<Option<ToolOutput>, ToolError> {
257        let blocks = extract_scrape_blocks(response);
258        if blocks.is_empty() {
259            return Ok(None);
260        }
261
262        let mut outputs = Vec::with_capacity(blocks.len());
263        #[allow(clippy::cast_possible_truncation)]
264        let blocks_executed = blocks.len() as u32;
265
266        for block in &blocks {
267            let instruction: ScrapeInstruction = serde_json::from_str(block).map_err(|e| {
268                ToolError::Execution(std::io::Error::new(
269                    std::io::ErrorKind::InvalidData,
270                    e.to_string(),
271                ))
272            })?;
273            let correlation_id = EgressEvent::new_correlation_id();
274            let start = Instant::now();
275            let scrape_result = self
276                .scrape_instruction(&instruction, &correlation_id, None, None)
277                .await;
278            #[allow(clippy::cast_possible_truncation)]
279            let duration_ms = start.elapsed().as_millis() as u64;
280            match scrape_result {
281                Ok(output) => {
282                    self.log_audit(
283                        "web_scrape",
284                        &redact_url_for_log(&instruction.url),
285                        AuditResult::Success,
286                        duration_ms,
287                        None,
288                        None,
289                        None,
290                        Some(correlation_id),
291                    )
292                    .await;
293                    outputs.push(output);
294                }
295                Err(e) => {
296                    let audit_result = tool_error_to_audit_result(&e);
297                    self.log_audit(
298                        "web_scrape",
299                        &redact_url_for_log(&instruction.url),
300                        audit_result,
301                        duration_ms,
302                        Some(&e),
303                        None,
304                        None,
305                        Some(correlation_id),
306                    )
307                    .await;
308                    return Err(e);
309                }
310            }
311        }
312
313        Ok(Some(ToolOutput {
314            tool_name: ToolName::new("web-scrape"),
315            summary: outputs.join("\n\n"),
316            blocks_executed,
317            filter_stats: None,
318            diff: None,
319            streamed: false,
320            terminal_id: None,
321            locations: None,
322            raw_response: None,
323            claim_source: Some(ClaimSource::WebScrape),
324        }))
325    }
326
327    #[cfg_attr(
328        feature = "profiling",
329        tracing::instrument(name = "tools.scrape.fetch", skip_all)
330    )]
331    async fn execute_tool_call(&self, call: &ToolCall) -> Result<Option<ToolOutput>, ToolError> {
332        match call.tool_id.as_str() {
333            "web_scrape" => {
334                let instruction: ScrapeInstruction = deserialize_params(&call.params)?;
335                let correlation_id = EgressEvent::new_correlation_id();
336                let start = Instant::now();
337                let result = self
338                    .scrape_instruction(
339                        &instruction,
340                        &correlation_id,
341                        call.caller_id.clone(),
342                        call.skill_name.clone(),
343                    )
344                    .await;
345                #[allow(clippy::cast_possible_truncation)]
346                let duration_ms = start.elapsed().as_millis() as u64;
347                self.run_with_audit(
348                    "web_scrape",
349                    "web-scrape",
350                    &redact_url_for_log(&instruction.url),
351                    call.caller_id.clone(),
352                    call.skill_name.clone(),
353                    correlation_id,
354                    duration_ms,
355                    result,
356                )
357                .await
358            }
359            "fetch" => {
360                let p: FetchParams = deserialize_params(&call.params)?;
361                let correlation_id = EgressEvent::new_correlation_id();
362                let start = Instant::now();
363                let result = self
364                    .handle_fetch(
365                        &p,
366                        &correlation_id,
367                        call.caller_id.clone(),
368                        call.skill_name.clone(),
369                    )
370                    .await;
371                #[allow(clippy::cast_possible_truncation)]
372                let duration_ms = start.elapsed().as_millis() as u64;
373                self.run_with_audit(
374                    "fetch",
375                    "fetch",
376                    &redact_url_for_log(&p.url),
377                    call.caller_id.clone(),
378                    call.skill_name.clone(),
379                    correlation_id,
380                    duration_ms,
381                    result,
382                )
383                .await
384            }
385            _ => Ok(None),
386        }
387    }
388
389    fn is_tool_retryable(&self, tool_id: &str) -> bool {
390        matches!(tool_id, "web_scrape" | "fetch")
391    }
392}
393
394fn tool_error_to_audit_result(e: &ToolError) -> AuditResult {
395    match e {
396        ToolError::Blocked { command } => AuditResult::Blocked {
397            reason: command.clone(),
398        },
399        ToolError::Timeout { .. } => AuditResult::Timeout,
400        _ => AuditResult::Error {
401            message: e.to_string(),
402        },
403    }
404}
405
406impl WebScrapeExecutor {
407    #[allow(clippy::too_many_arguments)]
408    async fn run_with_audit(
409        &self,
410        audit_tool_name: &str,
411        public_tool_name: &str,
412        audit_command: &str,
413        caller_id: Option<String>,
414        skill_name: Option<Vec<String>>,
415        correlation_id: String,
416        duration_ms: u64,
417        result: Result<String, ToolError>,
418    ) -> Result<Option<ToolOutput>, ToolError> {
419        match result {
420            Ok(output) => {
421                self.log_audit(
422                    audit_tool_name,
423                    audit_command,
424                    AuditResult::Success,
425                    duration_ms,
426                    None,
427                    caller_id,
428                    skill_name,
429                    Some(correlation_id),
430                )
431                .await;
432                Ok(Some(ToolOutput {
433                    tool_name: ToolName::new(public_tool_name),
434                    summary: output,
435                    blocks_executed: 1,
436                    filter_stats: None,
437                    diff: None,
438                    streamed: false,
439                    terminal_id: None,
440                    locations: None,
441                    raw_response: None,
442                    claim_source: Some(ClaimSource::WebScrape),
443                }))
444            }
445            Err(e) => {
446                let audit_result = tool_error_to_audit_result(&e);
447                self.log_audit(
448                    audit_tool_name,
449                    audit_command,
450                    audit_result,
451                    duration_ms,
452                    Some(&e),
453                    caller_id,
454                    skill_name,
455                    Some(correlation_id),
456                )
457                .await;
458                Err(e)
459            }
460        }
461    }
462
463    #[allow(clippy::too_many_arguments)] // function with many required inputs; a *Params struct would be more verbose without simplifying the call site
464    async fn log_audit(
465        &self,
466        tool: &str,
467        command: &str,
468        result: AuditResult,
469        duration_ms: u64,
470        error: Option<&ToolError>,
471        caller_id: Option<String>,
472        skill_name: Option<Vec<String>>,
473        correlation_id: Option<String>,
474    ) {
475        if let Some(ref logger) = self.audit_logger {
476            let (error_category, error_domain, error_phase) =
477                error.map_or((None, None, None), |e| {
478                    let cat = e.category();
479                    (
480                        Some(cat.label().to_owned()),
481                        Some(cat.domain().label().to_owned()),
482                        Some(cat.phase().label().to_owned()),
483                    )
484                });
485            let entry = AuditEntry {
486                timestamp: chrono_now(),
487                tool: tool.into(),
488                command: command.into(),
489                result,
490                duration_ms,
491                error_category,
492                error_domain,
493                error_phase,
494                claim_source: Some(ClaimSource::WebScrape),
495                mcp_server_id: None,
496                injection_flagged: false,
497                embedding_anomalous: false,
498                cross_boundary_mcp_to_acp: false,
499                adversarial_policy_decision: None,
500                exit_code: None,
501                truncated: false,
502                caller_id,
503                skill_name,
504                policy_match: None,
505                correlation_id,
506                vigil_risk: None,
507                execution_env: None,
508                resolved_cwd: None,
509                scope_at_definition: None,
510                scope_at_dispatch: None,
511            };
512            logger.log(&entry).await;
513        }
514    }
515
516    fn send_egress_event(&self, event: EgressEvent) {
517        if let Some(ref tx) = self.egress_tx {
518            match tx.try_send(event) {
519                Ok(()) => {}
520                Err(tokio::sync::mpsc::error::TrySendError::Full(_)) => {
521                    self.egress_dropped.fetch_add(1, Ordering::Relaxed);
522                }
523                Err(tokio::sync::mpsc::error::TrySendError::Closed(_)) => {
524                    tracing::debug!("egress channel closed; executor continuing without telemetry");
525                }
526            }
527        }
528    }
529
530    async fn log_egress_event(&self, event: &EgressEvent) {
531        if let Some(ref logger) = self.audit_logger {
532            logger.log_egress(event).await;
533        }
534        self.send_egress_event(event.clone());
535    }
536
537    async fn handle_fetch(
538        &self,
539        params: &FetchParams,
540        correlation_id: &str,
541        caller_id: Option<String>,
542        skill_name: Option<Vec<String>>,
543    ) -> Result<String, ToolError> {
544        let parsed = validate_url(&params.url);
545        let host_str = parsed
546            .as_ref()
547            .map(|u| u.host_str().unwrap_or("").to_owned())
548            .unwrap_or_default();
549
550        if let Err(ref _e) = parsed {
551            if self.egress_config.enabled && self.egress_config.log_blocked {
552                let event = Self::make_blocked_event(
553                    "fetch",
554                    &params.url,
555                    &host_str,
556                    correlation_id,
557                    caller_id.clone(),
558                    skill_name.clone(),
559                    "scheme",
560                );
561                self.log_egress_event(&event).await;
562            }
563            return Err(parsed.unwrap_err());
564        }
565        let parsed = parsed.unwrap();
566
567        if let Err(e) = check_domain_policy(
568            parsed.host_str().unwrap_or(""),
569            &self.allowed_domains,
570            &self.denied_domains,
571        ) {
572            if self.egress_config.enabled && self.egress_config.log_blocked {
573                let event = Self::make_blocked_event(
574                    "fetch",
575                    &params.url,
576                    parsed.host_str().unwrap_or(""),
577                    correlation_id,
578                    caller_id.clone(),
579                    skill_name.clone(),
580                    "blocklist",
581                );
582                self.log_egress_event(&event).await;
583            }
584            return Err(e);
585        }
586
587        let (host, addrs) = match resolve_and_validate(&parsed).await {
588            Ok(v) => v,
589            Err(e) => {
590                if self.egress_config.enabled && self.egress_config.log_blocked {
591                    let event = Self::make_blocked_event(
592                        "fetch",
593                        &params.url,
594                        parsed.host_str().unwrap_or(""),
595                        correlation_id,
596                        caller_id.clone(),
597                        skill_name.clone(),
598                        "ssrf",
599                    );
600                    self.log_egress_event(&event).await;
601                }
602                return Err(e);
603            }
604        };
605
606        let body = self
607            .fetch_html(
608                &params.url,
609                &host,
610                &addrs,
611                "fetch",
612                correlation_id,
613                caller_id,
614                skill_name,
615            )
616            .await?;
617        self.apply_ipi_filter(&body, &params.url).await
618    }
619
620    async fn scrape_instruction(
621        &self,
622        instruction: &ScrapeInstruction,
623        correlation_id: &str,
624        caller_id: Option<String>,
625        skill_name: Option<Vec<String>>,
626    ) -> Result<String, ToolError> {
627        let parsed = validate_url(&instruction.url);
628        let host_str = parsed
629            .as_ref()
630            .map(|u| u.host_str().unwrap_or("").to_owned())
631            .unwrap_or_default();
632
633        if let Err(ref _e) = parsed {
634            if self.egress_config.enabled && self.egress_config.log_blocked {
635                let event = Self::make_blocked_event(
636                    "web_scrape",
637                    &instruction.url,
638                    &host_str,
639                    correlation_id,
640                    caller_id.clone(),
641                    skill_name.clone(),
642                    "scheme",
643                );
644                self.log_egress_event(&event).await;
645            }
646            return Err(parsed.unwrap_err());
647        }
648        let parsed = parsed.unwrap();
649
650        if let Err(e) = check_domain_policy(
651            parsed.host_str().unwrap_or(""),
652            &self.allowed_domains,
653            &self.denied_domains,
654        ) {
655            if self.egress_config.enabled && self.egress_config.log_blocked {
656                let event = Self::make_blocked_event(
657                    "web_scrape",
658                    &instruction.url,
659                    parsed.host_str().unwrap_or(""),
660                    correlation_id,
661                    caller_id.clone(),
662                    skill_name.clone(),
663                    "blocklist",
664                );
665                self.log_egress_event(&event).await;
666            }
667            return Err(e);
668        }
669
670        let (host, addrs) = match resolve_and_validate(&parsed).await {
671            Ok(v) => v,
672            Err(e) => {
673                if self.egress_config.enabled && self.egress_config.log_blocked {
674                    let event = Self::make_blocked_event(
675                        "web_scrape",
676                        &instruction.url,
677                        parsed.host_str().unwrap_or(""),
678                        correlation_id,
679                        caller_id.clone(),
680                        skill_name.clone(),
681                        "ssrf",
682                    );
683                    self.log_egress_event(&event).await;
684                }
685                return Err(e);
686            }
687        };
688
689        let html = self
690            .fetch_html(
691                &instruction.url,
692                &host,
693                &addrs,
694                "web_scrape",
695                correlation_id,
696                caller_id,
697                skill_name,
698            )
699            .await?;
700        let selector = instruction.select.clone();
701        let extract = ExtractMode::parse(&instruction.extract);
702        let limit = instruction.limit.unwrap_or(10);
703        let extracted = tokio::task::spawn_blocking(move || {
704            parse_and_extract(&html, &selector, &extract, limit)
705        })
706        .await
707        .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))??;
708        // apply_ipi_filter runs on plain extracted text, not raw HTML
709        self.apply_ipi_filter(&extracted, &instruction.url).await
710    }
711
712    fn make_blocked_event(
713        tool: &str,
714        url: &str,
715        host: &str,
716        correlation_id: &str,
717        caller_id: Option<String>,
718        skill_name: Option<Vec<String>>,
719        block_reason: &'static str,
720    ) -> EgressEvent {
721        EgressEvent {
722            timestamp: chrono_now(),
723            kind: "egress",
724            correlation_id: correlation_id.to_owned(),
725            tool: tool.into(),
726            url: redact_url_for_log(url),
727            host: host.to_owned(),
728            method: "GET".to_owned(),
729            status: None,
730            duration_ms: 0,
731            response_bytes: 0,
732            blocked: true,
733            block_reason: Some(block_reason),
734            caller_id,
735            skill_name,
736            hop: 0,
737        }
738    }
739
740    /// Fetches the HTML at `url`, manually following up to 3 redirects.
741    ///
742    /// Each redirect target is validated with `validate_url` and `resolve_and_validate`
743    /// before following, preventing SSRF via redirect chains. When egress logging is
744    /// enabled, one [`EgressEvent`] is emitted per hop.
745    ///
746    /// # Errors
747    ///
748    /// Returns `ToolError::Blocked` if any redirect target resolves to a private IP.
749    /// Returns `ToolError::Execution` on HTTP errors, too-large bodies, or too many redirects.
750    #[allow(clippy::too_many_lines, clippy::too_many_arguments)]
751    async fn fetch_html(
752        &self,
753        url: &str,
754        host: &str,
755        addrs: &[SocketAddr],
756        tool: &str,
757        correlation_id: &str,
758        caller_id: Option<String>,
759        skill_name: Option<Vec<String>>,
760    ) -> Result<String, ToolError> {
761        const MAX_REDIRECTS: usize = 3;
762
763        let mut current_url = url.to_owned();
764        let mut current_host = host.to_owned();
765        let mut current_addrs = addrs.to_vec();
766
767        for hop in 0..=MAX_REDIRECTS {
768            let hop_start = Instant::now();
769            // Build a per-hop client pinned to the current hop's validated addresses.
770            let client = self.build_client(&current_host, &current_addrs);
771            let resp = client
772                .get(&current_url)
773                .send()
774                .await
775                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())));
776
777            let resp = match resp {
778                Ok(r) => r,
779                Err(e) => {
780                    if self.egress_config.enabled {
781                        #[allow(clippy::cast_possible_truncation)]
782                        let duration_ms = hop_start.elapsed().as_millis() as u64;
783                        let event = EgressEvent {
784                            timestamp: chrono_now(),
785                            kind: "egress",
786                            correlation_id: correlation_id.to_owned(),
787                            tool: tool.into(),
788                            url: redact_url_for_log(&current_url),
789                            host: current_host.clone(),
790                            method: "GET".to_owned(),
791                            status: None,
792                            duration_ms,
793                            response_bytes: 0,
794                            blocked: false,
795                            block_reason: None,
796                            caller_id: caller_id.clone(),
797                            skill_name: skill_name.clone(),
798                            #[allow(clippy::cast_possible_truncation)]
799                            hop: hop as u8,
800                        };
801                        self.log_egress_event(&event).await;
802                    }
803                    return Err(e);
804                }
805            };
806
807            let status = resp.status();
808
809            if status.is_redirection() {
810                if hop == MAX_REDIRECTS {
811                    return Err(ToolError::Execution(std::io::Error::other(
812                        "too many redirects",
813                    )));
814                }
815
816                let location = resp
817                    .headers()
818                    .get(reqwest::header::LOCATION)
819                    .and_then(|v| v.to_str().ok())
820                    .ok_or_else(|| {
821                        ToolError::Execution(std::io::Error::other("redirect with no Location"))
822                    })?;
823
824                // Resolve relative redirect URLs against the current URL.
825                let base = Url::parse(&current_url)
826                    .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
827                let next_url = base
828                    .join(location)
829                    .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
830
831                let validated = validate_url(next_url.as_str());
832                if let Err(ref _e) = validated {
833                    if self.egress_config.enabled && self.egress_config.log_blocked {
834                        #[allow(clippy::cast_possible_truncation)]
835                        let duration_ms = hop_start.elapsed().as_millis() as u64;
836                        let next_host = next_url.host_str().unwrap_or("").to_owned();
837                        let event = EgressEvent {
838                            timestamp: chrono_now(),
839                            kind: "egress",
840                            correlation_id: correlation_id.to_owned(),
841                            tool: tool.into(),
842                            url: redact_url_for_log(next_url.as_str()),
843                            host: next_host,
844                            method: "GET".to_owned(),
845                            status: None,
846                            duration_ms,
847                            response_bytes: 0,
848                            blocked: true,
849                            block_reason: Some("ssrf"),
850                            caller_id: caller_id.clone(),
851                            skill_name: skill_name.clone(),
852                            #[allow(clippy::cast_possible_truncation)]
853                            hop: (hop + 1) as u8,
854                        };
855                        self.log_egress_event(&event).await;
856                    }
857                    return Err(validated.unwrap_err());
858                }
859                let validated = validated.unwrap();
860                let resolve_result = resolve_and_validate(&validated).await;
861                if let Err(ref _e) = resolve_result {
862                    if self.egress_config.enabled && self.egress_config.log_blocked {
863                        #[allow(clippy::cast_possible_truncation)]
864                        let duration_ms = hop_start.elapsed().as_millis() as u64;
865                        let next_host = next_url.host_str().unwrap_or("").to_owned();
866                        let event = EgressEvent {
867                            timestamp: chrono_now(),
868                            kind: "egress",
869                            correlation_id: correlation_id.to_owned(),
870                            tool: tool.into(),
871                            url: redact_url_for_log(next_url.as_str()),
872                            host: next_host,
873                            method: "GET".to_owned(),
874                            status: None,
875                            duration_ms,
876                            response_bytes: 0,
877                            blocked: true,
878                            block_reason: Some("ssrf"),
879                            caller_id: caller_id.clone(),
880                            skill_name: skill_name.clone(),
881                            #[allow(clippy::cast_possible_truncation)]
882                            hop: (hop + 1) as u8,
883                        };
884                        self.log_egress_event(&event).await;
885                    }
886                    return Err(resolve_result.unwrap_err());
887                }
888                let (next_host, next_addrs) = resolve_result.unwrap();
889
890                current_url = next_url.to_string();
891                current_host = next_host;
892                current_addrs = next_addrs;
893                continue;
894            }
895
896            if !status.is_success() {
897                if self.egress_config.enabled {
898                    #[allow(clippy::cast_possible_truncation)]
899                    let duration_ms = hop_start.elapsed().as_millis() as u64;
900                    let event = EgressEvent {
901                        timestamp: chrono_now(),
902                        kind: "egress",
903                        correlation_id: correlation_id.to_owned(),
904                        tool: tool.into(),
905                        url: redact_url_for_log(&current_url),
906                        host: current_host.clone(),
907                        method: "GET".to_owned(),
908                        status: Some(status.as_u16()),
909                        duration_ms,
910                        response_bytes: 0,
911                        blocked: false,
912                        block_reason: None,
913                        caller_id: caller_id.clone(),
914                        skill_name: skill_name.clone(),
915                        #[allow(clippy::cast_possible_truncation)]
916                        hop: hop as u8,
917                    };
918                    self.log_egress_event(&event).await;
919                }
920                return Err(ToolError::Http {
921                    status: status.as_u16(),
922                    message: status.canonical_reason().unwrap_or("unknown").to_owned(),
923                });
924            }
925
926            let bytes = resp
927                .bytes()
928                .await
929                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
930
931            if bytes.len() > self.max_body_bytes {
932                if self.egress_config.enabled {
933                    #[allow(clippy::cast_possible_truncation)]
934                    let duration_ms = hop_start.elapsed().as_millis() as u64;
935                    let event = EgressEvent {
936                        timestamp: chrono_now(),
937                        kind: "egress",
938                        correlation_id: correlation_id.to_owned(),
939                        tool: tool.into(),
940                        url: redact_url_for_log(&current_url),
941                        host: current_host.clone(),
942                        method: "GET".to_owned(),
943                        status: Some(status.as_u16()),
944                        duration_ms,
945                        response_bytes: bytes.len(),
946                        blocked: false,
947                        block_reason: None,
948                        caller_id: caller_id.clone(),
949                        skill_name: skill_name.clone(),
950                        #[allow(clippy::cast_possible_truncation)]
951                        hop: hop as u8,
952                    };
953                    self.log_egress_event(&event).await;
954                }
955                return Err(ToolError::Execution(std::io::Error::other(format!(
956                    "response too large: {} bytes (max: {})",
957                    bytes.len(),
958                    self.max_body_bytes,
959                ))));
960            }
961
962            // Success — emit egress event.
963            if self.egress_config.enabled {
964                #[allow(clippy::cast_possible_truncation)]
965                let duration_ms = hop_start.elapsed().as_millis() as u64;
966                let response_bytes = if self.egress_config.log_response_bytes {
967                    bytes.len()
968                } else {
969                    0
970                };
971                let event = EgressEvent {
972                    timestamp: chrono_now(),
973                    kind: "egress",
974                    correlation_id: correlation_id.to_owned(),
975                    tool: tool.into(),
976                    url: redact_url_for_log(&current_url),
977                    host: current_host.clone(),
978                    method: "GET".to_owned(),
979                    status: Some(status.as_u16()),
980                    duration_ms,
981                    response_bytes,
982                    blocked: false,
983                    block_reason: None,
984                    caller_id: caller_id.clone(),
985                    skill_name: skill_name.clone(),
986                    #[allow(clippy::cast_possible_truncation)]
987                    hop: hop as u8,
988                };
989                self.log_egress_event(&event).await;
990            }
991
992            return String::from_utf8(bytes.to_vec())
993                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())));
994        }
995
996        Err(ToolError::Execution(std::io::Error::other(
997            "too many redirects",
998        )))
999    }
1000
1001    /// Apply IPI filter to a fetched response body.
1002    ///
1003    /// Runs regex scanning on a blocking thread via `spawn_blocking` to avoid
1004    /// stalling the tokio executor on large inputs. When score >= threshold,
1005    /// prepends a warning header and emits a `tracing::warn!` log.
1006    ///
1007    /// # Errors
1008    ///
1009    /// Returns a [`ToolError`] if the blocking scan task panics.
1010    #[tracing::instrument(name = "tools.scrape.apply_ipi_filter", skip(self, body), fields(body_len = body.len()))]
1011    async fn apply_ipi_filter(&self, body: &str, url: &str) -> Result<String, ToolError> {
1012        let verdict = self
1013            .ipi_filter
1014            .filter_async(body.to_owned())
1015            .await
1016            .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
1017        if !verdict.patterns_found.is_empty() {
1018            tracing::warn!(
1019                url = url,
1020                score = verdict.score,
1021                patterns = ?verdict.patterns_found,
1022                "IPI patterns detected in fetched content"
1023            );
1024        }
1025        // verdict.sanitized == body only when score < threshold (no redaction)
1026        if verdict.sanitized == body {
1027            Ok(verdict.sanitized)
1028        } else {
1029            Ok(format!(
1030                "[IPI WARNING: score={:.2}, patterns={}] {}",
1031                verdict.score,
1032                verdict.patterns_found.join(", "),
1033                verdict.sanitized,
1034            ))
1035        }
1036    }
1037}
1038
1039fn extract_scrape_blocks(text: &str) -> Vec<&str> {
1040    crate::executor::extract_fenced_blocks(text, "scrape")
1041}
1042
1043/// Check host against the domain allowlist/denylist from `ScrapeConfig`.
1044///
1045/// Logic:
1046/// 1. If `denied_domains` matches the host → block.
1047/// 2. If `allowed_domains` is non-empty:
1048///    a. IP address hosts are always rejected (no pattern can match a bare IP).
1049///    b. Hosts not matching any entry → block.
1050/// 3. Otherwise → allow.
1051///
1052/// Wildcard prefix matching: `*.example.com` matches `sub.example.com` but NOT `example.com`.
1053/// Multiple wildcards are not supported; patterns with more than one `*` are treated as exact.
1054fn check_domain_policy(
1055    host: &str,
1056    allowed_domains: &[String],
1057    denied_domains: &[String],
1058) -> Result<(), ToolError> {
1059    if denied_domains.iter().any(|p| domain_matches(p, host)) {
1060        return Err(ToolError::Blocked {
1061            command: format!("domain blocked by denylist: {host}"),
1062        });
1063    }
1064    if !allowed_domains.is_empty() {
1065        // Bare IP addresses cannot match any domain pattern — reject when allowlist is active.
1066        let is_ip = host.parse::<std::net::IpAddr>().is_ok()
1067            || (host.starts_with('[') && host.ends_with(']'));
1068        if is_ip {
1069            return Err(ToolError::Blocked {
1070                command: format!(
1071                    "bare IP address not allowed when domain allowlist is active: {host}"
1072                ),
1073            });
1074        }
1075        if !allowed_domains.iter().any(|p| domain_matches(p, host)) {
1076            return Err(ToolError::Blocked {
1077                command: format!("domain not in allowlist: {host}"),
1078            });
1079        }
1080    }
1081    Ok(())
1082}
1083
1084// Domain pattern matching is delegated to the shared `domain_match` module.
1085use crate::domain_match::domain_matches;
1086
1087fn validate_url(raw: &str) -> Result<Url, ToolError> {
1088    let parsed = Url::parse(raw).map_err(|_| ToolError::Blocked {
1089        command: format!("invalid URL: {raw}"),
1090    })?;
1091
1092    if parsed.scheme() != "https" {
1093        return Err(ToolError::Blocked {
1094            command: format!("scheme not allowed: {}", parsed.scheme()),
1095        });
1096    }
1097
1098    if let Some(host) = parsed.host()
1099        && is_private_host(&host)
1100    {
1101        return Err(ToolError::Blocked {
1102            command: format!(
1103                "private/local host blocked: {}",
1104                parsed.host_str().unwrap_or("")
1105            ),
1106        });
1107    }
1108
1109    Ok(parsed)
1110}
1111
1112fn is_private_host(host: &url::Host<&str>) -> bool {
1113    match host {
1114        url::Host::Domain(d) => {
1115            // Exact match or subdomain of localhost (e.g. foo.localhost)
1116            // and .internal/.local TLDs used in cloud/k8s environments.
1117            #[allow(clippy::case_sensitive_file_extension_comparisons)]
1118            {
1119                *d == "localhost"
1120                    || d.ends_with(".localhost")
1121                    || d.ends_with(".internal")
1122                    || d.ends_with(".local")
1123            }
1124        }
1125        url::Host::Ipv4(v4) => is_private_ip(IpAddr::V4(*v4)),
1126        url::Host::Ipv6(v6) => is_private_ip(IpAddr::V6(*v6)),
1127    }
1128}
1129
1130/// Resolves DNS for the URL host, validates all resolved IPs against private ranges,
1131/// and returns the hostname and validated socket addresses.
1132///
1133/// Returning the addresses allows the caller to pin the HTTP client to these exact
1134/// addresses, eliminating TOCTOU between DNS validation and the actual connection.
1135#[tracing::instrument(name = "tools.scrape.dns.resolve", skip(url), fields(host = url.host_str().unwrap_or("")))]
1136async fn resolve_and_validate(url: &Url) -> Result<(String, Vec<SocketAddr>), ToolError> {
1137    let Some(host) = url.host_str() else {
1138        return Ok((String::new(), vec![]));
1139    };
1140    let port = url.port_or_known_default().unwrap_or(443);
1141    let addrs: Vec<SocketAddr> = tokio::time::timeout(
1142        Duration::from_secs(10),
1143        tokio::net::lookup_host(format!("{host}:{port}")),
1144    )
1145    .await
1146    .map_err(|_| ToolError::Timeout { timeout_secs: 10 })?
1147    .map_err(|e| ToolError::Blocked {
1148        command: format!("DNS resolution failed: {e}"),
1149    })?
1150    .collect();
1151    for addr in &addrs {
1152        if is_private_ip(addr.ip()) {
1153            return Err(ToolError::Blocked {
1154                command: format!("SSRF protection: private IP {} for host {host}", addr.ip()),
1155            });
1156        }
1157    }
1158    Ok((host.to_owned(), addrs))
1159}
1160
1161fn parse_and_extract(
1162    html: &str,
1163    selector: &str,
1164    extract: &ExtractMode,
1165    limit: usize,
1166) -> Result<String, ToolError> {
1167    let soup = scrape_core::Soup::parse(html);
1168
1169    let tags = soup.find_all(selector).map_err(|e| {
1170        ToolError::Execution(std::io::Error::new(
1171            std::io::ErrorKind::InvalidData,
1172            format!("invalid selector: {e}"),
1173        ))
1174    })?;
1175
1176    let mut results = Vec::new();
1177
1178    for tag in tags.into_iter().take(limit) {
1179        let value = match extract {
1180            ExtractMode::Text => tag.text(),
1181            ExtractMode::Html => tag.inner_html(),
1182            ExtractMode::Attr(name) => tag.get(name).unwrap_or_default().to_owned(),
1183        };
1184        if !value.trim().is_empty() {
1185            results.push(value.trim().to_owned());
1186        }
1187    }
1188
1189    if results.is_empty() {
1190        Ok(format!("No results for selector: {selector}"))
1191    } else {
1192        Ok(results.join("\n"))
1193    }
1194}
1195
1196#[cfg(test)]
1197mod tests {
1198    use super::*;
1199
1200    // --- extract_scrape_blocks ---
1201
1202    #[test]
1203    fn extract_single_block() {
1204        let text =
1205            "Here:\n```scrape\n{\"url\":\"https://example.com\",\"select\":\"h1\"}\n```\nDone.";
1206        let blocks = extract_scrape_blocks(text);
1207        assert_eq!(blocks.len(), 1);
1208        assert!(blocks[0].contains("example.com"));
1209    }
1210
1211    #[test]
1212    fn extract_multiple_blocks() {
1213        let text = "```scrape\n{\"url\":\"https://a.com\",\"select\":\"h1\"}\n```\ntext\n```scrape\n{\"url\":\"https://b.com\",\"select\":\"p\"}\n```";
1214        let blocks = extract_scrape_blocks(text);
1215        assert_eq!(blocks.len(), 2);
1216    }
1217
1218    #[test]
1219    fn no_blocks_returns_empty() {
1220        let blocks = extract_scrape_blocks("plain text, no code blocks");
1221        assert!(blocks.is_empty());
1222    }
1223
1224    #[test]
1225    fn unclosed_block_ignored() {
1226        let blocks = extract_scrape_blocks("```scrape\n{\"url\":\"https://x.com\"}");
1227        assert!(blocks.is_empty());
1228    }
1229
1230    #[test]
1231    fn non_scrape_block_ignored() {
1232        let text =
1233            "```bash\necho hi\n```\n```scrape\n{\"url\":\"https://x.com\",\"select\":\"h1\"}\n```";
1234        let blocks = extract_scrape_blocks(text);
1235        assert_eq!(blocks.len(), 1);
1236        assert!(blocks[0].contains("x.com"));
1237    }
1238
1239    #[test]
1240    fn multiline_json_block() {
1241        let text =
1242            "```scrape\n{\n  \"url\": \"https://example.com\",\n  \"select\": \"h1\"\n}\n```";
1243        let blocks = extract_scrape_blocks(text);
1244        assert_eq!(blocks.len(), 1);
1245        let instr: ScrapeInstruction = serde_json::from_str(blocks[0]).unwrap();
1246        assert_eq!(instr.url, "https://example.com");
1247    }
1248
1249    // --- ScrapeInstruction parsing ---
1250
1251    #[test]
1252    fn parse_valid_instruction() {
1253        let json = r#"{"url":"https://example.com","select":"h1","extract":"text","limit":5}"#;
1254        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
1255        assert_eq!(instr.url, "https://example.com");
1256        assert_eq!(instr.select, "h1");
1257        assert_eq!(instr.extract, "text");
1258        assert_eq!(instr.limit, Some(5));
1259    }
1260
1261    #[test]
1262    fn parse_minimal_instruction() {
1263        let json = r#"{"url":"https://example.com","select":"p"}"#;
1264        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
1265        assert_eq!(instr.extract, "text");
1266        assert!(instr.limit.is_none());
1267    }
1268
1269    #[test]
1270    fn parse_attr_extract() {
1271        let json = r#"{"url":"https://example.com","select":"a","extract":"attr:href"}"#;
1272        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
1273        assert_eq!(instr.extract, "attr:href");
1274    }
1275
1276    #[test]
1277    fn parse_invalid_json_errors() {
1278        let result = serde_json::from_str::<ScrapeInstruction>("not json");
1279        assert!(result.is_err());
1280    }
1281
1282    // --- ExtractMode ---
1283
1284    #[test]
1285    fn extract_mode_text() {
1286        assert!(matches!(ExtractMode::parse("text"), ExtractMode::Text));
1287    }
1288
1289    #[test]
1290    fn extract_mode_html() {
1291        assert!(matches!(ExtractMode::parse("html"), ExtractMode::Html));
1292    }
1293
1294    #[test]
1295    fn extract_mode_attr() {
1296        let mode = ExtractMode::parse("attr:href");
1297        assert!(matches!(mode, ExtractMode::Attr(ref s) if s == "href"));
1298    }
1299
1300    #[test]
1301    fn extract_mode_unknown_defaults_to_text() {
1302        assert!(matches!(ExtractMode::parse("unknown"), ExtractMode::Text));
1303    }
1304
1305    // --- validate_url ---
1306
1307    #[test]
1308    fn valid_https_url() {
1309        assert!(validate_url("https://example.com").is_ok());
1310    }
1311
1312    #[test]
1313    fn http_rejected() {
1314        let err = validate_url("http://example.com").unwrap_err();
1315        assert!(matches!(err, ToolError::Blocked { .. }));
1316    }
1317
1318    #[test]
1319    fn ftp_rejected() {
1320        let err = validate_url("ftp://files.example.com").unwrap_err();
1321        assert!(matches!(err, ToolError::Blocked { .. }));
1322    }
1323
1324    #[test]
1325    fn file_rejected() {
1326        let err = validate_url("file:///etc/passwd").unwrap_err();
1327        assert!(matches!(err, ToolError::Blocked { .. }));
1328    }
1329
1330    #[test]
1331    fn invalid_url_rejected() {
1332        let err = validate_url("not a url").unwrap_err();
1333        assert!(matches!(err, ToolError::Blocked { .. }));
1334    }
1335
1336    #[test]
1337    fn localhost_blocked() {
1338        let err = validate_url("https://localhost/path").unwrap_err();
1339        assert!(matches!(err, ToolError::Blocked { .. }));
1340    }
1341
1342    #[test]
1343    fn loopback_ip_blocked() {
1344        let err = validate_url("https://127.0.0.1/path").unwrap_err();
1345        assert!(matches!(err, ToolError::Blocked { .. }));
1346    }
1347
1348    #[test]
1349    fn private_10_blocked() {
1350        let err = validate_url("https://10.0.0.1/api").unwrap_err();
1351        assert!(matches!(err, ToolError::Blocked { .. }));
1352    }
1353
1354    #[test]
1355    fn private_172_blocked() {
1356        let err = validate_url("https://172.16.0.1/api").unwrap_err();
1357        assert!(matches!(err, ToolError::Blocked { .. }));
1358    }
1359
1360    #[test]
1361    fn private_192_blocked() {
1362        let err = validate_url("https://192.168.1.1/api").unwrap_err();
1363        assert!(matches!(err, ToolError::Blocked { .. }));
1364    }
1365
1366    #[test]
1367    fn ipv6_loopback_blocked() {
1368        let err = validate_url("https://[::1]/path").unwrap_err();
1369        assert!(matches!(err, ToolError::Blocked { .. }));
1370    }
1371
1372    #[test]
1373    fn public_ip_allowed() {
1374        assert!(validate_url("https://93.184.216.34/page").is_ok());
1375    }
1376
1377    // --- parse_and_extract ---
1378
1379    #[test]
1380    fn extract_text_from_html() {
1381        let html = "<html><body><h1>Hello World</h1><p>Content</p></body></html>";
1382        let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
1383        assert_eq!(result, "Hello World");
1384    }
1385
1386    #[test]
1387    fn extract_multiple_elements() {
1388        let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
1389        let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
1390        assert_eq!(result, "A\nB\nC");
1391    }
1392
1393    #[test]
1394    fn extract_with_limit() {
1395        let html = "<ul><li>A</li><li>B</li><li>C</li></ul>";
1396        let result = parse_and_extract(html, "li", &ExtractMode::Text, 2).unwrap();
1397        assert_eq!(result, "A\nB");
1398    }
1399
1400    #[test]
1401    fn extract_attr_href() {
1402        let html = r#"<a href="https://example.com">Link</a>"#;
1403        let result =
1404            parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
1405        assert_eq!(result, "https://example.com");
1406    }
1407
1408    #[test]
1409    fn extract_inner_html() {
1410        let html = "<div><span>inner</span></div>";
1411        let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
1412        assert!(result.contains("<span>inner</span>"));
1413    }
1414
1415    #[test]
1416    fn no_matches_returns_message() {
1417        let html = "<html><body><p>text</p></body></html>";
1418        let result = parse_and_extract(html, "h1", &ExtractMode::Text, 10).unwrap();
1419        assert!(result.starts_with("No results for selector:"));
1420    }
1421
1422    #[test]
1423    fn empty_text_skipped() {
1424        let html = "<ul><li>  </li><li>A</li></ul>";
1425        let result = parse_and_extract(html, "li", &ExtractMode::Text, 10).unwrap();
1426        assert_eq!(result, "A");
1427    }
1428
1429    #[test]
1430    fn invalid_selector_errors() {
1431        let html = "<html><body></body></html>";
1432        let result = parse_and_extract(html, "[[[invalid", &ExtractMode::Text, 10);
1433        assert!(result.is_err());
1434    }
1435
1436    #[test]
1437    fn empty_html_returns_no_results() {
1438        let result = parse_and_extract("", "h1", &ExtractMode::Text, 10).unwrap();
1439        assert!(result.starts_with("No results for selector:"));
1440    }
1441
1442    #[test]
1443    fn nested_selector() {
1444        let html = "<div><span>inner</span></div><span>outer</span>";
1445        let result = parse_and_extract(html, "div > span", &ExtractMode::Text, 10).unwrap();
1446        assert_eq!(result, "inner");
1447    }
1448
1449    #[test]
1450    fn attr_missing_returns_empty() {
1451        let html = r"<a>No href</a>";
1452        let result =
1453            parse_and_extract(html, "a", &ExtractMode::Attr("href".to_owned()), 10).unwrap();
1454        assert!(result.starts_with("No results for selector:"));
1455    }
1456
1457    #[test]
1458    fn extract_html_mode() {
1459        let html = "<div><b>bold</b> text</div>";
1460        let result = parse_and_extract(html, "div", &ExtractMode::Html, 10).unwrap();
1461        assert!(result.contains("<b>bold</b>"));
1462    }
1463
1464    #[test]
1465    fn limit_zero_returns_no_results() {
1466        let html = "<ul><li>A</li><li>B</li></ul>";
1467        let result = parse_and_extract(html, "li", &ExtractMode::Text, 0).unwrap();
1468        assert!(result.starts_with("No results for selector:"));
1469    }
1470
1471    // --- validate_url edge cases ---
1472
1473    #[test]
1474    fn url_with_port_allowed() {
1475        assert!(validate_url("https://example.com:8443/path").is_ok());
1476    }
1477
1478    #[test]
1479    fn link_local_ip_blocked() {
1480        let err = validate_url("https://169.254.1.1/path").unwrap_err();
1481        assert!(matches!(err, ToolError::Blocked { .. }));
1482    }
1483
1484    #[test]
1485    fn url_no_scheme_rejected() {
1486        let err = validate_url("example.com/path").unwrap_err();
1487        assert!(matches!(err, ToolError::Blocked { .. }));
1488    }
1489
1490    #[test]
1491    fn unspecified_ipv4_blocked() {
1492        let err = validate_url("https://0.0.0.0/path").unwrap_err();
1493        assert!(matches!(err, ToolError::Blocked { .. }));
1494    }
1495
1496    #[test]
1497    fn broadcast_ipv4_blocked() {
1498        let err = validate_url("https://255.255.255.255/path").unwrap_err();
1499        assert!(matches!(err, ToolError::Blocked { .. }));
1500    }
1501
1502    #[test]
1503    fn ipv6_link_local_blocked() {
1504        let err = validate_url("https://[fe80::1]/path").unwrap_err();
1505        assert!(matches!(err, ToolError::Blocked { .. }));
1506    }
1507
1508    #[test]
1509    fn ipv6_unique_local_blocked() {
1510        let err = validate_url("https://[fd12::1]/path").unwrap_err();
1511        assert!(matches!(err, ToolError::Blocked { .. }));
1512    }
1513
1514    #[test]
1515    fn ipv4_mapped_ipv6_loopback_blocked() {
1516        let err = validate_url("https://[::ffff:127.0.0.1]/path").unwrap_err();
1517        assert!(matches!(err, ToolError::Blocked { .. }));
1518    }
1519
1520    #[test]
1521    fn ipv4_mapped_ipv6_private_blocked() {
1522        let err = validate_url("https://[::ffff:10.0.0.1]/path").unwrap_err();
1523        assert!(matches!(err, ToolError::Blocked { .. }));
1524    }
1525
1526    // --- WebScrapeExecutor (no-network) ---
1527
1528    #[tokio::test]
1529    async fn executor_no_blocks_returns_none() {
1530        let config = ScrapeConfig::default();
1531        let executor = WebScrapeExecutor::new(&config);
1532        let result = executor.execute("plain text").await;
1533        assert!(result.unwrap().is_none());
1534    }
1535
1536    #[tokio::test]
1537    async fn executor_invalid_json_errors() {
1538        let config = ScrapeConfig::default();
1539        let executor = WebScrapeExecutor::new(&config);
1540        let response = "```scrape\nnot json\n```";
1541        let result = executor.execute(response).await;
1542        assert!(matches!(result, Err(ToolError::Execution(_))));
1543    }
1544
1545    #[tokio::test]
1546    async fn executor_blocked_url_errors() {
1547        let config = ScrapeConfig::default();
1548        let executor = WebScrapeExecutor::new(&config);
1549        let response = "```scrape\n{\"url\":\"http://example.com\",\"select\":\"h1\"}\n```";
1550        let result = executor.execute(response).await;
1551        assert!(matches!(result, Err(ToolError::Blocked { .. })));
1552    }
1553
1554    #[tokio::test]
1555    async fn executor_private_ip_blocked() {
1556        let config = ScrapeConfig::default();
1557        let executor = WebScrapeExecutor::new(&config);
1558        let response = "```scrape\n{\"url\":\"https://192.168.1.1/api\",\"select\":\"h1\"}\n```";
1559        let result = executor.execute(response).await;
1560        assert!(matches!(result, Err(ToolError::Blocked { .. })));
1561    }
1562
1563    #[tokio::test]
1564    async fn executor_unreachable_host_returns_error() {
1565        let config = ScrapeConfig {
1566            timeout: 1,
1567            max_body_bytes: 1_048_576,
1568            ..Default::default()
1569        };
1570        let executor = WebScrapeExecutor::new(&config);
1571        let response = "```scrape\n{\"url\":\"https://192.0.2.1:1/page\",\"select\":\"h1\"}\n```";
1572        let result = executor.execute(response).await;
1573        assert!(matches!(result, Err(ToolError::Execution(_))));
1574    }
1575
1576    #[tokio::test]
1577    async fn executor_localhost_url_blocked() {
1578        let config = ScrapeConfig::default();
1579        let executor = WebScrapeExecutor::new(&config);
1580        let response = "```scrape\n{\"url\":\"https://localhost:9999/api\",\"select\":\"h1\"}\n```";
1581        let result = executor.execute(response).await;
1582        assert!(matches!(result, Err(ToolError::Blocked { .. })));
1583    }
1584
1585    #[tokio::test]
1586    async fn executor_empty_text_returns_none() {
1587        let config = ScrapeConfig::default();
1588        let executor = WebScrapeExecutor::new(&config);
1589        let result = executor.execute("").await;
1590        assert!(result.unwrap().is_none());
1591    }
1592
1593    #[tokio::test]
1594    async fn executor_multiple_blocks_first_blocked() {
1595        let config = ScrapeConfig::default();
1596        let executor = WebScrapeExecutor::new(&config);
1597        let response = "```scrape\n{\"url\":\"http://evil.com\",\"select\":\"h1\"}\n```\n\
1598             ```scrape\n{\"url\":\"https://ok.com\",\"select\":\"h1\"}\n```";
1599        let result = executor.execute(response).await;
1600        assert!(result.is_err());
1601    }
1602
1603    #[test]
1604    fn validate_url_empty_string() {
1605        let err = validate_url("").unwrap_err();
1606        assert!(matches!(err, ToolError::Blocked { .. }));
1607    }
1608
1609    #[test]
1610    fn validate_url_javascript_scheme_blocked() {
1611        let err = validate_url("javascript:alert(1)").unwrap_err();
1612        assert!(matches!(err, ToolError::Blocked { .. }));
1613    }
1614
1615    #[test]
1616    fn validate_url_data_scheme_blocked() {
1617        let err = validate_url("data:text/html,<h1>hi</h1>").unwrap_err();
1618        assert!(matches!(err, ToolError::Blocked { .. }));
1619    }
1620
1621    #[test]
1622    fn is_private_host_public_domain_is_false() {
1623        let host: url::Host<&str> = url::Host::Domain("example.com");
1624        assert!(!is_private_host(&host));
1625    }
1626
1627    #[test]
1628    fn is_private_host_localhost_is_true() {
1629        let host: url::Host<&str> = url::Host::Domain("localhost");
1630        assert!(is_private_host(&host));
1631    }
1632
1633    #[test]
1634    fn is_private_host_ipv6_unspecified_is_true() {
1635        let host = url::Host::Ipv6(std::net::Ipv6Addr::UNSPECIFIED);
1636        assert!(is_private_host(&host));
1637    }
1638
1639    #[test]
1640    fn is_private_host_public_ipv6_is_false() {
1641        let host = url::Host::Ipv6("2001:db8::1".parse().unwrap());
1642        assert!(!is_private_host(&host));
1643    }
1644
1645    // --- fetch_html redirect logic: wiremock HTTP server tests ---
1646    //
1647    // These tests use a local wiremock server to exercise the redirect-following logic
1648    // in `fetch_html` without requiring an external HTTPS connection. The server binds to
1649    // 127.0.0.1, and tests call `fetch_html` directly (bypassing `validate_url`) to avoid
1650    // the SSRF guard that would otherwise block loopback connections.
1651
1652    /// Helper: returns executor + (`server_url`, `server_addr`) from a running wiremock mock server.
1653    /// The server address is passed to `fetch_html` via `resolve_to_addrs` so the client
1654    /// connects to the mock instead of doing a real DNS lookup.
1655    async fn mock_server_executor() -> (WebScrapeExecutor, wiremock::MockServer) {
1656        let server = wiremock::MockServer::start().await;
1657        let executor = WebScrapeExecutor {
1658            timeout: Duration::from_secs(5),
1659            max_body_bytes: 1_048_576,
1660            allowed_domains: vec![],
1661            denied_domains: vec![],
1662            audit_logger: None,
1663            egress_config: EgressConfig::default(),
1664            egress_tx: None,
1665            egress_dropped: Arc::new(AtomicU64::new(0)),
1666            ipi_filter: IpiFilter::new(0.6),
1667        };
1668        (executor, server)
1669    }
1670
1671    /// Parses the mock server's URI into (`host_str`, `socket_addr`) for use with `build_client`.
1672    fn server_host_and_addr(server: &wiremock::MockServer) -> (String, Vec<std::net::SocketAddr>) {
1673        let uri = server.uri();
1674        let url = Url::parse(&uri).unwrap();
1675        let host = url.host_str().unwrap_or("127.0.0.1").to_owned();
1676        let port = url.port().unwrap_or(80);
1677        let addr: std::net::SocketAddr = format!("{host}:{port}").parse().unwrap();
1678        (host, vec![addr])
1679    }
1680
1681    /// Test-only redirect follower that mimics `fetch_html`'s loop but skips `validate_url` /
1682    /// `resolve_and_validate`. This lets us exercise the redirect-counting and
1683    /// missing-Location logic against a plain HTTP wiremock server.
1684    async fn follow_redirects_raw(
1685        executor: &WebScrapeExecutor,
1686        start_url: &str,
1687        host: &str,
1688        addrs: &[std::net::SocketAddr],
1689    ) -> Result<String, ToolError> {
1690        const MAX_REDIRECTS: usize = 3;
1691        let mut current_url = start_url.to_owned();
1692        let mut current_host = host.to_owned();
1693        let mut current_addrs = addrs.to_vec();
1694
1695        for hop in 0..=MAX_REDIRECTS {
1696            let client = executor.build_client(&current_host, &current_addrs);
1697            let resp = client
1698                .get(&current_url)
1699                .send()
1700                .await
1701                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
1702
1703            let status = resp.status();
1704
1705            if status.is_redirection() {
1706                if hop == MAX_REDIRECTS {
1707                    return Err(ToolError::Execution(std::io::Error::other(
1708                        "too many redirects",
1709                    )));
1710                }
1711
1712                let location = resp
1713                    .headers()
1714                    .get(reqwest::header::LOCATION)
1715                    .and_then(|v| v.to_str().ok())
1716                    .ok_or_else(|| {
1717                        ToolError::Execution(std::io::Error::other("redirect with no Location"))
1718                    })?;
1719
1720                let base = Url::parse(&current_url)
1721                    .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
1722                let next_url = base
1723                    .join(location)
1724                    .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
1725
1726                // Re-use same host/addrs (mock server is always the same endpoint).
1727                current_url = next_url.to_string();
1728                // Preserve host/addrs as-is since the mock server doesn't change.
1729                let _ = &mut current_host;
1730                let _ = &mut current_addrs;
1731                continue;
1732            }
1733
1734            if !status.is_success() {
1735                return Err(ToolError::Execution(std::io::Error::other(format!(
1736                    "HTTP {status}",
1737                ))));
1738            }
1739
1740            let bytes = resp
1741                .bytes()
1742                .await
1743                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())))?;
1744
1745            if bytes.len() > executor.max_body_bytes {
1746                return Err(ToolError::Execution(std::io::Error::other(format!(
1747                    "response too large: {} bytes (max: {})",
1748                    bytes.len(),
1749                    executor.max_body_bytes,
1750                ))));
1751            }
1752
1753            return String::from_utf8(bytes.to_vec())
1754                .map_err(|e| ToolError::Execution(std::io::Error::other(e.to_string())));
1755        }
1756
1757        Err(ToolError::Execution(std::io::Error::other(
1758            "too many redirects",
1759        )))
1760    }
1761
1762    #[tokio::test]
1763    async fn fetch_html_success_returns_body() {
1764        use wiremock::matchers::{method, path};
1765        use wiremock::{Mock, ResponseTemplate};
1766
1767        let (executor, server) = mock_server_executor().await;
1768        Mock::given(method("GET"))
1769            .and(path("/page"))
1770            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>OK</h1>"))
1771            .mount(&server)
1772            .await;
1773
1774        let (host, addrs) = server_host_and_addr(&server);
1775        let url = format!("{}/page", server.uri());
1776        let result = executor
1777            .fetch_html(&url, &host, &addrs, "fetch", "test-cid", None, None)
1778            .await;
1779        assert!(result.is_ok(), "expected Ok, got: {result:?}");
1780        assert_eq!(result.unwrap(), "<h1>OK</h1>");
1781    }
1782
1783    #[tokio::test]
1784    async fn fetch_html_non_2xx_returns_error() {
1785        use wiremock::matchers::{method, path};
1786        use wiremock::{Mock, ResponseTemplate};
1787
1788        let (executor, server) = mock_server_executor().await;
1789        Mock::given(method("GET"))
1790            .and(path("/forbidden"))
1791            .respond_with(ResponseTemplate::new(403))
1792            .mount(&server)
1793            .await;
1794
1795        let (host, addrs) = server_host_and_addr(&server);
1796        let url = format!("{}/forbidden", server.uri());
1797        let result = executor
1798            .fetch_html(&url, &host, &addrs, "fetch", "test-cid", None, None)
1799            .await;
1800        assert!(result.is_err());
1801        let msg = result.unwrap_err().to_string();
1802        assert!(msg.contains("403"), "expected 403 in error: {msg}");
1803    }
1804
1805    #[tokio::test]
1806    async fn fetch_html_404_returns_error() {
1807        use wiremock::matchers::{method, path};
1808        use wiremock::{Mock, ResponseTemplate};
1809
1810        let (executor, server) = mock_server_executor().await;
1811        Mock::given(method("GET"))
1812            .and(path("/missing"))
1813            .respond_with(ResponseTemplate::new(404))
1814            .mount(&server)
1815            .await;
1816
1817        let (host, addrs) = server_host_and_addr(&server);
1818        let url = format!("{}/missing", server.uri());
1819        let result = executor
1820            .fetch_html(&url, &host, &addrs, "fetch", "test-cid", None, None)
1821            .await;
1822        assert!(result.is_err());
1823        let msg = result.unwrap_err().to_string();
1824        assert!(msg.contains("404"), "expected 404 in error: {msg}");
1825    }
1826
1827    #[tokio::test]
1828    async fn fetch_html_redirect_no_location_returns_error() {
1829        use wiremock::matchers::{method, path};
1830        use wiremock::{Mock, ResponseTemplate};
1831
1832        let (executor, server) = mock_server_executor().await;
1833        // 302 with no Location header
1834        Mock::given(method("GET"))
1835            .and(path("/redirect-no-loc"))
1836            .respond_with(ResponseTemplate::new(302))
1837            .mount(&server)
1838            .await;
1839
1840        let (host, addrs) = server_host_and_addr(&server);
1841        let url = format!("{}/redirect-no-loc", server.uri());
1842        let result = executor
1843            .fetch_html(&url, &host, &addrs, "fetch", "test-cid", None, None)
1844            .await;
1845        assert!(result.is_err());
1846        let msg = result.unwrap_err().to_string();
1847        assert!(
1848            msg.contains("Location") || msg.contains("location"),
1849            "expected Location-related error: {msg}"
1850        );
1851    }
1852
1853    #[tokio::test]
1854    async fn fetch_html_single_redirect_followed() {
1855        use wiremock::matchers::{method, path};
1856        use wiremock::{Mock, ResponseTemplate};
1857
1858        let (executor, server) = mock_server_executor().await;
1859        let final_url = format!("{}/final", server.uri());
1860
1861        Mock::given(method("GET"))
1862            .and(path("/start"))
1863            .respond_with(ResponseTemplate::new(302).insert_header("location", final_url.as_str()))
1864            .mount(&server)
1865            .await;
1866
1867        Mock::given(method("GET"))
1868            .and(path("/final"))
1869            .respond_with(ResponseTemplate::new(200).set_body_string("<p>final</p>"))
1870            .mount(&server)
1871            .await;
1872
1873        let (host, addrs) = server_host_and_addr(&server);
1874        let url = format!("{}/start", server.uri());
1875        let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1876        assert!(result.is_ok(), "single redirect should succeed: {result:?}");
1877        assert_eq!(result.unwrap(), "<p>final</p>");
1878    }
1879
1880    #[tokio::test]
1881    async fn fetch_html_three_redirects_allowed() {
1882        use wiremock::matchers::{method, path};
1883        use wiremock::{Mock, ResponseTemplate};
1884
1885        let (executor, server) = mock_server_executor().await;
1886        let hop2 = format!("{}/hop2", server.uri());
1887        let hop3 = format!("{}/hop3", server.uri());
1888        let final_dest = format!("{}/done", server.uri());
1889
1890        Mock::given(method("GET"))
1891            .and(path("/hop1"))
1892            .respond_with(ResponseTemplate::new(301).insert_header("location", hop2.as_str()))
1893            .mount(&server)
1894            .await;
1895        Mock::given(method("GET"))
1896            .and(path("/hop2"))
1897            .respond_with(ResponseTemplate::new(301).insert_header("location", hop3.as_str()))
1898            .mount(&server)
1899            .await;
1900        Mock::given(method("GET"))
1901            .and(path("/hop3"))
1902            .respond_with(ResponseTemplate::new(301).insert_header("location", final_dest.as_str()))
1903            .mount(&server)
1904            .await;
1905        Mock::given(method("GET"))
1906            .and(path("/done"))
1907            .respond_with(ResponseTemplate::new(200).set_body_string("<p>done</p>"))
1908            .mount(&server)
1909            .await;
1910
1911        let (host, addrs) = server_host_and_addr(&server);
1912        let url = format!("{}/hop1", server.uri());
1913        let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1914        assert!(result.is_ok(), "3 redirects should succeed: {result:?}");
1915        assert_eq!(result.unwrap(), "<p>done</p>");
1916    }
1917
1918    #[tokio::test]
1919    async fn fetch_html_four_redirects_rejected() {
1920        use wiremock::matchers::{method, path};
1921        use wiremock::{Mock, ResponseTemplate};
1922
1923        let (executor, server) = mock_server_executor().await;
1924        let hop2 = format!("{}/r2", server.uri());
1925        let hop3 = format!("{}/r3", server.uri());
1926        let hop4 = format!("{}/r4", server.uri());
1927        let hop5 = format!("{}/r5", server.uri());
1928
1929        for (from, to) in [
1930            ("/r1", &hop2),
1931            ("/r2", &hop3),
1932            ("/r3", &hop4),
1933            ("/r4", &hop5),
1934        ] {
1935            Mock::given(method("GET"))
1936                .and(path(from))
1937                .respond_with(ResponseTemplate::new(301).insert_header("location", to.as_str()))
1938                .mount(&server)
1939                .await;
1940        }
1941
1942        let (host, addrs) = server_host_and_addr(&server);
1943        let url = format!("{}/r1", server.uri());
1944        let result = follow_redirects_raw(&executor, &url, &host, &addrs).await;
1945        assert!(result.is_err(), "4 redirects should be rejected");
1946        let msg = result.unwrap_err().to_string();
1947        assert!(
1948            msg.contains("redirect"),
1949            "expected redirect-related error: {msg}"
1950        );
1951    }
1952
1953    #[tokio::test]
1954    async fn fetch_html_body_too_large_returns_error() {
1955        use wiremock::matchers::{method, path};
1956        use wiremock::{Mock, ResponseTemplate};
1957
1958        let small_limit_executor = WebScrapeExecutor {
1959            timeout: Duration::from_secs(5),
1960            max_body_bytes: 10,
1961            allowed_domains: vec![],
1962            denied_domains: vec![],
1963            audit_logger: None,
1964            egress_config: EgressConfig::default(),
1965            egress_tx: None,
1966            egress_dropped: Arc::new(AtomicU64::new(0)),
1967            ipi_filter: IpiFilter::new(0.6),
1968        };
1969        let server = wiremock::MockServer::start().await;
1970        Mock::given(method("GET"))
1971            .and(path("/big"))
1972            .respond_with(
1973                ResponseTemplate::new(200)
1974                    .set_body_string("this body is definitely longer than ten bytes"),
1975            )
1976            .mount(&server)
1977            .await;
1978
1979        let (host, addrs) = server_host_and_addr(&server);
1980        let url = format!("{}/big", server.uri());
1981        let result = small_limit_executor
1982            .fetch_html(&url, &host, &addrs, "fetch", "test-cid", None, None)
1983            .await;
1984        assert!(result.is_err());
1985        let msg = result.unwrap_err().to_string();
1986        assert!(msg.contains("too large"), "expected too-large error: {msg}");
1987    }
1988
1989    #[test]
1990    fn extract_scrape_blocks_empty_block_content() {
1991        let text = "```scrape\n\n```";
1992        let blocks = extract_scrape_blocks(text);
1993        assert_eq!(blocks.len(), 1);
1994        assert!(blocks[0].is_empty());
1995    }
1996
1997    #[test]
1998    fn extract_scrape_blocks_whitespace_only() {
1999        let text = "```scrape\n   \n```";
2000        let blocks = extract_scrape_blocks(text);
2001        assert_eq!(blocks.len(), 1);
2002    }
2003
2004    #[test]
2005    fn parse_and_extract_multiple_selectors() {
2006        let html = "<div><h1>Title</h1><p>Para</p></div>";
2007        let result = parse_and_extract(html, "h1, p", &ExtractMode::Text, 10).unwrap();
2008        assert!(result.contains("Title"));
2009        assert!(result.contains("Para"));
2010    }
2011
2012    #[test]
2013    fn webscrape_executor_new_with_custom_config() {
2014        let config = ScrapeConfig {
2015            timeout: 60,
2016            max_body_bytes: 512,
2017            ..Default::default()
2018        };
2019        let executor = WebScrapeExecutor::new(&config);
2020        assert_eq!(executor.max_body_bytes, 512);
2021    }
2022
2023    #[test]
2024    fn webscrape_executor_debug() {
2025        let config = ScrapeConfig::default();
2026        let executor = WebScrapeExecutor::new(&config);
2027        let dbg = format!("{executor:?}");
2028        assert!(dbg.contains("WebScrapeExecutor"));
2029    }
2030
2031    #[test]
2032    fn extract_mode_attr_empty_name() {
2033        let mode = ExtractMode::parse("attr:");
2034        assert!(matches!(mode, ExtractMode::Attr(ref s) if s.is_empty()));
2035    }
2036
2037    #[test]
2038    fn default_extract_returns_text() {
2039        assert_eq!(default_extract(), "text");
2040    }
2041
2042    #[test]
2043    fn scrape_instruction_debug() {
2044        let json = r#"{"url":"https://example.com","select":"h1"}"#;
2045        let instr: ScrapeInstruction = serde_json::from_str(json).unwrap();
2046        let dbg = format!("{instr:?}");
2047        assert!(dbg.contains("ScrapeInstruction"));
2048    }
2049
2050    #[test]
2051    fn extract_mode_debug() {
2052        let mode = ExtractMode::Text;
2053        let dbg = format!("{mode:?}");
2054        assert!(dbg.contains("Text"));
2055    }
2056
2057    // --- fetch_html redirect logic: constant and validation unit tests ---
2058
2059    /// `MAX_REDIRECTS` is 3; the 4th redirect attempt must be rejected.
2060    /// Verify the boundary is correct by inspecting the constant value.
2061    #[test]
2062    fn max_redirects_constant_is_three() {
2063        // fetch_html uses `for hop in 0..=MAX_REDIRECTS` and returns error when hop == MAX_REDIRECTS
2064        // while still in a redirect. That means hops 0,1,2 can redirect; hop 3 triggers the error.
2065        // This test documents the expected limit.
2066        const MAX_REDIRECTS: usize = 3;
2067        assert_eq!(MAX_REDIRECTS, 3, "fetch_html allows exactly 3 redirects");
2068    }
2069
2070    /// Verifies that a Location-less redirect would produce an error string containing the
2071    /// expected message, matching the error path in `fetch_html`.
2072    #[test]
2073    fn redirect_no_location_error_message() {
2074        let err = std::io::Error::other("redirect with no Location");
2075        assert!(err.to_string().contains("redirect with no Location"));
2076    }
2077
2078    /// Verifies that a too-many-redirects condition produces the expected error string.
2079    #[test]
2080    fn too_many_redirects_error_message() {
2081        let err = std::io::Error::other("too many redirects");
2082        assert!(err.to_string().contains("too many redirects"));
2083    }
2084
2085    /// Verifies that a non-2xx HTTP status produces an error message with the status code.
2086    #[test]
2087    fn non_2xx_status_error_format() {
2088        let status = reqwest::StatusCode::FORBIDDEN;
2089        let msg = format!("HTTP {status}");
2090        assert!(msg.contains("403"));
2091    }
2092
2093    /// Verifies that a 404 response status code formats into the expected error message.
2094    #[test]
2095    fn not_found_status_error_format() {
2096        let status = reqwest::StatusCode::NOT_FOUND;
2097        let msg = format!("HTTP {status}");
2098        assert!(msg.contains("404"));
2099    }
2100
2101    /// Verifies relative redirect resolution for same-host paths (simulates Location: /other).
2102    #[test]
2103    fn relative_redirect_same_host_path() {
2104        let base = Url::parse("https://example.com/current").unwrap();
2105        let resolved = base.join("/other").unwrap();
2106        assert_eq!(resolved.as_str(), "https://example.com/other");
2107    }
2108
2109    /// Verifies relative redirect resolution preserves scheme and host.
2110    #[test]
2111    fn relative_redirect_relative_path() {
2112        let base = Url::parse("https://example.com/a/b").unwrap();
2113        let resolved = base.join("c").unwrap();
2114        assert_eq!(resolved.as_str(), "https://example.com/a/c");
2115    }
2116
2117    /// Verifies that an absolute redirect URL overrides base URL completely.
2118    #[test]
2119    fn absolute_redirect_overrides_base() {
2120        let base = Url::parse("https://example.com/page").unwrap();
2121        let resolved = base.join("https://other.com/target").unwrap();
2122        assert_eq!(resolved.as_str(), "https://other.com/target");
2123    }
2124
2125    /// Verifies that a redirect Location of http:// (downgrade) is rejected.
2126    #[test]
2127    fn redirect_http_downgrade_rejected() {
2128        let location = "http://example.com/page";
2129        let base = Url::parse("https://example.com/start").unwrap();
2130        let next = base.join(location).unwrap();
2131        let err = validate_url(next.as_str()).unwrap_err();
2132        assert!(matches!(err, ToolError::Blocked { .. }));
2133    }
2134
2135    /// Verifies that a redirect to a private IP literal is blocked.
2136    #[test]
2137    fn redirect_location_private_ip_blocked() {
2138        let location = "https://192.168.100.1/admin";
2139        let base = Url::parse("https://example.com/start").unwrap();
2140        let next = base.join(location).unwrap();
2141        let err = validate_url(next.as_str()).unwrap_err();
2142        assert!(matches!(err, ToolError::Blocked { .. }));
2143        let ToolError::Blocked { command: cmd } = err else {
2144            panic!("expected Blocked");
2145        };
2146        assert!(
2147            cmd.contains("private") || cmd.contains("scheme"),
2148            "error message should describe the block reason: {cmd}"
2149        );
2150    }
2151
2152    /// Verifies that a redirect to a .internal domain is blocked.
2153    #[test]
2154    fn redirect_location_internal_domain_blocked() {
2155        let location = "https://metadata.internal/latest/meta-data/";
2156        let base = Url::parse("https://example.com/start").unwrap();
2157        let next = base.join(location).unwrap();
2158        let err = validate_url(next.as_str()).unwrap_err();
2159        assert!(matches!(err, ToolError::Blocked { .. }));
2160    }
2161
2162    /// Verifies that a chain of 3 valid public redirects passes `validate_url` at every hop.
2163    #[test]
2164    fn redirect_chain_three_hops_all_public() {
2165        let hops = [
2166            "https://redirect1.example.com/hop1",
2167            "https://redirect2.example.com/hop2",
2168            "https://destination.example.com/final",
2169        ];
2170        for hop in hops {
2171            assert!(validate_url(hop).is_ok(), "expected ok for {hop}");
2172        }
2173    }
2174
2175    // --- SSRF redirect chain defense ---
2176
2177    /// Verifies that a redirect Location pointing to a private IP is rejected by `validate_url`
2178    /// before any connection attempt — simulating the validation step inside `fetch_html`.
2179    #[test]
2180    fn redirect_to_private_ip_rejected_by_validate_url() {
2181        // These would appear as Location headers in a redirect response.
2182        let private_targets = [
2183            "https://127.0.0.1/secret",
2184            "https://10.0.0.1/internal",
2185            "https://192.168.1.1/admin",
2186            "https://172.16.0.1/data",
2187            "https://[::1]/path",
2188            "https://[fe80::1]/path",
2189            "https://localhost/path",
2190            "https://service.internal/api",
2191        ];
2192        for target in private_targets {
2193            let result = validate_url(target);
2194            assert!(result.is_err(), "expected error for {target}");
2195            assert!(
2196                matches!(result.unwrap_err(), ToolError::Blocked { .. }),
2197                "expected Blocked for {target}"
2198            );
2199        }
2200    }
2201
2202    /// Verifies that relative redirect URLs are resolved correctly before validation.
2203    #[test]
2204    fn redirect_relative_url_resolves_correctly() {
2205        let base = Url::parse("https://example.com/page").unwrap();
2206        let relative = "/other";
2207        let resolved = base.join(relative).unwrap();
2208        assert_eq!(resolved.as_str(), "https://example.com/other");
2209    }
2210
2211    /// Verifies that a protocol-relative redirect to http:// is rejected (scheme check).
2212    #[test]
2213    fn redirect_to_http_rejected() {
2214        let err = validate_url("http://example.com/page").unwrap_err();
2215        assert!(matches!(err, ToolError::Blocked { .. }));
2216    }
2217
2218    #[test]
2219    fn ipv4_mapped_ipv6_link_local_blocked() {
2220        let err = validate_url("https://[::ffff:169.254.0.1]/path").unwrap_err();
2221        assert!(matches!(err, ToolError::Blocked { .. }));
2222    }
2223
2224    #[test]
2225    fn ipv4_mapped_ipv6_public_allowed() {
2226        assert!(validate_url("https://[::ffff:93.184.216.34]/path").is_ok());
2227    }
2228
2229    // --- fetch tool ---
2230
2231    #[tokio::test]
2232    async fn fetch_http_scheme_blocked() {
2233        let config = ScrapeConfig::default();
2234        let executor = WebScrapeExecutor::new(&config);
2235        let call = crate::executor::ToolCall {
2236            tool_id: ToolName::new("fetch"),
2237            params: {
2238                let mut m = serde_json::Map::new();
2239                m.insert("url".to_owned(), serde_json::json!("http://example.com"));
2240                m
2241            },
2242            caller_id: None,
2243            context: None,
2244
2245            tool_call_id: String::new(),
2246            skill_name: None,
2247        };
2248        let result = executor.execute_tool_call(&call).await;
2249        assert!(matches!(result, Err(ToolError::Blocked { .. })));
2250    }
2251
2252    #[tokio::test]
2253    async fn fetch_private_ip_blocked() {
2254        let config = ScrapeConfig::default();
2255        let executor = WebScrapeExecutor::new(&config);
2256        let call = crate::executor::ToolCall {
2257            tool_id: ToolName::new("fetch"),
2258            params: {
2259                let mut m = serde_json::Map::new();
2260                m.insert(
2261                    "url".to_owned(),
2262                    serde_json::json!("https://192.168.1.1/secret"),
2263                );
2264                m
2265            },
2266            caller_id: None,
2267            context: None,
2268
2269            tool_call_id: String::new(),
2270            skill_name: None,
2271        };
2272        let result = executor.execute_tool_call(&call).await;
2273        assert!(matches!(result, Err(ToolError::Blocked { .. })));
2274    }
2275
2276    #[tokio::test]
2277    async fn fetch_localhost_blocked() {
2278        let config = ScrapeConfig::default();
2279        let executor = WebScrapeExecutor::new(&config);
2280        let call = crate::executor::ToolCall {
2281            tool_id: ToolName::new("fetch"),
2282            params: {
2283                let mut m = serde_json::Map::new();
2284                m.insert(
2285                    "url".to_owned(),
2286                    serde_json::json!("https://localhost/page"),
2287                );
2288                m
2289            },
2290            caller_id: None,
2291            context: None,
2292
2293            tool_call_id: String::new(),
2294            skill_name: None,
2295        };
2296        let result = executor.execute_tool_call(&call).await;
2297        assert!(matches!(result, Err(ToolError::Blocked { .. })));
2298    }
2299
2300    #[tokio::test]
2301    async fn fetch_unknown_tool_returns_none() {
2302        let config = ScrapeConfig::default();
2303        let executor = WebScrapeExecutor::new(&config);
2304        let call = crate::executor::ToolCall {
2305            tool_id: ToolName::new("unknown_tool"),
2306            params: serde_json::Map::new(),
2307            caller_id: None,
2308            context: None,
2309
2310            tool_call_id: String::new(),
2311            skill_name: None,
2312        };
2313        let result = executor.execute_tool_call(&call).await;
2314        assert!(result.unwrap().is_none());
2315    }
2316
2317    #[tokio::test]
2318    async fn fetch_returns_body_via_mock() {
2319        use wiremock::matchers::{method, path};
2320        use wiremock::{Mock, ResponseTemplate};
2321
2322        let (executor, server) = mock_server_executor().await;
2323        Mock::given(method("GET"))
2324            .and(path("/content"))
2325            .respond_with(ResponseTemplate::new(200).set_body_string("plain text content"))
2326            .mount(&server)
2327            .await;
2328
2329        let (host, addrs) = server_host_and_addr(&server);
2330        let url = format!("{}/content", server.uri());
2331        let result = executor
2332            .fetch_html(&url, &host, &addrs, "fetch", "test-cid", None, None)
2333            .await;
2334        assert!(result.is_ok());
2335        assert_eq!(result.unwrap(), "plain text content");
2336    }
2337
2338    #[test]
2339    fn tool_definitions_returns_web_scrape_and_fetch() {
2340        let config = ScrapeConfig::default();
2341        let executor = WebScrapeExecutor::new(&config);
2342        let defs = executor.tool_definitions();
2343        assert_eq!(defs.len(), 2);
2344        assert_eq!(defs[0].id, "web_scrape");
2345        assert_eq!(
2346            defs[0].invocation,
2347            crate::registry::InvocationHint::FencedBlock("scrape")
2348        );
2349        assert_eq!(defs[1].id, "fetch");
2350        assert_eq!(
2351            defs[1].invocation,
2352            crate::registry::InvocationHint::ToolCall
2353        );
2354    }
2355
2356    #[test]
2357    fn tool_definitions_schema_has_all_params() {
2358        let config = ScrapeConfig::default();
2359        let executor = WebScrapeExecutor::new(&config);
2360        let defs = executor.tool_definitions();
2361        let obj = defs[0].schema.as_object().unwrap();
2362        let props = obj["properties"].as_object().unwrap();
2363        assert!(props.contains_key("url"));
2364        assert!(props.contains_key("select"));
2365        assert!(props.contains_key("extract"));
2366        assert!(props.contains_key("limit"));
2367        let req = obj["required"].as_array().unwrap();
2368        assert!(req.iter().any(|v| v.as_str() == Some("url")));
2369        assert!(req.iter().any(|v| v.as_str() == Some("select")));
2370        assert!(!req.iter().any(|v| v.as_str() == Some("extract")));
2371    }
2372
2373    // --- is_private_host: new domain checks (AUD-02) ---
2374
2375    #[test]
2376    fn subdomain_localhost_blocked() {
2377        let host: url::Host<&str> = url::Host::Domain("foo.localhost");
2378        assert!(is_private_host(&host));
2379    }
2380
2381    #[test]
2382    fn internal_tld_blocked() {
2383        let host: url::Host<&str> = url::Host::Domain("service.internal");
2384        assert!(is_private_host(&host));
2385    }
2386
2387    #[test]
2388    fn local_tld_blocked() {
2389        let host: url::Host<&str> = url::Host::Domain("printer.local");
2390        assert!(is_private_host(&host));
2391    }
2392
2393    #[test]
2394    fn public_domain_not_blocked() {
2395        let host: url::Host<&str> = url::Host::Domain("example.com");
2396        assert!(!is_private_host(&host));
2397    }
2398
2399    // --- resolve_and_validate: private IP rejection ---
2400
2401    #[tokio::test]
2402    async fn resolve_loopback_rejected() {
2403        // 127.0.0.1 resolves directly (literal IP in DNS query)
2404        let url = url::Url::parse("https://127.0.0.1/path").unwrap();
2405        // validate_url catches this before resolve_and_validate, but test directly
2406        let result = resolve_and_validate(&url).await;
2407        assert!(
2408            result.is_err(),
2409            "loopback IP must be rejected by resolve_and_validate"
2410        );
2411        let err = result.unwrap_err();
2412        assert!(matches!(err, crate::executor::ToolError::Blocked { .. }));
2413    }
2414
2415    #[tokio::test]
2416    async fn resolve_private_10_rejected() {
2417        let url = url::Url::parse("https://10.0.0.1/path").unwrap();
2418        let result = resolve_and_validate(&url).await;
2419        assert!(result.is_err());
2420        assert!(matches!(
2421            result.unwrap_err(),
2422            crate::executor::ToolError::Blocked { .. }
2423        ));
2424    }
2425
2426    #[tokio::test]
2427    async fn resolve_private_192_rejected() {
2428        let url = url::Url::parse("https://192.168.1.1/path").unwrap();
2429        let result = resolve_and_validate(&url).await;
2430        assert!(result.is_err());
2431        assert!(matches!(
2432            result.unwrap_err(),
2433            crate::executor::ToolError::Blocked { .. }
2434        ));
2435    }
2436
2437    #[tokio::test]
2438    async fn resolve_ipv6_loopback_rejected() {
2439        let url = url::Url::parse("https://[::1]/path").unwrap();
2440        let result = resolve_and_validate(&url).await;
2441        assert!(result.is_err());
2442        assert!(matches!(
2443            result.unwrap_err(),
2444            crate::executor::ToolError::Blocked { .. }
2445        ));
2446    }
2447
2448    #[tokio::test]
2449    async fn resolve_no_host_returns_ok() {
2450        // URL without a resolvable host — should pass through
2451        let url = url::Url::parse("https://example.com/path").unwrap();
2452        // We can't do a live DNS test, but we can verify a URL with no host
2453        let url_no_host = url::Url::parse("data:text/plain,hello").unwrap();
2454        // data: URLs have no host; resolve_and_validate should return Ok with empty addrs
2455        let result = resolve_and_validate(&url_no_host).await;
2456        assert!(result.is_ok());
2457        let (host, addrs) = result.unwrap();
2458        assert!(host.is_empty());
2459        assert!(addrs.is_empty());
2460        drop(url);
2461        drop(url_no_host);
2462    }
2463
2464    // --- audit logging ---
2465
2466    /// Helper: build an `AuditLogger` writing to a temp file, and return the logger + path.
2467    async fn make_file_audit_logger(
2468        dir: &tempfile::TempDir,
2469    ) -> (
2470        std::sync::Arc<crate::audit::AuditLogger>,
2471        std::path::PathBuf,
2472    ) {
2473        use crate::audit::AuditLogger;
2474        use crate::config::AuditConfig;
2475        let path = dir.path().join("audit.log");
2476        let config = AuditConfig {
2477            enabled: true,
2478            destination: crate::config::AuditDestination::File(path.clone()),
2479            ..Default::default()
2480        };
2481        let logger = std::sync::Arc::new(AuditLogger::from_config(&config, false).await.unwrap());
2482        (logger, path)
2483    }
2484
2485    #[tokio::test]
2486    async fn with_audit_sets_logger() {
2487        let config = ScrapeConfig::default();
2488        let executor = WebScrapeExecutor::new(&config);
2489        assert!(executor.audit_logger.is_none());
2490
2491        let dir = tempfile::tempdir().unwrap();
2492        let (logger, _path) = make_file_audit_logger(&dir).await;
2493        let executor = executor.with_audit(logger);
2494        assert!(executor.audit_logger.is_some());
2495    }
2496
2497    #[test]
2498    fn tool_error_to_audit_result_blocked_maps_correctly() {
2499        let err = ToolError::Blocked {
2500            command: "scheme not allowed: http".into(),
2501        };
2502        let result = tool_error_to_audit_result(&err);
2503        assert!(
2504            matches!(result, AuditResult::Blocked { reason } if reason == "scheme not allowed: http")
2505        );
2506    }
2507
2508    #[test]
2509    fn tool_error_to_audit_result_timeout_maps_correctly() {
2510        let err = ToolError::Timeout { timeout_secs: 15 };
2511        let result = tool_error_to_audit_result(&err);
2512        assert!(matches!(result, AuditResult::Timeout));
2513    }
2514
2515    #[test]
2516    fn tool_error_to_audit_result_execution_error_maps_correctly() {
2517        let err = ToolError::Execution(std::io::Error::other("connection refused"));
2518        let result = tool_error_to_audit_result(&err);
2519        assert!(
2520            matches!(result, AuditResult::Error { message } if message.contains("connection refused"))
2521        );
2522    }
2523
2524    #[tokio::test]
2525    async fn fetch_audit_blocked_url_logged() {
2526        let dir = tempfile::tempdir().unwrap();
2527        let (logger, log_path) = make_file_audit_logger(&dir).await;
2528
2529        let config = ScrapeConfig::default();
2530        let executor = WebScrapeExecutor::new(&config).with_audit(logger);
2531
2532        let call = crate::executor::ToolCall {
2533            tool_id: ToolName::new("fetch"),
2534            params: {
2535                let mut m = serde_json::Map::new();
2536                m.insert("url".to_owned(), serde_json::json!("http://example.com"));
2537                m
2538            },
2539            caller_id: None,
2540            context: None,
2541
2542            tool_call_id: String::new(),
2543            skill_name: None,
2544        };
2545        let result = executor.execute_tool_call(&call).await;
2546        assert!(matches!(result, Err(ToolError::Blocked { .. })));
2547
2548        let content = tokio::fs::read_to_string(&log_path).await.unwrap();
2549        assert!(
2550            content.contains("\"tool\":\"fetch\""),
2551            "expected tool=fetch in audit: {content}"
2552        );
2553        assert!(
2554            content.contains("\"type\":\"blocked\""),
2555            "expected type=blocked in audit: {content}"
2556        );
2557        assert!(
2558            content.contains("http://example.com"),
2559            "expected URL in audit command field: {content}"
2560        );
2561    }
2562
2563    #[tokio::test]
2564    async fn log_audit_success_writes_to_file() {
2565        let dir = tempfile::tempdir().unwrap();
2566        let (logger, log_path) = make_file_audit_logger(&dir).await;
2567
2568        let config = ScrapeConfig::default();
2569        let executor = WebScrapeExecutor::new(&config).with_audit(logger);
2570
2571        executor
2572            .log_audit(
2573                "fetch",
2574                "https://example.com/page",
2575                AuditResult::Success,
2576                42,
2577                None,
2578                None,
2579                None,
2580                None,
2581            )
2582            .await;
2583
2584        let content = tokio::fs::read_to_string(&log_path).await.unwrap();
2585        assert!(
2586            content.contains("\"tool\":\"fetch\""),
2587            "expected tool=fetch in audit: {content}"
2588        );
2589        assert!(
2590            content.contains("\"type\":\"success\""),
2591            "expected type=success in audit: {content}"
2592        );
2593        assert!(
2594            content.contains("\"command\":\"https://example.com/page\""),
2595            "expected command URL in audit: {content}"
2596        );
2597        assert!(
2598            content.contains("\"duration_ms\":42"),
2599            "expected duration_ms in audit: {content}"
2600        );
2601    }
2602
2603    #[tokio::test]
2604    async fn log_audit_blocked_writes_to_file() {
2605        let dir = tempfile::tempdir().unwrap();
2606        let (logger, log_path) = make_file_audit_logger(&dir).await;
2607
2608        let config = ScrapeConfig::default();
2609        let executor = WebScrapeExecutor::new(&config).with_audit(logger);
2610
2611        executor
2612            .log_audit(
2613                "web_scrape",
2614                "http://evil.com/page",
2615                AuditResult::Blocked {
2616                    reason: "scheme not allowed: http".into(),
2617                },
2618                0,
2619                None,
2620                None,
2621                None,
2622                None,
2623            )
2624            .await;
2625
2626        let content = tokio::fs::read_to_string(&log_path).await.unwrap();
2627        assert!(
2628            content.contains("\"tool\":\"web_scrape\""),
2629            "expected tool=web_scrape in audit: {content}"
2630        );
2631        assert!(
2632            content.contains("\"type\":\"blocked\""),
2633            "expected type=blocked in audit: {content}"
2634        );
2635        assert!(
2636            content.contains("scheme not allowed"),
2637            "expected block reason in audit: {content}"
2638        );
2639    }
2640
2641    #[tokio::test]
2642    async fn web_scrape_audit_blocked_url_logged() {
2643        let dir = tempfile::tempdir().unwrap();
2644        let (logger, log_path) = make_file_audit_logger(&dir).await;
2645
2646        let config = ScrapeConfig::default();
2647        let executor = WebScrapeExecutor::new(&config).with_audit(logger);
2648
2649        let call = crate::executor::ToolCall {
2650            tool_id: ToolName::new("web_scrape"),
2651            params: {
2652                let mut m = serde_json::Map::new();
2653                m.insert("url".to_owned(), serde_json::json!("http://example.com"));
2654                m.insert("select".to_owned(), serde_json::json!("h1"));
2655                m
2656            },
2657            caller_id: None,
2658            context: None,
2659
2660            tool_call_id: String::new(),
2661            skill_name: None,
2662        };
2663        let result = executor.execute_tool_call(&call).await;
2664        assert!(matches!(result, Err(ToolError::Blocked { .. })));
2665
2666        let content = tokio::fs::read_to_string(&log_path).await.unwrap();
2667        assert!(
2668            content.contains("\"tool\":\"web_scrape\""),
2669            "expected tool=web_scrape in audit: {content}"
2670        );
2671        assert!(
2672            content.contains("\"type\":\"blocked\""),
2673            "expected type=blocked in audit: {content}"
2674        );
2675    }
2676
2677    #[tokio::test]
2678    async fn no_audit_logger_does_not_panic_on_blocked_fetch() {
2679        let config = ScrapeConfig::default();
2680        let executor = WebScrapeExecutor::new(&config);
2681        assert!(executor.audit_logger.is_none());
2682
2683        let call = crate::executor::ToolCall {
2684            tool_id: ToolName::new("fetch"),
2685            params: {
2686                let mut m = serde_json::Map::new();
2687                m.insert("url".to_owned(), serde_json::json!("http://example.com"));
2688                m
2689            },
2690            caller_id: None,
2691            context: None,
2692
2693            tool_call_id: String::new(),
2694            skill_name: None,
2695        };
2696        // Must not panic even without an audit logger
2697        let result = executor.execute_tool_call(&call).await;
2698        assert!(matches!(result, Err(ToolError::Blocked { .. })));
2699    }
2700
2701    // CR-10: fetch end-to-end via execute_tool_call -> handle_fetch -> fetch_html
2702    #[tokio::test]
2703    async fn fetch_execute_tool_call_end_to_end() {
2704        use wiremock::matchers::{method, path};
2705        use wiremock::{Mock, ResponseTemplate};
2706
2707        let (executor, server) = mock_server_executor().await;
2708        Mock::given(method("GET"))
2709            .and(path("/e2e"))
2710            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>end-to-end</h1>"))
2711            .mount(&server)
2712            .await;
2713
2714        let (host, addrs) = server_host_and_addr(&server);
2715        // Call fetch_html directly (bypassing SSRF guard for loopback mock server)
2716        let result = executor
2717            .fetch_html(
2718                &format!("{}/e2e", server.uri()),
2719                &host,
2720                &addrs,
2721                "fetch",
2722                "test-cid",
2723                None,
2724                None,
2725            )
2726            .await;
2727        assert!(result.is_ok());
2728        assert!(result.unwrap().contains("end-to-end"));
2729    }
2730
2731    // --- domain_matches ---
2732
2733    #[test]
2734    fn domain_matches_exact() {
2735        assert!(domain_matches("example.com", "example.com"));
2736        assert!(!domain_matches("example.com", "other.com"));
2737        assert!(!domain_matches("example.com", "sub.example.com"));
2738    }
2739
2740    #[test]
2741    fn domain_matches_wildcard_single_subdomain() {
2742        assert!(domain_matches("*.example.com", "sub.example.com"));
2743        assert!(!domain_matches("*.example.com", "example.com"));
2744        assert!(!domain_matches("*.example.com", "sub.sub.example.com"));
2745    }
2746
2747    #[test]
2748    fn domain_matches_wildcard_does_not_match_empty_label() {
2749        // Pattern "*.example.com" requires a non-empty label before ".example.com"
2750        assert!(!domain_matches("*.example.com", ".example.com"));
2751    }
2752
2753    #[test]
2754    fn domain_matches_multi_wildcard_treated_as_exact() {
2755        // Multiple wildcards are unsupported — treated as literal pattern
2756        assert!(!domain_matches("*.*.example.com", "a.b.example.com"));
2757    }
2758
2759    // --- check_domain_policy ---
2760
2761    #[test]
2762    fn check_domain_policy_empty_lists_allow_all() {
2763        assert!(check_domain_policy("example.com", &[], &[]).is_ok());
2764        assert!(check_domain_policy("evil.com", &[], &[]).is_ok());
2765    }
2766
2767    #[test]
2768    fn check_domain_policy_denylist_blocks() {
2769        let denied = vec!["evil.com".to_string()];
2770        let err = check_domain_policy("evil.com", &[], &denied).unwrap_err();
2771        assert!(matches!(err, ToolError::Blocked { .. }));
2772    }
2773
2774    #[test]
2775    fn check_domain_policy_denylist_does_not_block_other_domains() {
2776        let denied = vec!["evil.com".to_string()];
2777        assert!(check_domain_policy("good.com", &[], &denied).is_ok());
2778    }
2779
2780    #[test]
2781    fn check_domain_policy_allowlist_permits_matching() {
2782        let allowed = vec!["docs.rs".to_string(), "*.rust-lang.org".to_string()];
2783        assert!(check_domain_policy("docs.rs", &allowed, &[]).is_ok());
2784        assert!(check_domain_policy("blog.rust-lang.org", &allowed, &[]).is_ok());
2785    }
2786
2787    #[test]
2788    fn check_domain_policy_allowlist_blocks_unknown() {
2789        let allowed = vec!["docs.rs".to_string()];
2790        let err = check_domain_policy("other.com", &allowed, &[]).unwrap_err();
2791        assert!(matches!(err, ToolError::Blocked { .. }));
2792    }
2793
2794    #[test]
2795    fn check_domain_policy_deny_overrides_allow() {
2796        let allowed = vec!["example.com".to_string()];
2797        let denied = vec!["example.com".to_string()];
2798        let err = check_domain_policy("example.com", &allowed, &denied).unwrap_err();
2799        assert!(matches!(err, ToolError::Blocked { .. }));
2800    }
2801
2802    #[test]
2803    fn check_domain_policy_wildcard_in_denylist() {
2804        let denied = vec!["*.evil.com".to_string()];
2805        let err = check_domain_policy("sub.evil.com", &[], &denied).unwrap_err();
2806        assert!(matches!(err, ToolError::Blocked { .. }));
2807        // parent domain not blocked
2808        assert!(check_domain_policy("evil.com", &[], &denied).is_ok());
2809    }
2810}
zeph_tools/scrape.rs

zeph_tools/
scrape.rs