entelix_tools/
http_fetch.rs

1//! `HttpFetchTool` — `Tool` impl for outbound HTTP fetches.
2//!
3//! ## Threat model
4//!
5//! Naïve "let the model fetch any URL" tools are SSRF magnets:
6//! agents have been talked into hitting `http://169.254.169.254/...`
7//! (cloud metadata) and internal services. The defense lives in
8//! three orthogonal layers:
9//!
10//! 1. **Host allowlist** — explicit allow-by-domain list. The
11//!    builder requires at least one entry; an unconfigured tool
12//!    refuses every URL. [`HostAllowlist`] supports exact matches,
13//!    wildcard subdomains (`*.example.com`), and explicit IP-range
14//!    permits.
15//! 2. **Scheme guard** — only `http` and `https`. `file://`,
16//!    `javascript:`, `data:`, `gopher://`, and IP-of-ftp tricks all
17//!    bounce here.
18//! 3. **Private-IP block** — by default literal IPs in
19//!    loopback / private / link-local / metadata ranges are
20//!    rejected even when the surface allowlist would otherwise
21//!    permit them. Override with [`HostRule::IpExact`] when an
22//!    on-prem deployment genuinely needs `127.0.0.1:8080`.
23//!
24//! Layered defense rather than a single check — any one layer
25//! could be misconfigured but all three together close the
26//! reasonable SSRF surface.
27//!
28//! ## Resource caps
29//!
30//! - **Method allowlist** — defaults to `[GET]`. POST / PATCH must
31//!   be opted in.
32//! - **Redirect cap** — defaults to 5; `0` disables redirects.
33//! - **Body cap** — defaults to 1 MiB; the response stream aborts
34//!   with [`ToolError::BodyTooLarge`] once the cap is exceeded
35//!   instead of buffering the whole tail.
36//! - **Per-call timeout** — defaults to 30 s; respects
37//!   [`entelix_core::context::ExecutionContext::cancellation`] as a
38//!   secondary kill switch.
39
40use std::collections::HashSet;
41use std::net::IpAddr;
42use std::sync::Arc;
43use std::time::Duration;
44
45use async_trait::async_trait;
46use bytes::BytesMut;
47use futures::StreamExt;
48use reqwest::Method;
49use reqwest::redirect::Policy;
50use serde::{Deserialize, Serialize};
51use serde_json::{Value, json};
52use url::Url;
53
54use entelix_core::AgentContext;
55use entelix_core::error::Result;
56use entelix_core::tools::{Tool, ToolEffect, ToolMetadata};
57
58use crate::error::{ToolError, ToolResult};
59
60/// Default cap on redirect chain length.
61pub const DEFAULT_MAX_REDIRECTS: usize = 5;
62
63/// Default cap on response body size (1 MiB).
64pub const DEFAULT_MAX_RESPONSE_BYTES: usize = 1024 * 1024;
65
66/// Default per-call timeout.
67pub const DEFAULT_FETCH_TIMEOUT: Duration = Duration::from_secs(30);
68
69/// One allowlist rule.
70#[derive(Clone, Debug, PartialEq, Eq)]
71#[non_exhaustive]
72pub enum HostRule {
73    /// Exact case-insensitive hostname match (e.g. `api.example.com`).
74    Exact(String),
75    /// Wildcard subdomain — `*.example.com` matches `a.example.com`
76    /// and `b.c.example.com` but not `example.com` itself.
77    Wildcard(String),
78    /// Exact IP literal (e.g. `127.0.0.1`). Use sparingly — bypasses
79    /// the private-IP block.
80    IpExact(IpAddr),
81}
82
83/// Host allowlist. Fail-closed: empty allowlist rejects everything.
84#[derive(Clone, Debug, Default)]
85pub struct HostAllowlist {
86    rules: Vec<HostRule>,
87}
88
89impl HostAllowlist {
90    /// Empty (fail-closed) allowlist.
91    #[must_use]
92    pub fn new() -> Self {
93        Self::default()
94    }
95
96    /// Normalize a hostname to its ASCII-Punycode form (UTS-46 +
97    /// IDNA-2008 transitional rules) and lower-case it. IDN inputs
98    /// like `пример.рф` round-trip to `xn--e1afmkfd.xn--p1ai`, which
99    /// is the form `Url::host_str()` returns at check time. Falls
100    /// back to a plain lowercase when normalization fails (preserves
101    /// the previous behavior for hostnames the IDNA pass rejects).
102    fn normalize(host: &str) -> String {
103        idna::domain_to_ascii(host).map_or_else(|_| host.to_lowercase(), |s| s.to_lowercase())
104    }
105
106    /// Append an exact hostname rule (case-insensitive). IDN inputs
107    /// are normalized to Punycode so a Cyrillic-look-alike domain
108    /// cannot bypass an entry registered in Latin script (or vice
109    /// versa).
110    #[must_use]
111    pub fn add_exact_host(mut self, host: impl Into<String>) -> Self {
112        self.rules
113            .push(HostRule::Exact(Self::normalize(&host.into())));
114        self
115    }
116
117    /// Append a wildcard-subdomain rule. The leading `*.` is
118    /// optional in the supplied string; both `*.example.com` and
119    /// `example.com` are accepted as input and stored without the
120    /// `*.` prefix for matching. Inputs are normalized to Punycode
121    /// the same way as [`Self::add_exact_host`].
122    #[must_use]
123    pub fn add_subdomain_root(mut self, host: impl Into<String>) -> Self {
124        let raw = host.into();
125        let stripped = raw.strip_prefix("*.").unwrap_or(&raw);
126        self.rules
127            .push(HostRule::Wildcard(Self::normalize(stripped)));
128        self
129    }
130
131    /// Append an exact IP literal rule. Intended for narrow on-prem
132    /// allowances; prefer `allow_exact` over this for hostnames.
133    #[must_use]
134    pub fn add_exact_ip(mut self, ip: IpAddr) -> Self {
135        self.rules.push(HostRule::IpExact(ip));
136        self
137    }
138
139    /// Number of registered rules.
140    #[must_use]
141    pub fn len(&self) -> usize {
142        self.rules.len()
143    }
144
145    /// Whether the allowlist has zero rules (rejects everything).
146    #[must_use]
147    pub fn is_empty(&self) -> bool {
148        self.rules.is_empty()
149    }
150
151    /// Borrow every IP registered via [`Self::add_exact_ip`].
152    /// Used by [`HttpFetchToolBuilder`] to seed the SSRF-safe DNS
153    /// resolver's explicit-allow set so on-prem private-IP
154    /// allowances pass the connect-time block.
155    pub fn explicit_ips(&self) -> std::collections::HashSet<IpAddr> {
156        self.rules
157            .iter()
158            .filter_map(|r| match r {
159                HostRule::IpExact(ip) => Some(*ip),
160                _ => None,
161            })
162            .collect()
163    }
164
165    fn check(&self, url: &Url) -> ToolResult<()> {
166        let host = url.host_str().ok_or_else(|| ToolError::HostBlocked {
167            host: "<no host>".to_owned(),
168        })?;
169        // `Url::parse` already Punycode-encodes IDN hosts, but we run
170        // the same normalize pass so rule and check stay symmetric —
171        // any future change to the normalizer applies everywhere at
172        // once.
173        let host_lower = Self::normalize(host);
174
175        // 1. IP literal short-circuit — only allowed via explicit
176        //    IpExact rule (overrides the private-IP block).
177        if let Ok(ip) = host_lower.parse::<IpAddr>() {
178            for rule in &self.rules {
179                if let HostRule::IpExact(allowed) = rule
180                    && *allowed == ip
181                {
182                    return Ok(());
183                }
184            }
185            return Err(ToolError::HostBlocked { host: host_lower });
186        }
187
188        // 2. Hostname rules.
189        for rule in &self.rules {
190            match rule {
191                HostRule::Exact(h) if h == &host_lower => return Ok(()),
192                HostRule::Wildcard(suffix) => {
193                    if host_lower == *suffix {
194                        // `*.example.com` does NOT match the apex
195                        // `example.com` — that's the whole point of
196                        // wildcard-subdomain: subdomains, not the
197                        // bare host.
198                        continue;
199                    }
200                    if host_lower.ends_with(&format!(".{suffix}")) {
201                        return Ok(());
202                    }
203                }
204                _ => {}
205            }
206        }
207        Err(ToolError::HostBlocked { host: host_lower })
208    }
209}
210
211/// Builder for [`HttpFetchTool`].
212pub struct HttpFetchToolBuilder {
213    allowlist: HostAllowlist,
214    max_redirects: usize,
215    max_response_bytes: usize,
216    timeout: Duration,
217    allowed_methods: HashSet<Method>,
218    user_agent: String,
219    /// Lower-cased response header names the tool surfaces to the
220    /// model. Empty = no headers reach the LLM (default — most
221    /// vendor headers like `set-cookie`, `cf-ray`, `x-amz-*` are
222    /// noise that burns model attention without informing reasoning,
223    /// invariant #16).
224    exposed_response_headers: HashSet<String>,
225}
226
227impl HttpFetchToolBuilder {
228    /// Start a builder with no allowlist (fail-closed), the
229    /// `[GET]` default method allowlist, and no response headers
230    /// exposed to the model.
231    #[must_use]
232    pub fn new() -> Self {
233        let mut methods = HashSet::new();
234        methods.insert(Method::GET);
235        Self {
236            allowlist: HostAllowlist::new(),
237            max_redirects: DEFAULT_MAX_REDIRECTS,
238            max_response_bytes: DEFAULT_MAX_RESPONSE_BYTES,
239            timeout: DEFAULT_FETCH_TIMEOUT,
240            allowed_methods: methods,
241            user_agent: format!("entelix-http-fetch/{}", env!("CARGO_PKG_VERSION")),
242            exposed_response_headers: HashSet::new(),
243        }
244    }
245
246    /// Set the host allowlist outright.
247    #[must_use]
248    pub fn with_allowlist(mut self, allowlist: HostAllowlist) -> Self {
249        self.allowlist = allowlist;
250        self
251    }
252
253    /// Cap redirect chain length. `0` disables redirects entirely.
254    #[must_use]
255    pub const fn with_max_redirects(mut self, n: usize) -> Self {
256        self.max_redirects = n;
257        self
258    }
259
260    /// Cap the response body in bytes.
261    #[must_use]
262    pub const fn with_max_response_bytes(mut self, n: usize) -> Self {
263        self.max_response_bytes = n;
264        self
265    }
266
267    /// Per-call timeout.
268    #[must_use]
269    pub const fn with_timeout(mut self, t: Duration) -> Self {
270        self.timeout = t;
271        self
272    }
273
274    /// Set the method allowlist outright. Intersect with the actual
275    /// HTTP method on the input — anything not in this set is
276    /// rejected.
277    #[must_use]
278    pub fn with_allowed_methods<I: IntoIterator<Item = Method>>(mut self, methods: I) -> Self {
279        self.allowed_methods = methods.into_iter().collect();
280        self
281    }
282
283    /// Override the `User-Agent` header.
284    #[must_use]
285    pub fn with_user_agent(mut self, ua: impl Into<String>) -> Self {
286        self.user_agent = ua.into();
287        self
288    }
289
290    /// Allow the tool to surface specific response headers to the
291    /// model (LLM-facing). Header names are lower-cased and matched
292    /// case-insensitively. Default is the empty set — every response
293    /// header is dropped from the tool output, sparing the model
294    /// from `set-cookie` / `cf-ray` / `x-amz-request-id` /
295    /// `content-encoding` noise that costs tokens without informing
296    /// reasoning (invariant #16). Operators that need a header
297    /// (e.g. `content-type` for the model to branch on payload
298    /// shape) opt it in explicitly.
299    #[must_use]
300    pub fn with_exposed_response_headers<I, S>(mut self, headers: I) -> Self
301    where
302        I: IntoIterator<Item = S>,
303        S: AsRef<str>,
304    {
305        self.exposed_response_headers = headers
306            .into_iter()
307            .map(|h| h.as_ref().to_ascii_lowercase())
308            .collect();
309        self
310    }
311
312    /// Finalize. Returns [`ToolError::Config`] when the allowlist
313    /// is empty (fail-closed: an unconfigured tool would refuse
314    /// every URL anyway, but explicit early failure is friendlier
315    /// to operators).
316    pub fn build(self) -> ToolResult<HttpFetchTool> {
317        if self.allowlist.is_empty() {
318            return Err(ToolError::config_msg(
319                "HttpFetchTool requires at least one HostAllowlist rule",
320            ));
321        }
322        // Allowlist + scheme guard re-applied on every redirect hop.
323        // Without this, a 302 from an allowlisted host to an
324        // unlisted (but DNS-public) one would succeed: the
325        // host-allowlist check only ran on the first URL and the
326        // SSRF DNS resolver alone does not enforce host policy.
327        let allowlist_for_policy = Arc::new(self.allowlist.clone());
328        let max_redirects = self.max_redirects;
329        let policy = if max_redirects == 0 {
330            Policy::none()
331        } else {
332            Policy::custom(move |attempt| {
333                if attempt.previous().len() >= max_redirects {
334                    return attempt.error(redirect_error(format!(
335                        "redirect cap exceeded ({max_redirects})"
336                    )));
337                }
338                let scheme = attempt.url().scheme().to_owned();
339                if !matches!(scheme.as_str(), "http" | "https") {
340                    return attempt.error(redirect_error(format!(
341                        "redirect to disallowed scheme '{scheme}'"
342                    )));
343                }
344                if let Err(e) = allowlist_for_policy.check(attempt.url()) {
345                    return attempt.error(redirect_error(format!(
346                        "redirect to non-allowlisted host: {e}"
347                    )));
348                }
349                attempt.follow()
350            })
351        };
352        // SSRF-safe DNS resolver: filters every connect-time lookup
353        // against the private/loopback/metadata block. IP literals
354        // explicitly registered on the allowlist override the block
355        // (on-prem proxies bind 127.0.0.1, etc.).
356        let resolver = crate::dns::SsrfSafeDnsResolver::from_system()?
357            .with_explicit_allow(self.allowlist.explicit_ips());
358        let client = reqwest::Client::builder()
359            .timeout(self.timeout)
360            .redirect(policy)
361            .user_agent(self.user_agent)
362            .dns_resolver(Arc::new(resolver))
363            .build()
364            .map_err(|e| ToolError::Config {
365                message: format!("HTTP client: {e}"),
366                source: Some(Box::new(e)),
367            })?;
368        let metadata = ToolMetadata::function(
369            "http_fetch",
370            "Fetch a URL over HTTP/HTTPS. Returns status, final_url (post-redirect), \
371             headers, body. Restricted to the configured host allowlist.",
372            json!({
373                "type": "object",
374                "required": ["url"],
375                "properties": {
376                    "url": {
377                        "type": "string",
378                        "description": "Absolute http(s) URL to fetch."
379                    },
380                    "method": {
381                        "type": "string",
382                        "description": "HTTP method (default: GET).",
383                        "enum": ["GET", "HEAD", "POST", "PUT", "PATCH", "DELETE"]
384                    },
385                    "headers": {
386                        "type": "object",
387                        "description": "Extra request headers.",
388                        "additionalProperties": { "type": "string" }
389                    },
390                    "body": {
391                        "type": "string",
392                        "description": "Request body (for non-GET methods)."
393                    }
394                }
395            }),
396        )
397        .with_effect(ToolEffect::Mutating);
398        Ok(HttpFetchTool {
399            client,
400            allowlist: Arc::new(self.allowlist),
401            max_response_bytes: self.max_response_bytes,
402            allowed_methods: Arc::new(self.allowed_methods),
403            exposed_response_headers: Arc::new(self.exposed_response_headers),
404            metadata: Arc::new(metadata),
405        })
406    }
407}
408
409/// Wraps a redirect-rejection message into a `Box<dyn Error>` so
410/// `reqwest::redirect::Attempt::error` accepts it.
411fn redirect_error(message: String) -> Box<dyn std::error::Error + Send + Sync> {
412    Box::new(RedirectRejected(message))
413}
414
415/// Internal error type produced by the redirect policy. The text is
416/// surfaced through reqwest's `Error::Display` so callers see the
417/// rejection reason in their `ToolError::Network` chain.
418#[derive(Debug)]
419struct RedirectRejected(String);
420
421impl std::fmt::Display for RedirectRejected {
422    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
423        write!(f, "{}", self.0)
424    }
425}
426
427impl std::error::Error for RedirectRejected {}
428
429impl Default for HttpFetchToolBuilder {
430    fn default() -> Self {
431        Self::new()
432    }
433}
434
435/// HTTP fetch [`Tool`] for agentic workflows.
436///
437/// Cloning is cheap (handles are `Arc`-backed). Share one tool
438/// instance across the process and across the hooks pipeline.
439#[derive(Clone)]
440pub struct HttpFetchTool {
441    client: reqwest::Client,
442    allowlist: Arc<HostAllowlist>,
443    max_response_bytes: usize,
444    allowed_methods: Arc<HashSet<Method>>,
445    exposed_response_headers: Arc<HashSet<String>>,
446    metadata: Arc<ToolMetadata>,
447}
448
449#[allow(
450    clippy::missing_fields_in_debug,
451    reason = "`reqwest::Client` is opaque; printed as configured-rule counts"
452)]
453impl std::fmt::Debug for HttpFetchTool {
454    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
455        f.debug_struct("HttpFetchTool")
456            .field("allowlist_rules", &self.allowlist.len())
457            .field("max_response_bytes", &self.max_response_bytes)
458            .field("allowed_methods", &self.allowed_methods.len())
459            .finish()
460    }
461}
462
463impl HttpFetchTool {
464    /// Start a builder.
465    #[must_use]
466    pub fn builder() -> HttpFetchToolBuilder {
467        HttpFetchToolBuilder::new()
468    }
469}
470
471#[derive(Debug, Deserialize)]
472struct FetchInput {
473    url: String,
474    #[serde(default)]
475    method: Option<String>,
476    #[serde(default)]
477    headers: Option<std::collections::HashMap<String, String>>,
478    #[serde(default)]
479    body: Option<String>,
480}
481
482#[derive(Debug, Serialize)]
483struct FetchOutput {
484    status: u16,
485    final_url: String,
486    headers: std::collections::HashMap<String, String>,
487    body: String,
488    truncated: bool,
489}
490
491#[async_trait]
492impl Tool for HttpFetchTool {
493    fn metadata(&self) -> &ToolMetadata {
494        &self.metadata
495    }
496
497    async fn execute(&self, input: Value, ctx: &AgentContext<()>) -> Result<Value> {
498        let parsed: FetchInput = serde_json::from_value(input).map_err(ToolError::from)?;
499        let url = Url::parse(&parsed.url)
500            .map_err(|e| ToolError::InvalidInput(format!("malformed URL: {e}")))?;
501        if !matches!(url.scheme(), "http" | "https") {
502            return Err(ToolError::UnsupportedScheme {
503                scheme: url.scheme().to_owned(),
504            }
505            .into());
506        }
507        self.allowlist.check(&url)?;
508
509        let method = match parsed.method.as_deref() {
510            Some(m) => Method::from_bytes(m.to_uppercase().as_bytes())
511                .map_err(|_| ToolError::InvalidInput(format!("unknown method '{m}'")))?,
512            None => Method::GET,
513        };
514        if !self.allowed_methods.contains(&method) {
515            return Err(ToolError::MethodBlocked {
516                method: method.to_string(),
517            }
518            .into());
519        }
520
521        let mut request = self.client.request(method, url.clone());
522        if let Some(headers) = &parsed.headers {
523            for (k, v) in headers {
524                request = request.header(k, v);
525            }
526        }
527        if let Some(body) = parsed.body {
528            request = request.body(body);
529        }
530
531        // Race the HTTP send against cancellation.
532        let cancel = ctx.cancellation().clone();
533        let response = tokio::select! {
534            biased;
535            () = cancel.cancelled() => {
536                return Err(ToolError::network_msg("cancelled").into());
537            }
538            r = request.send() => r.map_err(ToolError::network)?,
539        };
540
541        let status = response.status().as_u16();
542        let final_url = response.url().to_string();
543        // Default-deny: only headers the operator explicitly opted
544        // in via `with_exposed_response_headers` flow to the LLM. Vendor
545        // chrome (`set-cookie`, `cf-ray`, `x-amz-*`, `via`,
546        // `content-encoding`, …) costs the model tokens without
547        // informing reasoning (invariant #16).
548        let allow = &*self.exposed_response_headers;
549        let response_headers = if allow.is_empty() {
550            std::collections::HashMap::new()
551        } else {
552            response
553                .headers()
554                .iter()
555                .filter(|(k, _)| allow.contains(k.as_str()))
556                .filter_map(|(k, v)| v.to_str().ok().map(|s| (k.to_string(), s.to_owned())))
557                .collect::<std::collections::HashMap<_, _>>()
558        };
559
560        // Stream-and-cap body collection.
561        let mut buf = BytesMut::new();
562        let mut truncated = false;
563        let mut stream = response.bytes_stream();
564        let cancel = ctx.cancellation().clone();
565        loop {
566            let chunk = tokio::select! {
567                biased;
568                () = cancel.cancelled() => {
569                    return Err(ToolError::network_msg("cancelled").into());
570                }
571                next = stream.next() => match next {
572                    Some(Ok(c)) => c,
573                    Some(Err(e)) => {
574                        return Err(ToolError::network(e).into());
575                    }
576                    None => break,
577                },
578            };
579            if buf.len().saturating_add(chunk.len()) > self.max_response_bytes {
580                let take = self
581                    .max_response_bytes
582                    .saturating_sub(buf.len())
583                    .min(chunk.len());
584                buf.extend_from_slice(chunk.get(..take).unwrap_or(&[]));
585                truncated = true;
586                break;
587            }
588            buf.extend_from_slice(&chunk);
589        }
590
591        // Treat the body as UTF-8 text when it parses, otherwise
592        // hex-prefixed binary marker. Tools shouldn't surface
593        // arbitrary bytes inline; the agent loop expects strings.
594        let body = match std::str::from_utf8(&buf) {
595            Ok(s) => s.to_owned(),
596            Err(_) => format!("<binary {} bytes>", buf.len()),
597        };
598
599        let output = FetchOutput {
600            status,
601            final_url,
602            headers: response_headers,
603            body,
604            truncated,
605        };
606        Ok(serde_json::to_value(output).map_err(ToolError::from)?)
607    }
608}
609
610#[cfg(test)]
611#[allow(clippy::unwrap_used, clippy::indexing_slicing, clippy::ip_constant)]
612mod tests {
613    use std::net::Ipv4Addr;
614
615    use super::*;
616
617    fn url(s: &str) -> Url {
618        Url::parse(s).unwrap()
619    }
620
621    #[test]
622    fn empty_allowlist_rejects_everything() {
623        let allow = HostAllowlist::new();
624        assert!(allow.check(&url("https://example.com/x")).is_err());
625    }
626
627    #[test]
628    fn exact_host_match() {
629        let allow = HostAllowlist::new().add_exact_host("api.example.com");
630        assert!(allow.check(&url("https://api.example.com/path")).is_ok());
631        assert!(allow.check(&url("https://other.example.com/")).is_err());
632    }
633
634    #[test]
635    fn case_insensitive_hostname_match() {
636        let allow = HostAllowlist::new().add_exact_host("API.example.com");
637        assert!(allow.check(&url("https://api.example.com/")).is_ok());
638        assert!(allow.check(&url("https://API.EXAMPLE.COM/")).is_ok());
639    }
640
641    #[test]
642    fn wildcard_matches_subdomains_only_not_apex() {
643        let allow = HostAllowlist::new().add_subdomain_root("example.com");
644        assert!(allow.check(&url("https://a.example.com/")).is_ok());
645        assert!(allow.check(&url("https://x.y.example.com/")).is_ok());
646        // Apex must NOT match a wildcard rule.
647        assert!(allow.check(&url("https://example.com/")).is_err());
648    }
649
650    #[test]
651    fn wildcard_input_strips_leading_star_dot() {
652        let allow = HostAllowlist::new().add_subdomain_root("*.example.com");
653        assert!(allow.check(&url("https://a.example.com/")).is_ok());
654    }
655
656    #[test]
657    fn ip_literals_require_explicit_rule() {
658        let allow = HostAllowlist::new().add_exact_host("example.com");
659        assert!(allow.check(&url("http://127.0.0.1/x")).is_err());
660        assert!(allow.check(&url("http://10.0.0.5/x")).is_err());
661    }
662
663    #[test]
664    fn explicit_ip_exact_admits() {
665        let allow = HostAllowlist::new().add_exact_ip(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)));
666        assert!(allow.check(&url("http://127.0.0.1/x")).is_ok());
667        assert!(allow.check(&url("http://127.0.0.2/x")).is_err());
668    }
669
670    #[test]
671    fn builder_requires_non_empty_allowlist() {
672        let err = HttpFetchToolBuilder::new().build().unwrap_err();
673        assert!(matches!(err, ToolError::Config { .. }));
674    }
675
676    #[test]
677    fn idn_rule_matches_punycode_url() {
678        // Rule provided in human form; URL arrives in Punycode (which
679        // is what `Url::parse` produces) — they must agree.
680        let allow = HostAllowlist::new().add_exact_host("пример.рф");
681        // `xn--e1afmkfd.xn--p1ai` is the canonical Punycode of пример.рф.
682        let parsed = url("https://xn--e1afmkfd.xn--p1ai/");
683        assert_eq!(parsed.host_str(), Some("xn--e1afmkfd.xn--p1ai"));
684        assert!(allow.check(&parsed).is_ok());
685    }
686
687    #[test]
688    fn punycode_rule_matches_idn_input_via_url_parse() {
689        // Symmetric: rule given in Punycode; URL passes through
690        // `Url::parse` which canonicalizes IDNs to ASCII.
691        let allow = HostAllowlist::new().add_exact_host("xn--e1afmkfd.xn--p1ai");
692        let parsed = url("https://пример.рф/path");
693        assert!(allow.check(&parsed).is_ok());
694    }
695
696    #[test]
697    fn cyrillic_lookalike_blocked_when_only_latin_is_allowed() {
698        // Cyrillic 'е' (U+0435) is visually identical to Latin 'e'
699        // (U+0065). An allowlist for the Latin domain must NOT admit
700        // a homograph attack: post-IDNA the two normalize to
701        // different ASCII (Punycode) forms.
702        let allow = HostAllowlist::new().add_exact_host("example.com");
703        // "еxample.com" with the leading 'e' replaced by Cyrillic 'е'.
704        let homograph = "\u{0435}xample.com";
705        // `Url::parse` runs IDNA on this; the resulting host_str is
706        // the Punycode form, which is not "example.com".
707        let parsed = Url::parse(&format!("https://{homograph}/")).unwrap();
708        assert_ne!(parsed.host_str(), Some("example.com"));
709        assert!(allow.check(&parsed).is_err());
710    }
711
712    #[test]
713    fn idn_wildcard_matches_subdomain() {
714        let allow = HostAllowlist::new().add_subdomain_root("пример.рф");
715        let parsed = url("https://api.xn--e1afmkfd.xn--p1ai/");
716        assert!(allow.check(&parsed).is_ok());
717    }
718}
entelix_tools/http_fetch.rs

entelix_tools/
http_fetch.rs