Skip to main content

zlayer_overlay/
dns.rs

1//! DNS server for service discovery over overlay networks
2
3use hickory_client::client::{Client, SyncClient};
4use hickory_client::udp::UdpClientConnection;
5use hickory_server::authority::{Catalog, ZoneType};
6use hickory_server::proto::rr::rdata::{A, AAAA};
7use hickory_server::proto::rr::{DNSClass, LowerName, Name, RData, Record, RecordType};
8use hickory_server::resolver::config::NameServerConfigGroup;
9use hickory_server::server::ServerFuture;
10use hickory_server::store::in_memory::InMemoryAuthority;
11use serde::{Deserialize, Serialize};
12use std::collections::HashMap;
13use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr};
14use std::str::FromStr;
15use std::sync::Arc;
16use std::time::Duration;
17use tokio::net::{TcpListener, UdpSocket};
18use tokio::sync::RwLock;
19
20/// Default DNS port for overlay service discovery (non-standard to avoid conflicts)
21pub const DEFAULT_DNS_PORT: u16 = 15353;
22
23/// Standard DNS port used for upstream forwarding when a resolv.conf entry
24/// (or the public fallback) does not carry an explicit port.
25const STANDARD_DNS_PORT: u16 = 53;
26
27/// Well-known public recursive resolvers used as a last-resort fallback when
28/// no usable host upstream can be detected.
29///
30/// Cloudflare (`1.1.1.1`) is listed first, Google (`8.8.8.8`) second. These are
31/// only ever reached when [`resolve_upstreams`] cannot extract a single
32/// non-loopback nameserver from `/etc/resolv.conf` — i.e. the host resolver is
33/// either absent or wholly stub-based (the netbird / systemd-resolved failure
34/// mode this forwarder exists to route around).
35const PUBLIC_FALLBACK_UPSTREAMS: [IpAddr; 2] = [
36    IpAddr::V4(Ipv4Addr::new(1, 1, 1, 1)),
37    IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8)),
38];
39
40/// Path to the host resolver configuration parsed for default upstreams.
41pub(crate) const RESOLV_CONF_PATH: &str = "/etc/resolv.conf";
42
43/// Configuration for DNS integration with overlay network
44#[derive(Debug, Clone, Serialize, Deserialize)]
45pub struct DnsConfig {
46    /// DNS zone (e.g., "overlay.local.")
47    pub zone: String,
48    /// DNS server port (default: 15353)
49    pub port: u16,
50    /// Bind address (default: overlay IP)
51    pub bind_addr: IpAddr,
52    /// Explicit upstream resolvers for non-overlay queries.
53    ///
54    /// When `Some`, this list wins outright over any host-resolver detection:
55    /// the overlay DNS server forwards every query *outside* [`Self::zone`] to
56    /// these addresses (in order) and never consults `/etc/resolv.conf`. This
57    /// is the production-safe override for hosts where a mesh VPN (netbird,
58    /// Tailscale, …) has hijacked systemd-resolved with a `~.` catch-all and
59    /// poisoned the host resolver for everything else.
60    ///
61    /// When `None` (the default), the server detects upstreams at startup by
62    /// parsing `/etc/resolv.conf` and filtering out loopback / resolved-stub
63    /// addresses; see [`resolve_upstreams`] for the exact precedence and the
64    /// public fallback.
65    ///
66    /// Each entry is a full `SocketAddr` so a non-standard upstream port can be
67    /// expressed; detection synthesises port [`STANDARD_DNS_PORT`] (53).
68    #[serde(default)]
69    pub upstreams: Option<Vec<SocketAddr>>,
70}
71
72impl DnsConfig {
73    /// Create a new DNS config with defaults
74    #[must_use]
75    pub fn new(zone: &str, bind_addr: IpAddr) -> Self {
76        Self {
77            zone: zone.to_string(),
78            port: DEFAULT_DNS_PORT,
79            bind_addr,
80            upstreams: None,
81        }
82    }
83
84    /// Set a custom port
85    #[must_use]
86    pub fn with_port(mut self, port: u16) -> Self {
87        self.port = port;
88        self
89    }
90
91    /// Set explicit upstream resolvers for non-overlay queries.
92    ///
93    /// Supplying this disables host-resolver auto-detection entirely (the
94    /// config override always wins). Pass full `SocketAddr`s; for the common
95    /// case of "this IP on port 53" build them as `SocketAddr::new(ip, 53)`.
96    #[must_use]
97    pub fn with_upstreams(mut self, upstreams: Vec<SocketAddr>) -> Self {
98        self.upstreams = Some(upstreams);
99        self
100    }
101}
102
103/// Returns `true` for addresses that must never be used as an overlay DNS
104/// upstream because forwarding to them would either loop back into a broken
105/// host resolver or hit the systemd-resolved stub.
106///
107/// Filtered out:
108/// - `127.0.0.53` — the systemd-resolved stub listener. This is the exact
109///   address a mesh VPN hijacks; forwarding here re-introduces the failure we
110///   exist to bypass.
111/// - any other IPv4/IPv6 loopback (`127.0.0.0/8`, `::1`) — a resolver that is
112///   only reachable on loopback is, from a *container's* perspective, useless
113///   (the container has its own loopback) and is almost always the host stub.
114/// - the unspecified address (`0.0.0.0`, `::`) — never a valid nameserver.
115fn is_unusable_upstream(ip: IpAddr) -> bool {
116    match ip {
117        IpAddr::V4(v4) => v4.is_loopback() || v4.is_unspecified(),
118        IpAddr::V6(v6) => v6.is_loopback() || v6.is_unspecified(),
119    }
120}
121
122/// Parse `nameserver` directives out of resolv.conf-formatted text.
123///
124/// Only the `nameserver <ip>` directive is honoured (the sole directive that
125/// names an upstream); `search`, `domain`, `options`, comments (`#`/`;`) and
126/// blank lines are ignored. Loopback / stub / unspecified entries are filtered
127/// via [`is_unusable_upstream`] so a systemd-resolved `nameserver 127.0.0.53`
128/// line never survives. Surviving entries are returned as `SocketAddr`s on
129/// [`STANDARD_DNS_PORT`] (resolv.conf has no port syntax).
130///
131/// Duplicates are de-duplicated while preserving first-seen order.
132fn parse_resolv_conf(contents: &str) -> Vec<SocketAddr> {
133    let mut out: Vec<SocketAddr> = Vec::new();
134    for line in contents.lines() {
135        let line = line.trim();
136        if line.is_empty() || line.starts_with('#') || line.starts_with(';') {
137            continue;
138        }
139        let mut parts = line.split_whitespace();
140        if parts.next() != Some("nameserver") {
141            continue;
142        }
143        let Some(addr_str) = parts.next() else {
144            continue;
145        };
146        // resolv.conf may carry a scoped IPv6 like `fe80::1%eth0`; strip the
147        // zone id since `IpAddr` does not parse it.
148        let addr_str = addr_str.split('%').next().unwrap_or(addr_str);
149        let Ok(ip) = IpAddr::from_str(addr_str) else {
150            continue;
151        };
152        if is_unusable_upstream(ip) {
153            continue;
154        }
155        let sock = SocketAddr::new(ip, STANDARD_DNS_PORT);
156        if !out.contains(&sock) {
157            out.push(sock);
158        }
159    }
160    out
161}
162
163/// Resolve the effective upstream resolver list for non-overlay forwarding.
164///
165/// Precedence (documented because every choice here is load-bearing for the
166/// production failure this guards against):
167///
168/// 1. **Config override wins.** A non-empty `config.upstreams` is used verbatim
169///    and detection is skipped — this is the operator's escape hatch when the
170///    host resolver is unusable.
171/// 2. **Host `/etc/resolv.conf`, filtered.** Otherwise we parse the host
172///    resolver config and keep only non-loopback, non-stub nameservers (see
173///    [`parse_resolv_conf`]). This deliberately drops `127.0.0.53` so a
174///    netbird/systemd-resolved `~.` hijack cannot poison the overlay path:
175///    containers no longer inherit the broken stub, they hit the *real*
176///    upstreams resolv.conf points at.
177/// 3. **Public fallback.** If the filter leaves nothing usable (host is
178///    stub-only or resolv.conf is missing), fall back to
179///    [`PUBLIC_FALLBACK_UPSTREAMS`] (1.1.1.1, 8.8.8.8) and `warn!` loudly so
180///    the operator knows no host upstream survived.
181///
182/// `resolv_conf_path` is injectable for tests; production passes
183/// [`RESOLV_CONF_PATH`].
184pub(crate) fn resolve_upstreams(config: &DnsConfig, resolv_conf_path: &str) -> Vec<SocketAddr> {
185    if let Some(explicit) = &config.upstreams {
186        if !explicit.is_empty() {
187            tracing::debug!(
188                count = explicit.len(),
189                "using explicit overlay DNS upstreams from config (host detection skipped)",
190            );
191            return explicit.clone();
192        }
193    }
194
195    let detected = match std::fs::read_to_string(resolv_conf_path) {
196        Ok(contents) => parse_resolv_conf(&contents),
197        Err(e) => {
198            tracing::warn!(
199                path = resolv_conf_path,
200                error = %e,
201                "could not read host resolv.conf for overlay DNS upstream detection",
202            );
203            Vec::new()
204        }
205    };
206
207    if detected.is_empty() {
208        let fallback: Vec<SocketAddr> = PUBLIC_FALLBACK_UPSTREAMS
209            .iter()
210            .map(|ip| SocketAddr::new(*ip, STANDARD_DNS_PORT))
211            .collect();
212        tracing::warn!(
213            fallback = ?fallback,
214            "no usable host DNS upstreams found (resolv.conf empty, missing, or stub-only); \
215             falling back to public resolvers for overlay forwarding",
216        );
217        fallback
218    } else {
219        tracing::info!(
220            upstreams = ?detected,
221            "overlay DNS forwarding to host upstreams (loopback/stub filtered out)",
222        );
223        detected
224    }
225}
226
227/// Build the bounded async resolver used to forward non-overlay queries.
228///
229/// hickory-server 0.24 *does* ship a [`ForwardAuthority`], but when every
230/// upstream is unreachable its lookup error flows through the [`Catalog`]'s
231/// `build_response` and lands in a documented-TODO branch that leaves the
232/// response code at the initialised `NoError` and emits an *empty* answer
233/// section — i.e. total-upstream-failure surfaces to a container as "this name
234/// has no A record" instead of `SERVFAIL`. That silent failure is exactly the
235/// production hazard we are guarding against, so instead of registering a
236/// `ForwardAuthority` in the catalog we drive a [`TokioAsyncResolver`] directly
237/// from [`ForwardingCatalog`] and map its error kinds to precise response codes
238/// (see [`ForwardingCatalog::handle_request`]).
239///
240/// `from_ips_clear` builds plain UDP+TCP nameservers (no DoT/DoH). Upstreams are
241/// bucketed by port so a non-standard upstream port is honoured; the common
242/// case is a single port (53). The resolver is bounded — 2s per-query timeout,
243/// 2 attempts — so a dead/blackholed upstream fails fast rather than hanging
244/// containers.
245///
246/// Returns `Err` only if the upstream set is empty (callers must not call this
247/// with an empty list — they gate on `!upstreams.is_empty()`).
248pub(crate) fn build_forward_resolver(
249    upstreams: &[SocketAddr],
250) -> Result<hickory_server::resolver::TokioAsyncResolver, DnsError> {
251    use hickory_server::resolver::config::{ResolverConfig, ResolverOpts};
252
253    if upstreams.is_empty() {
254        return Err(DnsError::Server("no upstreams for forward resolver".into()));
255    }
256
257    let mut group = NameServerConfigGroup::new();
258    let mut by_port: std::collections::BTreeMap<u16, Vec<IpAddr>> =
259        std::collections::BTreeMap::new();
260    for addr in upstreams {
261        by_port.entry(addr.port()).or_default().push(addr.ip());
262    }
263    for (port, ips) in by_port {
264        // trust_negative_responses = true: these are recursive resolvers we
265        // delegate to wholesale, so a negative response from them is final.
266        group.merge(NameServerConfigGroup::from_ips_clear(&ips, port, true));
267    }
268
269    // `ResolverOpts` is `#[non_exhaustive]`, so we cannot build it with a struct
270    // literal from this crate — start from defaults and override the two fields
271    // that matter for fail-fast behaviour.
272    let mut options = ResolverOpts::default();
273    options.timeout = Duration::from_secs(2);
274    options.attempts = 2;
275    // Forwarders must emit intermediate CNAMEs (RFC 1034 §4.3.2).
276    options.preserve_intermediates = true;
277
278    let config = ResolverConfig::from_parts(None, vec![], group);
279    Ok(hickory_server::resolver::TokioAsyncResolver::tokio(
280        config, options,
281    ))
282}
283
284/// A [`RequestHandler`] that serves the overlay zone from an [`InMemoryAuthority`]
285/// (via the wrapped [`Catalog`]) and forwards everything else to upstream
286/// resolvers, mapping resolver outcomes to precise DNS response codes.
287///
288/// Routing: a query whose name is within `zone_origin` is handed to the catalog
289/// unchanged (the [`InMemoryAuthority`] answers it). Any other query is resolved
290/// through `resolver` and answered directly. When `resolver` is `None` (no
291/// usable upstreams were configured) non-overlay queries fall through to the
292/// catalog, which answers `REFUSED` — the pre-forwarder behaviour.
293///
294/// Response-code mapping for forwarded queries:
295/// - resolver `Ok` → `NoError` with the resolved records as answers;
296/// - `NoRecordsFound { response_code: NXDomain }` → `NXDomain`;
297/// - `NoRecordsFound { response_code: NoError }` (genuine NODATA) → empty
298///   `NoError`;
299/// - timeout / IO / no-connections / any other error (total upstream failure)
300///   → `SERVFAIL`, never a panic and never a silent empty `NoError`.
301///
302/// The forwarder is only ever reachable on the sockets the server already binds
303/// (overlay IP / localhost / explicit secondary). No wildcard bind is added, so
304/// open recursion is not exposed to the world.
305struct ForwardingCatalog {
306    catalog: Catalog,
307    zone_origin: LowerName,
308    /// Additional always-served zone origin (the static `zlayer.local` zone that
309    /// carries node-stable records like `host.zlayer.local`). Queries within
310    /// EITHER this or `zone_origin` are answered from the catalog; everything
311    /// else is forwarded.
312    static_origin: LowerName,
313    resolver: Option<Arc<hickory_server::resolver::TokioAsyncResolver>>,
314}
315
316impl ForwardingCatalog {
317    /// Build the `NoError` answer message for a successful forward lookup.
318    fn forward_answer_response<'a>(
319        request: &'a hickory_server::server::Request,
320        answers: &'a [Record],
321    ) -> hickory_server::authority::MessageResponse<
322        'a,
323        'a,
324        std::slice::Iter<'a, Record>,
325        std::iter::Empty<&'a Record>,
326        std::iter::Empty<&'a Record>,
327        std::iter::Empty<&'a Record>,
328    > {
329        use hickory_server::authority::MessageResponseBuilder;
330        use hickory_server::proto::op::ResponseCode;
331
332        let mut header = hickory_server::proto::op::Header::response_from_request(request.header());
333        header.set_recursion_available(true);
334        header.set_response_code(ResponseCode::NoError);
335        // Forwarded answers are non-authoritative by definition.
336        header.set_authoritative(false);
337
338        MessageResponseBuilder::from_message_request(request).build(
339            header,
340            answers.iter(),
341            std::iter::empty(),
342            std::iter::empty(),
343            std::iter::empty(),
344        )
345    }
346
347    /// Build an answer-less response carrying just `code` (used for NXDOMAIN,
348    /// NODATA, and SERVFAIL on the forward path).
349    fn forward_code_response(
350        request: &hickory_server::server::Request,
351        code: hickory_server::proto::op::ResponseCode,
352    ) -> hickory_server::authority::MessageResponse<
353        '_,
354        '_,
355        impl Iterator<Item = &Record> + Send,
356        impl Iterator<Item = &Record> + Send,
357        impl Iterator<Item = &Record> + Send,
358        impl Iterator<Item = &Record> + Send,
359    > {
360        use hickory_server::authority::MessageResponseBuilder;
361        MessageResponseBuilder::from_message_request(request).error_msg(request.header(), code)
362    }
363
364    /// Resolve `name`/`rtype` through the upstream resolver and send the mapped
365    /// response. Returns the wire [`ResponseInfo`].
366    async fn forward<R: hickory_server::server::ResponseHandler>(
367        &self,
368        resolver: &hickory_server::resolver::TokioAsyncResolver,
369        request: &hickory_server::server::Request,
370        mut response_handle: R,
371    ) -> hickory_server::server::ResponseInfo {
372        use hickory_server::proto::op::ResponseCode;
373        use hickory_server::resolver::error::ResolveErrorKind;
374
375        let query = request.request_info().query;
376        let name = Name::from(query.name());
377        let rtype = query.query_type();
378
379        match resolver.lookup(name, rtype).await {
380            Ok(lookup) => {
381                let records: Vec<Record> = lookup.records().to_vec();
382                let response = Self::forward_answer_response(request, &records);
383                Self::send_or_servfail(&mut response_handle, response).await
384            }
385            Err(e) => {
386                let code = match e.kind() {
387                    // Upstream answered authoritatively: respect its verdict.
388                    ResolveErrorKind::NoRecordsFound { response_code, .. }
389                        if *response_code == ResponseCode::NXDomain =>
390                    {
391                        ResponseCode::NXDomain
392                    }
393                    // Name exists but no record of this type (genuine NODATA).
394                    ResolveErrorKind::NoRecordsFound { response_code, .. }
395                        if *response_code == ResponseCode::NoError =>
396                    {
397                        ResponseCode::NoError
398                    }
399                    // Timeout / IO / no-connections / anything else: the upstream
400                    // path is broken. SERVFAIL — never a silent empty NoError,
401                    // never a panic.
402                    _ => {
403                        tracing::debug!(error = %e, "overlay DNS upstream forward failed; SERVFAIL");
404                        ResponseCode::ServFail
405                    }
406                };
407                let response = Self::forward_code_response(request, code);
408                Self::send_or_servfail(&mut response_handle, response).await
409            }
410        }
411    }
412
413    /// Send `response`, degrading a send error to a SERVFAIL `ResponseInfo`
414    /// (mirrors how the inner catalog handles its own send failures).
415    async fn send_or_servfail<'a, R, A, N, S, D>(
416        response_handle: &mut R,
417        response: hickory_server::authority::MessageResponse<'_, 'a, A, N, S, D>,
418    ) -> hickory_server::server::ResponseInfo
419    where
420        R: hickory_server::server::ResponseHandler,
421        A: Iterator<Item = &'a Record> + Send + 'a,
422        N: Iterator<Item = &'a Record> + Send + 'a,
423        S: Iterator<Item = &'a Record> + Send + 'a,
424        D: Iterator<Item = &'a Record> + Send + 'a,
425    {
426        match response_handle.send_response(response).await {
427            Ok(info) => info,
428            Err(e) => {
429                tracing::error!(error = %e, "failed to send overlay DNS forward response");
430                let mut header = hickory_server::proto::op::Header::new();
431                header.set_response_code(hickory_server::proto::op::ResponseCode::ServFail);
432                header.into()
433            }
434        }
435    }
436}
437
438#[async_trait::async_trait]
439impl hickory_server::server::RequestHandler for ForwardingCatalog {
440    async fn handle_request<R: hickory_server::server::ResponseHandler>(
441        &self,
442        request: &hickory_server::server::Request,
443        response_handle: R,
444    ) -> hickory_server::server::ResponseInfo {
445        // Overlay-zone queries (and anything when we have no upstream resolver)
446        // go straight to the catalog / InMemoryAuthority. Everything else is
447        // forwarded.
448        let query_name = request.request_info().query.name().clone();
449        let is_overlay =
450            self.zone_origin.zone_of(&query_name) || self.static_origin.zone_of(&query_name);
451
452        match (&self.resolver, is_overlay) {
453            (Some(resolver), false) => self.forward(resolver, request, response_handle).await,
454            _ => self.catalog.handle_request(request, response_handle).await,
455        }
456    }
457}
458
459/// Generate a hostname from an IP address for DNS registration
460///
461/// For IPv4: converts an IP like 10.200.0.5 to "node-0-5" (using last two octets).
462/// For IPv6: converts an IP like `fd00::abcd` to "node-abcd" (using last 4 hex chars).
463#[must_use]
464pub fn peer_hostname(ip: IpAddr) -> String {
465    match ip {
466        IpAddr::V4(v4) => {
467            let octets = v4.octets();
468            format!("node-{}-{}", octets[2], octets[3])
469        }
470        IpAddr::V6(v6) => {
471            let segments = v6.segments();
472            let last_segment = segments[7];
473            format!("node-{last_segment:04x}")
474        }
475    }
476}
477
478/// Error type for DNS operations
479#[derive(Debug, thiserror::Error)]
480pub enum DnsError {
481    #[error("Invalid domain name: {0}")]
482    InvalidName(String),
483
484    #[error("DNS server error: {0}")]
485    Server(String),
486
487    #[error("DNS client error: {0}")]
488    Client(String),
489
490    #[error("IO error: {0}")]
491    Io(#[from] std::io::Error),
492
493    #[error("Record not found: {0}")]
494    NotFound(String),
495}
496
497/// Handle for managing DNS records after server is started
498///
499/// This handle can be cloned and used to add/remove records while the server is running.
500#[derive(Clone)]
501pub struct DnsHandle {
502    authority: Arc<InMemoryAuthority>,
503    zone_origin: Name,
504    serial: Arc<RwLock<u32>>,
505}
506
507impl DnsHandle {
508    /// Add a DNS record for a hostname to IP mapping
509    ///
510    /// Creates an A record for IPv4 addresses and an AAAA record for IPv6 addresses.
511    ///
512    /// # Errors
513    ///
514    /// Returns `DnsError::InvalidName` if the hostname is invalid.
515    pub async fn add_record(&self, hostname: &str, ip: IpAddr) -> Result<(), DnsError> {
516        // Create the fully qualified domain name
517        let fqdn = if hostname.ends_with('.') {
518            Name::from_str(hostname)
519                .map_err(|e| DnsError::InvalidName(format!("{hostname}: {e}")))?
520        } else {
521            // Append the zone origin
522            let name = Name::from_str(hostname)
523                .map_err(|e| DnsError::InvalidName(format!("{hostname}: {e}")))?;
524            name.append_domain(&self.zone_origin)
525                .map_err(|e| DnsError::InvalidName(format!("Failed to append zone: {e}")))?
526        };
527
528        // Create an A or AAAA record depending on address family
529        let rdata = match ip {
530            IpAddr::V4(v4) => RData::A(A::from(v4)),
531            IpAddr::V6(v6) => RData::AAAA(AAAA::from(v6)),
532        };
533        let record = Record::from_rdata(fqdn, 300, rdata); // 300 second TTL
534
535        // Get the current serial and increment it
536        let serial = {
537            let mut s = self.serial.write().await;
538            let current = *s;
539            *s = s.wrapping_add(1);
540            current
541        };
542
543        // Upsert the record into the authority (uses internal synchronization)
544        self.authority.upsert(record, serial).await;
545
546        Ok(())
547    }
548
549    /// Remove DNS records for a hostname (both A and AAAA)
550    ///
551    /// Tombstones both record types since we don't track which type was stored.
552    ///
553    /// # Errors
554    ///
555    /// Returns `DnsError::InvalidName` if the hostname is invalid.
556    pub async fn remove_record(&self, hostname: &str) -> Result<bool, DnsError> {
557        let fqdn = if hostname.ends_with('.') {
558            Name::from_str(hostname)
559                .map_err(|e| DnsError::InvalidName(format!("{hostname}: {e}")))?
560        } else {
561            let name = Name::from_str(hostname)
562                .map_err(|e| DnsError::InvalidName(format!("{hostname}: {e}")))?;
563            name.append_domain(&self.zone_origin)
564                .map_err(|e| DnsError::InvalidName(format!("Failed to append zone: {e}")))?
565        };
566
567        let serial = {
568            let mut s = self.serial.write().await;
569            let current = *s;
570            *s = s.wrapping_add(1);
571            current
572        };
573
574        // Create empty records to effectively "remove" by setting empty data.
575        // Note: hickory-dns doesn't have a direct remove, so we create tombstones.
576        // We tombstone both A and AAAA since we don't know which type was stored.
577        let a_record = Record::with(fqdn.clone(), RecordType::A, 0);
578        self.authority.upsert(a_record, serial).await;
579
580        let aaaa_record = Record::with(fqdn.clone(), RecordType::AAAA, 0);
581        self.authority.upsert(aaaa_record, serial).await;
582
583        Ok(true)
584    }
585
586    /// Get the zone origin
587    #[must_use]
588    pub fn zone_origin(&self) -> &Name {
589        &self.zone_origin
590    }
591
592    /// Look up the first A record for an absolute name directly against this
593    /// handle's in-memory authority, without any UDP roundtrip.
594    ///
595    /// `fqdn` is treated as an absolute name (a trailing dot is optional). The
596    /// name MUST already be fully qualified within the server's zone — this does
597    /// NOT apply resolv.conf-style `search` expansion. Returns the first A
598    /// record's address, or `None` when no matching A record exists.
599    ///
600    /// Intended for in-process callers and tests that need deterministic
601    /// resolution against the authority (the UDP [`DnsClient`] uses a blocking
602    /// sync client that can deadlock a current-thread tokio runtime).
603    pub async fn lookup_a(&self, fqdn: &str) -> Option<IpAddr> {
604        use hickory_server::authority::{Authority, LookupOptions};
605
606        let name = Name::from_str(fqdn).ok()?;
607        let lower = LowerName::from(name);
608        let lookup = self
609            .authority
610            .lookup(&lower, RecordType::A, LookupOptions::default())
611            .await
612            .ok()?;
613        lookup.iter().find_map(|record| match record.data() {
614            Some(RData::A(a)) => Some(IpAddr::V4((*a).into())),
615            _ => None,
616        })
617    }
618}
619
620/// Origin of the node-stable static DNS zone. Records here (e.g.
621/// `host.zlayer.local`, `daemon.<deployment>.zlayer.local`) point at the node's
622/// own overlay IP and are served regardless of the per-deployment zone, so a
623/// container's resolver can always reach the daemon by a stable name.
624pub const STATIC_ZONE_ORIGIN: &str = "zlayer.local.";
625
626/// DNS server for overlay networks
627pub struct DnsServer {
628    listen_addr: SocketAddr,
629    authority: Arc<InMemoryAuthority>,
630    zone_origin: Name,
631    /// Authority for the always-served static [`STATIC_ZONE_ORIGIN`] zone.
632    static_authority: Arc<InMemoryAuthority>,
633    /// Origin name for [`Self::static_authority`].
634    static_origin: Name,
635    serial: Arc<RwLock<u32>>,
636    /// Upstream resolvers for non-overlay queries.
637    ///
638    /// Resolved once at construction (config override > filtered resolv.conf >
639    /// public fallback). Every catalog this server builds — the primary
640    /// listener and any secondary / Windows-fallback listener — is wrapped in a
641    /// [`ForwardingCatalog`] that forwards non-overlay queries here, so a query
642    /// that does not match the overlay zone is forwarded instead of refused.
643    /// Empty only in the theoretical case where resolution yields nothing (it
644    /// always returns at least the public fallback), in which case no forwarder
645    /// is installed and non-overlay queries get the pre-existing REFUSED
646    /// behaviour.
647    upstreams: Vec<SocketAddr>,
648}
649
650impl DnsServer {
651    /// Create a new DNS server for the given zone.
652    ///
653    /// Upstreams for non-overlay forwarding are auto-detected from the host
654    /// `/etc/resolv.conf` (loopback/stub filtered, public fallback if empty).
655    /// Use [`Self::from_config`] with [`DnsConfig::with_upstreams`] to override.
656    ///
657    /// # Errors
658    ///
659    /// Returns `DnsError::InvalidName` if the zone name is invalid.
660    pub fn new(listen_addr: SocketAddr, zone: &str) -> Result<Self, DnsError> {
661        let upstreams =
662            resolve_upstreams(&DnsConfig::new(zone, listen_addr.ip()), RESOLV_CONF_PATH);
663        Self::new_with_upstreams(listen_addr, zone, upstreams)
664    }
665
666    /// Create a DNS server with an explicit, already-resolved upstream list.
667    ///
668    /// Bypasses resolv.conf detection entirely — `upstreams` is used verbatim
669    /// for the root-zone forwarder. Primarily an internal/testing seam so a
670    /// stub upstream can be injected without touching the host `/etc/resolv.conf`.
671    ///
672    /// # Errors
673    ///
674    /// Returns `DnsError::InvalidName` if the zone name is invalid.
675    pub fn new_with_upstreams(
676        listen_addr: SocketAddr,
677        zone: &str,
678        upstreams: Vec<SocketAddr>,
679    ) -> Result<Self, DnsError> {
680        let zone_origin =
681            Name::from_str(zone).map_err(|e| DnsError::InvalidName(format!("{zone}: {e}")))?;
682
683        // Create an empty in-memory authority for the zone
684        // Using Arc directly since InMemoryAuthority has internal synchronization via upsert()
685        let authority = Arc::new(InMemoryAuthority::empty(
686            zone_origin.clone(),
687            ZoneType::Primary,
688            false,
689        ));
690
691        // Always-served static zone (zlayer.local) for node-stable records.
692        //
693        // When the per-deployment zone IS `zlayer.local.` (the default daemon,
694        // `deployment_name = "zlayer"`), the static origin COLLIDES with the
695        // primary zone origin. The catalog is a map keyed by origin, so two
696        // separate authorities at the same key evict each other — that is the
697        // bug that NXDOMAIN'd `<svc>.service` (the static authority, holding only
698        // `host`/`daemon` records, clobbered the primary holding the service
699        // records). Share ONE authority when the origins coincide so both
700        // service-discovery (`add_record`) and node-stable (`add_static_record`)
701        // records live in the single catalog-reachable authority.
702        let static_origin = Name::from_str(STATIC_ZONE_ORIGIN)
703            .map_err(|e| DnsError::InvalidName(format!("{STATIC_ZONE_ORIGIN}: {e}")))?;
704        let static_authority = if static_origin == zone_origin {
705            Arc::clone(&authority)
706        } else {
707            Arc::new(InMemoryAuthority::empty(
708                static_origin.clone(),
709                ZoneType::Primary,
710                false,
711            ))
712        };
713
714        Ok(Self {
715            listen_addr,
716            authority,
717            zone_origin,
718            static_authority,
719            static_origin,
720            serial: Arc::new(RwLock::new(1)),
721            upstreams,
722        })
723    }
724
725    /// Create from a `DnsConfig`
726    ///
727    /// Upstreams follow [`resolve_upstreams`] precedence: `config.upstreams`
728    /// override wins, else filtered `/etc/resolv.conf`, else public fallback.
729    ///
730    /// # Errors
731    ///
732    /// Returns `DnsError::InvalidName` if the zone name is invalid.
733    pub fn from_config(config: &DnsConfig) -> Result<Self, DnsError> {
734        let listen_addr = SocketAddr::new(config.bind_addr, config.port);
735        let upstreams = resolve_upstreams(config, RESOLV_CONF_PATH);
736        Self::new_with_upstreams(listen_addr, &config.zone, upstreams)
737    }
738
739    /// The upstream resolvers this server forwards non-overlay queries to.
740    #[must_use]
741    pub fn upstreams(&self) -> &[SocketAddr] {
742        &self.upstreams
743    }
744
745    /// Build the request handler for a listener: a [`ForwardingCatalog`] that
746    /// serves the overlay zone from `authority` (via an inner [`Catalog`]) and
747    /// forwards every non-overlay query to `upstreams`, mapping total upstream
748    /// failure to `SERVFAIL` rather than a silent empty `NoError`.
749    ///
750    /// Shared by every listener (primary + secondary) so forwarding behaviour
751    /// is identical across the sockets this server binds. A resolver-build
752    /// failure (only possible with an empty upstream set, which is gated out
753    /// here) degrades to "overlay-only": non-overlay queries fall through to the
754    /// catalog and get `REFUSED`, but overlay service discovery keeps working.
755    fn build_catalog(
756        zone_origin: Name,
757        authority: Arc<InMemoryAuthority>,
758        static_origin: Name,
759        static_authority: Arc<InMemoryAuthority>,
760        upstreams: &[SocketAddr],
761    ) -> ForwardingCatalog {
762        let lower_origin = LowerName::from(zone_origin.clone());
763        let lower_static_origin = LowerName::from(static_origin.clone());
764
765        let mut catalog = Catalog::new();
766        // The catalog accepts Arc<dyn AuthorityObject> - InMemoryAuthority implements this
767        catalog.upsert(zone_origin.into(), Box::new(authority));
768        // Always-served static zone (zlayer.local) on every listener. Skip when
769        // it collides with the primary zone origin — same catalog key, and the
770        // authority is already shared (see `new_with_upstreams`), so a second
771        // upsert would just self-replace and could evict the primary.
772        if lower_static_origin != lower_origin {
773            catalog.upsert(static_origin.into(), Box::new(static_authority));
774        }
775
776        let resolver = if upstreams.is_empty() {
777            None
778        } else {
779            match build_forward_resolver(upstreams) {
780                Ok(r) => {
781                    tracing::debug!(
782                        upstreams = ?upstreams,
783                        "overlay DNS forwarder ready for non-overlay queries",
784                    );
785                    Some(Arc::new(r))
786                }
787                Err(e) => {
788                    tracing::error!(
789                        error = %e,
790                        "failed to build overlay DNS forwarder; non-overlay queries \
791                         will be refused (overlay zone still served)",
792                    );
793                    None
794                }
795            }
796        };
797
798        ForwardingCatalog {
799            catalog,
800            zone_origin: lower_origin,
801            static_origin: lower_static_origin,
802            resolver,
803        }
804    }
805
806    /// Get a handle for managing DNS records
807    ///
808    /// The handle can be cloned and used to add/remove records even after
809    /// the server has been started.
810    #[must_use]
811    pub fn handle(&self) -> DnsHandle {
812        DnsHandle {
813            authority: Arc::clone(&self.authority),
814            zone_origin: self.zone_origin.clone(),
815            serial: Arc::clone(&self.serial),
816        }
817    }
818
819    /// Add a DNS record for a hostname to IP mapping
820    ///
821    /// Creates an A record for IPv4 addresses and an AAAA record for IPv6 addresses.
822    ///
823    /// # Errors
824    ///
825    /// Returns `DnsError::InvalidName` if the hostname is invalid.
826    pub async fn add_record(&self, hostname: &str, ip: IpAddr) -> Result<(), DnsError> {
827        self.handle().add_record(hostname, ip).await
828    }
829
830    /// Add a record into the always-served static [`STATIC_ZONE_ORIGIN`] zone.
831    ///
832    /// `hostname` may be a bare label (e.g. `host`, becoming `host.zlayer.local`)
833    /// or a fully-qualified name ending in `.` — in which case it MUST fall
834    /// within the `zlayer.local.` zone or it will be added but never routed to
835    /// the static authority (the catalog only serves names under the zone
836    /// origin). Use this for node-stable names that must resolve regardless of
837    /// the per-deployment zone so containers can always reach the daemon.
838    ///
839    /// # Errors
840    ///
841    /// Returns `DnsError::InvalidName` if the hostname is invalid.
842    pub async fn add_static_record(&self, hostname: &str, ip: IpAddr) -> Result<(), DnsError> {
843        let fqdn = if hostname.ends_with('.') {
844            Name::from_str(hostname)
845                .map_err(|e| DnsError::InvalidName(format!("{hostname}: {e}")))?
846        } else {
847            let name = Name::from_str(hostname)
848                .map_err(|e| DnsError::InvalidName(format!("{hostname}: {e}")))?;
849            name.append_domain(&self.static_origin)
850                .map_err(|e| DnsError::InvalidName(format!("Failed to append static zone: {e}")))?
851        };
852
853        let rdata = match ip {
854            IpAddr::V4(v4) => RData::A(A::from(v4)),
855            IpAddr::V6(v6) => RData::AAAA(AAAA::from(v6)),
856        };
857        let record = Record::from_rdata(fqdn, 300, rdata);
858
859        let serial = {
860            let mut s = self.serial.write().await;
861            let current = *s;
862            *s = s.wrapping_add(1);
863            current
864        };
865        self.static_authority.upsert(record, serial).await;
866        Ok(())
867    }
868
869    /// Look up the first A record for an absolute name directly against the
870    /// static [`STATIC_ZONE_ORIGIN`] authority. Test-only helper mirroring
871    /// [`DnsHandle::lookup_a`].
872    #[cfg(test)]
873    pub(crate) async fn lookup_static_a(&self, fqdn: &str) -> Option<IpAddr> {
874        use hickory_server::authority::{Authority, LookupOptions};
875
876        let name = Name::from_str(fqdn).ok()?;
877        let lower = LowerName::from(name);
878        let lookup = self
879            .static_authority
880            .lookup(&lower, RecordType::A, LookupOptions::default())
881            .await
882            .ok()?;
883        lookup.iter().find_map(|record| match record.data() {
884            Some(RData::A(a)) => Some(IpAddr::V4((*a).into())),
885            _ => None,
886        })
887    }
888
889    /// Remove DNS records for a hostname (both A and AAAA)
890    ///
891    /// # Errors
892    ///
893    /// Returns `DnsError::InvalidName` if the hostname is invalid.
894    pub async fn remove_record(&self, hostname: &str) -> Result<bool, DnsError> {
895        self.handle().remove_record(hostname).await
896    }
897
898    /// Start the DNS server and return a handle for record management
899    ///
900    /// This spawns the DNS server in a background task and returns a handle
901    /// that can be used to add/remove records while the server is running.
902    ///
903    /// # Errors
904    ///
905    /// This method currently always succeeds but returns `Result` for API consistency.
906    #[allow(clippy::unused_async)]
907    pub async fn start(self) -> Result<DnsHandle, DnsError> {
908        let handle = self.handle();
909        let listen_addr = self.listen_addr;
910        let zone_origin = self.zone_origin.clone();
911        let authority = Arc::clone(&self.authority);
912        let static_origin = self.static_origin.clone();
913        let static_authority = Arc::clone(&self.static_authority);
914        let upstreams = self.upstreams.clone();
915
916        // Spawn the server in a background task
917        tokio::spawn(async move {
918            if let Err(e) = Self::run_server(
919                listen_addr,
920                zone_origin,
921                authority,
922                static_origin,
923                static_authority,
924                upstreams,
925            )
926            .await
927            {
928                tracing::error!("DNS server error: {}", e);
929            }
930        });
931
932        Ok(handle)
933    }
934
935    /// Start the DNS server in a background task without consuming self.
936    ///
937    /// Unlike `start(self)`, this method borrows self, allowing the `DnsServer`
938    /// to be wrapped in an Arc and shared (e.g., with `ServiceManager`) while
939    /// the server runs in the background.
940    ///
941    /// # Errors
942    ///
943    /// This method currently always succeeds but returns `Result` for API consistency.
944    #[allow(clippy::unused_async)]
945    pub async fn start_background(&self) -> Result<DnsHandle, DnsError> {
946        let handle = self.handle();
947        let listen_addr = self.listen_addr;
948        let zone_origin = self.zone_origin.clone();
949        let authority = Arc::clone(&self.authority);
950        let static_origin = self.static_origin.clone();
951        let static_authority = Arc::clone(&self.static_authority);
952        let upstreams = self.upstreams.clone();
953
954        tokio::spawn(async move {
955            if let Err(e) = Self::run_server(
956                listen_addr,
957                zone_origin,
958                authority,
959                static_origin,
960                static_authority,
961                upstreams,
962            )
963            .await
964            {
965                tracing::error!("DNS server error: {}", e);
966            }
967        });
968
969        Ok(handle)
970    }
971
972    /// Bind a second DNS listener on port 53 of `bind_ip`, sharing this
973    /// server's authority + zone so the same records answer both listeners.
974    ///
975    /// Windows containers always query DNS on port 53 — HNS endpoints do not
976    /// support setting a non-standard DNS port in the schema. The canonical
977    /// overlay listener on [`DEFAULT_DNS_PORT`] (15353) is therefore
978    /// unreachable from a Windows container; this method adds a second
979    /// listener on port 53 of the overlay IP so containers that point at
980    /// `<overlay_ip>:53` via `Dns.ServerList` can actually resolve.
981    ///
982    /// `bind_ip` is typically the node's overlay IP (e.g. `10.200.42.1`).
983    /// Binding to `0.0.0.0:53` would collide with whatever resolver the host
984    /// already runs (systemd-resolved on Linux, DNS Client on Windows). The
985    /// method itself is cross-platform; callers decide whether to invoke it
986    /// based on their workload mix.
987    ///
988    /// The bound UDP + TCP sockets live on a detached tokio task that shares
989    /// the same `Arc<InMemoryAuthority>` as the primary listener, so
990    /// `DnsHandle::add_record` / `remove_record` updates both responders
991    /// atomically. Returns a cloneable [`DnsHandle`] for convenience.
992    ///
993    /// # Errors
994    ///
995    /// Returns `DnsError::Io` when either port 53 socket (UDP or TCP) cannot
996    /// be bound — typically because another DNS resolver already owns the
997    /// address, or because the process lacks the privilege to bind below 1024
998    /// on platforms that require it. Callers should treat this as a warning
999    /// and fall back to the primary 15353 listener for non-Windows workloads.
1000    #[allow(clippy::unused_async)]
1001    pub async fn bind_windows_fallback(&self, bind_ip: IpAddr) -> Result<DnsHandle, DnsError> {
1002        self.bind_secondary(SocketAddr::new(bind_ip, 53)).await
1003    }
1004
1005    /// Bind an additional DNS listener on an arbitrary `listen_addr`, sharing
1006    /// this server's authority + zone so the same records answer on both the
1007    /// primary listener and this one.
1008    ///
1009    /// Unlike [`bind_windows_fallback`](Self::bind_windows_fallback) (which is
1010    /// hard-wired to port 53 for Windows HNS containers), this lets the caller
1011    /// pick a **non-privileged** port — required on macOS where an unprivileged
1012    /// daemon cannot bind below 1024. The VZ-Linux path uses this to expose the
1013    /// overlay resolver on `<node_overlay_ip>:<dns_port>` so a tiny in-guest
1014    /// relay can forward the guest's port-53 queries to it.
1015    ///
1016    /// # Errors
1017    ///
1018    /// Returns `DnsError::Io` when either the UDP or TCP socket cannot be bound.
1019    #[allow(clippy::unused_async)]
1020    pub async fn bind_secondary(&self, listen_addr: SocketAddr) -> Result<DnsHandle, DnsError> {
1021        let handle = self.handle();
1022        let zone_origin = self.zone_origin.clone();
1023        let authority = Arc::clone(&self.authority);
1024        let static_origin = self.static_origin.clone();
1025        let static_authority = Arc::clone(&self.static_authority);
1026        let upstreams = self.upstreams.clone();
1027
1028        // Pre-bind the sockets synchronously so binding failures surface here
1029        // instead of being swallowed by the detached task. On success we hand
1030        // the live sockets off to the server future on a background task.
1031        let udp_socket = UdpSocket::bind(listen_addr).await?;
1032        let tcp_listener = TcpListener::bind(listen_addr).await?;
1033
1034        tokio::spawn(async move {
1035            let catalog = Self::build_catalog(
1036                zone_origin,
1037                authority,
1038                static_origin,
1039                static_authority,
1040                &upstreams,
1041            );
1042            let mut server = ServerFuture::new(catalog);
1043            server.register_socket(udp_socket);
1044            server.register_listener(tcp_listener, Duration::from_secs(30));
1045            tracing::info!(
1046                addr = %listen_addr,
1047                "secondary DNS listener started",
1048            );
1049            if let Err(e) = server.block_until_done().await {
1050                tracing::error!("secondary DNS listener error: {}", e);
1051            }
1052        });
1053
1054        Ok(handle)
1055    }
1056
1057    /// Internal method to run the DNS server
1058    async fn run_server(
1059        listen_addr: SocketAddr,
1060        zone_origin: Name,
1061        authority: Arc<InMemoryAuthority>,
1062        static_origin: Name,
1063        static_authority: Arc<InMemoryAuthority>,
1064        upstreams: Vec<SocketAddr>,
1065    ) -> Result<(), DnsError> {
1066        // Create the catalog: overlay zone authority + static zlayer.local zone
1067        // + (optional) root-zone forwarder for everything else.
1068        let catalog = Self::build_catalog(
1069            zone_origin,
1070            authority,
1071            static_origin,
1072            static_authority,
1073            &upstreams,
1074        );
1075
1076        // Create the server
1077        let mut server = ServerFuture::new(catalog);
1078
1079        // Bind UDP socket
1080        let udp_socket = UdpSocket::bind(listen_addr).await?;
1081        server.register_socket(udp_socket);
1082
1083        // Bind TCP listener
1084        let tcp_listener = TcpListener::bind(listen_addr).await?;
1085        server.register_listener(tcp_listener, Duration::from_secs(30));
1086
1087        tracing::info!(addr = %listen_addr, "DNS server listening");
1088
1089        // Run the server
1090        server
1091            .block_until_done()
1092            .await
1093            .map_err(|e| DnsError::Server(e.to_string()))?;
1094
1095        Ok(())
1096    }
1097
1098    /// Get the listen address
1099    #[must_use]
1100    pub fn listen_addr(&self) -> SocketAddr {
1101        self.listen_addr
1102    }
1103
1104    /// Get the zone origin
1105    #[must_use]
1106    pub fn zone_origin(&self) -> &Name {
1107        &self.zone_origin
1108    }
1109}
1110
1111/// DNS client for querying overlay DNS servers
1112pub struct DnsClient {
1113    server_addr: SocketAddr,
1114}
1115
1116impl DnsClient {
1117    /// Create a new DNS client
1118    #[must_use]
1119    pub fn new(server_addr: SocketAddr) -> Self {
1120        Self { server_addr }
1121    }
1122
1123    /// Query for an A record
1124    ///
1125    /// # Errors
1126    ///
1127    /// Returns a `DnsError` if the query fails or the hostname is invalid.
1128    pub fn query_a(&self, hostname: &str) -> Result<Option<Ipv4Addr>, DnsError> {
1129        let name = Name::from_str(hostname)
1130            .map_err(|e| DnsError::InvalidName(format!("{hostname}: {e}")))?;
1131
1132        let conn = UdpClientConnection::new(self.server_addr)
1133            .map_err(|e| DnsError::Client(e.to_string()))?;
1134
1135        let client = SyncClient::new(conn);
1136
1137        let response = client
1138            .query(&name, DNSClass::IN, RecordType::A)
1139            .map_err(|e| DnsError::Client(e.to_string()))?;
1140
1141        // Extract the A record from the response
1142        for answer in response.answers() {
1143            if let Some(RData::A(a_record)) = answer.data() {
1144                return Ok(Some((*a_record).into()));
1145            }
1146        }
1147
1148        Ok(None)
1149    }
1150
1151    /// Query for an AAAA record (IPv6)
1152    ///
1153    /// # Errors
1154    ///
1155    /// Returns a `DnsError` if the query fails or the hostname is invalid.
1156    pub fn query_aaaa(&self, hostname: &str) -> Result<Option<Ipv6Addr>, DnsError> {
1157        let name = Name::from_str(hostname)
1158            .map_err(|e| DnsError::InvalidName(format!("{hostname}: {e}")))?;
1159
1160        let conn = UdpClientConnection::new(self.server_addr)
1161            .map_err(|e| DnsError::Client(e.to_string()))?;
1162
1163        let client = SyncClient::new(conn);
1164
1165        let response = client
1166            .query(&name, DNSClass::IN, RecordType::AAAA)
1167            .map_err(|e| DnsError::Client(e.to_string()))?;
1168
1169        // Extract the AAAA record from the response
1170        for answer in response.answers() {
1171            if let Some(RData::AAAA(aaaa_record)) = answer.data() {
1172                return Ok(Some((*aaaa_record).into()));
1173            }
1174        }
1175
1176        Ok(None)
1177    }
1178
1179    /// Query for any address record (A or AAAA), returning the first match
1180    ///
1181    /// Tries A first, then AAAA. Returns the first successful result.
1182    ///
1183    /// # Errors
1184    ///
1185    /// Returns a `DnsError` if both queries fail or the hostname is invalid.
1186    pub fn query_addr(&self, hostname: &str) -> Result<Option<IpAddr>, DnsError> {
1187        // Try A record first
1188        if let Ok(Some(v4)) = self.query_a(hostname) {
1189            return Ok(Some(IpAddr::V4(v4)));
1190        }
1191
1192        // Then try AAAA
1193        if let Ok(Some(v6)) = self.query_aaaa(hostname) {
1194            return Ok(Some(IpAddr::V6(v6)));
1195        }
1196
1197        Ok(None)
1198    }
1199}
1200
1201/// Service discovery with DNS
1202pub struct ServiceDiscovery {
1203    dns_server: SocketAddr,
1204    records: RwLock<HashMap<String, IpAddr>>,
1205}
1206
1207impl ServiceDiscovery {
1208    /// Create a new service discovery instance
1209    #[must_use]
1210    pub fn new(dns_server_addr: SocketAddr) -> Self {
1211        Self {
1212            dns_server: dns_server_addr,
1213            records: RwLock::new(HashMap::new()),
1214        }
1215    }
1216
1217    /// Register a service (stores locally, does not update DNS server)
1218    pub async fn register(&self, name: &str, ip: IpAddr) {
1219        let mut records = self.records.write().await;
1220        records.insert(name.to_string(), ip);
1221    }
1222
1223    /// Resolve a service to an IP address
1224    ///
1225    /// Checks the local cache first, then queries the DNS server for both
1226    /// A (IPv4) and AAAA (IPv6) records.
1227    pub async fn resolve(&self, name: &str) -> Option<IpAddr> {
1228        // First check local cache
1229        {
1230            let records = self.records.read().await;
1231            if let Some(ip) = records.get(name) {
1232                return Some(*ip);
1233            }
1234        }
1235
1236        // Query DNS server for both A and AAAA records
1237        let client = DnsClient::new(self.dns_server);
1238        if let Ok(Some(addr)) = client.query_addr(name) {
1239            return Some(addr);
1240        }
1241
1242        None
1243    }
1244
1245    /// Unregister a service
1246    pub async fn unregister(&self, name: &str) {
1247        let mut records = self.records.write().await;
1248        records.remove(name);
1249    }
1250
1251    /// List all registered services
1252    pub async fn list_services(&self) -> Vec<String> {
1253        let records = self.records.read().await;
1254        records.keys().cloned().collect()
1255    }
1256
1257    /// Get the DNS server address
1258    pub fn dns_server(&self) -> SocketAddr {
1259        self.dns_server
1260    }
1261}
1262
1263/// Build the body of a macOS scoped-resolver file (`/etc/resolver/<zone>`).
1264///
1265/// The format is the BSD `resolver(5)` configuration consumed by macOS's
1266/// `mDNSResponder`: a `nameserver <ip>` line and, when a non-standard port is
1267/// in play, a `port <p>` line. Factored out as a pure, platform-agnostic helper
1268/// so the exact byte layout can be asserted in a unit test on any host (the
1269/// `write_scoped_resolver` wrapper that touches `/etc/resolver` is macOS-only).
1270///
1271/// Gated on `macos` (its sole non-test caller) or `test` so it never lingers as
1272/// dead code in a plain non-macOS library build under `-D warnings`.
1273#[cfg(any(target_os = "macos", test))]
1274fn scoped_resolver_body(node_ip: std::net::IpAddr, port: Option<u16>) -> String {
1275    use std::fmt::Write as _;
1276    let mut body = format!("nameserver {node_ip}\n");
1277    if let Some(p) = port {
1278        let _ = writeln!(body, "port {p}");
1279    }
1280    body
1281}
1282
1283/// Write a macOS scoped resolver `/etc/resolver/<zone>` pointing `*.<zone>`
1284/// lookups at the overlay DNS resolver `node_ip` (optionally on `port`). This
1285/// lets host-net-shared Seatbelt containers (which share the host
1286/// `/etc/resolv.conf`) resolve overlay names without altering the system
1287/// default resolver. Requires root (the `/etc/resolver` dir is root-owned);
1288/// returns an error if not root. Idempotent: rewrites only when content differs.
1289///
1290/// # Errors
1291///
1292/// Returns [`std::io::ErrorKind::PermissionDenied`] when the effective UID is
1293/// not 0, and propagates any I/O error from creating `/etc/resolver` or writing
1294/// the per-zone file.
1295#[cfg(target_os = "macos")]
1296// The only `unsafe` is the C-ABI `libc::geteuid()` call (always safe; no args,
1297// no memory). Scoped to this fn so the crate keeps the warn-level lint.
1298#[allow(unsafe_code)]
1299pub fn write_scoped_resolver(
1300    zone: &str,
1301    node_ip: std::net::IpAddr,
1302    port: Option<u16>,
1303) -> std::io::Result<()> {
1304    // `/etc/resolver` is root-owned; fail fast with a clear error rather than
1305    // letting the directory create / file write surface a bare EACCES.
1306    if unsafe { libc::geteuid() } != 0 {
1307        return Err(std::io::Error::new(
1308            std::io::ErrorKind::PermissionDenied,
1309            "writing /etc/resolver requires root",
1310        ));
1311    }
1312
1313    std::fs::create_dir_all("/etc/resolver")?;
1314
1315    let body = scoped_resolver_body(node_ip, port);
1316    let path = format!("/etc/resolver/{zone}");
1317
1318    // Idempotent: if the existing file already matches, leave it untouched so
1319    // mDNSResponder isn't needlessly poked by a no-op rewrite.
1320    if let Ok(existing) = std::fs::read_to_string(&path) {
1321        if existing == body {
1322            return Ok(());
1323        }
1324    }
1325
1326    std::fs::write(&path, body)
1327}
1328
1329/// Remove a previously-written `/etc/resolver/<zone>`. Idempotent (missing file
1330/// is treated as success). Root-gated, matching [`write_scoped_resolver`].
1331///
1332/// # Errors
1333///
1334/// Returns [`std::io::ErrorKind::PermissionDenied`] when the effective UID is
1335/// not 0, and propagates any I/O error other than a missing file.
1336#[cfg(target_os = "macos")]
1337// The only `unsafe` is the C-ABI `libc::geteuid()` call (always safe; no args,
1338// no memory). Scoped to this fn so the crate keeps the warn-level lint.
1339#[allow(unsafe_code)]
1340pub fn remove_scoped_resolver(zone: &str) -> std::io::Result<()> {
1341    if unsafe { libc::geteuid() } != 0 {
1342        return Err(std::io::Error::new(
1343            std::io::ErrorKind::PermissionDenied,
1344            "removing /etc/resolver entries requires root",
1345        ));
1346    }
1347
1348    let path = format!("/etc/resolver/{zone}");
1349    match std::fs::remove_file(&path) {
1350        Ok(()) => Ok(()),
1351        Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(()),
1352        Err(e) => Err(e),
1353    }
1354}
1355
1356#[cfg(test)]
1357mod tests {
1358    use super::*;
1359
1360    #[test]
1361    fn test_scoped_resolver_body_with_port() {
1362        let body = scoped_resolver_body(IpAddr::V4(Ipv4Addr::new(10, 200, 0, 1)), Some(15353));
1363        assert_eq!(body, "nameserver 10.200.0.1\nport 15353\n");
1364    }
1365
1366    #[test]
1367    fn test_scoped_resolver_body_without_port() {
1368        // No explicit port => standard :53, so no `port` line is emitted.
1369        let body = scoped_resolver_body(IpAddr::V4(Ipv4Addr::new(10, 200, 0, 1)), None);
1370        assert_eq!(body, "nameserver 10.200.0.1\n");
1371    }
1372
1373    #[test]
1374    fn test_scoped_resolver_body_v6() {
1375        let body = scoped_resolver_body(IpAddr::V6("fd00::1".parse().unwrap()), Some(53));
1376        assert_eq!(body, "nameserver fd00::1\nport 53\n");
1377    }
1378
1379    #[test]
1380    fn test_peer_hostname_v4() {
1381        // Test various IPv4 addresses
1382        assert_eq!(
1383            peer_hostname(IpAddr::V4(Ipv4Addr::new(10, 200, 0, 1))),
1384            "node-0-1"
1385        );
1386        assert_eq!(
1387            peer_hostname(IpAddr::V4(Ipv4Addr::new(10, 200, 0, 5))),
1388            "node-0-5"
1389        );
1390        assert_eq!(
1391            peer_hostname(IpAddr::V4(Ipv4Addr::new(10, 200, 1, 100))),
1392            "node-1-100"
1393        );
1394        assert_eq!(
1395            peer_hostname(IpAddr::V4(Ipv4Addr::new(192, 168, 255, 254))),
1396            "node-255-254"
1397        );
1398    }
1399
1400    #[test]
1401    fn test_peer_hostname_v6() {
1402        // Test various IPv6 addresses
1403        assert_eq!(
1404            peer_hostname(IpAddr::V6("fd00::1".parse().unwrap())),
1405            "node-0001"
1406        );
1407        assert_eq!(
1408            peer_hostname(IpAddr::V6("fd00::abcd".parse().unwrap())),
1409            "node-abcd"
1410        );
1411        assert_eq!(
1412            peer_hostname(IpAddr::V6("fd00:200::ffff".parse().unwrap())),
1413            "node-ffff"
1414        );
1415        // Zero last segment
1416        assert_eq!(
1417            peer_hostname(IpAddr::V6("fd00::1:0".parse().unwrap())),
1418            "node-0000"
1419        );
1420    }
1421
1422    #[test]
1423    fn test_dns_config() {
1424        let config = DnsConfig::new("overlay.local.", IpAddr::V4(Ipv4Addr::new(10, 200, 0, 1)));
1425        assert_eq!(config.zone, "overlay.local.");
1426        assert_eq!(config.port, DEFAULT_DNS_PORT);
1427        assert_eq!(config.bind_addr, IpAddr::V4(Ipv4Addr::new(10, 200, 0, 1)));
1428
1429        // Test with_port
1430        let config = config.with_port(5353);
1431        assert_eq!(config.port, 5353);
1432    }
1433
1434    #[test]
1435    fn test_dns_config_serialization() {
1436        let config = DnsConfig::new("overlay.local.", IpAddr::V4(Ipv4Addr::new(10, 200, 0, 1)))
1437            .with_port(15353);
1438
1439        let json = serde_json::to_string(&config).unwrap();
1440        let deserialized: DnsConfig = serde_json::from_str(&json).unwrap();
1441
1442        assert_eq!(deserialized.zone, config.zone);
1443        assert_eq!(deserialized.port, config.port);
1444        assert_eq!(deserialized.bind_addr, config.bind_addr);
1445    }
1446
1447    #[tokio::test]
1448    async fn test_service_discovery_local_cache() {
1449        // Use a non-routable address since we're only testing local cache
1450        let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 15353);
1451        let discovery = ServiceDiscovery::new(addr);
1452
1453        let ip = IpAddr::V4(Ipv4Addr::new(10, 0, 0, 2));
1454        discovery.register("test-service", ip).await;
1455
1456        let resolved = discovery.resolve("test-service").await;
1457        assert_eq!(resolved, Some(ip));
1458
1459        // Test unregister
1460        discovery.unregister("test-service").await;
1461        let services = discovery.list_services().await;
1462        assert!(services.is_empty());
1463    }
1464
1465    #[test]
1466    fn test_dns_server_creation() {
1467        let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 15353);
1468        let server = DnsServer::new(addr, "overlay.local.");
1469
1470        assert!(server.is_ok());
1471        let server = server.unwrap();
1472        assert_eq!(server.listen_addr(), addr);
1473        assert_eq!(server.zone_origin().to_string(), "overlay.local.");
1474    }
1475
1476    #[test]
1477    fn test_dns_server_from_config() {
1478        let config =
1479            DnsConfig::new("test.local.", IpAddr::V4(Ipv4Addr::LOCALHOST)).with_port(15353);
1480        let server = DnsServer::from_config(&config);
1481
1482        assert!(server.is_ok());
1483        let server = server.unwrap();
1484        assert_eq!(server.listen_addr().port(), 15353);
1485        assert_eq!(server.zone_origin().to_string(), "test.local.");
1486    }
1487
1488    #[test]
1489    fn test_dns_server_invalid_zone() {
1490        let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 15353);
1491        // Empty zone name is technically valid in DNS, so use an obviously invalid one
1492        let server = DnsServer::new(addr, "overlay.local.");
1493        assert!(server.is_ok());
1494    }
1495
1496    #[tokio::test]
1497    async fn test_dns_server_add_record() {
1498        let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 15353);
1499        let server = DnsServer::new(addr, "overlay.local.").unwrap();
1500
1501        let result = server
1502            .add_record("myservice", IpAddr::V4(Ipv4Addr::new(10, 0, 0, 5)))
1503            .await;
1504        assert!(result.is_ok());
1505    }
1506
1507    #[tokio::test]
1508    async fn test_add_static_record_resolves_in_static_zone() {
1509        let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 15353);
1510        // Per-deployment zone is unrelated to zlayer.local; the static record
1511        // must still be served from the static authority.
1512        let server = DnsServer::new(addr, "mydeploy.local.").unwrap();
1513        let node_ip = IpAddr::V4(Ipv4Addr::new(10, 200, 0, 1));
1514
1515        // Bare label -> host.zlayer.local
1516        server.add_static_record("host", node_ip).await.unwrap();
1517        assert_eq!(
1518            server.lookup_static_a("host.zlayer.local.").await,
1519            Some(node_ip)
1520        );
1521
1522        // Fully-qualified daemon.<deployment>.zlayer.local
1523        server
1524            .add_static_record("daemon.mydeploy.zlayer.local.", node_ip)
1525            .await
1526            .unwrap();
1527        assert_eq!(
1528            server
1529                .lookup_static_a("daemon.mydeploy.zlayer.local.")
1530                .await,
1531            Some(node_ip)
1532        );
1533    }
1534
1535    #[tokio::test]
1536    async fn test_dns_handle_add_record() {
1537        let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 15353);
1538        let server = DnsServer::new(addr, "overlay.local.").unwrap();
1539
1540        // Get handle and add records through it
1541        let handle = server.handle();
1542
1543        let result = handle
1544            .add_record("service1", IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1)))
1545            .await;
1546        assert!(result.is_ok());
1547
1548        let result = handle
1549            .add_record("service2", IpAddr::V4(Ipv4Addr::new(10, 0, 0, 2)))
1550            .await;
1551        assert!(result.is_ok());
1552
1553        // Zone origin should be accessible
1554        assert_eq!(handle.zone_origin().to_string(), "overlay.local.");
1555    }
1556
1557    #[test]
1558    fn test_dns_client_creation() {
1559        let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8)), 53);
1560        let client = DnsClient::new(addr);
1561        assert_eq!(client.server_addr, addr);
1562    }
1563
1564    #[tokio::test]
1565    async fn test_dns_handle_add_aaaa_record() {
1566        let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 15353);
1567        let server = DnsServer::new(addr, "overlay.local.").unwrap();
1568        let handle = server.handle();
1569
1570        // Add an AAAA record via IPv6 address
1571        let ipv6: IpAddr = "fd00::1".parse().unwrap();
1572        let result = handle.add_record("service-v6", ipv6).await;
1573        assert!(result.is_ok());
1574
1575        // Add a second AAAA record
1576        let ipv6_2: IpAddr = "fd00::abcd".parse().unwrap();
1577        let result = handle.add_record("service-v6-2", ipv6_2).await;
1578        assert!(result.is_ok());
1579    }
1580
1581    #[tokio::test]
1582    async fn test_dns_server_add_aaaa_record() {
1583        let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 15353);
1584        let server = DnsServer::new(addr, "overlay.local.").unwrap();
1585
1586        // Add AAAA record through the server directly
1587        let ipv6: IpAddr = "fd00::42".parse().unwrap();
1588        let result = server.add_record("myservice-v6", ipv6).await;
1589        assert!(result.is_ok());
1590    }
1591
1592    #[tokio::test]
1593    async fn test_dns_handle_remove_record_covers_both_types() {
1594        let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 15353);
1595        let server = DnsServer::new(addr, "overlay.local.").unwrap();
1596        let handle = server.handle();
1597
1598        // Add an A record
1599        let ipv4 = IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1));
1600        handle.add_record("dual-service", ipv4).await.unwrap();
1601
1602        // Remove should succeed (tombstones both A and AAAA)
1603        let removed = handle.remove_record("dual-service").await.unwrap();
1604        assert!(removed);
1605
1606        // Add an AAAA record
1607        let ipv6: IpAddr = "fd00::1".parse().unwrap();
1608        handle.add_record("v6-service", ipv6).await.unwrap();
1609
1610        // Remove should also succeed for AAAA records
1611        let removed = handle.remove_record("v6-service").await.unwrap();
1612        assert!(removed);
1613    }
1614
1615    #[tokio::test]
1616    async fn test_service_discovery_local_cache_ipv6() {
1617        let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 15353);
1618        let discovery = ServiceDiscovery::new(addr);
1619
1620        // Register an IPv6 service
1621        let ipv6: IpAddr = "fd00::beef".parse().unwrap();
1622        discovery.register("v6-service", ipv6).await;
1623
1624        // Should resolve from local cache
1625        let resolved = discovery.resolve("v6-service").await;
1626        assert_eq!(resolved, Some(ipv6));
1627
1628        // Unregister and verify
1629        discovery.unregister("v6-service").await;
1630        let services = discovery.list_services().await;
1631        assert!(services.is_empty());
1632    }
1633
1634    #[tokio::test]
1635    async fn test_service_discovery_mixed_v4_v6_cache() {
1636        let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 15353);
1637        let discovery = ServiceDiscovery::new(addr);
1638
1639        let ipv4 = IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1));
1640        let ipv6: IpAddr = "fd00::1".parse().unwrap();
1641
1642        discovery.register("svc-v4", ipv4).await;
1643        discovery.register("svc-v6", ipv6).await;
1644
1645        assert_eq!(discovery.resolve("svc-v4").await, Some(ipv4));
1646        assert_eq!(discovery.resolve("svc-v6").await, Some(ipv6));
1647
1648        let mut services = discovery.list_services().await;
1649        services.sort();
1650        assert_eq!(services, vec!["svc-v4", "svc-v6"]);
1651    }
1652
1653    #[test]
1654    fn test_dns_config_with_ipv6_bind_addr() {
1655        let ipv6_bind: IpAddr = "fd00::1".parse().unwrap();
1656        let config = DnsConfig::new("overlay.local.", ipv6_bind);
1657        assert_eq!(config.bind_addr, ipv6_bind);
1658        assert_eq!(config.port, DEFAULT_DNS_PORT);
1659
1660        // Serialization round-trip
1661        let json = serde_json::to_string(&config).unwrap();
1662        let deserialized: DnsConfig = serde_json::from_str(&json).unwrap();
1663        assert_eq!(deserialized.bind_addr, ipv6_bind);
1664    }
1665
1666    #[test]
1667    fn test_dns_server_creation_ipv6_bind() {
1668        let ipv6_addr: IpAddr = "::1".parse().unwrap();
1669        let addr = SocketAddr::new(ipv6_addr, 15353);
1670        let server = DnsServer::new(addr, "overlay.local.");
1671
1672        assert!(server.is_ok());
1673        let server = server.unwrap();
1674        assert_eq!(server.listen_addr(), addr);
1675    }
1676
1677    /// Smoke test for the Windows-fallback port-53 listener: binding to
1678    /// 127.0.0.2:53 should fail fast on hosts where that port is privileged
1679    /// or already in use, but we only care that the method surfaces a clean
1680    /// `DnsError` (not a panic) when the bind is contested. When the bind
1681    /// succeeds on a permissive CI host, we verify the returned handle shares
1682    /// the authority with the primary listener.
1683    #[tokio::test]
1684    async fn test_bind_windows_fallback_errors_or_shares_authority() {
1685        let primary = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 0);
1686        let server = DnsServer::new(primary, "overlay.local.").unwrap();
1687        let bind_ip: IpAddr = "127.0.0.2".parse().unwrap();
1688
1689        match server.bind_windows_fallback(bind_ip).await {
1690            Ok(handle) => {
1691                // Best-effort: the handle must expose the same zone as the
1692                // primary server so record mutations on either propagate to
1693                // both listeners.
1694                assert_eq!(handle.zone_origin().to_string(), "overlay.local.");
1695                handle
1696                    .add_record("dual", IpAddr::V4(Ipv4Addr::new(10, 0, 0, 9)))
1697                    .await
1698                    .expect("add_record via fallback handle");
1699            }
1700            Err(DnsError::Io(_)) => {
1701                // Expected on hosts that reserve port 53 or where the
1702                // loopback alias is already bound. Counts as a clean error
1703                // rather than a panic.
1704            }
1705            Err(other) => panic!("unexpected error from bind_windows_fallback: {other}"),
1706        }
1707    }
1708
1709    #[test]
1710    fn test_peer_hostname_uniqueness() {
1711        // Different IPs should produce different hostnames
1712        let v4_a = peer_hostname(IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1)));
1713        let v4_b = peer_hostname(IpAddr::V4(Ipv4Addr::new(10, 0, 0, 2)));
1714        assert_ne!(v4_a, v4_b);
1715
1716        let v6_a = peer_hostname(IpAddr::V6("fd00::1".parse().unwrap()));
1717        let v6_b = peer_hostname(IpAddr::V6("fd00::2".parse().unwrap()));
1718        assert_ne!(v6_a, v6_b);
1719
1720        // IPv4 and IPv6 hostname formats are distinct
1721        let v4 = peer_hostname(IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1)));
1722        let v6 = peer_hostname(IpAddr::V6("fd00::1".parse().unwrap()));
1723        assert_ne!(v4, v6);
1724    }
1725
1726    // ---- resolv.conf parsing / upstream resolution -------------------------
1727
1728    #[test]
1729    fn test_parse_resolv_conf_filters_stub_and_loopback() {
1730        // A systemd-resolved stub line plus a plain loopback must be dropped;
1731        // the real upstream survives on port 53.
1732        let contents = "\
1733            # generated by netbird\n\
1734            nameserver 127.0.0.53\n\
1735            nameserver 127.0.0.1\n\
1736            nameserver 192.168.1.1\n\
1737            search example.com\n\
1738            options edns0\n";
1739        let parsed = parse_resolv_conf(contents);
1740        assert_eq!(
1741            parsed,
1742            vec![SocketAddr::new(
1743                IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1)),
1744                53
1745            )],
1746            "127.0.0.53 stub and 127.0.0.1 loopback must be filtered out",
1747        );
1748    }
1749
1750    #[test]
1751    fn test_parse_resolv_conf_dedup_and_comments() {
1752        let contents = "\
1753            ; a comment\n\
1754            nameserver 8.8.8.8\n\
1755            nameserver 8.8.8.8\n\
1756            nameserver fe80::1%eth0\n\
1757            nameserver 0.0.0.0\n";
1758        let parsed = parse_resolv_conf(contents);
1759        // 8.8.8.8 de-duplicated; scoped link-local kept (zone stripped);
1760        // 0.0.0.0 unspecified dropped.
1761        assert_eq!(parsed.len(), 2);
1762        assert_eq!(
1763            parsed[0],
1764            SocketAddr::new(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8)), 53)
1765        );
1766        assert_eq!(parsed[1].ip(), "fe80::1".parse::<IpAddr>().unwrap());
1767    }
1768
1769    #[test]
1770    fn test_resolve_upstreams_config_override_wins() {
1771        // An explicit config upstream must be returned verbatim with no
1772        // resolv.conf consultation (we point the path at a bogus file).
1773        let explicit = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(10, 9, 9, 9)), 5300);
1774        let config = DnsConfig::new("overlay.local.", IpAddr::V4(Ipv4Addr::LOCALHOST))
1775            .with_upstreams(vec![explicit]);
1776        let resolved = resolve_upstreams(&config, "/nonexistent/resolv.conf");
1777        assert_eq!(resolved, vec![explicit]);
1778    }
1779
1780    #[test]
1781    fn test_resolve_upstreams_falls_back_to_public_when_missing() {
1782        // Missing resolv.conf => public fallback (1.1.1.1, 8.8.8.8).
1783        let config = DnsConfig::new("overlay.local.", IpAddr::V4(Ipv4Addr::LOCALHOST));
1784        let resolved = resolve_upstreams(&config, "/definitely/not/a/real/resolv.conf");
1785        assert_eq!(
1786            resolved,
1787            vec![
1788                SocketAddr::new(IpAddr::V4(Ipv4Addr::new(1, 1, 1, 1)), 53),
1789                SocketAddr::new(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8)), 53),
1790            ],
1791        );
1792    }
1793
1794    // ---- end-to-end forwarding ---------------------------------------------
1795
1796    /// Spawn a minimal stub upstream DNS responder on an ephemeral UDP port.
1797    ///
1798    /// It answers *every* A query with `answer_ip` (echoing the queried name)
1799    /// so a forwarded query can be observed flowing through. Returns the bound
1800    /// `SocketAddr` so the caller can point the overlay forwarder at it.
1801    async fn spawn_stub_upstream(answer_ip: Ipv4Addr) -> SocketAddr {
1802        use hickory_server::proto::op::{Message, MessageType, ResponseCode};
1803
1804        let sock = UdpSocket::bind(SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 0))
1805            .await
1806            .expect("bind stub upstream");
1807        let addr = sock.local_addr().expect("stub local_addr");
1808
1809        tokio::spawn(async move {
1810            let mut buf = vec![0u8; 1500];
1811            loop {
1812                let Ok((len, from)) = sock.recv_from(&mut buf).await else {
1813                    break;
1814                };
1815                let Ok(request) = Message::from_vec(&buf[..len]) else {
1816                    continue;
1817                };
1818                let mut resp = Message::new();
1819                resp.set_id(request.id());
1820                resp.set_message_type(MessageType::Response);
1821                resp.set_recursion_available(true);
1822                resp.set_response_code(ResponseCode::NoError);
1823                for q in request.queries() {
1824                    resp.add_query(q.clone());
1825                    if q.query_type() == RecordType::A {
1826                        let rec =
1827                            Record::from_rdata(q.name().clone(), 60, RData::A(A::from(answer_ip)));
1828                        resp.add_answer(rec);
1829                    }
1830                }
1831                if let Ok(bytes) = resp.to_vec() {
1832                    let _ = sock.send_to(&bytes, from).await;
1833                }
1834            }
1835        });
1836
1837        addr
1838    }
1839
1840    /// Send a raw A query to `server` and return the first A answer, if any.
1841    /// Returns `Err` carrying the `ResponseCode` on a non-NoError response so
1842    /// SERVFAIL can be asserted distinctly from "no answer".
1843    async fn raw_query_a(
1844        server: SocketAddr,
1845        name: &str,
1846    ) -> Result<Option<Ipv4Addr>, hickory_server::proto::op::ResponseCode> {
1847        use hickory_server::proto::op::{Message, MessageType, Query, ResponseCode};
1848
1849        let client = UdpSocket::bind(SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 0))
1850            .await
1851            .expect("bind client");
1852
1853        let qname = Name::from_str(name).expect("query name");
1854        let mut msg = Message::new();
1855        msg.set_id(0x1234);
1856        msg.set_message_type(MessageType::Query);
1857        msg.set_recursion_desired(true);
1858        msg.add_query(Query::query(qname, RecordType::A));
1859        let bytes = msg.to_vec().expect("encode query");
1860
1861        client.send_to(&bytes, server).await.expect("send query");
1862
1863        let mut buf = vec![0u8; 1500];
1864        // Generous client deadline: the forwarder's own bounded retry budget
1865        // (2 attempts x 2s) means a SERVFAIL for a dead upstream arrives within
1866        // ~4s; this must exceed that so the test observes SERVFAIL rather than
1867        // tripping its own client timeout first.
1868        let len = tokio::time::timeout(Duration::from_secs(12), client.recv(&mut buf))
1869            .await
1870            .expect("query timed out")
1871            .expect("recv response");
1872        let resp = Message::from_vec(&buf[..len]).expect("decode response");
1873
1874        if resp.response_code() != ResponseCode::NoError {
1875            return Err(resp.response_code());
1876        }
1877        for ans in resp.answers() {
1878            if let Some(RData::A(a)) = ans.data() {
1879                return Ok(Some((*a).into()));
1880            }
1881        }
1882        Ok(None)
1883    }
1884
1885    #[tokio::test]
1886    async fn test_forwarding_overlay_answered_and_nonoverlay_forwarded() {
1887        // Stub upstream answers everything with 203.0.113.7.
1888        let upstream_answer = Ipv4Addr::new(203, 0, 113, 7);
1889        let upstream = spawn_stub_upstream(upstream_answer).await;
1890
1891        // `start` binds the listener internally, so grab a concrete ephemeral
1892        // port first (bind + drop) and build the server on it — that lets the
1893        // test client send queries to a known address.
1894        let bound = {
1895            let probe = UdpSocket::bind(SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 0))
1896                .await
1897                .unwrap();
1898            let a = probe.local_addr().unwrap();
1899            drop(probe);
1900            a
1901        };
1902
1903        // Overlay server with the stub as its only upstream (no resolv.conf
1904        // detection — upstreams injected directly).
1905        let overlay_ip = Ipv4Addr::new(10, 200, 0, 5);
1906        let server =
1907            DnsServer::new_with_upstreams(bound, "overlay.local.", vec![upstream]).unwrap();
1908        let handle = server.handle();
1909        handle
1910            .add_record("svc", IpAddr::V4(overlay_ip))
1911            .await
1912            .unwrap();
1913        let _running = server.start().await.unwrap();
1914
1915        // Give the listener a moment to bind.
1916        tokio::time::sleep(Duration::from_millis(150)).await;
1917
1918        // Overlay-zone query is answered from the authority (NOT the stub).
1919        let overlay = raw_query_a(bound, "svc.overlay.local.")
1920            .await
1921            .expect("overlay query should not SERVFAIL");
1922        assert_eq!(
1923            overlay,
1924            Some(overlay_ip),
1925            "overlay name must be answered from InMemoryAuthority",
1926        );
1927
1928        // Non-overlay query is forwarded to the stub upstream.
1929        let forwarded = raw_query_a(bound, "example.com.")
1930            .await
1931            .expect("forwarded query should not SERVFAIL");
1932        assert_eq!(
1933            forwarded,
1934            Some(upstream_answer),
1935            "non-overlay name must be forwarded to the upstream stub",
1936        );
1937    }
1938
1939    #[tokio::test]
1940    async fn test_forwarding_total_upstream_failure_is_servfail_not_panic() {
1941        use hickory_server::proto::op::ResponseCode;
1942
1943        // Point the forwarder at a dead upstream (nothing listening). The
1944        // server must return SERVFAIL for non-overlay queries, never panic,
1945        // and still serve the overlay zone.
1946        let dead_upstream = {
1947            // Bind+drop to grab a free port nobody is listening on.
1948            let s = UdpSocket::bind(SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 0))
1949                .await
1950                .unwrap();
1951            let a = s.local_addr().unwrap();
1952            drop(s);
1953            a
1954        };
1955
1956        let bound = {
1957            let s = UdpSocket::bind(SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 0))
1958                .await
1959                .unwrap();
1960            let a = s.local_addr().unwrap();
1961            drop(s);
1962            a
1963        };
1964
1965        let server =
1966            DnsServer::new_with_upstreams(bound, "overlay.local.", vec![dead_upstream]).unwrap();
1967        let handle = server.handle();
1968        handle
1969            .add_record("svc", IpAddr::V4(Ipv4Addr::new(10, 200, 0, 9)))
1970            .await
1971            .unwrap();
1972        let _running = server.start().await.unwrap();
1973        tokio::time::sleep(Duration::from_millis(150)).await;
1974
1975        // Overlay zone still works.
1976        let overlay = raw_query_a(bound, "svc.overlay.local.")
1977            .await
1978            .expect("overlay query should still succeed");
1979        assert_eq!(overlay, Some(Ipv4Addr::new(10, 200, 0, 9)));
1980
1981        // Non-overlay query against a dead upstream => SERVFAIL (not a panic,
1982        // not a hang past the resolver's own timeout).
1983        match raw_query_a(bound, "example.com.").await {
1984            Err(ResponseCode::ServFail) => {} // expected
1985            Err(other) => panic!("expected SERVFAIL, got {other:?}"),
1986            Ok(answer) => panic!("expected SERVFAIL, got answer {answer:?}"),
1987        }
1988    }
1989
1990    /// Regression: when the per-deployment service-discovery zone origin equals
1991    /// the static zone origin (`zlayer.local.` — the default `deployment_name`
1992    /// is `zlayer`), the catalog (a `HashMap` keyed by origin) must NOT evict the
1993    /// primary authority by upserting a separate static authority at the same
1994    /// key. Before the fix, `build_catalog` inserted both authorities under the
1995    /// colliding `zlayer.local.` origin; the second insert evicted the first, so
1996    /// `<svc>.service.<stack>.zlayer.local.` queries NXDOMAIN'd.
1997    ///
1998    /// This drives the real bind + UDP wire path (NOT a direct-authority lookup)
1999    /// so it actually exercises catalog routing — the only place the eviction
2000    /// manifests.
2001    #[tokio::test]
2002    async fn test_colliding_static_zone_does_not_evict_service_records() {
2003        // Zone origin IS the static origin => the colliding/default case.
2004        let bound = {
2005            let probe = UdpSocket::bind(SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 0))
2006                .await
2007                .unwrap();
2008            let a = probe.local_addr().unwrap();
2009            drop(probe);
2010            a
2011        };
2012
2013        // Empty upstreams: no forwarder needed, both names live in the zone.
2014        let server = DnsServer::new_with_upstreams(bound, "zlayer.local.", vec![]).unwrap();
2015
2016        let svc_ip = Ipv4Addr::new(10, 200, 0, 42);
2017        let static_ip = Ipv4Addr::new(10, 200, 0, 1);
2018
2019        // Service-discovery record (child of the primary zone) + node-stable
2020        // static record. Both ultimately resolve under `zlayer.local.`.
2021        server
2022            .add_record("forgejodb.service.forgejo-stack", IpAddr::V4(svc_ip))
2023            .await
2024            .unwrap();
2025        server
2026            .add_static_record("host", IpAddr::V4(static_ip))
2027            .await
2028            .unwrap();
2029
2030        let _running = server.start().await.unwrap();
2031        tokio::time::sleep(Duration::from_millis(150)).await;
2032
2033        // The service record is the one that NXDOMAIN'd pre-fix (eviction).
2034        let svc = raw_query_a(bound, "forgejodb.service.forgejo-stack.zlayer.local.")
2035            .await
2036            .expect(
2037                "service-discovery query must not NXDOMAIN/SERVFAIL (catalog eviction regression)",
2038            );
2039        assert_eq!(
2040            svc,
2041            Some(svc_ip),
2042            "service-discovery record must survive a colliding static zone",
2043        );
2044
2045        // The static record must also still resolve from the shared authority.
2046        let stat = raw_query_a(bound, "host.zlayer.local.")
2047            .await
2048            .expect("static node-stable query must not NXDOMAIN/SERVFAIL");
2049        assert_eq!(
2050            stat,
2051            Some(static_ip),
2052            "static record must resolve when its zone is shared with the primary",
2053        );
2054    }
2055}