Skip to main content

zlayer_overlayd/
server.rs

1//! The overlayd server engine.
2//!
3//! [`OverlaydServer`] is a near 1:1 migration of the *mechanics* half of the
4//! agent's `OverlayManager`: it owns the single cluster `WireGuard`
5//! [`OverlayTransport`], the per-service Linux bridges (Linux) / HCN Internal
6//! network + endpoints (Windows), the per-node IP allocator, DNS config, and
7//! NAT traversal. The cluster-brain half (Raft, scheduler, service registry)
8//! stays in the main daemon, which drives this server over the IPC contract in
9//! [`zlayer_types::overlayd`].
10//!
11//! Every [`OverlaydRequest`] maps to a method here via [`OverlaydServer::handle`].
12
13use std::collections::HashMap;
14use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr};
15#[cfg(target_os = "linux")]
16use std::os::fd::AsFd;
17use std::path::{Path, PathBuf};
18use std::sync::atomic::{AtomicU64, Ordering};
19
20use ipnetwork::IpNetwork;
21use zlayer_overlay::{NatConfig, NatTraversal, OverlayConfig, OverlayTransport, PeerInfo};
22use zlayer_types::overlayd::{
23    AttachHandle, AttachResult, DedicatedServiceStatus, GuestOverlayConfig, OverlayMode,
24    OverlaydRequest, OverlaydResponse, PeerScope, PeerSpec, PeerStatus, ServiceOverlayInfo,
25    StatusSnapshot,
26};
27
28use crate::error::OverlaydError;
29use crate::network_state::{
30    owner_for_service, DedicatedPortAllocator, ManagedNetwork, NetworkState,
31};
32
33/// Maximum length for Linux network interface names (IFNAMSIZ - 1 for null terminator).
34const MAX_IFNAME_LEN: usize = 15;
35
36/// Generate a Linux-safe interface name guaranteed to be <= 15 chars.
37///
38/// Joins the `parts` with `-` after a `"zl-"` prefix and appends `-{suffix}` if
39/// non-empty. When the result exceeds 15 characters, a deterministic hash of all
40/// parts is used instead to keep the name unique and within the kernel limit.
41#[must_use]
42pub fn make_interface_name(parts: &[&str], suffix: &str) -> String {
43    use std::collections::hash_map::DefaultHasher;
44    use std::hash::{Hash, Hasher};
45
46    let base = format!("zl-{}", parts.join("-"));
47    let candidate = if suffix.is_empty() {
48        base
49    } else {
50        format!("{base}-{suffix}")
51    };
52
53    if candidate.len() <= MAX_IFNAME_LEN {
54        return candidate;
55    }
56
57    // Name is too long -- produce a deterministic hash-based name.
58    let mut hasher = DefaultHasher::new();
59    for part in parts {
60        part.hash(&mut hasher);
61    }
62    suffix.hash(&mut hasher);
63    let hash = format!("{:x}", hasher.finish());
64
65    if suffix.is_empty() {
66        // "zl-" (3) + up to 12 hex chars = 15
67        let budget = MAX_IFNAME_LEN - 3;
68        format!("zl-{}", &hash[..budget.min(hash.len())])
69    } else {
70        // "zl-" (3) + hash + "-" (1) + suffix
71        let suffix_cost = 1 + suffix.len(); // "-" + suffix
72        let hash_budget = MAX_IFNAME_LEN.saturating_sub(3 + suffix_cost);
73        if hash_budget == 0 {
74            let budget = MAX_IFNAME_LEN - 3;
75            format!("zl-{}", &hash[..budget.min(hash.len())])
76        } else {
77            format!("zl-{}-{}", &hash[..hash_budget.min(hash.len())], suffix)
78        }
79    }
80}
81
82/// First usable host address in `subnet`.
83///
84/// For IPv4 this is `network() + 1` (skipping the network address). For IPv6
85/// the same rule applies — the network address is conventionally reserved.
86fn first_usable_ip(subnet: ipnet::IpNet) -> IpAddr {
87    match subnet {
88        ipnet::IpNet::V4(v4) => {
89            let net = u32::from(v4.network());
90            IpAddr::V4(Ipv4Addr::from(net.wrapping_add(1)))
91        }
92        ipnet::IpNet::V6(v6) => {
93            let net = u128::from(v6.network());
94            IpAddr::V6(Ipv6Addr::from(net.wrapping_add(1)))
95        }
96    }
97}
98
99/// Parameters threaded into [`OverlaydServer::attach_to_interface`] when a
100/// container is being attached to a per-service Linux bridge.
101#[cfg(target_os = "linux")]
102#[derive(Debug)]
103struct BridgeAttachParams<'a> {
104    /// Linux bridge name on the host to enslave the host-side veth into.
105    bridge_name: &'a str,
106    /// Bridge's L3 gateway IP. The container's default route is set here.
107    gateway: IpAddr,
108    /// Prefix length of the bridge's subnet.
109    subnet_prefix_len: u8,
110}
111
112/// Tracking info recorded by [`OverlaydServer::attach_container`] for every
113/// container that successfully attaches on Linux. Used by `detach_container`.
114#[cfg(target_os = "linux")]
115#[derive(Debug, Clone)]
116struct AttachInfo {
117    /// IP allocated on the per-service overlay (eth0 inside the container).
118    service_ip: IpAddr,
119    /// Name of the service whose bridge owns `service_ip`.
120    service_name: Option<String>,
121    /// IP allocated on the global overlay (eth1), if the container joined it.
122    global_ip: Option<IpAddr>,
123    /// True iff the container also attached to the global overlay (eth1).
124    joined_global: bool,
125}
126
127/// Tracking info recorded by [`OverlaydServer::attach_container_guest`] for a
128/// guest-managed attach. Platform-agnostic (no netns/veth/HCN): the guest owns
129/// its own `WireGuard` device; the host only allocated the address + registered
130/// the guest's public key as a global peer.
131#[derive(Debug, Clone)]
132struct GuestAttachInfo {
133    /// Overlay IP allocated for the guest (released on detach).
134    overlay_ip: IpAddr,
135    /// Base64 public key registered on the global transport for the guest
136    /// (removed on detach).
137    public_key: String,
138    /// Service whose bridge pool owns `overlay_ip` (Linux service-bridge path);
139    /// `None` when drawn from the node slice. Mirrors `AttachInfo::service_name`
140    /// so detach returns the IP to the right pool.
141    service_name: Option<String>,
142}
143
144/// Per-service Linux bridge state. One bridge per service per node; containers
145/// attach to it via veth pairs and cross-node packets ride the single cluster
146/// `OverlayTransport` with the service subnet plumbed into its `AllowedIPs`.
147#[cfg(target_os = "linux")]
148#[derive(Debug)]
149struct ServiceBridge {
150    /// Linux bridge name, kept under IFNAMSIZ-1 by [`make_interface_name`].
151    name: String,
152    /// CIDR of the service's subnet on this node.
153    subnet: ipnet::IpNet,
154    /// Gateway IP within the subnet (first usable address).
155    gateway: IpAddr,
156    /// Per-service IP allocator covering `subnet`.
157    ip_allocator: zlayer_overlay::allocator::IpAllocator,
158}
159
160/// A dedicated per-service `WireGuard` transport (`OverlayMode::Dedicated`).
161///
162/// Unlike Shared mode — where every service subnet is plumbed onto the single
163/// cluster [`OverlayTransport`] via multi-CIDR `AllowedIPs` — a Dedicated
164/// service owns a *second* real `WireGuard` device with its own crypto context,
165/// listen port, overlay IP, and subnet. The device is portable (boringtun
166/// userspace `WireGuard` works on Linux/macOS/Windows), so this struct is
167/// cross-platform; only the bridge/HCN *attachment* of containers onto it is
168/// platform-gated.
169struct ServiceTransport {
170    /// The live dedicated `WireGuard` device. Dropping it tears down the TUN.
171    transport: OverlayTransport,
172    /// Actual interface name (kernel-assigned `utunN` on macOS).
173    interface: String,
174    /// base64 public key of this dedicated device.
175    public_key: String,
176    /// UDP listen port handed out by [`DedicatedPortAllocator`].
177    listen_port: u16,
178    /// This node's overlay IP on the dedicated device.
179    overlay_ip: std::net::IpAddr,
180    /// The service's subnet carried by the dedicated device.
181    subnet: ipnet::IpNet,
182}
183
184/// The overlay daemon engine.
185pub struct OverlaydServer {
186    /// Deployment name (used for network naming). Set by `SetupGlobalOverlay`.
187    deployment: String,
188    /// Per-daemon-process disambiguator included in overlay link names. Set by
189    /// `SetupGlobalOverlay`.
190    instance_id: String,
191    /// Root data directory; HCN markers, IPAM state, etc. live under it.
192    data_dir: PathBuf,
193    /// Global overlay interface name.
194    global_interface: Option<String>,
195    /// Global overlay transport (kept alive for the TUN device lifetime). The
196    /// SINGLE cluster-wide `WireGuard` transport; every service subnet is
197    /// plumbed through its `AllowedIPs`.
198    global_transport: Option<OverlayTransport>,
199    /// Service-name -> per-service Linux bridge / placeholder name.
200    service_interfaces: HashMap<String, String>,
201    /// Service-name -> dedicated per-service `WireGuard` transport (Dedicated
202    /// mode). Coexists with `global_transport`. Empty for Shared-only nodes.
203    service_transports: HashMap<String, ServiceTransport>,
204    /// Port allocator for dedicated devices (band above the global WG port).
205    dedicated_ports: DedicatedPortAllocator,
206    /// Per-service bridge state (Linux only).
207    #[cfg(target_os = "linux")]
208    service_bridges: HashMap<String, ServiceBridge>,
209    /// Local fallback `ServiceSubnetRegistry`. Used by the Linux Shared bridge
210    /// path and by the cross-platform Dedicated path (subnets stay globally
211    /// unique regardless of mode/OS).
212    service_subnet_registry: Option<zlayer_overlay::allocator::ServiceSubnetRegistry>,
213    /// Local raft node id used as the partition key for service-subnet assign.
214    local_node_id: u64,
215    /// Base64 `WireGuard` public key of THIS node's cluster transport, as told
216    /// by the main daemon via `SetLocalWgPubkey` (used for service-subnet
217    /// `AllowedIPs` plumbing).
218    local_wg_pubkey: Option<String>,
219    /// Public key generated for the live global transport, recorded at
220    /// `setup_global_overlay` time so `Status` can surface it (the transport
221    /// itself exposes no public-key accessor).
222    transport_public_key: Option<String>,
223    /// IP allocator for the node's overlay slice.
224    ip_allocator: IpAllocator,
225    /// This node's IP on the global overlay network.
226    node_ip: Option<IpAddr>,
227    /// `WireGuard` listen port for the overlay network.
228    overlay_port: u16,
229    /// Full cluster CIDR (e.g. `10.200.0.0/16`).
230    cluster_cidr: Option<IpNetwork>,
231    /// Per-node slice CIDR.
232    slice_cidr: Option<IpNetwork>,
233    /// Map of HCN namespace GUID -> (`service_name`, `allocated_ip`) for autoclean.
234    #[cfg(target_os = "windows")]
235    hcn_cleanup: HashMap<windows::core::GUID, (String, std::net::IpAddr)>,
236    /// Per-service container-IP allocators for Windows dedicated services. Each
237    /// is bounded to that service's subnet (not the node slice) so dedicated
238    /// containers draw addresses from their own isolated network. Keyed by
239    /// service name; created lazily on the first dedicated attach.
240    #[cfg(target_os = "windows")]
241    service_ip_allocators: HashMap<String, IpAllocator>,
242    /// Per-PID tracking of overlay attachments on Linux.
243    #[cfg(target_os = "linux")]
244    attached: HashMap<u32, AttachInfo>,
245    /// Peers installed on the GLOBAL transport via `AddPeer { Global }`, keyed by
246    /// base64 public key. Tracked here (in wire-safe [`PeerSpec`] form, with the
247    /// keys kept base64 — the boringtun UAPI dump only exposes hex keys) so a
248    /// guest-managed attach can hand the guest the exact peer set the host's own
249    /// global device carries. Platform-agnostic: the guest path runs on macOS.
250    global_peers: HashMap<String, PeerSpec>,
251    /// Guest-managed overlay attachments, keyed by the opaque container `id` from
252    /// [`AttachHandle::GuestManaged`]. Records the allocated overlay IP and the
253    /// generated public key registered in the mesh so `DetachContainer` can
254    /// release the IP and remove the peer.
255    guest_attachments: HashMap<String, GuestAttachInfo>,
256    /// Overlay DNS server listen address, if one was bootstrapped.
257    dns_server_addr: Option<SocketAddr>,
258    /// DNS domain for overlay service discovery.
259    dns_domain: Option<String>,
260    /// Overlay DNS A/AAAA records this node owns (name -> ip).
261    dns_records: HashMap<String, IpAddr>,
262    /// NAT traversal configuration threaded into every `OverlayConfig`.
263    nat_config: Option<NatConfig>,
264    /// Override for `OverlayConfig::uapi_sock_dir`.
265    uapi_sock_dir: Option<PathBuf>,
266    /// Live NAT traversal orchestrator.
267    nat_traversal: Option<NatTraversal>,
268    /// Unix-epoch seconds of the last successful candidate gather / STUN refresh.
269    nat_last_refresh: AtomicU64,
270    /// Set when a `Shutdown` request has been received.
271    shutdown_requested: bool,
272}
273
274impl OverlaydServer {
275    /// Create a fresh server bound to `data_dir`. The overlay itself is brought
276    /// up lazily by `SetupGlobalOverlay` (which carries the deployment, slice,
277    /// port, and NAT toggle from the main daemon).
278    ///
279    /// # Panics
280    /// Panics only if the compile-time-constant default CIDR `10.200.0.0/16`
281    /// fails to parse (impossible).
282    #[must_use]
283    pub fn new(data_dir: PathBuf) -> Self {
284        // Until SetupGlobalOverlay arrives, the allocator is bounded to the
285        // default cluster /16. SetupGlobalOverlay re-binds it to the node slice.
286        let default_cidr: IpNetwork = "10.200.0.0/16".parse().expect("compile-time constant CIDR");
287        let overlay_port = zlayer_core::DEFAULT_WG_PORT;
288
289        // Rehydrate the dedicated-port allocator from the on-disk marker so a
290        // service that already owns a dedicated overlay re-binds the exact UDP
291        // port it had before this process started.
292        let marker_path = zlayer_paths::ZLayerDirs::new(data_dir.clone()).agent_network_state();
293        let recorded_dedicated_ports: Vec<u16> = NetworkState::load(&marker_path)
294            .networks
295            .iter()
296            .filter(|n| n.owner.starts_with("service:"))
297            .filter_map(|n| n.wg_port)
298            .collect();
299
300        Self {
301            deployment: String::new(),
302            instance_id: String::new(),
303            data_dir,
304            global_interface: None,
305            global_transport: None,
306            service_interfaces: HashMap::new(),
307            service_transports: HashMap::new(),
308            dedicated_ports: DedicatedPortAllocator::new(overlay_port, recorded_dedicated_ports),
309            #[cfg(target_os = "linux")]
310            service_bridges: HashMap::new(),
311            service_subnet_registry: None,
312            local_node_id: 0,
313            local_wg_pubkey: None,
314            transport_public_key: None,
315            ip_allocator: IpAllocator::new(default_cidr),
316            node_ip: None,
317            overlay_port,
318            cluster_cidr: Some(default_cidr),
319            slice_cidr: None,
320            #[cfg(target_os = "windows")]
321            hcn_cleanup: HashMap::new(),
322            #[cfg(target_os = "windows")]
323            service_ip_allocators: HashMap::new(),
324            #[cfg(target_os = "linux")]
325            attached: HashMap::new(),
326            global_peers: HashMap::new(),
327            guest_attachments: HashMap::new(),
328            dns_server_addr: None,
329            dns_domain: None,
330            dns_records: HashMap::new(),
331            nat_config: None,
332            uapi_sock_dir: None,
333            nat_traversal: None,
334            nat_last_refresh: AtomicU64::new(0),
335            shutdown_requested: false,
336        }
337    }
338
339    /// Override the `WireGuard` UAPI socket directory for every overlay
340    /// transport built by this server.
341    #[must_use]
342    pub fn with_uapi_sock_dir(mut self, dir: impl Into<PathBuf>) -> Self {
343        self.uapi_sock_dir = Some(dir.into());
344        self
345    }
346
347    /// Whether a `Shutdown` request has been received.
348    #[must_use]
349    pub fn shutdown_requested(&self) -> bool {
350        self.shutdown_requested
351    }
352
353    /// The root data directory this server was constructed with. Used by the
354    /// uninstall path (`purge_managed_networks`) and for HCN marker resolution.
355    #[must_use]
356    pub fn data_dir(&self) -> &Path {
357        &self.data_dir
358    }
359
360    // -- request dispatch ----------------------------------------------------
361
362    /// Execute one [`OverlaydRequest`], producing the [`OverlaydResponse`] the
363    /// server sends back over IPC. Any internal error is folded into
364    /// [`OverlaydResponse::Err`].
365    pub async fn handle(&mut self, req: OverlaydRequest) -> OverlaydResponse {
366        match self.dispatch(req).await {
367            Ok(resp) => resp,
368            Err(e) => OverlaydResponse::Err {
369                message: e.to_string(),
370            },
371        }
372    }
373
374    #[allow(clippy::too_many_lines)]
375    async fn dispatch(&mut self, req: OverlaydRequest) -> Result<OverlaydResponse, OverlaydError> {
376        match req {
377            OverlaydRequest::SetLocalNodeId { node_id } => {
378                self.local_node_id = node_id;
379                Ok(OverlaydResponse::Ok)
380            }
381            OverlaydRequest::SetLocalWgPubkey { pubkey } => {
382                self.local_wg_pubkey = Some(pubkey);
383                Ok(OverlaydResponse::Ok)
384            }
385            OverlaydRequest::SetupGlobalOverlay {
386                deployment,
387                instance_id,
388                cluster_cidr,
389                slice_cidr,
390                wg_port,
391                nat_enabled,
392            } => {
393                let name = self
394                    .setup_global_overlay(
395                        deployment,
396                        instance_id,
397                        &cluster_cidr,
398                        slice_cidr.as_deref(),
399                        wg_port,
400                        nat_enabled,
401                    )
402                    .await?;
403                Ok(OverlaydResponse::BridgeName { name })
404            }
405            OverlaydRequest::TeardownGlobalOverlay => {
406                self.teardown_global_overlay();
407                Ok(OverlaydResponse::Ok)
408            }
409            OverlaydRequest::SetupServiceOverlay { service, mode } => {
410                let info = self.setup_service_overlay(&service, mode).await?;
411                Ok(OverlaydResponse::ServiceOverlay(info))
412            }
413            OverlaydRequest::TeardownServiceOverlay { service } => {
414                self.teardown_service_overlay(&service).await;
415                Ok(OverlaydResponse::Ok)
416            }
417            OverlaydRequest::AllocateIp {
418                service,
419                join_global,
420            } => {
421                let ip = self.allocate_ip(&service, join_global)?;
422                Ok(OverlaydResponse::Ip { ip })
423            }
424            OverlaydRequest::ReleaseIp { ip } => {
425                self.release_ip(ip);
426                Ok(OverlaydResponse::Ok)
427            }
428            OverlaydRequest::AttachContainer {
429                handle,
430                service,
431                join_global,
432                dns_server,
433                dns_domain,
434            } => {
435                // A guest-managed attach takes a wholly separate path: it cannot
436                // build a veth/HCN endpoint (the target is a VM, not a host
437                // process), so it allocates the overlay identity + peer set and
438                // returns it as `GuestConfig`. PID/HCN handles keep the existing
439                // veth/HCN attach and return `Attached`.
440                if let AttachHandle::GuestManaged { id } = handle {
441                    // Record the overlay DNS resolver/zone the daemon staged for
442                    // this node so the guest config can fall back to them (same
443                    // bookkeeping `attach_container` does for the other handles).
444                    if let Some(server) = dns_server {
445                        self.dns_server_addr = Some(SocketAddr::new(server, 53));
446                    }
447                    if dns_domain.is_some() {
448                        self.dns_domain.clone_from(&dns_domain);
449                    }
450                    let config = self
451                        .attach_container_guest(&id, &service, join_global, dns_server, dns_domain)
452                        .await?;
453                    Ok(OverlaydResponse::GuestConfig(config))
454                } else {
455                    let result = self
456                        .attach_container(handle, &service, join_global, dns_server, dns_domain)
457                        .await?;
458                    Ok(OverlaydResponse::Attached(result))
459                }
460            }
461            OverlaydRequest::DetachContainer { handle } => {
462                if let AttachHandle::GuestManaged { id } = handle {
463                    self.detach_container_guest(&id).await?;
464                } else {
465                    self.detach_container(handle).await?;
466                }
467                Ok(OverlaydResponse::Ok)
468            }
469            // `scope` selects the target device: `Global` (default) = the single
470            // cluster transport; `Service { service }` = that service's
471            // dedicated per-service transport.
472            OverlaydRequest::AddPeer { peer, scope } => {
473                let info = peer_spec_to_info(&peer)?;
474                let transport = self.transport_for_scope(&scope)?;
475                Self::add_peer_on(transport, &info).await?;
476                // Mirror Global peers into `global_peers` so a guest-managed
477                // attach can reproduce the host's global peer set for the guest.
478                if matches!(scope, PeerScope::Global) {
479                    self.global_peers.insert(peer.public_key.clone(), peer);
480                }
481                Ok(OverlaydResponse::Ok)
482            }
483            OverlaydRequest::RemovePeer { pubkey, scope } => {
484                let transport = self.transport_for_scope(&scope)?;
485                Self::remove_peer_on(transport, &pubkey).await?;
486                if matches!(scope, PeerScope::Global) {
487                    self.global_peers.remove(&pubkey);
488                }
489                Ok(OverlaydResponse::Ok)
490            }
491            OverlaydRequest::AddAllowedIp {
492                pubkey,
493                cidr,
494                scope,
495            } => {
496                let transport = self.transport_for_scope(&scope)?;
497                Self::add_allowed_ip_on(transport, &pubkey, &cidr).await?;
498                Ok(OverlaydResponse::Ok)
499            }
500            OverlaydRequest::RemoveAllowedIp {
501                pubkey,
502                cidr,
503                scope,
504            } => {
505                let transport = self.transport_for_scope(&scope)?;
506                Self::remove_allowed_ip_on(transport, &pubkey, &cidr).await?;
507                Ok(OverlaydResponse::Ok)
508            }
509            OverlaydRequest::RegisterDns { name, ip } => {
510                self.register_dns(name, ip);
511                Ok(OverlaydResponse::Ok)
512            }
513            OverlaydRequest::UnregisterDns { name } => {
514                self.unregister_dns(&name);
515                Ok(OverlaydResponse::Ok)
516            }
517            OverlaydRequest::Status => Ok(OverlaydResponse::Status(self.status_snapshot().await)),
518            OverlaydRequest::NatTick => {
519                self.nat_maintenance_tick().await?;
520                Ok(OverlaydResponse::Ok)
521            }
522            OverlaydRequest::Shutdown => {
523                self.shutdown_requested = true;
524                self.teardown_global_overlay();
525                Ok(OverlaydResponse::Ok)
526            }
527        }
528    }
529
530    // -- global overlay ------------------------------------------------------
531
532    /// Bring up (or reuse) this node's base/global overlay.
533    ///
534    /// Idempotent: if a global transport is already live, reuse it (recreating
535    /// without this guard could yank the kernel TUN out from under the running
536    /// boringtun worker). Re-binds the IP allocator to `slice_cidr` if one is
537    /// supplied so container IPs never collide across nodes.
538    ///
539    /// # Errors
540    /// Returns an error if key generation or interface creation fails.
541    async fn setup_global_overlay(
542        &mut self,
543        deployment: String,
544        instance_id: String,
545        cluster_cidr: &str,
546        slice_cidr: Option<&str>,
547        wg_port: u16,
548        nat_enabled: bool,
549    ) -> Result<String, OverlaydError> {
550        self.deployment = deployment;
551        self.instance_id = instance_id;
552        self.overlay_port = wg_port;
553
554        let cluster: IpNetwork = cluster_cidr.parse().map_err(|e| {
555            OverlaydError::Other(format!("invalid cluster CIDR {cluster_cidr}: {e}"))
556        })?;
557        self.cluster_cidr = Some(cluster);
558        if let Some(slice) = slice_cidr {
559            let slice_net: IpNetwork = slice
560                .parse()
561                .map_err(|e| OverlaydError::Other(format!("invalid slice CIDR {slice}: {e}")))?;
562            self.slice_cidr = Some(slice_net);
563            self.ip_allocator = IpAllocator::new(slice_net);
564        }
565        // NAT defaults to enabled (NatConfig::default()); honor an explicit
566        // disable from the main daemon by stamping a disabled config.
567        if !nat_enabled {
568            self.nat_config = Some(NatConfig {
569                enabled: false,
570                ..NatConfig::default()
571            });
572        }
573
574        if let Some(name) = self.global_interface.clone() {
575            if self.global_transport.is_some() {
576                tracing::debug!(
577                    deployment = %self.deployment,
578                    "Global overlay already active, reusing existing transport"
579                );
580                return Ok(name);
581            }
582        }
583
584        let interface_name = make_interface_name(&[&self.deployment, &self.instance_id], "g");
585
586        let (private_key, public_key) = OverlayTransport::generate_keys()
587            .await
588            .map_err(|e| OverlaydError::Overlay(format!("Failed to generate keys: {e}")))?;
589
590        let node_ip = self.ip_allocator.allocate()?;
591        self.transport_public_key = Some(public_key.clone());
592        let config = self.build_config(private_key, public_key, node_ip, 16, self.overlay_port);
593        let mut transport = OverlayTransport::new(config, interface_name);
594
595        transport
596            .create_interface()
597            .await
598            .map_err(|e| OverlaydError::Overlay(format!("Failed to create global overlay: {e}")))?;
599        transport.configure(&[]).await.map_err(|e| {
600            OverlaydError::Overlay(format!("Failed to configure global overlay: {e}"))
601        })?;
602
603        // Read back the actual interface name (on macOS, the kernel assigns utunN).
604        let actual_name = transport.interface_name().to_string();
605
606        self.node_ip = Some(node_ip);
607        self.global_interface = Some(actual_name.clone());
608        self.global_transport = Some(transport);
609        Ok(actual_name)
610    }
611
612    /// Tear down the node's base overlay (e.g. on full uninstall / shutdown).
613    fn teardown_global_overlay(&mut self) {
614        if let Some(mut transport) = self.global_transport.take() {
615            tracing::info!("Shutting down global overlay");
616            transport.shutdown();
617        }
618        self.global_interface = None;
619        self.transport_public_key = None;
620    }
621
622    // -- service overlay -----------------------------------------------------
623
624    /// Set up the per-service Linux bridge that backs `service` on this node.
625    ///
626    /// Returns the bridge name on success.
627    ///
628    /// # Errors
629    /// Returns an error if subnet assignment fails (exhaustion), if the bridge
630    /// cannot be created, or if the cluster transport rejects the `AllowedIPs`
631    /// update.
632    #[cfg(target_os = "linux")]
633    async fn setup_service_overlay(
634        &mut self,
635        service: &str,
636        mode: OverlayMode,
637    ) -> Result<ServiceOverlayInfo, OverlaydError> {
638        match mode.resolve() {
639            OverlayMode::Shared => self.setup_service_overlay_shared(service).await,
640            OverlayMode::Dedicated => self.setup_service_overlay_dedicated(service).await,
641            OverlayMode::Auto => unreachable!("OverlayMode::resolve never returns Auto"),
642        }
643    }
644
645    /// Shared-mode per-service overlay (Linux): the per-service bridge backed by
646    /// the single cluster transport. This is the original `setup_service_overlay`
647    /// body verbatim, now returning a [`ServiceOverlayInfo`] with the bridge name
648    /// and all identity fields `None` (Shared mode shares the cluster device).
649    ///
650    /// Returns the bridge name on success.
651    ///
652    /// # Errors
653    /// Returns an error if subnet assignment fails (exhaustion), if the bridge
654    /// cannot be created, or if the cluster transport rejects the `AllowedIPs`
655    /// update.
656    #[cfg(target_os = "linux")]
657    #[allow(clippy::too_many_lines)]
658    async fn setup_service_overlay_shared(
659        &mut self,
660        service: &str,
661    ) -> Result<ServiceOverlayInfo, OverlaydError> {
662        // 1. Idempotency check.
663        if let Some(existing) = self.service_bridges.get(service) {
664            let name = existing.name.clone();
665            tracing::debug!(service = %service, bridge = %name, "Service bridge already active, reusing");
666            return Ok(shared_overlay_info(name));
667        }
668
669        // 2. Assign subnet via the (currently local) ServiceSubnetRegistry.
670        self.ensure_service_subnet_registry()?;
671        let subnet: ipnet::IpNet = {
672            let registry = self
673                .service_subnet_registry
674                .as_mut()
675                .expect("ensure_service_subnet_registry leaves Some");
676            let node_key = self.local_node_id.to_string();
677            registry.assign(service, &node_key).map_err(|e| {
678                OverlaydError::Overlay(format!(
679                    "ServiceSubnetRegistry::assign({service}, {node_key}) failed: {e}"
680                ))
681            })?
682        };
683
684        // 3+4+6. Create the per-service Linux bridge, assign its gateway, bring
685        // it up, build the per-service IpAllocator, and record it.
686        let bridge_name = self.create_service_bridge(service, subnet).await?;
687
688        // 5. Plumb subnet into the cluster transport's local AllowedIPs so the
689        // single cluster device carries this service's cross-node traffic
690        // (Shared mode shares one crypto context for every service).
691        if let Some(ref cluster) = self.global_transport {
692            if let Some(ref pubkey) = self.local_wg_pubkey {
693                if let Err(e) = cluster.add_allowed_ip(pubkey, subnet).await {
694                    tracing::warn!(
695                        service = %service,
696                        subnet = %subnet,
697                        error = %e,
698                        "Failed to add service subnet to cluster transport AllowedIPs (non-fatal)"
699                    );
700                }
701            } else {
702                tracing::debug!(service = %service, "local_wg_pubkey not yet set; skipping cluster AllowedIPs update");
703            }
704        }
705
706        Ok(shared_overlay_info(bridge_name))
707    }
708
709    /// Create the per-service Linux bridge for `service` on `subnet`, assign its
710    /// gateway, bring it up, build the per-service [`IpAllocator`], and record it
711    /// in `service_bridges` + `service_interfaces`. Returns the bridge name.
712    ///
713    /// Shared and Dedicated mode share this bridge mechanic verbatim — the ONLY
714    /// difference between the two modes is which `WireGuard` device the service
715    /// subnet/peers are plumbed onto (the single cluster transport for Shared,
716    /// the dedicated per-service transport for Dedicated). This helper does NOT
717    /// touch any transport's `AllowedIPs`; the caller does that against the
718    /// device it owns.
719    ///
720    /// # Errors
721    /// Returns an error if the bridge cannot be created, addressed, or brought
722    /// up, or if the per-service `IpAllocator` cannot be built.
723    #[cfg(target_os = "linux")]
724    async fn create_service_bridge(
725        &mut self,
726        service: &str,
727        subnet: ipnet::IpNet,
728    ) -> Result<String, OverlaydError> {
729        use zlayer_overlay::allocator::IpAllocator as OverlayIpAllocator;
730
731        let bridge_name = make_interface_name(&[&self.deployment, &self.instance_id, service], "b");
732
733        if let Err(e) = crate::netlink::create_bridge(&bridge_name).await {
734            return Err(OverlaydError::Overlay(format!(
735                "create_bridge({bridge_name}) failed: {e}"
736            )));
737        }
738        if let Err(e) = crate::netlink::set_bridge_stp(&bridge_name, false) {
739            tracing::warn!(bridge = %bridge_name, error = %e, "set_bridge_stp(off) failed (non-fatal)");
740        }
741
742        // Gateway = first usable host in the subnet, assigned to the bridge.
743        let gateway = first_usable_ip(subnet);
744        if let Err(e) =
745            crate::netlink::add_address_to_link_by_name(&bridge_name, gateway, subnet.prefix_len())
746                .await
747        {
748            let _ = crate::netlink::delete_bridge(&bridge_name).await;
749            return Err(OverlaydError::Overlay(format!(
750                "add_address_to_link_by_name({bridge_name}, {gateway}/{}) failed: {e}",
751                subnet.prefix_len()
752            )));
753        }
754        if let Err(e) = crate::netlink::set_link_up_by_name(&bridge_name).await {
755            let _ = crate::netlink::delete_bridge(&bridge_name).await;
756            return Err(OverlaydError::Overlay(format!(
757                "set_link_up_by_name({bridge_name}) failed: {e}"
758            )));
759        }
760
761        // Build per-service IpAllocator, reserve the gateway.
762        let mut ip_allocator = OverlayIpAllocator::new(&subnet.to_string()).map_err(|e| {
763            OverlaydError::Overlay(format!("IpAllocator::new({subnet}) failed: {e}"))
764        })?;
765        let _ = ip_allocator.allocate_specific(gateway);
766
767        self.service_bridges.insert(
768            service.to_string(),
769            ServiceBridge {
770                name: bridge_name.clone(),
771                subnet,
772                gateway,
773                ip_allocator,
774            },
775        );
776        self.service_interfaces
777            .insert(service.to_string(), bridge_name.clone());
778
779        tracing::info!(service = %service, bridge = %bridge_name, subnet = %subnet, gateway = %gateway, "Service bridge created");
780        Ok(bridge_name)
781    }
782
783    /// Non-Linux variant of `setup_service_overlay`. On Windows the per-service
784    /// segment is the HCN Internal network created lazily at attach time, and on
785    /// macOS containers fall through to host networking. Registers the service
786    /// in `service_interfaces` with a placeholder name so presence checks work.
787    ///
788    /// # Errors
789    /// Infallible on non-Linux; the `Result` is preserved for ABI parity.
790    #[cfg(not(target_os = "linux"))]
791    async fn setup_service_overlay(
792        &mut self,
793        service: &str,
794        mode: OverlayMode,
795    ) -> Result<ServiceOverlayInfo, OverlaydError> {
796        match mode.resolve() {
797            OverlayMode::Shared => self.setup_service_overlay_shared(service).await,
798            OverlayMode::Dedicated => self.setup_service_overlay_dedicated(service).await,
799            OverlayMode::Auto => unreachable!("OverlayMode::resolve never returns Auto"),
800        }
801    }
802
803    /// Shared-mode per-service overlay (non-Linux): on Windows the per-service
804    /// segment is the HCN Internal network created lazily at attach time, and on
805    /// macOS containers fall through to host networking. Registers the service
806    /// in `service_interfaces` with a placeholder name so presence checks work.
807    ///
808    /// # Errors
809    /// Infallible on non-Linux; the `Result` is preserved for ABI parity.
810    #[cfg(not(target_os = "linux"))]
811    #[allow(clippy::unused_async)]
812    async fn setup_service_overlay_shared(
813        &mut self,
814        service: &str,
815    ) -> Result<ServiceOverlayInfo, OverlaydError> {
816        let placeholder = make_interface_name(&[&self.deployment, &self.instance_id, service], "b");
817        self.service_interfaces
818            .insert(service.to_string(), placeholder.clone());
819        tracing::debug!(service = %service, "Service overlay bridge setup is Linux-only; using direct networking placeholder");
820        Ok(shared_overlay_info(placeholder))
821    }
822
823    /// Dedicated-mode per-service overlay: stand up a *second* real `WireGuard`
824    /// device for `service` with its own crypto context, listen port, overlay
825    /// IP, and subnet — distinct from the single cluster transport.
826    ///
827    /// The cross-platform core (identity, subnet assign, transport bring-up,
828    /// marker persist, status) runs on every OS; only the *attachment* of
829    /// containers onto the device is platform-gated:
830    /// - Linux: a per-service bridge (same mechanic as Shared) routed over the
831    ///   dedicated device instead of the cluster device.
832    /// - Windows: a per-service HCN Internal network (a later task; a clearly
833    ///   marked seam returns an error here for now).
834    /// - macOS: nothing further — the utun device is the attachment.
835    ///
836    /// # Errors
837    /// Returns an error if port/key/subnet allocation, transport bring-up,
838    /// marker persistence, or the platform attachment fails.
839    #[allow(clippy::too_many_lines)]
840    async fn setup_service_overlay_dedicated(
841        &mut self,
842        service: &str,
843    ) -> Result<ServiceOverlayInfo, OverlaydError> {
844        // ----- cross-platform core (runs on every OS) -----
845
846        // 1. Idempotency: an existing dedicated transport returns its identity.
847        if let Some(st) = self.service_transports.get(service) {
848            return Ok(dedicated_overlay_info(
849                st.interface.clone(),
850                &st.public_key,
851                st.listen_port,
852                st.overlay_ip,
853                st.subnet,
854            ));
855        }
856
857        // 2. Identity: reuse a stable identity from the marker if one exists
858        //    (so the device re-binds the same key + port across restarts),
859        //    otherwise mint a fresh port + keypair + interface name.
860        let marker_path =
861            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
862        let recorded = NetworkState::load(&marker_path)
863            .get(&owner_for_service(service))
864            .cloned();
865
866        let (private_key, public_key, listen_port, iface_hint) = match recorded.as_ref() {
867            Some(entry)
868                if entry.wg_private_key.is_some()
869                    && entry.wg_public_key.is_some()
870                    && entry.wg_port.is_some()
871                    && entry.interface.is_some() =>
872            {
873                let port = entry.wg_port.expect("checked above");
874                self.dedicated_ports.reserve(port);
875                (
876                    entry.wg_private_key.clone().expect("checked above"),
877                    entry.wg_public_key.clone().expect("checked above"),
878                    port,
879                    entry.interface.clone().expect("checked above"),
880                )
881            }
882            _ => {
883                let port = self.dedicated_ports.allocate()?;
884                let (priv_key, pub_key) = OverlayTransport::generate_keys()
885                    .await
886                    .map_err(|e| OverlaydError::Overlay(format!("Failed to generate keys: {e}")))?;
887                let iface =
888                    make_interface_name(&[&self.deployment, &self.instance_id, service], "d");
889                (priv_key, pub_key, port, iface)
890            }
891        };
892
893        // 3. Subnet: assign from the same registry Shared uses, so per-service
894        //    subnets stay globally unique regardless of mode.
895        self.ensure_service_subnet_registry()?;
896        let subnet: ipnet::IpNet = {
897            let registry = self
898                .service_subnet_registry
899                .as_mut()
900                .expect("ensure_service_subnet_registry leaves Some");
901            let node_key = self.local_node_id.to_string();
902            registry.assign(service, &node_key).map_err(|e| {
903                OverlaydError::Overlay(format!(
904                    "ServiceSubnetRegistry::assign({service}, {node_key}) failed: {e}"
905                ))
906            })?
907        };
908        let overlay_ip = first_usable_ip(subnet);
909
910        // 4. Build + bring up the dedicated transport. The device's overlay CIDR
911        //    is the service subnet (so boringtun routes that subnet over THIS
912        //    device), and its listen port is the dedicated port.
913        let config = self.build_config(
914            private_key.clone(),
915            public_key.clone(),
916            overlay_ip,
917            subnet.prefix_len(),
918            listen_port,
919        );
920        let mut transport = OverlayTransport::new(config, iface_hint);
921        transport.create_interface().await.map_err(|e| {
922            OverlaydError::Overlay(format!(
923                "Failed to create dedicated overlay for {service}: {e}"
924            ))
925        })?;
926        transport.configure(&[]).await.map_err(|e| {
927            OverlaydError::Overlay(format!(
928                "Failed to configure dedicated overlay for {service}: {e}"
929            ))
930        })?;
931        let actual_iface = transport.interface_name().to_string();
932
933        // 5. Persist the marker so the identity survives restarts. Match the
934        //    base/Shared entry shape (owner/kind/name/id/subnet) plus the
935        //    dedicated WG fields.
936        let mut marker = NetworkState::load(&marker_path);
937        marker.upsert(ManagedNetwork {
938            owner: owner_for_service(service),
939            kind: "wg-dedicated".to_string(),
940            name: actual_iface.clone(),
941            id: public_key.clone(),
942            subnet: subnet.to_string(),
943            wg_port: Some(listen_port),
944            wg_private_key: Some(private_key),
945            wg_public_key: Some(public_key.clone()),
946            interface: Some(actual_iface.clone()),
947        });
948        if let Err(e) = marker.save(&marker_path) {
949            tracing::warn!(service = %service, error = %e, path = %marker_path.display(), "failed to persist dedicated-overlay marker (device still live)");
950        }
951
952        // 6. Record the live transport.
953        self.service_transports.insert(
954            service.to_string(),
955            ServiceTransport {
956                transport,
957                interface: actual_iface.clone(),
958                public_key: public_key.clone(),
959                listen_port,
960                overlay_ip,
961                subnet,
962            },
963        );
964
965        tracing::info!(
966            service = %service,
967            interface = %actual_iface,
968            listen_port,
969            subnet = %subnet,
970            overlay_ip = %overlay_ip,
971            "Dedicated per-service overlay device created"
972        );
973
974        // ----- platform-gated attachment -----
975        // `name` in the returned info is the container-attach handle: the bridge
976        // name on Linux, the dedicated interface elsewhere.
977        let name = self
978            .attach_dedicated_service(service, subnet, overlay_ip)
979            .await?;
980
981        Ok(dedicated_overlay_info(
982            name,
983            &public_key,
984            listen_port,
985            overlay_ip,
986            subnet,
987        ))
988    }
989
990    /// Linux attachment for a dedicated per-service overlay: create the same
991    /// per-service bridge Shared uses, but route the service subnet over the
992    /// DEDICATED device rather than the cluster device.
993    ///
994    /// Concretely, the dedicated transport's overlay CIDR already covers
995    /// `subnet` (set at `build_config` time in the core), so boringtun routes
996    /// `subnet` out the dedicated TUN; we additionally plumb `subnet` onto this
997    /// node's own `AllowedIPs` entry on the dedicated device so locally
998    /// originated packets to the subnet are accepted. Returns the bridge name.
999    ///
1000    /// # Errors
1001    /// Returns an error if the bridge cannot be created.
1002    #[cfg(target_os = "linux")]
1003    async fn attach_dedicated_service(
1004        &mut self,
1005        service: &str,
1006        subnet: ipnet::IpNet,
1007        overlay_ip: IpAddr,
1008    ) -> Result<String, OverlaydError> {
1009        let _ = overlay_ip;
1010        let bridge_name = self.create_service_bridge(service, subnet).await?;
1011
1012        // Plumb the service subnet onto the DEDICATED device (not the cluster
1013        // device). The dedicated transport's overlay CIDR already routes the
1014        // subnet out its TUN; adding it to our own pubkey's AllowedIPs keeps the
1015        // local-accept side consistent with the Shared path's cluster plumbing.
1016        if let Some(st) = self.service_transports.get(service) {
1017            if let Some(ref pubkey) = self.local_wg_pubkey {
1018                if let Err(e) = st.transport.add_allowed_ip(pubkey, subnet).await {
1019                    tracing::warn!(
1020                        service = %service,
1021                        subnet = %subnet,
1022                        error = %e,
1023                        "Failed to add service subnet to dedicated transport AllowedIPs (non-fatal)"
1024                    );
1025                }
1026            } else {
1027                tracing::debug!(service = %service, "local_wg_pubkey not yet set; skipping dedicated AllowedIPs update");
1028            }
1029        }
1030
1031        Ok(bridge_name)
1032    }
1033
1034    /// Windows attachment for a dedicated per-service overlay.
1035    ///
1036    /// The cross-platform core has already stood up the dedicated Wintun
1037    /// transport (the encrypted node-to-node path for the service subnet). This
1038    /// adds the *container-facing* side: a per-service HCN **Internal** network
1039    /// onto which the agent's containers attach (instead of the node's shared
1040    /// base overlay network), so dedicated-service traffic is isolated at the
1041    /// vSwitch layer. Returns the per-service network's name, which the caller
1042    /// records as the [`ServiceOverlayInfo::name`] attach handle.
1043    ///
1044    /// # Errors
1045    /// Propagates any error from [`Self::ensure_service_network`].
1046    #[cfg(target_os = "windows")]
1047    async fn attach_dedicated_service(
1048        &mut self,
1049        service: &str,
1050        subnet: ipnet::IpNet,
1051        _overlay_ip: IpAddr,
1052    ) -> Result<String, OverlaydError> {
1053        // Create (or reuse) the per-service Internal HCN network. The returned
1054        // GUID is recorded in the marker under `owner_for_service(service)`;
1055        // the `AttachContainer` handler reuses it via the same marker lookup.
1056        let _net_id = self.ensure_service_network(service, subnet).await?;
1057        // The attach handle reported back is the per-service network's name.
1058        let daemon_name = self.deployment_or_default();
1059        Ok(format!(
1060            "{}-svc-{service}",
1061            overlay_network_name(&daemon_name)
1062        ))
1063    }
1064
1065    /// macOS attachment for a dedicated per-service overlay: the cross-platform
1066    /// core already brought up a utun device; there is no bridge, so the
1067    /// interface name itself is the attach handle.
1068    #[cfg(all(not(target_os = "linux"), not(target_os = "windows")))]
1069    #[allow(clippy::unused_async)]
1070    async fn attach_dedicated_service(
1071        &mut self,
1072        service: &str,
1073        _subnet: ipnet::IpNet,
1074        _overlay_ip: IpAddr,
1075    ) -> Result<String, OverlaydError> {
1076        let iface = self
1077            .service_transports
1078            .get(service)
1079            .map(|st| st.interface.clone())
1080            .unwrap_or_default();
1081        Ok(iface)
1082    }
1083
1084    /// Tear down the per-service segment for `service`. Idempotent.
1085    // Only the Linux body awaits (netlink + cluster AllowedIPs); other targets
1086    // are synchronous (transport shutdown is sync) but must keep the async
1087    // signature for the dispatch call.
1088    #[cfg_attr(not(target_os = "linux"), allow(clippy::unused_async))]
1089    async fn teardown_service_overlay(&mut self, service: &str) {
1090        // Shared-mode segment teardown (bridge on Linux, placeholder elsewhere).
1091        #[cfg(target_os = "linux")]
1092        {
1093            let removed = self.service_bridges.remove(service);
1094            self.service_interfaces.remove(service);
1095            if let Some(bridge) = removed {
1096                if let Some(ref cluster) = self.global_transport {
1097                    if let Some(ref pubkey) = self.local_wg_pubkey {
1098                        if let Err(e) = cluster.remove_allowed_ip(pubkey, bridge.subnet).await {
1099                            tracing::warn!(
1100                                service = %service,
1101                                subnet = %bridge.subnet,
1102                                error = %e,
1103                                "Failed to remove service subnet from cluster AllowedIPs (non-fatal)"
1104                            );
1105                        }
1106                    }
1107                }
1108
1109                if let Err(e) = crate::netlink::delete_bridge(&bridge.name).await {
1110                    tracing::warn!(service = %service, bridge = %bridge.name, error = %e, "delete_bridge failed (non-fatal)");
1111                }
1112
1113                if let Some(registry) = self.service_subnet_registry.as_mut() {
1114                    let node_key = self.local_node_id.to_string();
1115                    let _ = registry.release(service, &node_key);
1116                }
1117
1118                tracing::info!(service = %service, bridge = %bridge.name, "Tore down service bridge");
1119            }
1120        }
1121        #[cfg(not(target_os = "linux"))]
1122        {
1123            if let Some(iface) = self.service_interfaces.remove(service) {
1124                tracing::info!(service = %service, interface = %iface, "Removed service overlay interface (placeholder, non-Linux)");
1125            }
1126        }
1127
1128        // Dedicated-mode teardown (cross-platform): tear down the per-service
1129        // transport, free its port, and drop its marker entry. No-op when the
1130        // service ran in Shared mode (nothing in `service_transports`).
1131        if let Some(mut st) = self.service_transports.remove(service) {
1132            st.transport.shutdown();
1133            self.dedicated_ports.release(st.listen_port);
1134
1135            // Release the subnet assignment (Shared releases it inside the
1136            // Linux block above; the dedicated subnet lives in the same
1137            // registry, so release it here for the dedicated case on every OS).
1138            if let Some(registry) = self.service_subnet_registry.as_mut() {
1139                let node_key = self.local_node_id.to_string();
1140                let _ = registry.release(service, &node_key);
1141            }
1142
1143            let marker_path =
1144                zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
1145            let mut marker = NetworkState::load(&marker_path);
1146            let removed_entry = marker.remove(&owner_for_service(service));
1147            if removed_entry.is_some() {
1148                if let Err(e) = marker.save(&marker_path) {
1149                    tracing::warn!(service = %service, error = %e, path = %marker_path.display(), "failed to persist dedicated-overlay marker removal");
1150                }
1151            }
1152
1153            // Windows: delete the per-service HCN Internal network this service
1154            // owned. The marker entry's `id` is the bare HCN GUID (set by
1155            // `ensure_service_network`); delete the network so a dedicated
1156            // service tears down cleanly without waiting for a full uninstall.
1157            // Also drop the per-service container-IP allocator.
1158            #[cfg(target_os = "windows")]
1159            {
1160                self.service_ip_allocators.remove(service);
1161                if let Some(entry) = removed_entry.as_ref() {
1162                    if entry.kind == "hcn-internal" {
1163                        if let Ok(guid) = windows::core::GUID::try_from(entry.id.as_str()) {
1164                            match zlayer_hns::network::Network::delete(guid) {
1165                                Ok(()) => {
1166                                    tracing::info!(service = %service, id = %entry.id, "deleted per-service HCN network");
1167                                }
1168                                Err(e) => {
1169                                    tracing::warn!(service = %service, id = %entry.id, error = %e, "failed to delete per-service HCN network (may leak until uninstall)");
1170                                }
1171                            }
1172                        } else {
1173                            tracing::warn!(service = %service, id = %entry.id, "per-service marker has unparseable HCN GUID; skipping network delete");
1174                        }
1175                    }
1176                }
1177            }
1178            #[cfg(not(target_os = "windows"))]
1179            drop(removed_entry);
1180
1181            tracing::info!(
1182                service = %service,
1183                interface = %st.interface,
1184                listen_port = st.listen_port,
1185                "Tore down dedicated per-service overlay device"
1186            );
1187        }
1188    }
1189
1190    /// Initialize the local fallback `ServiceSubnetRegistry` from the configured
1191    /// cluster CIDR. Called on first `setup_service_overlay` use.
1192    ///
1193    /// # Errors
1194    /// Returns an error when no cluster CIDR is configured or the registry
1195    /// cannot be built.
1196    fn ensure_service_subnet_registry(&mut self) -> Result<(), OverlaydError> {
1197        use zlayer_overlay::allocator::ServiceSubnetRegistry;
1198
1199        if self.service_subnet_registry.is_some() {
1200            return Ok(());
1201        }
1202        let cluster_cidr = self.cluster_cidr.ok_or_else(|| {
1203            OverlaydError::Other(
1204                "service subnet registry needs a cluster CIDR (SetupGlobalOverlay first)"
1205                    .to_string(),
1206            )
1207        })?;
1208        let cluster_ipnet: ipnet::IpNet = cluster_cidr.to_string().parse().map_err(|e| {
1209            OverlaydError::Other(format!(
1210                "failed to convert cluster CIDR {cluster_cidr} to ipnet::IpNet: {e}"
1211            ))
1212        })?;
1213        let slice_prefix: u8 = match cluster_ipnet {
1214            ipnet::IpNet::V4(_) => 28,
1215            ipnet::IpNet::V6(_) => 120,
1216        };
1217        let registry = ServiceSubnetRegistry::new(cluster_ipnet, slice_prefix).map_err(|e| {
1218            OverlaydError::Other(format!("failed to build ServiceSubnetRegistry: {e}"))
1219        })?;
1220        self.service_subnet_registry = Some(registry);
1221        Ok(())
1222    }
1223
1224    // -- IP allocation -------------------------------------------------------
1225
1226    /// Allocate an overlay IP from the per-service bridge (Linux) or the node
1227    /// slice (otherwise). `join_global` reserves a second global-overlay IP too,
1228    /// matching the eth1 attach behavior.
1229    ///
1230    /// # Errors
1231    /// Returns an error if the relevant pool is exhausted.
1232    fn allocate_ip(&mut self, service: &str, join_global: bool) -> Result<IpAddr, OverlaydError> {
1233        // `join_global` does not allocate a second IP here: the companion
1234        // global-overlay IP (eth1) is reserved at attach time. `AllocateIp`
1235        // returns only the primary (service / slice) IP the caller asked for.
1236        let _ = join_global;
1237        #[cfg(target_os = "linux")]
1238        {
1239            if let Some(bridge) = self.service_bridges.get_mut(service) {
1240                return bridge.ip_allocator.allocate().ok_or_else(|| {
1241                    OverlaydError::Overlay(format!(
1242                        "service bridge {} subnet {} exhausted",
1243                        bridge.name, bridge.subnet
1244                    ))
1245                });
1246            }
1247        }
1248        let _ = service;
1249        self.ip_allocator.allocate()
1250    }
1251
1252    /// Return an overlay IP to the allocator (service-bridge pool when known,
1253    /// otherwise the node slice).
1254    fn release_ip(&mut self, ip: IpAddr) {
1255        #[cfg(target_os = "linux")]
1256        {
1257            for bridge in self.service_bridges.values_mut() {
1258                if bridge.subnet.contains(&ip) {
1259                    bridge.ip_allocator.release(ip);
1260                    return;
1261                }
1262            }
1263        }
1264        self.ip_allocator.release(ip);
1265    }
1266
1267    // -- container attach (Linux) -------------------------------------------
1268
1269    /// Wire a container into the overlay and return its [`AttachResult`].
1270    ///
1271    /// # Errors
1272    /// Returns an error if the container cannot be attached.
1273    async fn attach_container(
1274        &mut self,
1275        handle: AttachHandle,
1276        service: &str,
1277        join_global: bool,
1278        dns_server: Option<IpAddr>,
1279        dns_domain: Option<String>,
1280    ) -> Result<AttachResult, OverlaydError> {
1281        // Record the overlay DNS resolver/zone the main daemon staged for this
1282        // node so later attaches (and the Windows HCN endpoint `Dns` schema)
1283        // can fall back to them when a per-attach value isn't supplied.
1284        if let Some(server) = dns_server {
1285            self.dns_server_addr = Some(SocketAddr::new(server, 53));
1286        }
1287        if dns_domain.is_some() {
1288            self.dns_domain.clone_from(&dns_domain);
1289        }
1290        match handle {
1291            AttachHandle::LinuxPid { pid } => {
1292                let ip = self
1293                    .attach_container_linux(pid, service, join_global)
1294                    .await?;
1295                Ok(AttachResult {
1296                    ip,
1297                    namespace_guid: None,
1298                })
1299            }
1300            AttachHandle::WindowsContainer { container_id, ip } => {
1301                self.attach_container_windows(&container_id, service, ip, dns_server, dns_domain)
1302                    .await
1303            }
1304            AttachHandle::GuestManaged { .. } => Err(OverlaydError::Other(
1305                "guest-managed attach must go through attach_container_guest, not attach_container"
1306                    .to_string(),
1307            )),
1308        }
1309    }
1310
1311    /// Tear down a container's overlay attachment and release its IP.
1312    ///
1313    /// # Errors
1314    /// Returns an error only if a netlink delete fails for a reason other than
1315    /// "link not found".
1316    async fn detach_container(&mut self, handle: AttachHandle) -> Result<(), OverlaydError> {
1317        match handle {
1318            AttachHandle::LinuxPid { pid } => self.detach_container_linux(pid).await,
1319            AttachHandle::WindowsContainer { container_id, .. } => {
1320                self.detach_container_windows(&container_id).await
1321            }
1322            AttachHandle::GuestManaged { .. } => Err(OverlaydError::Other(
1323                "guest-managed detach must go through detach_container_guest, not detach_container"
1324                    .to_string(),
1325            )),
1326        }
1327    }
1328
1329    // -- container attach (guest-managed) -----------------------------------
1330
1331    /// Guest-managed overlay attach: allocate the overlay identity for a VM guest
1332    /// that brings up its own kernel `WireGuard` device.
1333    ///
1334    /// overlayd cannot enter the guest's network namespace (it is a VM, not a
1335    /// host process), so instead of a veth/HCN endpoint it:
1336    /// 1. allocates the overlay IP from the SAME pool the Linux attach uses (the
1337    ///    per-service bridge pool when one exists, otherwise the node slice) so
1338    ///    guest addresses never collide with container addresses;
1339    /// 2. generates a fresh `WireGuard` keypair for the guest;
1340    /// 3. builds the peer set the guest must configure — every GLOBAL peer the
1341    ///    host already knows, plus THIS node itself (so the guest can reach the
1342    ///    host node over the overlay; carries a keepalive so the guest keeps its
1343    ///    NAT mapping open from behind VZ NAT);
1344    /// 4. registers the generated public key as a GLOBAL peer (host route to the
1345    ///    guest, roaming endpoint learned from the guest's keepalive) so remote
1346    ///    nodes and this node route to it;
1347    /// 5. records the attachment keyed by `id` so `DetachContainer` can release
1348    ///    the IP and remove the peer.
1349    ///
1350    /// Platform-agnostic: pure IPAM + keygen + peer bookkeeping (no netns/veth/
1351    /// HCN), so it compiles and runs on macOS (where the overlayd serving a VZ
1352    /// host lives) as well as Linux.
1353    ///
1354    /// # Errors
1355    /// Returns an error if the global overlay is not set up, the IP pool is
1356    /// exhausted, key generation fails, or registering the guest peer fails.
1357    #[allow(clippy::cast_possible_truncation)]
1358    async fn attach_container_guest(
1359        &mut self,
1360        id: &str,
1361        service: &str,
1362        join_global: bool,
1363        dns_server: Option<IpAddr>,
1364        dns_domain: Option<String>,
1365    ) -> Result<GuestOverlayConfig, OverlaydError> {
1366        // The global transport must exist: we both register the guest as a peer
1367        // on it and advertise this node (its public key + listen port) to the
1368        // guest. Resolve both up front so we fail before allocating anything.
1369        let node_public_key = self.transport_public_key.clone().ok_or_else(|| {
1370            OverlaydError::Other(
1371                "guest-managed attach requires the global overlay to be set up first \
1372                 (no node WireGuard public key)"
1373                    .to_string(),
1374            )
1375        })?;
1376        if self.global_transport.is_none() {
1377            return Err(OverlaydError::Other(
1378                "guest-managed attach requires the global overlay to be set up first \
1379                 (no global transport)"
1380                    .to_string(),
1381            ));
1382        }
1383
1384        // 1. Allocate the overlay IP from the same pool the Linux attach uses and
1385        //    derive the prefix length from that pool's network. On Linux a
1386        //    per-service bridge (when present) supplies both the IP and its
1387        //    subnet's prefix; otherwise (and on every non-Linux host) the node
1388        //    slice / cluster CIDR does.
1389        let (overlay_ip, prefix_len, pool_service): (IpAddr, u8, Option<String>) = {
1390            #[cfg(target_os = "linux")]
1391            {
1392                if let Some(bridge) = self.service_bridges.get_mut(service) {
1393                    let ip = bridge.ip_allocator.allocate().ok_or_else(|| {
1394                        OverlaydError::Overlay(format!(
1395                            "service bridge {} subnet {} exhausted",
1396                            bridge.name, bridge.subnet
1397                        ))
1398                    })?;
1399                    let prefix = bridge.subnet.prefix_len();
1400                    (ip, prefix, Some(service.to_string()))
1401                } else {
1402                    let ip = self.ip_allocator.allocate()?;
1403                    (ip, self.slice_prefix_len(), None)
1404                }
1405            }
1406            #[cfg(not(target_os = "linux"))]
1407            {
1408                let _ = service;
1409                let ip = self.ip_allocator.allocate()?;
1410                (ip, self.slice_prefix_len(), None)
1411            }
1412        };
1413        // `join_global` is informational for a guest-managed attach: the guest's
1414        // single WireGuard device IS its global-overlay endpoint, so there is no
1415        // separate eth1 IP to reserve. Touch it so callers stay consistent with
1416        // the Linux/Windows handles.
1417        let _ = join_global;
1418
1419        // 2. Generate the guest's WireGuard keypair (reuse the transport's
1420        //    native x25519 keygen — never reimplement curve25519 here).
1421        let (private_key, public_key) = OverlayTransport::generate_keys().await.map_err(|e| {
1422            // Roll back the IP allocation so a keygen failure leaks nothing.
1423            self.release_guest_ip(overlay_ip, pool_service.as_deref());
1424            OverlaydError::Overlay(format!("failed to generate guest keys: {e}"))
1425        })?;
1426
1427        // 3. Build the peer set. A VZ guest is behind the host's NAT and can only
1428        //    reach the LOCAL node (via its NAT gateway) — it cannot dial other
1429        //    nodes' or sibling guests' endpoints directly. So it gets exactly ONE
1430        //    peer: this node, with AllowedIPs covering the whole cluster CIDR.
1431        //    ALL overlay traffic (including to sibling containers and remote
1432        //    nodes) routes through this node, which forwards/hairpins it (the
1433        //    node already holds a /32 peer for every container — step 4 — and the
1434        //    real inter-node peers). We deliberately do NOT add the per-guest /32
1435        //    peers here: a /32 with no reachable endpoint would win longest-prefix
1436        //    routing and black-hole sibling traffic. The endpoint returned here is
1437        //    the node's overlay IP as a placeholder; the VZ runtime rewrites it to
1438        //    the guest's NAT gateway (the only host address the guest can reach)
1439        //    before delivering the config. Keepalive holds the guest's NAT mapping
1440        //    open so the node can reach back.
1441        let node_allowed = self
1442            .cluster_cidr
1443            .or(self.slice_cidr)
1444            .map_or_else(|| String::from("0.0.0.0/0"), |c| c.to_string());
1445        let node_endpoint = self.node_endpoint_for_guest();
1446        let peers: Vec<PeerSpec> = vec![PeerSpec {
1447            public_key: node_public_key,
1448            endpoint: node_endpoint,
1449            allowed_ips: node_allowed,
1450            persistent_keepalive_secs: 25,
1451        }];
1452
1453        // 4. Register the guest's public key as a GLOBAL peer (host route to the
1454        //    guest at <overlay_ip>/32, roaming endpoint learned from keepalive).
1455        //    Go through the same internal path `AddPeer { Global }` uses.
1456        let host_route = format!(
1457            "{}/{}",
1458            overlay_ip,
1459            if overlay_ip.is_ipv6() { 128 } else { 32 }
1460        );
1461        let guest_peer = PeerSpec {
1462            public_key: public_key.clone(),
1463            // Empty/roaming: the guest is behind NAT; boringtun learns its source
1464            // endpoint from the guest's first keepalive. `0.0.0.0:0` is the
1465            // wire-safe "unset endpoint" sentinel that still parses as a
1466            // SocketAddr (peer_spec_to_info requires a parseable endpoint).
1467            endpoint: "0.0.0.0:0".to_string(),
1468            allowed_ips: host_route,
1469            persistent_keepalive_secs: 0,
1470        };
1471        let guest_peer_info = peer_spec_to_info(&guest_peer)?;
1472        {
1473            let transport = self.transport_for_scope(&PeerScope::Global)?;
1474            if let Err(e) = Self::add_peer_on(transport, &guest_peer_info).await {
1475                self.release_guest_ip(overlay_ip, pool_service.as_deref());
1476                return Err(e);
1477            }
1478        }
1479        // Track it among the global peers (so a *subsequent* guest attach also
1480        // learns about this guest) and record the attachment for detach.
1481        self.global_peers
1482            .insert(public_key.clone(), guest_peer.clone());
1483        self.guest_attachments.insert(
1484            id.to_string(),
1485            GuestAttachInfo {
1486                overlay_ip,
1487                public_key: public_key.clone(),
1488                service_name: pool_service,
1489            },
1490        );
1491
1492        // 5. Return the config the caller ships into the guest.
1493        Ok(GuestOverlayConfig {
1494            overlay_ip,
1495            prefix_len,
1496            private_key,
1497            public_key,
1498            // The guest's device listens on the node's overlay WG port (the
1499            // convention every overlay device on this node uses).
1500            listen_port: self.overlay_port,
1501            peers,
1502            dns_server: dns_server.or_else(|| self.dns_server_addr.map(|s| s.ip())),
1503            dns_domain: dns_domain.or_else(|| self.dns_domain.clone()),
1504        })
1505    }
1506
1507    /// Release a guest-managed attach by `id`: drop the host route + global peer
1508    /// and return the allocated IP to its pool. Idempotent.
1509    ///
1510    /// # Errors
1511    /// Returns an error only if removing the peer from the global transport fails
1512    /// for a reason other than "peer not found".
1513    async fn detach_container_guest(&mut self, id: &str) -> Result<(), OverlaydError> {
1514        let Some(info) = self.guest_attachments.remove(id) else {
1515            return Ok(());
1516        };
1517        // Remove the guest's global peer (mirror the RemovePeer { Global } path).
1518        self.global_peers.remove(&info.public_key);
1519        if let Ok(transport) = self.transport_for_scope(&PeerScope::Global) {
1520            if let Err(e) = Self::remove_peer_on(transport, &info.public_key).await {
1521                tracing::warn!(
1522                    guest = %id,
1523                    pubkey = %info.public_key,
1524                    error = %e,
1525                    "failed to remove guest peer from global transport"
1526                );
1527            }
1528        }
1529        // Return the IP to whichever pool it came from.
1530        self.release_guest_ip(info.overlay_ip, info.service_name.as_deref());
1531        Ok(())
1532    }
1533
1534    /// Release a guest overlay IP back to the pool it was drawn from: the named
1535    /// service bridge's allocator (Linux) when `service` is set and the bridge
1536    /// still exists, otherwise the node slice allocator.
1537    fn release_guest_ip(&mut self, ip: IpAddr, service: Option<&str>) {
1538        #[cfg(target_os = "linux")]
1539        {
1540            if let Some(svc) = service {
1541                if let Some(bridge) = self.service_bridges.get_mut(svc) {
1542                    bridge.ip_allocator.release(ip);
1543                    return;
1544                }
1545            }
1546        }
1547        let _ = service;
1548        self.ip_allocator.release(ip);
1549    }
1550
1551    /// Prefix length of the address pool guest IPs are drawn from when not using
1552    /// a per-service bridge: the node slice if assigned, else the cluster CIDR.
1553    fn slice_prefix_len(&self) -> u8 {
1554        self.slice_cidr.or(self.cluster_cidr).map_or(
1555            if self.node_ip.is_some_and(|ip| ip.is_ipv6()) {
1556                64
1557            } else {
1558                24
1559            },
1560            |c| c.prefix(),
1561        )
1562    }
1563
1564    /// Reachable `WireGuard` endpoint for THIS node, advertised to a guest as a
1565    /// peer. overlayd has no public reflexive address at this layer, so it uses
1566    /// the node's overlay-listen identity (`node_ip:overlay_port`); the caller
1567    /// (the VZ runtime that ships the config into the guest) rewrites it to the
1568    /// concrete VZ-NAT gateway endpoint the guest can dial. Falls back to the
1569    /// unspecified address when no node IP is assigned yet.
1570    fn node_endpoint_for_guest(&self) -> String {
1571        let ip = self.node_ip.unwrap_or(IpAddr::V4(Ipv4Addr::UNSPECIFIED));
1572        SocketAddr::new(ip, self.overlay_port).to_string()
1573    }
1574
1575    /// Linux veth/netns attach. On non-Linux this returns the node's overlay IP
1576    /// (host networking) and is never wired for a `LinuxPid` handle in practice.
1577    #[cfg(target_os = "linux")]
1578    async fn attach_container_linux(
1579        &mut self,
1580        container_pid: u32,
1581        service: &str,
1582        join_global: bool,
1583    ) -> Result<IpAddr, OverlaydError> {
1584        // Look up the per-service bridge.
1585        let (bridge_name, bridge_subnet, bridge_gateway, container_ip) = {
1586            let bridge = self.service_bridges.get_mut(service).ok_or_else(|| {
1587                OverlaydError::Other(format!(
1588                    "no service bridge for service {service}; call setup_service_overlay() first"
1589                ))
1590            })?;
1591            let ip = bridge.ip_allocator.allocate().ok_or_else(|| {
1592                OverlaydError::Overlay(format!(
1593                    "service bridge {} subnet {} exhausted",
1594                    bridge.name, bridge.subnet
1595                ))
1596            })?;
1597            (bridge.name.clone(), bridge.subnet, bridge.gateway, ip)
1598        };
1599
1600        let bridge_params = BridgeAttachParams {
1601            bridge_name: &bridge_name,
1602            gateway: bridge_gateway,
1603            subnet_prefix_len: bridge_subnet.prefix_len(),
1604        };
1605        if let Err(e) = self
1606            .attach_to_interface(
1607                container_pid,
1608                container_ip,
1609                "s",
1610                "eth0",
1611                Some(&bridge_params),
1612            )
1613            .await
1614        {
1615            if let Some(bridge) = self.service_bridges.get_mut(service) {
1616                bridge.ip_allocator.release(container_ip);
1617            }
1618            return Err(e);
1619        }
1620
1621        let mut global_ip: Option<IpAddr> = None;
1622        if join_global && self.global_interface.is_some() {
1623            let g_ip = self.ip_allocator.allocate()?;
1624            self.attach_to_interface(container_pid, g_ip, "g", "eth1", None)
1625                .await?;
1626            global_ip = Some(g_ip);
1627        }
1628
1629        self.attached.insert(
1630            container_pid,
1631            AttachInfo {
1632                service_ip: container_ip,
1633                service_name: Some(service.to_string()),
1634                global_ip,
1635                joined_global: global_ip.is_some(),
1636            },
1637        );
1638
1639        Ok(container_ip)
1640    }
1641
1642    /// Non-Linux fallback: containers share the host network, so return the
1643    /// node's overlay IP (or loopback).
1644    #[cfg(not(target_os = "linux"))]
1645    #[allow(clippy::unused_async)]
1646    async fn attach_container_linux(
1647        &mut self,
1648        _container_pid: u32,
1649        service: &str,
1650        _join_global: bool,
1651    ) -> Result<IpAddr, OverlaydError> {
1652        tracing::debug!(service = %service, "LinuxPid attach is a no-op off Linux; using node overlay IP");
1653        Ok(self.node_ip.unwrap_or(IpAddr::V4(Ipv4Addr::LOCALHOST)))
1654    }
1655
1656    /// Release the overlay resources held by a Linux container PID. Idempotent.
1657    #[cfg(target_os = "linux")]
1658    async fn detach_container_linux(&mut self, pid: u32) -> Result<(), OverlaydError> {
1659        let Some(info) = self.attached.remove(&pid) else {
1660            return Ok(());
1661        };
1662
1663        let veth_s = format!("veth-{pid}-s");
1664        if let Err(e) = crate::netlink::delete_link_by_name(&veth_s).await {
1665            tracing::warn!(link = %veth_s, pid, error = %e, "Failed to delete service veth");
1666        }
1667        if info.joined_global {
1668            let veth_g = format!("veth-{pid}-g");
1669            if let Err(e) = crate::netlink::delete_link_by_name(&veth_g).await {
1670                tracing::warn!(link = %veth_g, pid, error = %e, "Failed to delete global veth");
1671            }
1672        }
1673
1674        if let Some(svc) = info.service_name.as_deref() {
1675            if let Some(bridge) = self.service_bridges.get_mut(svc) {
1676                bridge.ip_allocator.release(info.service_ip);
1677            } else {
1678                tracing::debug!(service = %svc, ip = %info.service_ip, "detach: service bridge already torn down; dropping service IP release");
1679            }
1680        } else {
1681            self.ip_allocator.release(info.service_ip);
1682        }
1683        if let Some(g) = info.global_ip {
1684            self.ip_allocator.release(g);
1685        }
1686        Ok(())
1687    }
1688
1689    /// Non-Linux fallback: nothing to detach (host networking).
1690    #[cfg(not(target_os = "linux"))]
1691    #[allow(clippy::unused_async)]
1692    async fn detach_container_linux(&mut self, _pid: u32) -> Result<(), OverlaydError> {
1693        Ok(())
1694    }
1695
1696    /// Best-effort sweep of orphan veth endpoints whose owning container PID is
1697    /// no longer alive. Names matching `veth-<pid>-*` / `vc-<pid>-*` where
1698    /// `/proc/<pid>` does not exist are deleted.
1699    #[cfg(target_os = "linux")]
1700    async fn sweep_orphan_veths() {
1701        let links = match crate::netlink::list_all_links().await {
1702            Ok(links) => links,
1703            Err(e) => {
1704                tracing::warn!(error = %e, "Failed to list links for orphan sweep");
1705                return;
1706            }
1707        };
1708        for (_index, name) in links {
1709            let remainder = if let Some(r) = name.strip_prefix("veth-") {
1710                r
1711            } else if let Some(r) = name.strip_prefix("vc-") {
1712                r
1713            } else {
1714                continue;
1715            };
1716            let Some(pid_str) = remainder.split('-').next() else {
1717                continue;
1718            };
1719            let pid: u32 = match pid_str.parse() {
1720                Ok(p) => p,
1721                Err(_) => continue,
1722            };
1723            if Path::new(&format!("/proc/{pid}")).exists() {
1724                continue;
1725            }
1726            tracing::info!(link = %name, pid = pid, "Deleting orphan veth");
1727            if let Err(e) = crate::netlink::delete_link_by_name(&name).await {
1728                tracing::warn!(link = %name, error = %e, "Failed to delete orphan veth");
1729            }
1730        }
1731    }
1732
1733    #[cfg(target_os = "linux")]
1734    #[allow(clippy::too_many_lines)]
1735    async fn attach_to_interface(
1736        &self,
1737        container_pid: u32,
1738        ip: IpAddr,
1739        tag: &str,
1740        container_iface: &str,
1741        bridge: Option<&BridgeAttachParams<'_>>,
1742    ) -> Result<(), OverlaydError> {
1743        // Best-effort cleanup of orphan veths left by a previous daemon crash.
1744        Self::sweep_orphan_veths().await;
1745
1746        let is_v6 = ip.is_ipv6();
1747        let prefix_len: u8 = if let Some(b) = bridge {
1748            b.subnet_prefix_len
1749        } else if is_v6 {
1750            64
1751        } else {
1752            24
1753        };
1754        let host_prefix: u8 = if is_v6 { 128 } else { 32 };
1755
1756        let veth_host = format!("veth-{container_pid}-{tag}");
1757        let veth_pending = format!("vc-{container_pid}-{tag}");
1758        let veth_container = container_iface.to_string();
1759
1760        let container_ns_fd = std::os::fd::OwnedFd::from(
1761            std::fs::File::open(format!("/proc/{container_pid}/ns/net")).map_err(|e| {
1762                OverlaydError::Overlay(format!("Failed to open /proc/{container_pid}/ns/net: {e}"))
1763            })?,
1764        );
1765
1766        crate::netlink::delete_link_by_name(&veth_host)
1767            .await
1768            .map_err(|e| OverlaydError::Overlay(format!("pre-cleanup delete {veth_host}: {e}")))?;
1769        crate::netlink::delete_link_by_name(&veth_pending)
1770            .await
1771            .map_err(|e| {
1772                OverlaydError::Overlay(format!("pre-cleanup delete {veth_pending}: {e}"))
1773            })?;
1774
1775        let bridge_gateway: Option<IpAddr> = bridge.map(|b| b.gateway);
1776        let bridge_name: Option<String> = bridge.map(|b| b.bridge_name.to_string());
1777        let node_ip = self.node_ip;
1778
1779        let result: Result<(), OverlaydError> = async {
1780            crate::netlink::create_veth_pair(&veth_host, &veth_pending)
1781                .await
1782                .map_err(|e| OverlaydError::Overlay(format!("create veth pair: {e}")))?;
1783
1784            crate::netlink::move_link_into_netns_fd_and_rename(
1785                &veth_pending,
1786                AsFd::as_fd(&container_ns_fd),
1787                &veth_container,
1788            )
1789            .map_err(|e| OverlaydError::Overlay(format!("move veth into netns: {e}")))?;
1790
1791            let vc = veth_container.clone();
1792            let bridge_gateway_for_netns = bridge_gateway;
1793            tokio::task::spawn_blocking(move || {
1794                crate::netlink::with_netns_fd_async(container_ns_fd, move || async move {
1795                    crate::netlink::add_address_to_link_by_name(&vc, ip, prefix_len).await?;
1796                    crate::netlink::set_link_up_by_name(&vc).await?;
1797                    crate::netlink::set_link_up_by_name("lo").await?;
1798                    if let Some(gw) = bridge_gateway_for_netns {
1799                        crate::netlink::add_default_route_via_gateway(gw).await?;
1800                    }
1801                    Ok(())
1802                })
1803            })
1804            .await
1805            .map_err(|e| OverlaydError::Overlay(format!("container netns task panicked: {e}")))?
1806            .map_err(|e| OverlaydError::Overlay(format!("container netns ops: {e}")))?;
1807
1808            crate::netlink::set_link_up_by_name(&veth_host)
1809                .await
1810                .map_err(|e| OverlaydError::Overlay(format!("set {veth_host} up: {e}")))?;
1811
1812            if let Some(bname) = bridge_name.as_deref() {
1813                crate::netlink::add_link_to_bridge(&veth_host, bname)
1814                    .await
1815                    .map_err(|e| {
1816                        OverlaydError::Overlay(format!(
1817                            "enslave {veth_host} to bridge {bname}: {e}"
1818                        ))
1819                    })?;
1820            } else {
1821                crate::netlink::replace_route_via_dev(ip, host_prefix, &veth_host, node_ip)
1822                    .await
1823                    .map_err(|e| {
1824                        OverlaydError::Overlay(format!("host route for {ip}/{host_prefix}: {e}"))
1825                    })?;
1826            }
1827
1828            let _ = crate::netlink::set_sysctl("net.ipv4.ip_forward", "1");
1829            let _ = crate::netlink::set_sysctl("net.ipv6.conf.all.forwarding", "1");
1830
1831            Ok(())
1832        }
1833        .await;
1834
1835        if result.is_err() {
1836            let _ = crate::netlink::delete_link_by_name(&veth_host).await;
1837            let _ = crate::netlink::delete_link_by_name(&veth_pending).await;
1838        }
1839        result
1840    }
1841
1842    // -- container attach (Windows HCN) -------------------------------------
1843
1844    /// Windows attach: ensure the overlay HCN Internal network exists, allocate
1845    /// or validate the IP, create the per-container HCN endpoint + namespace,
1846    /// and return the bare-lowercase namespace GUID for the agent to embed in
1847    /// the compute-system document.
1848    ///
1849    /// # Errors
1850    /// Returns an error if the network/endpoint cannot be created or the slice
1851    /// is exhausted.
1852    #[cfg(target_os = "windows")]
1853    async fn attach_container_windows(
1854        &mut self,
1855        container_id: &str,
1856        service: &str,
1857        ip_override: Option<IpAddr>,
1858        dns_server: Option<IpAddr>,
1859        dns_domain: Option<String>,
1860    ) -> Result<AttachResult, OverlaydError> {
1861        // Resolve whether THIS service has a dedicated per-service overlay. It
1862        // does iff a live dedicated transport exists OR a `hcn-internal` marker
1863        // entry is recorded under `owner_for_service(service)` (the network
1864        // survives daemon restarts even if the transport map is empty mid-init).
1865        // Dedicated services attach onto their OWN per-service Internal network
1866        // and draw IPs from the service subnet; everything else uses the node's
1867        // shared base overlay network and the node slice.
1868        let dedicated_subnet = self.dedicated_service_subnet(service);
1869
1870        let (net_id, ip, prefix_length) = if let Some(svc_subnet) = dedicated_subnet {
1871            // ----- dedicated per-service network path -----
1872            let net_id = self.ensure_service_network(service, svc_subnet).await?;
1873
1874            // Allocate (or validate) the IP from the SERVICE subnet, not the
1875            // node slice. A per-service allocator is created lazily and bounded
1876            // to the service subnet so addresses stay inside the dedicated
1877            // network. An `ip_override` inside the service subnet is honored;
1878            // one outside it is rejected so a slice-allocated IP can't leak onto
1879            // the dedicated network.
1880            let svc_ipnetwork: IpNetwork = svc_subnet.to_string().parse().map_err(|e| {
1881                OverlaydError::Other(format!("failed to parse service subnet {svc_subnet}: {e}"))
1882            })?;
1883            let allocator = self
1884                .service_ip_allocators
1885                .entry(service.to_string())
1886                .or_insert_with(|| IpAllocator::new(svc_ipnetwork));
1887            let ip = match ip_override {
1888                Some(ip) if svc_subnet.contains(&ip) => ip,
1889                Some(ip) => {
1890                    return Err(OverlaydError::Other(format!(
1891                        "overridden IP {ip} is not inside dedicated service subnet {svc_subnet} for service {service}"
1892                    )));
1893                }
1894                None => allocator.allocate()?,
1895            };
1896            (net_id, ip, svc_subnet.prefix_len())
1897        } else {
1898            // ----- shared base overlay network path (unchanged) -----
1899            let slice = self.slice_cidr.ok_or_else(|| {
1900                OverlaydError::Other(
1901                    "no node slice assigned yet (SetupGlobalOverlay with slice_cidr first)"
1902                        .to_string(),
1903                )
1904            })?;
1905            let slice_ipnet: ipnet::IpNet = slice.to_string().parse().map_err(|e| {
1906                OverlaydError::Other(format!("failed to parse slice CIDR {slice}: {e}"))
1907            })?;
1908            let net_id = self.ensure_overlay_network(slice_ipnet).await?;
1909            let ip = match ip_override {
1910                Some(ip) => ip,
1911                None => self.ip_allocator.allocate()?,
1912            };
1913            (net_id, ip, slice_ipnet.prefix_len())
1914        };
1915
1916        // 3. Create the endpoint + per-container namespace on the network.
1917        let dns_server_eff = dns_server.or_else(|| self.dns_server_addr.map(|a| a.ip()));
1918        let dns_domain_for_attach = dns_domain.or_else(|| self.dns_domain.clone());
1919        let cluster_cidr = self.cluster_cidr.map(|c| c.to_string()).unwrap_or_default();
1920        let owner_tag = owner_tag(&self.deployment_or_default());
1921        let cid = container_id.to_string();
1922
1923        let attachment = tokio::task::spawn_blocking(move || {
1924            zlayer_hns::attach::EndpointAttachment::create_overlay(
1925                net_id,
1926                &owner_tag,
1927                cid.as_str(),
1928                ip,
1929                prefix_length,
1930                &cluster_cidr,
1931                dns_server_eff,
1932                dns_domain_for_attach.as_deref(),
1933            )
1934        })
1935        .await
1936        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
1937        .map_err(|e| OverlaydError::Overlay(format!("HCN overlay endpoint attach failed: {e}")))?;
1938
1939        let namespace_id = attachment.namespace_id();
1940        let bare_guid = format_guid_bare(namespace_id);
1941
1942        // Record for autoclean keyed by namespace GUID.
1943        self.hcn_cleanup
1944            .insert(namespace_id, (service.to_string(), ip));
1945
1946        tracing::info!(
1947            ns = %bare_guid,
1948            service = %service,
1949            ip = %ip,
1950            "Attached container to HCN overlay"
1951        );
1952
1953        Ok(AttachResult {
1954            ip,
1955            namespace_guid: Some(bare_guid),
1956        })
1957    }
1958
1959    /// Non-Windows path: a `WindowsContainer` handle has no meaning off Windows.
1960    #[cfg(not(target_os = "windows"))]
1961    #[allow(clippy::unused_async)]
1962    async fn attach_container_windows(
1963        &mut self,
1964        _container_id: &str,
1965        _service: &str,
1966        _ip_override: Option<IpAddr>,
1967        _dns_server: Option<IpAddr>,
1968        _dns_domain: Option<String>,
1969    ) -> Result<AttachResult, OverlaydError> {
1970        Err(OverlaydError::Other(
1971            "WindowsContainer attach is only supported on Windows".to_string(),
1972        ))
1973    }
1974
1975    /// Detach a Windows container by its bare namespace GUID and release its IP.
1976    /// Idempotent: unknown ids are a no-op.
1977    #[cfg(target_os = "windows")]
1978    #[allow(clippy::unused_async)]
1979    async fn detach_container_windows(
1980        &mut self,
1981        namespace_guid: &str,
1982    ) -> Result<(), OverlaydError> {
1983        use windows::core::GUID;
1984
1985        let Ok(guid) = GUID::try_from(namespace_guid) else {
1986            tracing::warn!(ns = %namespace_guid, "detach: unparseable namespace GUID");
1987            return Ok(());
1988        };
1989        if let Some((service, ip)) = self.hcn_cleanup.remove(&guid) {
1990            self.ip_allocator.release(ip);
1991            tracing::info!(ns = %namespace_guid, service = %service, ip = %ip, "Released HCN overlay attachment");
1992        }
1993        Ok(())
1994    }
1995
1996    /// Non-Windows path.
1997    #[cfg(not(target_os = "windows"))]
1998    #[allow(clippy::unused_async)]
1999    async fn detach_container_windows(
2000        &mut self,
2001        _namespace_guid: &str,
2002    ) -> Result<(), OverlaydError> {
2003        Ok(())
2004    }
2005
2006    /// Ensure the per-daemon HCN overlay (Internal vSwitch, no physical-NIC
2007    /// binding) exists on the host, reusing one recorded in the
2008    /// `{data_dir}/agent_network.json` marker or discoverable by name, and
2009    /// recording it in the marker on create.
2010    ///
2011    /// # Errors
2012    /// Propagates the underlying `zlayer_hns` error on create failure.
2013    #[cfg(target_os = "windows")]
2014    #[allow(clippy::too_many_lines)]
2015    async fn ensure_overlay_network(
2016        &mut self,
2017        slice_cidr: ipnet::IpNet,
2018    ) -> Result<windows::core::GUID, OverlaydError> {
2019        use windows::core::GUID;
2020
2021        let daemon_name = self.deployment_or_default();
2022        let net_name = overlay_network_name(&daemon_name);
2023        let marker_path =
2024            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
2025
2026        // Fast path: marker names a network GUID that still exists; reopen it.
2027        if let Some(recorded_id) = crate::network_state::NetworkState::load(&marker_path)
2028            .get(crate::network_state::OWNER_BASE)
2029            .and_then(|entry| GUID::try_from(entry.id.as_str()).ok())
2030        {
2031            let reopened = tokio::task::spawn_blocking(move || {
2032                zlayer_hns::network::Network::open(recorded_id).ok()
2033            })
2034            .await
2035            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
2036            if reopened.is_some() {
2037                tracing::info!(name = %net_name, "reusing HCN overlay network from marker");
2038                return Ok(recorded_id);
2039            }
2040        }
2041
2042        // Idempotency: reuse a host network whose queried name matches ours.
2043        let target_name = net_name.clone();
2044        let existing = tokio::task::spawn_blocking(move || -> Option<GUID> {
2045            let guids = zlayer_hns::network::list("{}").ok()?;
2046            for guid in guids {
2047                let Ok(network) = zlayer_hns::network::Network::open(guid) else {
2048                    continue;
2049                };
2050                if matches!(network.query("{}"), Ok(props) if props.name == target_name) {
2051                    return Some(guid);
2052                }
2053            }
2054            None
2055        })
2056        .await
2057        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
2058
2059        if let Some(existing_id) = existing {
2060            tracing::info!(name = %net_name, "reusing existing HCN overlay network");
2061            return Ok(existing_id);
2062        }
2063
2064        let net_id = GUID::new()
2065            .map_err(|e| OverlaydError::Other(format!("GUID::new for overlay network: {e}")))?;
2066        let subnet_str = slice_cidr.to_string();
2067
2068        // Default: an HCN Internal network — an internal vSwitch with NO
2069        // physical-NIC binding — so container traffic never touches the
2070        // operator's gateway adapter. Setting ZLAYER_HCN_UPLINK_ADAPTER opts
2071        // into the legacy Transparent model bound to that named uplink.
2072        let use_transparent = std::env::var(zlayer_hns::adapter::ZLAYER_UPLINK_ENV)
2073            .ok()
2074            .is_some_and(|v| !v.trim().is_empty());
2075
2076        let net_name_for_create = net_name.clone();
2077        let subnet_for_create = subnet_str.clone();
2078        if use_transparent {
2079            let uplink = zlayer_hns::adapter::find_primary_adapter()
2080                .map_err(|e| OverlaydError::Other(format!("find_primary_adapter: {e}")))?;
2081            tracing::warn!(uplink = %uplink, "ZLAYER_HCN_UPLINK_ADAPTER set: creating HCN *Transparent* overlay bound to a physical NIC");
2082            tokio::task::spawn_blocking(move || {
2083                zlayer_hns::network::Network::create_transparent(
2084                    net_id,
2085                    &net_name_for_create,
2086                    &subnet_for_create,
2087                    &uplink,
2088                )
2089            })
2090            .await
2091            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
2092            .map_err(|e| {
2093                OverlaydError::Overlay(format!("HcnCreateNetwork transparent ({net_name}): {e}"))
2094            })?;
2095        } else {
2096            tokio::task::spawn_blocking(move || {
2097                zlayer_hns::network::Network::create_internal(
2098                    net_id,
2099                    &net_name_for_create,
2100                    &subnet_for_create,
2101                )
2102            })
2103            .await
2104            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
2105            .map_err(|e| {
2106                OverlaydError::Overlay(format!("HcnCreateNetwork internal ({net_name}): {e}"))
2107            })?;
2108        }
2109
2110        // HCN's Static IPAM needs ~1-2s after network create to settle its
2111        // address pool; without this the first endpoint frequently fails with
2112        // HCN_E_ADDR_INVALID_OR_RESERVED.
2113        tokio::time::sleep(std::time::Duration::from_millis(2000)).await;
2114
2115        tracing::info!(
2116            subnet = %subnet_str,
2117            mode = if use_transparent { "Transparent" } else { "Internal" },
2118            "created HCN overlay network"
2119        );
2120
2121        // Persist the marker so subsequent runs reuse this network by GUID and a
2122        // full uninstall knows to delete it. Best-effort.
2123        let mut marker = crate::network_state::NetworkState::load(&marker_path);
2124        marker.upsert(crate::network_state::ManagedNetwork {
2125            owner: crate::network_state::OWNER_BASE.to_string(),
2126            kind: if use_transparent {
2127                "hcn-transparent"
2128            } else {
2129                "hcn-internal"
2130            }
2131            .to_string(),
2132            name: net_name.clone(),
2133            id: format_guid_bare(net_id),
2134            subnet: subnet_str.clone(),
2135            // Base/Shared HCN network: no dedicated WireGuard identity.
2136            wg_port: None,
2137            wg_private_key: None,
2138            wg_public_key: None,
2139            interface: None,
2140        });
2141        if let Err(e) = marker.save(&marker_path) {
2142            tracing::warn!(error = %e, path = %marker_path.display(), "failed to persist agent network marker (network still reusable by name)");
2143        }
2144
2145        Ok(net_id)
2146    }
2147
2148    /// Ensure the per-service HCN **Internal** network for `service` exists on
2149    /// the host, reusing one recorded under the `service:<name>` marker owner
2150    /// (or discoverable by its derived name) and recording it on create.
2151    ///
2152    /// This is the Windows analogue of the Linux per-service bridge: a
2153    /// dedicated (`OverlayMode::Dedicated`) service gets its OWN isolated HCN
2154    /// Internal network — an internal vSwitch with NO physical-NIC binding —
2155    /// distinct from the node's shared base overlay network. Containers attach
2156    /// to it (rather than the base network) so dedicated-service traffic is
2157    /// segregated at the vSwitch layer. Modeled on [`Self::ensure_overlay_network`]
2158    /// but keyed on [`owner_for_service`] and forced to the Internal type (never
2159    /// Transparent — the on-box test asserts zero external vSwitches for
2160    /// dedicated services).
2161    ///
2162    /// Returns the network GUID.
2163    ///
2164    /// # Errors
2165    /// Propagates the underlying `zlayer_hns` error on create failure.
2166    #[cfg(target_os = "windows")]
2167    #[allow(clippy::too_many_lines)]
2168    async fn ensure_service_network(
2169        &mut self,
2170        service: &str,
2171        subnet: ipnet::IpNet,
2172    ) -> Result<windows::core::GUID, OverlaydError> {
2173        use windows::core::GUID;
2174
2175        let daemon_name = self.deployment_or_default();
2176        // Per-service network name: `<base overlay name>-svc-<service>` so it is
2177        // unambiguously distinct from the base network and from other services.
2178        let net_name = format!("{}-svc-{service}", overlay_network_name(&daemon_name));
2179        let owner = owner_for_service(service);
2180        let marker_path =
2181            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
2182
2183        // Fast path: marker names a network GUID that still exists; reopen it.
2184        // Only honor the recorded id when it belongs to an HCN-internal entry —
2185        // a Dedicated WireGuard marker (`kind == "wg-dedicated"`) stores the
2186        // transport public key in `id`, NOT an HCN GUID, so it must be ignored
2187        // for HCN reuse.
2188        let recorded_hcn_id = crate::network_state::NetworkState::load(&marker_path)
2189            .get(&owner)
2190            .filter(|entry| entry.kind == "hcn-internal")
2191            .and_then(|entry| GUID::try_from(entry.id.as_str()).ok());
2192        if let Some(recorded_id) = recorded_hcn_id {
2193            let reopened = tokio::task::spawn_blocking(move || {
2194                zlayer_hns::network::Network::open(recorded_id).ok()
2195            })
2196            .await
2197            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
2198            if reopened.is_some() {
2199                tracing::info!(name = %net_name, service = %service, "reusing per-service HCN network from marker");
2200                return Ok(recorded_id);
2201            }
2202        }
2203
2204        // Idempotency: reuse a host network whose queried name matches ours.
2205        let target_name = net_name.clone();
2206        let existing = tokio::task::spawn_blocking(move || -> Option<GUID> {
2207            let guids = zlayer_hns::network::list("{}").ok()?;
2208            for guid in guids {
2209                let Ok(network) = zlayer_hns::network::Network::open(guid) else {
2210                    continue;
2211                };
2212                if matches!(network.query("{}"), Ok(props) if props.name == target_name) {
2213                    return Some(guid);
2214                }
2215            }
2216            None
2217        })
2218        .await
2219        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
2220
2221        if let Some(existing_id) = existing {
2222            tracing::info!(name = %net_name, service = %service, "reusing existing per-service HCN network");
2223            return Ok(existing_id);
2224        }
2225
2226        let net_id = GUID::new()
2227            .map_err(|e| OverlaydError::Other(format!("GUID::new for per-service network: {e}")))?;
2228        let subnet_str = subnet.to_string();
2229
2230        // ALWAYS Internal for a dedicated service — never Transparent. The
2231        // dedicated requirement is isolation; an Internal network binds NO
2232        // physical NIC (no external vSwitch), which is what the on-box test
2233        // asserts.
2234        let net_name_for_create = net_name.clone();
2235        let subnet_for_create = subnet_str.clone();
2236        tokio::task::spawn_blocking(move || {
2237            zlayer_hns::network::Network::create_internal(
2238                net_id,
2239                &net_name_for_create,
2240                &subnet_for_create,
2241            )
2242        })
2243        .await
2244        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
2245        .map_err(|e| {
2246            OverlaydError::Overlay(format!("HcnCreateNetwork internal ({net_name}): {e}"))
2247        })?;
2248
2249        // HCN's Static IPAM needs ~1-2s after network create to settle its
2250        // address pool; without this the first endpoint frequently fails with
2251        // HCN_E_ADDR_INVALID_OR_RESERVED (same wait as the base network).
2252        tokio::time::sleep(std::time::Duration::from_millis(2000)).await;
2253
2254        tracing::info!(
2255            service = %service,
2256            subnet = %subnet_str,
2257            "created per-service HCN Internal network"
2258        );
2259
2260        // Persist the marker (owner = `service:<name>`, kind = `hcn-internal`)
2261        // so subsequent runs reuse this network by GUID and a full uninstall
2262        // (`purge_managed_networks`, which sweeps every `kind` starting with
2263        // `hcn`) deletes it. Best-effort.
2264        //
2265        // A dedicated Windows service shares the SAME owner key for two facts:
2266        // the dedicated WireGuard identity (written by the cross-platform core
2267        // in `setup_service_overlay_dedicated`, kind `wg-dedicated`) and this
2268        // HCN network's GUID. The marker is keyed by owner, so carry the WG
2269        // identity fields over when we rewrite the entry to `hcn-internal` — the
2270        // single entry then holds both the HCN GUID (in `id`) and the WG
2271        // identity (in the `wg_*`/`interface` fields), and the WG private key
2272        // survives restarts. (The core re-asserts the `wg-dedicated` shape on
2273        // the next setup; this path re-asserts `hcn-internal` again right after
2274        // — both are self-healing because the network is also reusable by name.)
2275        let mut marker = crate::network_state::NetworkState::load(&marker_path);
2276        let carried = marker.get(&owner).cloned();
2277        marker.upsert(crate::network_state::ManagedNetwork {
2278            owner,
2279            kind: "hcn-internal".to_string(),
2280            name: net_name.clone(),
2281            id: format_guid_bare(net_id),
2282            subnet: subnet_str.clone(),
2283            wg_port: carried.as_ref().and_then(|c| c.wg_port),
2284            wg_private_key: carried.as_ref().and_then(|c| c.wg_private_key.clone()),
2285            wg_public_key: carried.as_ref().and_then(|c| c.wg_public_key.clone()),
2286            interface: carried.as_ref().and_then(|c| c.interface.clone()),
2287        });
2288        if let Err(e) = marker.save(&marker_path) {
2289            tracing::warn!(service = %service, error = %e, path = %marker_path.display(), "failed to persist per-service network marker (network still reusable by name)");
2290        }
2291
2292        Ok(net_id)
2293    }
2294
2295    /// Resolve the dedicated per-service subnet for `service`, if the service
2296    /// runs in `OverlayMode::Dedicated` on this node.
2297    ///
2298    /// Source of truth, in order:
2299    /// 1. The live [`ServiceTransport`] in `service_transports` (the normal
2300    ///    case once `SetupServiceOverlay` has run this process).
2301    /// 2. A persisted `hcn-internal` marker entry under
2302    ///    [`owner_for_service`]`(service)` — covers the window where the HCN
2303    ///    network exists from a prior run but the transport map is still empty.
2304    ///
2305    /// Returns `None` for Shared-mode services (attach onto the base network).
2306    #[cfg(target_os = "windows")]
2307    fn dedicated_service_subnet(&self, service: &str) -> Option<ipnet::IpNet> {
2308        if let Some(st) = self.service_transports.get(service) {
2309            return Some(st.subnet);
2310        }
2311        let marker_path =
2312            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
2313        crate::network_state::NetworkState::load(&marker_path)
2314            .get(&owner_for_service(service))
2315            .filter(|entry| entry.kind == "hcn-internal")
2316            .and_then(|entry| entry.subnet.parse::<ipnet::IpNet>().ok())
2317    }
2318
2319    /// The daemon name used for HCN network/owner naming, defaulting to
2320    /// `"zlayer"` when no deployment has been set yet.
2321    #[cfg(target_os = "windows")]
2322    fn deployment_or_default(&self) -> String {
2323        if self.deployment.is_empty() {
2324            "zlayer".to_string()
2325        } else {
2326            self.deployment.clone()
2327        }
2328    }
2329
2330    // -- peers ---------------------------------------------------------------
2331
2332    /// Resolve a [`PeerScope`] to the live [`OverlayTransport`] its ops target.
2333    ///
2334    /// `Global` -> the single cluster transport; `Service { service }` -> that
2335    /// service's dedicated per-service transport (Dedicated mode only).
2336    ///
2337    /// # Errors
2338    /// Returns an error if the global overlay is not up (for `Global`) or no
2339    /// dedicated overlay exists for the named service (for `Service`).
2340    fn transport_for_scope(&self, scope: &PeerScope) -> Result<&OverlayTransport, OverlaydError> {
2341        match scope {
2342            PeerScope::Global => self
2343                .global_transport
2344                .as_ref()
2345                .ok_or_else(|| OverlaydError::Other("global overlay not set up".into())),
2346            PeerScope::Service { service } => self
2347                .service_transports
2348                .get(service)
2349                .map(|s| &s.transport)
2350                .ok_or_else(|| {
2351                    OverlaydError::Other(format!("no dedicated overlay for service {service}"))
2352                }),
2353        }
2354    }
2355
2356    /// Add a peer to a resolved transport.
2357    ///
2358    /// # Errors
2359    /// Wraps the underlying transport error.
2360    async fn add_peer_on(
2361        transport: &OverlayTransport,
2362        peer: &PeerInfo,
2363    ) -> Result<(), OverlaydError> {
2364        transport
2365            .add_peer(peer)
2366            .await
2367            .map_err(|e| OverlaydError::Overlay(format!("add_peer failed: {e}")))
2368    }
2369
2370    /// Remove a peer (by base64 public key) from a resolved transport.
2371    ///
2372    /// # Errors
2373    /// Wraps the underlying transport error.
2374    async fn remove_peer_on(
2375        transport: &OverlayTransport,
2376        pubkey: &str,
2377    ) -> Result<(), OverlaydError> {
2378        transport
2379            .remove_peer(pubkey)
2380            .await
2381            .map_err(|e| OverlaydError::Overlay(format!("remove_peer failed: {e}")))
2382    }
2383
2384    /// Plumb a CIDR into a peer's `AllowedIPs` on a resolved transport.
2385    ///
2386    /// # Errors
2387    /// Returns an error when the CIDR is invalid or the UAPI write fails.
2388    async fn add_allowed_ip_on(
2389        transport: &OverlayTransport,
2390        pubkey: &str,
2391        cidr: &str,
2392    ) -> Result<(), OverlaydError> {
2393        let net: ipnet::IpNet = cidr
2394            .parse()
2395            .map_err(|e| OverlaydError::Other(format!("invalid CIDR {cidr}: {e}")))?;
2396        transport
2397            .add_allowed_ip(pubkey, net)
2398            .await
2399            .map_err(|e| OverlaydError::Overlay(format!("add_allowed_ip failed: {e}")))
2400    }
2401
2402    /// Remove a CIDR from a peer's `AllowedIPs` on a resolved transport.
2403    ///
2404    /// # Errors
2405    /// Returns an error when the CIDR is invalid or the UAPI write fails.
2406    async fn remove_allowed_ip_on(
2407        transport: &OverlayTransport,
2408        pubkey: &str,
2409        cidr: &str,
2410    ) -> Result<(), OverlaydError> {
2411        let net: ipnet::IpNet = cidr
2412            .parse()
2413            .map_err(|e| OverlaydError::Other(format!("invalid CIDR {cidr}: {e}")))?;
2414        transport
2415            .remove_allowed_ip(pubkey, net)
2416            .await
2417            .map_err(|e| OverlaydError::Overlay(format!("remove_allowed_ip failed: {e}")))
2418    }
2419
2420    // -- DNS -----------------------------------------------------------------
2421
2422    /// Register an overlay DNS A/AAAA record.
2423    fn register_dns(&mut self, name: String, ip: IpAddr) {
2424        self.dns_records.insert(name, ip);
2425    }
2426
2427    /// Remove an overlay DNS record.
2428    fn unregister_dns(&mut self, name: &str) {
2429        self.dns_records.remove(name);
2430    }
2431
2432    // -- NAT -----------------------------------------------------------------
2433
2434    /// Periodic NAT traversal maintenance: re-probe STUN, refresh relays.
2435    /// No-op when NAT traversal has not been started.
2436    ///
2437    /// # Errors
2438    /// Returns an error when the underlying STUN refresh fails.
2439    async fn nat_maintenance_tick(&mut self) -> Result<(), OverlaydError> {
2440        // Lazily start NAT traversal on the first tick if a config asks for it.
2441        if self.nat_traversal.is_none() {
2442            let config = self.nat_config.clone().unwrap_or_default();
2443            if config.enabled {
2444                let mut nat = NatTraversal::new(config, self.overlay_port);
2445                match nat.gather_candidates().await {
2446                    Ok(candidates) => {
2447                        tracing::info!(count = candidates.len(), "Gathered NAT candidates");
2448                        self.nat_last_refresh.store(now_unix(), Ordering::SeqCst);
2449                        self.nat_traversal = Some(nat);
2450                    }
2451                    Err(e) => {
2452                        tracing::warn!(error = %e, "NAT candidate gathering failed");
2453                        return Ok(());
2454                    }
2455                }
2456            } else {
2457                return Ok(());
2458            }
2459        }
2460
2461        let Some(nat) = self.nat_traversal.as_mut() else {
2462            return Ok(());
2463        };
2464        match nat.refresh().await {
2465            Ok(changed) => {
2466                if changed {
2467                    tracing::info!("NAT reflexive address changed during refresh");
2468                }
2469                self.nat_last_refresh.store(now_unix(), Ordering::SeqCst);
2470                Ok(())
2471            }
2472            Err(e) => Err(OverlaydError::Overlay(format!(
2473                "NAT maintenance tick failed: {e}"
2474            ))),
2475        }
2476    }
2477
2478    // -- status --------------------------------------------------------------
2479
2480    /// Build a [`StatusSnapshot`] from current overlay state.
2481    async fn status_snapshot(&self) -> StatusSnapshot {
2482        let mut peers: Vec<PeerStatus> = Vec::new();
2483        let public_key = self.transport_public_key.clone();
2484
2485        if let Some(transport) = self.global_transport.as_ref() {
2486            // Parse the UAPI dump for per-peer state. Best-effort: a parse
2487            // failure leaves the peer list empty rather than failing Status.
2488            if let Ok(dump) = transport.status().await {
2489                peers = parse_peer_status(&dump);
2490            }
2491        }
2492
2493        let service_count = u32::try_from(self.service_count()).unwrap_or(u32::MAX);
2494        let peer_count = u32::try_from(peers.len()).unwrap_or(u32::MAX);
2495
2496        // Per dedicated per-service overlay device: count its peers the same
2497        // way the global status does (parse the UAPI/status dump).
2498        let mut dedicated_services: Vec<DedicatedServiceStatus> = Vec::new();
2499        for (svc, st) in &self.service_transports {
2500            let peer_count = match st.transport.status().await {
2501                Ok(dump) => u32::try_from(parse_peer_status(&dump).len()).unwrap_or(u32::MAX),
2502                Err(_) => 0,
2503            };
2504            dedicated_services.push(DedicatedServiceStatus {
2505                service: svc.clone(),
2506                interface: st.interface.clone(),
2507                public_key: st.public_key.clone(),
2508                listen_port: st.listen_port,
2509                overlay_ip: st.overlay_ip,
2510                subnet: st.subnet.to_string(),
2511                peer_count,
2512            });
2513        }
2514
2515        StatusSnapshot {
2516            interface: self.global_interface.clone(),
2517            node_ip: self.node_ip,
2518            public_key,
2519            overlay_cidr: self.cluster_cidr.map(|c| c.to_string()),
2520            slice_cidr: self.slice_cidr.map(|c| c.to_string()),
2521            peer_count,
2522            service_count,
2523            peers,
2524            dedicated_services,
2525        }
2526    }
2527
2528    /// Number of per-service overlays set up on this node (Shared bridges /
2529    /// placeholders plus any Dedicated transports not already counted there).
2530    fn service_count(&self) -> usize {
2531        let extra_dedicated = self
2532            .service_transports
2533            .keys()
2534            .filter(|svc| !self.service_interfaces.contains_key(*svc))
2535            .count();
2536        self.service_interfaces.len() + extra_dedicated
2537    }
2538
2539    // -- config helper -------------------------------------------------------
2540
2541    fn build_config(
2542        &self,
2543        private_key: String,
2544        public_key: String,
2545        ip: IpAddr,
2546        mask: u8,
2547        listen_port: u16,
2548    ) -> OverlayConfig {
2549        let local_addr = match ip {
2550            IpAddr::V4(_) => IpAddr::V4(Ipv4Addr::UNSPECIFIED),
2551            IpAddr::V6(_) => IpAddr::V6(Ipv6Addr::UNSPECIFIED),
2552        };
2553        let mut config = OverlayConfig {
2554            local_endpoint: SocketAddr::new(local_addr, listen_port),
2555            private_key,
2556            public_key,
2557            overlay_cidr: format!("{ip}/{mask}"),
2558            ..OverlayConfig::default()
2559        };
2560        if let Some(nat) = self.nat_config.clone() {
2561            config.nat = nat;
2562        }
2563        if let Some(dir) = self.uapi_sock_dir.clone() {
2564            config.uapi_sock_dir = dir;
2565        }
2566        config
2567    }
2568}
2569
2570/// Build a Shared-mode [`ServiceOverlayInfo`]: the bridge/placeholder name with
2571/// every dedicated-device identity field left `None` (Shared mode shares the
2572/// single cluster device).
2573fn shared_overlay_info(name: String) -> ServiceOverlayInfo {
2574    ServiceOverlayInfo {
2575        name,
2576        mode: OverlayMode::Shared,
2577        wg_public_key: None,
2578        wg_port: None,
2579        overlay_ip: None,
2580        subnet: None,
2581    }
2582}
2583
2584/// Build a Dedicated-mode [`ServiceOverlayInfo`] from a dedicated device's
2585/// identity. `name` is the container-attach handle (bridge name on Linux, the
2586/// dedicated interface elsewhere).
2587fn dedicated_overlay_info(
2588    name: String,
2589    public_key: &str,
2590    listen_port: u16,
2591    overlay_ip: IpAddr,
2592    subnet: ipnet::IpNet,
2593) -> ServiceOverlayInfo {
2594    ServiceOverlayInfo {
2595        name,
2596        mode: OverlayMode::Dedicated,
2597        wg_public_key: Some(public_key.to_string()),
2598        wg_port: Some(listen_port),
2599        overlay_ip: Some(overlay_ip),
2600        subnet: Some(subnet.to_string()),
2601    }
2602}
2603
2604/// Convert a wire [`PeerSpec`] into a `zlayer_overlay::PeerInfo`.
2605///
2606/// # Errors
2607/// Returns an error if `endpoint` cannot be parsed as a `host:port`
2608/// [`SocketAddr`].
2609pub fn peer_spec_to_info(spec: &PeerSpec) -> Result<PeerInfo, OverlaydError> {
2610    let endpoint: SocketAddr = spec.endpoint.parse().map_err(|e| {
2611        OverlaydError::Other(format!("invalid peer endpoint {}: {e}", spec.endpoint))
2612    })?;
2613    Ok(PeerInfo::new(
2614        spec.public_key.clone(),
2615        endpoint,
2616        &spec.allowed_ips,
2617        std::time::Duration::from_secs(spec.persistent_keepalive_secs),
2618    ))
2619}
2620
2621/// Parse a `wg`-style UAPI/`status` dump into [`PeerStatus`] entries.
2622///
2623/// The dump is a series of `key=value` lines; each `public_key=` line starts a
2624/// new peer block, and subsequent `endpoint=` / `allowed_ip=` /
2625/// `latest_handshake=` lines belong to it.
2626fn parse_peer_status(dump: &str) -> Vec<PeerStatus> {
2627    let mut peers: Vec<PeerStatus> = Vec::new();
2628    let mut current: Option<PeerStatus> = None;
2629    let mut allowed: Vec<String> = Vec::new();
2630
2631    let flush = |peers: &mut Vec<PeerStatus>,
2632                 current: &mut Option<PeerStatus>,
2633                 allowed: &mut Vec<String>| {
2634        if let Some(mut p) = current.take() {
2635            p.allowed_ips = allowed.join(",");
2636            peers.push(p);
2637        }
2638        allowed.clear();
2639    };
2640
2641    for line in dump.lines() {
2642        let line = line.trim();
2643        let Some((key, value)) = line.split_once('=') else {
2644            continue;
2645        };
2646        match key.trim() {
2647            "public_key" | "peer" => {
2648                flush(&mut peers, &mut current, &mut allowed);
2649                current = Some(PeerStatus {
2650                    public_key: value.trim().to_string(),
2651                    endpoint: String::new(),
2652                    allowed_ips: String::new(),
2653                    last_handshake_unix_secs: 0,
2654                });
2655            }
2656            "endpoint" => {
2657                if let Some(p) = current.as_mut() {
2658                    p.endpoint = value.trim().to_string();
2659                }
2660            }
2661            "allowed_ip" | "allowed_ips" => {
2662                if current.is_some() {
2663                    allowed.push(value.trim().to_string());
2664                }
2665            }
2666            "latest_handshake" | "last_handshake_time_sec" => {
2667                if let Some(p) = current.as_mut() {
2668                    p.last_handshake_unix_secs = value.trim().parse().unwrap_or(0);
2669                }
2670            }
2671            _ => {}
2672        }
2673    }
2674    flush(&mut peers, &mut current, &mut allowed);
2675    peers
2676}
2677
2678/// Current Unix time in whole seconds.
2679fn now_unix() -> u64 {
2680    std::time::SystemTime::now()
2681        .duration_since(std::time::UNIX_EPOCH)
2682        .unwrap_or_default()
2683        .as_secs()
2684}
2685
2686/// Simple IP address allocator supporting both IPv4 and IPv6, bounded to a
2687/// specific CIDR (typically a per-node `/28` slice). Allocations past the last
2688/// usable host return an exhaustion error.
2689struct IpAllocator {
2690    /// CIDR the allocator is bounded to.
2691    cidr: IpNetwork,
2692    /// Base (network) address of the CIDR.
2693    base: IpAddr,
2694    /// Monotonic counter for the next allocation offset relative to `base`.
2695    next_offset: AtomicU64,
2696    /// IPs returned by `release(...)`. `allocate()` drains this first before
2697    /// incrementing `next_offset`.
2698    released: parking_lot::Mutex<Vec<IpAddr>>,
2699}
2700
2701impl IpAllocator {
2702    fn new(cidr: IpNetwork) -> Self {
2703        Self {
2704            base: cidr.network(),
2705            cidr,
2706            next_offset: AtomicU64::new(1),
2707            released: parking_lot::Mutex::new(Vec::new()),
2708        }
2709    }
2710
2711    #[allow(clippy::cast_possible_truncation)]
2712    fn compute_addr(&self, offset: u64) -> IpAddr {
2713        match self.base {
2714            IpAddr::V4(base_v4) => {
2715                let base_u32 = u32::from_be_bytes(base_v4.octets());
2716                let addr = base_u32.wrapping_add(offset as u32);
2717                IpAddr::V4(Ipv4Addr::from(addr.to_be_bytes()))
2718            }
2719            IpAddr::V6(base_v6) => {
2720                let base_u128 = u128::from(base_v6);
2721                let addr = base_u128.wrapping_add(u128::from(offset));
2722                IpAddr::V6(Ipv6Addr::from(addr))
2723            }
2724        }
2725    }
2726
2727    /// Allocate the next IP in the slice, reusing released IPs first.
2728    ///
2729    /// # Errors
2730    /// Returns [`OverlaydError::Overlay`] when the CIDR is exhausted.
2731    fn allocate(&self) -> Result<IpAddr, OverlaydError> {
2732        if let Some(ip) = self.released.lock().pop() {
2733            return Ok(ip);
2734        }
2735        let offset = self.next_offset.fetch_add(1, Ordering::SeqCst);
2736        let addr = self.compute_addr(offset);
2737
2738        let in_cidr = self.cidr.contains(addr);
2739        let is_v4_broadcast = matches!(
2740            (&self.cidr, &addr),
2741            (IpNetwork::V4(v4), IpAddr::V4(a)) if *a == v4.broadcast()
2742        );
2743        if !in_cidr || is_v4_broadcast {
2744            return Err(OverlaydError::Overlay(format!(
2745                "IP allocator exhausted: next address {addr} is outside slice {}",
2746                self.cidr
2747            )));
2748        }
2749        Ok(addr)
2750    }
2751
2752    /// Return an IP to the free pool. Idempotent.
2753    fn release(&self, ip: IpAddr) {
2754        let mut released = self.released.lock();
2755        if !released.contains(&ip) {
2756            released.push(ip);
2757        }
2758    }
2759}
2760
2761// -- Windows HCN helpers (ported from the agent's hcs runtime) --------------
2762
2763/// Owner tag stamped onto every HCN endpoint this server creates. The legacy
2764/// single-instance value is `"zlayer"`; any other name is used verbatim so two
2765/// daemons running side-by-side never sweep each other's endpoints.
2766#[cfg(target_os = "windows")]
2767fn owner_tag(daemon_name: &str) -> String {
2768    if daemon_name == "zlayer" {
2769        "zlayer".to_string()
2770    } else {
2771        daemon_name.to_string()
2772    }
2773}
2774
2775/// Name of the per-daemon HCN overlay network on the host. Legacy
2776/// single-instance value is `"zlayer-overlay"`; any other name becomes
2777/// `"<daemon_name>-overlay"`.
2778#[cfg(target_os = "windows")]
2779fn overlay_network_name(daemon_name: &str) -> String {
2780    if daemon_name == "zlayer" {
2781        "zlayer-overlay".to_string()
2782    } else {
2783        format!("{daemon_name}-overlay")
2784    }
2785}
2786
2787/// Format a GUID as the bare, lowercase, un-braced string HCN/HCS use to
2788/// identify a namespace inside a compute-system document's
2789/// `Container.Networking.Namespace` field (e.g. `aabbccdd-eeff-...`).
2790#[cfg(target_os = "windows")]
2791fn format_guid_bare(id: windows::core::GUID) -> String {
2792    format!("{id:?}")
2793        .trim_matches(|c: char| c == '{' || c == '}')
2794        .to_ascii_lowercase()
2795}
2796
2797/// Delete every host-level HCN network this server created for `daemon_name` and
2798/// clear the persistent marker. Called on a full uninstall — never on a routine
2799/// stop/restart. Best-effort throughout. Synchronous (HCN calls are blocking).
2800#[cfg(target_os = "windows")]
2801pub fn purge_managed_networks(data_dir: &Path, daemon_name: &str) {
2802    use windows::core::GUID;
2803
2804    let marker_path = zlayer_paths::ZLayerDirs::new(data_dir.to_path_buf()).agent_network_state();
2805    let state = crate::network_state::NetworkState::load(&marker_path);
2806
2807    // Pass 1: delete recorded HCN networks by GUID.
2808    for entry in &state.networks {
2809        if !entry.kind.starts_with("hcn") {
2810            continue;
2811        }
2812        match GUID::try_from(entry.id.as_str()) {
2813            Ok(guid) => match zlayer_hns::network::Network::delete(guid) {
2814                Ok(()) => {
2815                    tracing::info!(name = %entry.name, id = %entry.id, "deleted managed HCN network");
2816                }
2817                Err(e) => {
2818                    tracing::warn!(name = %entry.name, id = %entry.id, error = %e, "failed to delete managed HCN network");
2819                }
2820            },
2821            Err(e) => {
2822                tracing::warn!(id = %entry.id, error = %e, "managed network marker has unparseable GUID");
2823            }
2824        }
2825    }
2826
2827    // Pass 2: name-sweep fallback for an overlay network whose marker entry was
2828    // lost (crash between create and marker write).
2829    let overlay_name = overlay_network_name(daemon_name);
2830    if let Ok(guids) = zlayer_hns::network::list("{}") {
2831        for guid in guids {
2832            let Ok(network) = zlayer_hns::network::Network::open(guid) else {
2833                continue;
2834            };
2835            let is_ours = matches!(network.query("{}"), Ok(props) if props.name == overlay_name);
2836            drop(network);
2837            if is_ours {
2838                match zlayer_hns::network::Network::delete(guid) {
2839                    Ok(()) => {
2840                        tracing::info!(name = %overlay_name, "deleted overlay HCN network (name sweep)");
2841                    }
2842                    Err(e) => {
2843                        tracing::warn!(name = %overlay_name, error = %e, "failed to delete overlay network (name sweep)");
2844                    }
2845                }
2846            }
2847        }
2848    }
2849
2850    if marker_path.exists() {
2851        if let Err(e) = std::fs::remove_file(&marker_path) {
2852            tracing::warn!(error = %e, path = %marker_path.display(), "failed to remove agent network marker");
2853        }
2854    }
2855}
2856
2857#[cfg(test)]
2858mod tests {
2859    use super::*;
2860
2861    #[test]
2862    fn peer_spec_to_info_parses_endpoint_and_keepalive() {
2863        let spec = PeerSpec {
2864            public_key: "base64key".to_string(),
2865            endpoint: "1.2.3.4:51820".to_string(),
2866            allowed_ips: "10.200.0.5/32,10.200.1.0/24".to_string(),
2867            persistent_keepalive_secs: 25,
2868        };
2869        let info = peer_spec_to_info(&spec).expect("valid spec");
2870        assert_eq!(info.public_key, "base64key");
2871        assert_eq!(info.endpoint, "1.2.3.4:51820".parse().unwrap());
2872        assert_eq!(info.allowed_ips, "10.200.0.5/32,10.200.1.0/24");
2873        assert_eq!(
2874            info.persistent_keepalive_interval,
2875            std::time::Duration::from_secs(25)
2876        );
2877    }
2878
2879    #[test]
2880    fn peer_spec_to_info_rejects_bad_endpoint() {
2881        let spec = PeerSpec {
2882            public_key: "k".to_string(),
2883            endpoint: "not-a-socket-addr".to_string(),
2884            allowed_ips: String::new(),
2885            persistent_keepalive_secs: 0,
2886        };
2887        assert!(peer_spec_to_info(&spec).is_err());
2888    }
2889
2890    #[test]
2891    fn interface_name_never_exceeds_limit() {
2892        let cases: Vec<(&[&str], &str)> = vec![
2893            (&["a"], "g"),
2894            (&["zlayer-manager"], "g"),
2895            (&["my-very-long-deployment-name-that-goes-on-and-on"], "g"),
2896            (&["zlayer", "manager"], "s"),
2897            (
2898                &["abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz"],
2899                "s",
2900            ),
2901            (&["x"], ""),
2902        ];
2903        for (parts, suffix) in &cases {
2904            let name = make_interface_name(parts, suffix);
2905            assert!(name.len() <= MAX_IFNAME_LEN, "Name '{name}' too long");
2906            assert!(name.starts_with("zl-"));
2907        }
2908    }
2909
2910    #[test]
2911    fn interface_name_is_deterministic() {
2912        assert_eq!(
2913            make_interface_name(&["zlayer-manager"], "g"),
2914            make_interface_name(&["zlayer-manager"], "g")
2915        );
2916    }
2917
2918    #[test]
2919    fn parse_peer_status_splits_blocks() {
2920        let dump = "\
2921public_key=AAA
2922endpoint=1.2.3.4:51820
2923allowed_ip=10.200.0.2/32
2924allowed_ip=10.200.1.0/24
2925latest_handshake=1700000000
2926public_key=BBB
2927endpoint=5.6.7.8:51820
2928allowed_ip=10.200.0.3/32
2929latest_handshake=0
2930";
2931        let peers = parse_peer_status(dump);
2932        assert_eq!(peers.len(), 2);
2933        assert_eq!(peers[0].public_key, "AAA");
2934        assert_eq!(peers[0].endpoint, "1.2.3.4:51820");
2935        assert_eq!(peers[0].allowed_ips, "10.200.0.2/32,10.200.1.0/24");
2936        assert_eq!(peers[0].last_handshake_unix_secs, 1_700_000_000);
2937        assert_eq!(peers[1].public_key, "BBB");
2938        assert_eq!(peers[1].last_handshake_unix_secs, 0);
2939    }
2940
2941    #[tokio::test]
2942    async fn status_snapshot_before_setup_is_empty() {
2943        let server = OverlaydServer::new(std::path::PathBuf::from("/tmp/zlayer-overlayd-test"));
2944        let snap = server.status_snapshot().await;
2945        assert!(snap.interface.is_none());
2946        assert!(snap.node_ip.is_none());
2947        assert!(snap.public_key.is_none());
2948        assert_eq!(snap.peer_count, 0);
2949        assert_eq!(snap.service_count, 0);
2950        assert!(snap.peers.is_empty());
2951    }
2952
2953    #[tokio::test]
2954    async fn allocate_and_release_ip_round_trip() {
2955        let mut server = OverlaydServer::new(std::path::PathBuf::from("/tmp/zlayer-overlayd-test"));
2956        let a = server.allocate_ip("svc", false).expect("alloc a");
2957        let b = server.allocate_ip("svc", false).expect("alloc b");
2958        assert_ne!(a, b);
2959        server.release_ip(a);
2960        // Released IP is handed back before the monotonic counter advances.
2961        let c = server.allocate_ip("svc", false).expect("alloc c");
2962        assert_eq!(c, a);
2963    }
2964
2965    /// Build a throwaway server bound to a unique temp data dir so the marker
2966    /// file (rehydrated in `new`) never collides between tests.
2967    fn test_server() -> OverlaydServer {
2968        let dir = std::env::temp_dir().join(format!(
2969            "zlayer-overlayd-scope-{}-{}",
2970            std::process::id(),
2971            now_unix()
2972        ));
2973        OverlaydServer::new(dir)
2974    }
2975
2976    #[tokio::test]
2977    async fn transport_for_scope_global_requires_setup() {
2978        let server = test_server();
2979        // No global overlay set up yet -> Global scope errors. (Can't use
2980        // `expect_err` because `&OverlayTransport` is not `Debug`.)
2981        match server.transport_for_scope(&PeerScope::Global) {
2982            Ok(_) => panic!("global overlay should not be set up"),
2983            Err(OverlaydError::Other(m)) => {
2984                assert!(m.contains("global overlay not set up"), "got: {m}");
2985            }
2986            Err(other) => panic!("unexpected error: {other:?}"),
2987        }
2988    }
2989
2990    #[tokio::test]
2991    async fn transport_for_scope_unset_service_errors() {
2992        let server = test_server();
2993        match server.transport_for_scope(&PeerScope::Service {
2994            service: "x".to_string(),
2995        }) {
2996            Ok(_) => panic!("no dedicated overlay should exist for x"),
2997            Err(OverlaydError::Other(m)) => {
2998                assert_eq!(m, "no dedicated overlay for service x");
2999            }
3000            Err(other) => panic!("unexpected error: {other:?}"),
3001        }
3002    }
3003
3004    #[tokio::test]
3005    async fn add_peer_service_scope_before_setup_errors_via_dispatch() {
3006        let mut server = test_server();
3007        let resp = server
3008            .handle(OverlaydRequest::AddPeer {
3009                peer: PeerSpec {
3010                    public_key: "k".to_string(),
3011                    endpoint: "1.2.3.4:51820".to_string(),
3012                    allowed_ips: "10.200.0.2/32".to_string(),
3013                    persistent_keepalive_secs: 0,
3014                },
3015                scope: PeerScope::Service {
3016                    service: "x".to_string(),
3017                },
3018            })
3019            .await;
3020        match resp {
3021            OverlaydResponse::Err { message } => {
3022                assert_eq!(message, "no dedicated overlay for service x");
3023            }
3024            other => panic!("expected Err response, got {other:?}"),
3025        }
3026    }
3027
3028    /// End-to-end Dedicated setup. Needs a real TUN device, so it is ignored by
3029    /// default and only runs on a privileged Linux host (mirrors the crate's
3030    /// other privileged overlay e2e tests).
3031    #[cfg(target_os = "linux")]
3032    #[tokio::test]
3033    #[ignore = "needs CAP_NET_ADMIN; run on a privileged Linux host"]
3034    async fn dedicated_setup_creates_distinct_device_and_routes_service_peer() {
3035        let mut server = test_server();
3036        // Bring up the global overlay first so the cluster CIDR + global device
3037        // exist (the dedicated device must get a distinct port and key).
3038        let global_name = server
3039            .setup_global_overlay(
3040                "dep".to_string(),
3041                "i0".to_string(),
3042                "10.200.0.0/16",
3043                Some("10.200.0.0/28"),
3044                zlayer_core::DEFAULT_WG_PORT,
3045                false,
3046            )
3047            .await
3048            .expect("global overlay up");
3049        assert!(!global_name.is_empty());
3050
3051        // Dedicated service setup.
3052        let info = server
3053            .setup_service_overlay("web", OverlayMode::Dedicated)
3054            .await
3055            .expect("dedicated service overlay up");
3056        assert_eq!(info.mode, OverlayMode::Dedicated);
3057        let port = info.wg_port.expect("dedicated port");
3058        assert_ne!(
3059            port, server.overlay_port,
3060            "dedicated device must not share the global port"
3061        );
3062
3063        let st = server
3064            .service_transports
3065            .get("web")
3066            .expect("service transport recorded");
3067        assert_eq!(st.listen_port, port);
3068        assert_ne!(
3069            st.interface, global_name,
3070            "dedicated interface must differ from global"
3071        );
3072        assert_eq!(
3073            Some(st.public_key.clone()),
3074            info.wg_public_key,
3075            "info pubkey matches recorded transport"
3076        );
3077        assert_ne!(
3078            Some(st.public_key.clone()),
3079            server.transport_public_key,
3080            "dedicated key must differ from global key"
3081        );
3082
3083        // A Service-scoped AddPeer must land on the dedicated device (succeeds),
3084        // proving scope routing targets the per-service transport.
3085        let resp = server
3086            .handle(OverlaydRequest::AddPeer {
3087                peer: PeerSpec {
3088                    public_key: {
3089                        let (_priv, pubk) = OverlayTransport::generate_keys().await.unwrap();
3090                        pubk
3091                    },
3092                    endpoint: "5.6.7.8:51999".to_string(),
3093                    allowed_ips: "10.201.0.2/32".to_string(),
3094                    persistent_keepalive_secs: 25,
3095                },
3096                scope: PeerScope::Service {
3097                    service: "web".to_string(),
3098                },
3099            })
3100            .await;
3101        assert!(
3102            matches!(resp, OverlaydResponse::Ok),
3103            "service-scoped add_peer should land on the dedicated device, got {resp:?}"
3104        );
3105    }
3106
3107    #[tokio::test]
3108    async fn guest_attach_requires_global_overlay() {
3109        // Without a global overlay (no node public key / transport) a
3110        // guest-managed attach must error rather than allocate anything.
3111        let mut server = test_server();
3112        let resp = server
3113            .handle(OverlaydRequest::AttachContainer {
3114                handle: AttachHandle::GuestManaged {
3115                    id: "vm-1".to_string(),
3116                },
3117                service: "web".to_string(),
3118                join_global: true,
3119                dns_server: None,
3120                dns_domain: None,
3121            })
3122            .await;
3123        match resp {
3124            OverlaydResponse::Err { message } => {
3125                assert!(
3126                    message.contains("global overlay to be set up"),
3127                    "got: {message}"
3128                );
3129            }
3130            other => panic!("expected Err response, got {other:?}"),
3131        }
3132        // Nothing was recorded.
3133        assert!(server.guest_attachments.is_empty());
3134    }
3135
3136    #[tokio::test]
3137    async fn detach_unknown_guest_is_idempotent() {
3138        let mut server = test_server();
3139        // No such guest -> Ok (idempotent), no panic.
3140        server
3141            .detach_container_guest("never-attached")
3142            .await
3143            .expect("detach of unknown guest is a no-op");
3144    }
3145
3146    /// Full guest-managed attach/detach round-trip. Needs a real TUN device (the
3147    /// global overlay must be live so the guest peer can be installed), so it is
3148    /// ignored by default and only runs on a privileged Linux host — mirrors the
3149    /// crate's other privileged overlay e2e tests.
3150    #[cfg(target_os = "linux")]
3151    #[tokio::test]
3152    #[ignore = "needs CAP_NET_ADMIN; run on a privileged Linux host"]
3153    async fn guest_attach_allocates_config_and_detach_releases() {
3154        let mut server = test_server();
3155        server
3156            .setup_global_overlay(
3157                "dep".to_string(),
3158                "i0".to_string(),
3159                "10.200.0.0/16",
3160                Some("10.200.0.0/28"),
3161                zlayer_core::DEFAULT_WG_PORT,
3162                false,
3163            )
3164            .await
3165            .expect("global overlay up");
3166
3167        // Seed a global peer so the guest config carries it through.
3168        let (_p, other_pub) = OverlayTransport::generate_keys().await.unwrap();
3169        let add = server
3170            .handle(OverlaydRequest::AddPeer {
3171                peer: PeerSpec {
3172                    public_key: other_pub.clone(),
3173                    endpoint: "9.9.9.9:51820".to_string(),
3174                    allowed_ips: "10.200.1.0/28".to_string(),
3175                    persistent_keepalive_secs: 25,
3176                },
3177                scope: PeerScope::Global,
3178            })
3179            .await;
3180        assert!(
3181            matches!(add, OverlaydResponse::Ok),
3182            "seed peer add: {add:?}"
3183        );
3184
3185        let resp = server
3186            .handle(OverlaydRequest::AttachContainer {
3187                handle: AttachHandle::GuestManaged {
3188                    id: "vm-1".to_string(),
3189                },
3190                service: "web".to_string(),
3191                join_global: true,
3192                dns_server: Some("10.200.0.1".parse().unwrap()),
3193                dns_domain: Some("overlay".to_string()),
3194            })
3195            .await;
3196        let config = match resp {
3197            OverlaydResponse::GuestConfig(c) => c,
3198            other => panic!("expected GuestConfig, got {other:?}"),
3199        };
3200        assert!(!config.private_key.is_empty());
3201        assert!(!config.public_key.is_empty());
3202        assert_ne!(config.private_key, config.public_key);
3203        assert_eq!(config.listen_port, server.overlay_port);
3204        assert_eq!(config.dns_server, Some("10.200.0.1".parse().unwrap()));
3205        // Peers = the seeded global peer + this node (self) + nothing else.
3206        assert!(
3207            config.peers.iter().any(|p| p.public_key == other_pub),
3208            "guest must learn the seeded global peer"
3209        );
3210        assert!(
3211            config
3212                .peers
3213                .iter()
3214                .any(|p| Some(&p.public_key) == server.transport_public_key.as_ref()),
3215            "guest must learn THIS node as a peer"
3216        );
3217        // The guest's own key is registered as a global peer (host route).
3218        assert!(server.global_peers.contains_key(&config.public_key));
3219        let info = server
3220            .guest_attachments
3221            .get("vm-1")
3222            .expect("attachment recorded");
3223        assert_eq!(info.overlay_ip, config.overlay_ip);
3224
3225        // Detach releases the peer + IP.
3226        let det = server
3227            .handle(OverlaydRequest::DetachContainer {
3228                handle: AttachHandle::GuestManaged {
3229                    id: "vm-1".to_string(),
3230                },
3231            })
3232            .await;
3233        assert!(matches!(det, OverlaydResponse::Ok), "detach: {det:?}");
3234        assert!(!server.guest_attachments.contains_key("vm-1"));
3235        assert!(!server.global_peers.contains_key(&config.public_key));
3236    }
3237}