Skip to main content

zlayer_overlayd/
server.rs

1//! The overlayd server engine.
2//!
3//! [`OverlaydServer`] is a near 1:1 migration of the *mechanics* half of the
4//! agent's `OverlayManager`: it owns the single cluster `WireGuard`
5//! [`OverlayTransport`], the per-service Linux bridges (Linux) / HCN Internal
6//! network + endpoints (Windows), the per-node IP allocator, DNS config, and
7//! NAT traversal. The cluster-brain half (Raft, scheduler, service registry)
8//! stays in the main daemon, which drives this server over the IPC contract in
9//! [`zlayer_types::overlayd`].
10//!
11//! Every [`OverlaydRequest`] maps to a method here via [`OverlaydServer::handle`].
12
13use std::collections::HashMap;
14use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr};
15#[cfg(target_os = "linux")]
16use std::os::fd::AsFd;
17use std::path::{Path, PathBuf};
18use std::sync::atomic::{AtomicU64, Ordering};
19
20use ipnetwork::IpNetwork;
21use zlayer_overlay::nat::{RelayServerConfig, StunServerConfig, TurnServerConfig};
22use zlayer_overlay::{
23    Candidate, CandidateType, ConnectionType, NatConfig, NatTraversal, OverlayConfig,
24    OverlayTransport, PeerInfo, RelayServer,
25};
26use zlayer_types::overlayd::{
27    AttachHandle, AttachResult, DedicatedServiceStatus, GuestOverlayConfig, NatCandidateWire,
28    NatConfigSpec, NatPeerWire, NatStatusWire, OverlayMode, OverlaydRequest, OverlaydResponse,
29    PeerScope, PeerSpec, PeerStatus, ServiceOverlayInfo, StatusSnapshot,
30};
31
32use crate::error::OverlaydError;
33use crate::network_state::{
34    owner_for_service, DedicatedPortAllocator, ManagedNetwork, NetworkState,
35};
36
37/// Maximum length for Linux network interface names (IFNAMSIZ - 1 for null terminator).
38const MAX_IFNAME_LEN: usize = 15;
39
40/// Reserved [`zlayer_overlay::allocator::ServiceSubnetRegistry`] key for the
41/// single node-wide shared bridge (`OverlayMode::Shared`). The leading NUL-like
42/// sentinel can never collide with a real service name (service names come from
43/// deployment specs and are DNS-label-shaped), so the shared bridge always gets
44/// exactly one stable subnet distinct from every per-service subnet.
45#[cfg(target_os = "linux")]
46const SHARED_BRIDGE_REGISTRY_KEY: &str = "__zlayer_shared_bridge__";
47
48/// Generate a Linux-safe interface name guaranteed to be <= 15 chars.
49///
50/// Joins the `parts` with `-` after a `"zl-"` prefix and appends `-{suffix}` if
51/// non-empty. When the result exceeds 15 characters, a deterministic hash of all
52/// parts is used instead to keep the name unique and within the kernel limit.
53#[must_use]
54pub fn make_interface_name(parts: &[&str], suffix: &str) -> String {
55    use std::collections::hash_map::DefaultHasher;
56    use std::hash::{Hash, Hasher};
57
58    let base = format!("zl-{}", parts.join("-"));
59    let candidate = if suffix.is_empty() {
60        base
61    } else {
62        format!("{base}-{suffix}")
63    };
64
65    if candidate.len() <= MAX_IFNAME_LEN {
66        return candidate;
67    }
68
69    // Name is too long -- produce a deterministic hash-based name.
70    let mut hasher = DefaultHasher::new();
71    for part in parts {
72        part.hash(&mut hasher);
73    }
74    suffix.hash(&mut hasher);
75    let hash = format!("{:x}", hasher.finish());
76
77    if suffix.is_empty() {
78        // "zl-" (3) + up to 12 hex chars = 15
79        let budget = MAX_IFNAME_LEN - 3;
80        format!("zl-{}", &hash[..budget.min(hash.len())])
81    } else {
82        // "zl-" (3) + hash + "-" (1) + suffix
83        let suffix_cost = 1 + suffix.len(); // "-" + suffix
84        let hash_budget = MAX_IFNAME_LEN.saturating_sub(3 + suffix_cost);
85        if hash_budget == 0 {
86            let budget = MAX_IFNAME_LEN - 3;
87            format!("zl-{}", &hash[..budget.min(hash.len())])
88        } else {
89            format!("zl-{}-{}", &hash[..hash_budget.min(hash.len())], suffix)
90        }
91    }
92}
93
94/// Pure orphan-selection predicate for [`OverlaydServer::prune_orphan_bridges`].
95///
96/// Returns `true` iff `name` is one of OUR per-service bridge (`zl-…-b`) or
97/// dedicated device (`zl-…-d`) interfaces AND is neither in the `live` set (the
98/// names the daemon says SHOULD exist) nor `protected` (the active global `-g`
99/// device, the node-wide `-sh` shared bridge, and any live in-memory service
100/// bridge/device). The `zl-` prefix gate keeps the sweep off unrelated host
101/// links; the `-b`/`-d` suffix gate keeps it off the global/shared interfaces
102/// and the `veth-…`/`vc-…` container-veth namespace (those are reclaimed by the
103/// PID-keyed `sweep_orphan_veths`, never here).
104#[cfg(target_os = "linux")]
105fn is_orphan_service_bridge(
106    name: &str,
107    live: &std::collections::HashSet<&str>,
108    protected: &std::collections::HashSet<String>,
109) -> bool {
110    if !name.starts_with("zl-") {
111        return false;
112    }
113    if !(name.ends_with("-b") || name.ends_with("-d")) {
114        return false;
115    }
116    !live.contains(name) && !protected.contains(name)
117}
118
119/// First usable host address in `subnet`.
120///
121/// For IPv4 this is `network() + 1` (skipping the network address). For IPv6
122/// the same rule applies — the network address is conventionally reserved.
123fn first_usable_ip(subnet: ipnet::IpNet) -> IpAddr {
124    match subnet {
125        ipnet::IpNet::V4(v4) => {
126            let net = u32::from(v4.network());
127            IpAddr::V4(Ipv4Addr::from(net.wrapping_add(1)))
128        }
129        ipnet::IpNet::V6(v6) => {
130            let net = u128::from(v6.network());
131            IpAddr::V6(Ipv6Addr::from(net.wrapping_add(1)))
132        }
133    }
134}
135
136/// Parameters threaded into [`OverlaydServer::attach_to_interface`] when a
137/// container is being attached to a per-service Linux bridge.
138#[cfg(target_os = "linux")]
139#[derive(Debug)]
140struct BridgeAttachParams<'a> {
141    /// Linux bridge name on the host to enslave the host-side veth into.
142    bridge_name: &'a str,
143    /// Bridge's L3 gateway IP. The container's default route is set here.
144    gateway: IpAddr,
145    /// Prefix length of the bridge's subnet.
146    subnet_prefix_len: u8,
147}
148
149/// Tracking info recorded by [`OverlaydServer::attach_container`] for every
150/// container that successfully attaches on Linux (via the per-PID `attached`
151/// map) and for every macOS host-shared container (via the
152/// `host_shared_attachments` map). Used by `detach_container`. Cross-platform
153/// so the host-shared path — which runs on macOS — can reuse the same record.
154#[derive(Debug, Clone)]
155struct AttachInfo {
156    /// IP allocated on the per-service overlay (eth0 inside the container).
157    service_ip: IpAddr,
158    /// Name of the service whose bridge owns `service_ip`.
159    service_name: Option<String>,
160    /// IP allocated on the global overlay (eth1), if the container joined it.
161    /// `Some` iff the container also attached to the global overlay; the
162    /// detach path now deletes `veth-<pid>-g` unconditionally (idempotent), so
163    /// no separate `joined_global` flag is needed.
164    ///
165    /// Linux-only: this is the per-container global/eth1 IP, allocated and read
166    /// solely by the Linux veth attach/detach paths. Host-shared containers
167    /// (macOS/Windows) share the node's single cluster utun and reach the
168    /// global overlay through their node `/32` alias, so they never allocate a
169    /// separate eth1 IP — it is always `None` off Linux and never read there.
170    #[cfg_attr(not(target_os = "linux"), allow(dead_code))]
171    global_ip: Option<IpAddr>,
172    /// True when this attach asked overlayd to reap the per-service bridge
173    /// once the LAST container detaches (ephemeral/per-job networks). False
174    /// for managed services (bridge persists across scale-to-0).
175    ephemeral: bool,
176    /// `Some(network)` when this container joined the named isolated network;
177    /// drives per-network L3 isolation membership cleanup on detach.
178    isolation_network: Option<String>,
179}
180
181/// Tracking info recorded by [`OverlaydServer::attach_container_guest`] for a
182/// guest-managed attach. Platform-agnostic (no netns/veth/HCN): the guest owns
183/// its own `WireGuard` device; the host only allocated the address + registered
184/// the guest's public key as a global peer.
185#[derive(Debug, Clone)]
186struct GuestAttachInfo {
187    /// Overlay IP allocated for the guest (released on detach).
188    overlay_ip: IpAddr,
189    /// Base64 public key registered on the global transport for the guest
190    /// (removed on detach).
191    public_key: String,
192    /// Service whose bridge pool owns `overlay_ip` (Linux service-bridge path);
193    /// `None` when drawn from the node slice. Mirrors `AttachInfo::service_name`
194    /// so detach returns the IP to the right pool.
195    service_name: Option<String>,
196    /// `Some(network)` when this guest joined the named isolated network;
197    /// drives per-network membership cleanup on detach. The guest's own
198    /// enforcement (`WireGuard` `AllowedIPs`) is wired separately — overlayd only
199    /// maintains the membership map here.
200    isolation_network: Option<String>,
201}
202
203/// Per-service Linux bridge state. One bridge per service per node; containers
204/// attach to it via veth pairs and cross-node packets ride the single cluster
205/// `OverlayTransport` with the service subnet plumbed into its `AllowedIPs`.
206#[cfg(target_os = "linux")]
207#[derive(Debug)]
208struct ServiceBridge {
209    /// Linux bridge name, kept under IFNAMSIZ-1 by [`make_interface_name`].
210    name: String,
211    /// CIDR of the service's subnet on this node.
212    subnet: ipnet::IpNet,
213    /// Gateway IP within the subnet (first usable address).
214    gateway: IpAddr,
215    /// Per-service IP allocator covering `subnet`.
216    ip_allocator: zlayer_overlay::allocator::IpAllocator,
217}
218
219/// A dedicated per-service `WireGuard` transport (`OverlayMode::Dedicated`).
220///
221/// Unlike Shared mode — where every service subnet is plumbed onto the single
222/// cluster [`OverlayTransport`] via multi-CIDR `AllowedIPs` — a Dedicated
223/// service owns a *second* real `WireGuard` device with its own crypto context,
224/// listen port, overlay IP, and subnet. The device is portable (boringtun
225/// userspace `WireGuard` works on Linux/macOS/Windows), so this struct is
226/// cross-platform; only the bridge/HCN *attachment* of containers onto it is
227/// platform-gated.
228struct ServiceTransport {
229    /// The live dedicated `WireGuard` device. Dropping it tears down the TUN.
230    transport: OverlayTransport,
231    /// Actual interface name (kernel-assigned `utunN` on macOS).
232    interface: String,
233    /// base64 public key of this dedicated device.
234    public_key: String,
235    /// UDP listen port handed out by [`DedicatedPortAllocator`].
236    listen_port: u16,
237    /// This node's overlay IP on the dedicated device.
238    overlay_ip: std::net::IpAddr,
239    /// The service's subnet carried by the dedicated device.
240    subnet: ipnet::IpNet,
241    /// Guest-attach IPAM bounded to `subnet`. VZ-Linux / WSL2 guests that join
242    /// this Dedicated service draw their overlay IP from here so they land on
243    /// the dedicated device's subnet (own crypto) rather than the node slice.
244    /// The node's own `overlay_ip` is reserved at setup so guests never collide
245    /// with it. Unused on Linux, where dedicated containers attach via a
246    /// per-service bridge that owns its own allocator.
247    #[cfg_attr(target_os = "linux", allow(dead_code))]
248    ip_allocator: zlayer_overlay::allocator::IpAllocator,
249}
250
251/// The overlay daemon engine.
252pub struct OverlaydServer {
253    /// Deployment name (used for network naming). Set by `SetupGlobalOverlay`.
254    deployment: String,
255    /// Per-daemon-process disambiguator included in overlay link names. Set by
256    /// `SetupGlobalOverlay`.
257    instance_id: String,
258    /// Root data directory; HCN markers, IPAM state, etc. live under it.
259    data_dir: PathBuf,
260    /// Global overlay interface name.
261    global_interface: Option<String>,
262    /// Global overlay transport (kept alive for the TUN device lifetime). The
263    /// SINGLE cluster-wide `WireGuard` transport; every service subnet is
264    /// plumbed through its `AllowedIPs`.
265    global_transport: Option<OverlayTransport>,
266    /// Service-name -> per-service Linux bridge / placeholder name.
267    service_interfaces: HashMap<String, String>,
268    /// Service-name -> dedicated per-service `WireGuard` transport (Dedicated
269    /// mode). Coexists with `global_transport`. Empty for Shared-only nodes.
270    service_transports: HashMap<String, ServiceTransport>,
271    /// Port allocator for dedicated devices (band above the global WG port).
272    dedicated_ports: DedicatedPortAllocator,
273    /// Per-service bridge state (Linux only).
274    #[cfg(target_os = "linux")]
275    service_bridges: HashMap<String, ServiceBridge>,
276    /// The SINGLE node-wide shared bridge backing every `OverlayMode::Shared`
277    /// service (Linux only). Created once on the first Shared-service setup and
278    /// reused for all subsequent ones; container ports are exposed via the
279    /// userspace free-port L4 proxy (`proxy_manager.rs`), not per-service
280    /// bridges. `None` until the first Shared service is set up.
281    #[cfg(target_os = "linux")]
282    shared_bridge: Option<ServiceBridge>,
283    /// Resolved per-service overlay mode, recorded at `setup_service_overlay_*`
284    /// time so the container ATTACH path knows which data-plane a service uses
285    /// (per-service bridge for `Auto`/`Dedicated` vs the single shared bridge
286    /// for `Shared`) without re-deriving it. Cross-platform.
287    service_modes: HashMap<String, OverlayMode>,
288    /// Local fallback `ServiceSubnetRegistry`. Used by the Linux Shared bridge
289    /// path and by the cross-platform Dedicated path (subnets stay globally
290    /// unique regardless of mode/OS).
291    service_subnet_registry: Option<zlayer_overlay::allocator::ServiceSubnetRegistry>,
292    /// Local raft node id used as the partition key for service-subnet assign.
293    local_node_id: u64,
294    /// Base64 `WireGuard` public key of THIS node's cluster transport, as told
295    /// by the main daemon via `SetLocalWgPubkey` (used for service-subnet
296    /// `AllowedIPs` plumbing).
297    local_wg_pubkey: Option<String>,
298    /// Public key generated for the live global transport, recorded at
299    /// `setup_global_overlay` time so `Status` can surface it (the transport
300    /// itself exposes no public-key accessor).
301    transport_public_key: Option<String>,
302    /// IP allocator for the node's overlay slice.
303    ip_allocator: IpAllocator,
304    /// This node's IP on the global overlay network.
305    node_ip: Option<IpAddr>,
306    /// `WireGuard` listen port for the overlay network.
307    overlay_port: u16,
308    /// Full cluster CIDR (e.g. `10.200.0.0/16`).
309    cluster_cidr: Option<IpNetwork>,
310    /// Per-node slice CIDR.
311    slice_cidr: Option<IpNetwork>,
312    /// Map of HCN namespace GUID -> (`service_name`, `allocated_ip`,
313    /// `isolation_network`) for autoclean. The trailing `isolation_network` lets
314    /// detach drain the per-network membership map for this container.
315    #[cfg(target_os = "windows")]
316    hcn_cleanup: HashMap<windows::core::GUID, (String, std::net::IpAddr, Option<String>)>,
317    /// Per-service container-IP allocators for Windows dedicated services. Each
318    /// is bounded to that service's subnet (not the node slice) so dedicated
319    /// containers draw addresses from their own isolated network. Keyed by
320    /// service name; created lazily on the first dedicated attach.
321    #[cfg(target_os = "windows")]
322    service_ip_allocators: HashMap<String, IpAllocator>,
323    /// Per-PID tracking of overlay attachments on Linux.
324    #[cfg(target_os = "linux")]
325    attached: HashMap<u32, AttachInfo>,
326    /// Per-isolated-network membership: network name -> the set of member
327    /// overlay (service) IPs currently attached to it. Drives per-network L3
328    /// isolation (a member reaches only its own network's members + node +
329    /// egress). Populated on attach, drained on detach, across all platforms.
330    network_members: std::collections::HashMap<String, std::collections::HashSet<IpAddr>>,
331    /// Peers installed on the GLOBAL transport via `AddPeer { Global }`, keyed by
332    /// base64 public key. Tracked here (in wire-safe [`PeerSpec`] form, with the
333    /// keys kept base64 — the boringtun UAPI dump only exposes hex keys) so a
334    /// guest-managed attach can hand the guest the exact peer set the host's own
335    /// global device carries. Platform-agnostic: the guest path runs on macOS.
336    global_peers: HashMap<String, PeerSpec>,
337    /// Guest-managed overlay attachments, keyed by the opaque container `id` from
338    /// [`AttachHandle::GuestManaged`]. Records the allocated overlay IP and the
339    /// generated public key registered in the mesh so `DetachContainer` can
340    /// release the IP and remove the peer.
341    guest_attachments: HashMap<String, GuestAttachInfo>,
342    /// Host-shared overlay attachments, keyed by the opaque container `id` from
343    /// [`AttachHandle::HostShared`] (macOS Seatbelt / native-VZ / libkrun
344    /// containers that share the node's host network namespace and its single
345    /// cluster `utun`). Records the distinct overlay `/32` allocated for the
346    /// container so `DetachContainer` can remove the utun alias, drain the
347    /// per-network L3 isolation membership, and release the IP. Cross-platform
348    /// (the host-shared path compiles everywhere; it is exercised on macOS).
349    host_shared_attachments: HashMap<String, AttachInfo>,
350    /// Overlay DNS server listen address, if one was bootstrapped.
351    dns_server_addr: Option<SocketAddr>,
352    /// DNS domain for overlay service discovery.
353    dns_domain: Option<String>,
354    /// Overlay DNS A/AAAA records this node owns (name -> ip).
355    dns_records: HashMap<String, IpAddr>,
356    /// NAT traversal configuration threaded into every `OverlayConfig`.
357    nat_config: Option<NatConfig>,
358    /// Override for `OverlayConfig::uapi_sock_dir`.
359    uapi_sock_dir: Option<PathBuf>,
360    /// Live NAT traversal orchestrator.
361    nat_traversal: Option<NatTraversal>,
362    /// Unix-epoch seconds of the last successful candidate gather / STUN refresh.
363    nat_last_refresh: AtomicU64,
364    /// NAT-traversal candidates each peer advertised, keyed by base64 public
365    /// key. Populated from `AddPeer { Global }` (the join-time candidate
366    /// exchange); the NAT maintenance tick feeds these into
367    /// `NatTraversal::connect_to_peer` to hole-punch / relay toward a peer whose
368    /// direct endpoint has not produced a recent `WireGuard` handshake.
369    peer_candidates: HashMap<String, Vec<Candidate>>,
370    /// The [`ConnectionType`] last negotiated to each peer (keyed by base64
371    /// public key), recorded by the connect loop so `NatStatus` can report
372    /// direct / hole-punched / relayed per peer.
373    peer_connection_type: HashMap<String, ConnectionType>,
374    /// Built-in relay server, started lazily on the first NAT tick when the
375    /// resolved [`NatConfig::relay_server`] is `Some`. Kept alive for the
376    /// daemon's lifetime so its background accept loop keeps running.
377    relay_server: Option<RelayServer>,
378    /// The address the built-in [`Self::relay_server`] actually bound (the real
379    /// port when `listen_port == 0`).
380    relay_bound_addr: Option<SocketAddr>,
381    /// Cluster-shared credential used to derive the built-in relay server's
382    /// `BLAKE2b` auth key. Carried in `NatConfigSpec.relay_server.auth_credential`
383    /// (the main daemon sets it from the cluster HS256 secret) so every node's
384    /// relay client derives the *same* key. `None` when no credential was
385    /// supplied (the relay then derives a key from the empty string — only nodes
386    /// that likewise have no credential can use it).
387    cluster_relay_credential: Option<String>,
388    /// Set when a `Shutdown` request has been received.
389    shutdown_requested: bool,
390    /// IPv4 `net.ipv4.ip_forward` value observed BEFORE the daemon first
391    /// enabled forwarding for an overlay container attach. `Some(prev)` is
392    /// recorded exactly once (the first time we flip it to `1`); teardown
393    /// restores `prev` so a clean shutdown reverts host routing state the
394    /// daemon turned on without clobbering an operator who set it. `None`
395    /// means the daemon never enabled IPv4 forwarding (nothing to revert).
396    #[cfg(target_os = "linux")]
397    prev_ipv4_forward: Option<String>,
398    /// Per-interface IPv6 `net.ipv6.conf.<dev>.forwarding` was enabled on
399    /// these device names for overlay routing. We enable forwarding
400    /// PER-INTERFACE (never `net.ipv6.conf.all.forwarding`, which has the
401    /// documented side effect of forcing `accept_ra=0` + `autoconf=0` on
402    /// every IPv6 interface — including the public NIC — and silently
403    /// dropping the RA-learned default route / path-MTU, which blackholes
404    /// the host's own larger reply packets). Teardown clears forwarding on
405    /// exactly these devices.
406    #[cfg(target_os = "linux")]
407    ipv6_forward_ifaces: std::collections::HashSet<String>,
408    /// Host-side veth device names THIS daemon created (`veth-<pid>-<tag>`),
409    /// recorded right after a successful `create_veth_pair`. A clean global
410    /// teardown deletes each so no host veth half is left dangling once the
411    /// overlay stops. Per-container detach may delete some of these first;
412    /// deletion is idempotent (a missing device is ignored). Only names this
413    /// daemon created are tracked — never a blanket prefix sweep that could
414    /// catch a concurrent overlay's interfaces.
415    #[cfg(target_os = "linux")]
416    created_veths: std::collections::HashSet<String>,
417    /// `zl-*` bridge device names THIS daemon created (per-service and the
418    /// node-wide shared bridge), recorded right after a successful
419    /// `create_bridge` + address + up. Deleting the bridge link on teardown
420    /// also drops its gateway address and up state, so the name alone is enough
421    /// to fully revert it.
422    #[cfg(target_os = "linux")]
423    created_bridges: std::collections::HashSet<String>,
424    /// Host `/32` (`/128`) routes to a container IP via a host-side veth that
425    /// THIS daemon installed via `replace_route_via_dev` (the bridgeless attach
426    /// path). Each entry is `(dest, prefix_len, dev)` — enough to delete the
427    /// exact route on teardown via `delete_route_via_dev`. Deletion is
428    /// idempotent (a route a prior detach already removed is ignored).
429    #[cfg(target_os = "linux")]
430    created_host_routes: Vec<(IpAddr, u8, String)>,
431}
432
433/// Whether rootless mode forces the `WireGuard` `local_endpoint` to UNSPECIFIED.
434///
435/// In rootless mode `detect_physical_egress()` runs inside the daemon netns and
436/// resolves pasta's in-netns tap IP, which is a meaningless WG source/advertised
437/// endpoint to remote peers. Extracted as a pure fn so the decision is testable
438/// without mutating the process-global `ZLAYER_ROOTLESS` env var (env writes race
439/// across parallel tests).
440fn rootless_forces_unspecified(rootless: bool) -> bool {
441    rootless
442}
443
444/// Whether a failure to create the HOST overlay adapter is fatal for the node.
445///
446/// On Linux the host adapter (a kernel TUN brought up via netlink, with the
447/// rootless userns+netns path as a fallback) IS the container data path, so a
448/// creation failure must abort overlay setup. On macOS/Windows, Linux
449/// containers live in a VZ VM / WSL2 distro that creates its OWN overlay device
450/// and meshes VM-to-VM over UDP — the host adapter (utun/Wintun, which needs
451/// root/Administrator) is only the host's own membership in the overlay and is
452/// NOT on the container data path. So on those platforms a host-adapter failure
453/// must DEGRADE to a VM-only overlay (warn + continue) rather than abort.
454///
455/// Extracted as a `cfg!`-driven pure fn so the degrade decision is unit-testable
456/// on Linux without needing to provoke a real utun/Wintun syscall failure.
457fn host_adapter_failure_is_fatal(host_adapter_mandatory: bool) -> bool {
458    cfg!(target_os = "linux") || host_adapter_mandatory
459}
460
461impl OverlaydServer {
462    /// Create a fresh server bound to `data_dir`. The overlay itself is brought
463    /// up lazily by `SetupGlobalOverlay` (which carries the deployment, slice,
464    /// port, and NAT toggle from the main daemon).
465    ///
466    /// # Panics
467    /// Panics only if the compile-time-constant default CIDR `10.200.0.0/16`
468    /// fails to parse (impossible).
469    #[must_use]
470    pub fn new(data_dir: PathBuf) -> Self {
471        // Until SetupGlobalOverlay arrives, the allocator is bounded to the
472        // default cluster /16. SetupGlobalOverlay re-binds it to the node slice.
473        let default_cidr: IpNetwork = "10.200.0.0/16".parse().expect("compile-time constant CIDR");
474        let overlay_port = zlayer_core::DEFAULT_WG_PORT;
475
476        // Rehydrate the dedicated-port allocator from the on-disk marker so a
477        // service that already owns a dedicated overlay re-binds the exact UDP
478        // port it had before this process started.
479        let marker_path = zlayer_paths::ZLayerDirs::new(data_dir.clone()).agent_network_state();
480        let recorded_dedicated_ports: Vec<u16> = NetworkState::load(&marker_path)
481            .networks
482            .iter()
483            .filter(|n| n.owner.starts_with("service:"))
484            .filter_map(|n| n.wg_port)
485            .collect();
486
487        Self {
488            deployment: String::new(),
489            instance_id: String::new(),
490            data_dir,
491            global_interface: None,
492            global_transport: None,
493            service_interfaces: HashMap::new(),
494            service_transports: HashMap::new(),
495            dedicated_ports: DedicatedPortAllocator::new(overlay_port, recorded_dedicated_ports),
496            #[cfg(target_os = "linux")]
497            service_bridges: HashMap::new(),
498            #[cfg(target_os = "linux")]
499            shared_bridge: None,
500            service_modes: HashMap::new(),
501            service_subnet_registry: None,
502            local_node_id: 0,
503            local_wg_pubkey: None,
504            transport_public_key: None,
505            ip_allocator: IpAllocator::new(default_cidr),
506            node_ip: None,
507            overlay_port,
508            cluster_cidr: Some(default_cidr),
509            slice_cidr: None,
510            #[cfg(target_os = "windows")]
511            hcn_cleanup: HashMap::new(),
512            #[cfg(target_os = "windows")]
513            service_ip_allocators: HashMap::new(),
514            #[cfg(target_os = "linux")]
515            attached: HashMap::new(),
516            network_members: std::collections::HashMap::new(),
517            global_peers: HashMap::new(),
518            guest_attachments: HashMap::new(),
519            host_shared_attachments: HashMap::new(),
520            dns_server_addr: None,
521            dns_domain: None,
522            dns_records: HashMap::new(),
523            nat_config: None,
524            uapi_sock_dir: None,
525            nat_traversal: None,
526            nat_last_refresh: AtomicU64::new(0),
527            peer_candidates: HashMap::new(),
528            peer_connection_type: HashMap::new(),
529            relay_server: None,
530            relay_bound_addr: None,
531            cluster_relay_credential: None,
532            shutdown_requested: false,
533            #[cfg(target_os = "linux")]
534            prev_ipv4_forward: None,
535            #[cfg(target_os = "linux")]
536            ipv6_forward_ifaces: std::collections::HashSet::new(),
537            #[cfg(target_os = "linux")]
538            created_veths: std::collections::HashSet::new(),
539            #[cfg(target_os = "linux")]
540            created_bridges: std::collections::HashSet::new(),
541            #[cfg(target_os = "linux")]
542            created_host_routes: Vec::new(),
543        }
544    }
545
546    /// Override the `WireGuard` UAPI socket directory for every overlay
547    /// transport built by this server.
548    #[must_use]
549    pub fn with_uapi_sock_dir(mut self, dir: impl Into<PathBuf>) -> Self {
550        self.uapi_sock_dir = Some(dir.into());
551        self
552    }
553
554    /// Whether a `Shutdown` request has been received.
555    #[must_use]
556    pub fn shutdown_requested(&self) -> bool {
557        self.shutdown_requested
558    }
559
560    /// The root data directory this server was constructed with. Used by the
561    /// uninstall path (`purge_managed_networks`) and for HCN marker resolution.
562    #[must_use]
563    pub fn data_dir(&self) -> &Path {
564        &self.data_dir
565    }
566
567    // -- request dispatch ----------------------------------------------------
568
569    /// Execute one [`OverlaydRequest`], producing the [`OverlaydResponse`] the
570    /// server sends back over IPC. Any internal error is folded into
571    /// [`OverlaydResponse::Err`].
572    pub async fn handle(&mut self, req: OverlaydRequest) -> OverlaydResponse {
573        match self.dispatch(req).await {
574            Ok(resp) => resp,
575            Err(e) => OverlaydResponse::Err {
576                message: e.to_string(),
577            },
578        }
579    }
580
581    #[allow(clippy::too_many_lines)]
582    async fn dispatch(&mut self, req: OverlaydRequest) -> Result<OverlaydResponse, OverlaydError> {
583        match req {
584            OverlaydRequest::SetLocalNodeId { node_id } => {
585                self.local_node_id = node_id;
586                Ok(OverlaydResponse::Ok)
587            }
588            OverlaydRequest::SetLocalWgPubkey { pubkey } => {
589                self.local_wg_pubkey = Some(pubkey);
590                Ok(OverlaydResponse::Ok)
591            }
592            OverlaydRequest::SetupGlobalOverlay {
593                deployment,
594                instance_id,
595                cluster_cidr,
596                slice_cidr,
597                wg_port,
598                nat,
599                host_adapter_mandatory,
600            } => {
601                let name = self
602                    .setup_global_overlay(
603                        deployment,
604                        instance_id,
605                        &cluster_cidr,
606                        slice_cidr.as_deref(),
607                        wg_port,
608                        nat,
609                        host_adapter_mandatory,
610                    )
611                    .await?;
612                Ok(OverlaydResponse::BridgeName { name })
613            }
614            OverlaydRequest::TeardownGlobalOverlay => {
615                self.teardown_global_overlay();
616                Ok(OverlaydResponse::Ok)
617            }
618            OverlaydRequest::SetupServiceOverlay { service, mode } => {
619                let info = self.setup_service_overlay(&service, mode).await?;
620                Ok(OverlaydResponse::ServiceOverlay(info))
621            }
622            OverlaydRequest::TeardownServiceOverlay { service } => {
623                self.teardown_service_overlay(&service).await;
624                Ok(OverlaydResponse::Ok)
625            }
626            OverlaydRequest::AllocateIp {
627                service,
628                join_global,
629            } => {
630                let ip = self.allocate_ip(&service, join_global)?;
631                Ok(OverlaydResponse::Ip { ip })
632            }
633            OverlaydRequest::ReleaseIp { ip } => {
634                self.release_ip(ip);
635                Ok(OverlaydResponse::Ok)
636            }
637            OverlaydRequest::AttachContainer {
638                handle,
639                service,
640                join_global,
641                dns_server,
642                dns_domain,
643                ephemeral,
644                isolation_network,
645            } => {
646                // A guest-managed attach takes a wholly separate path: it cannot
647                // build a veth/HCN endpoint (the target is a VM, not a host
648                // process), so it allocates the overlay identity + peer set and
649                // returns it as `GuestConfig`. PID/HCN handles keep the existing
650                // veth/HCN attach and return `Attached`.
651                if let AttachHandle::GuestManaged { id } = handle {
652                    // Record the overlay DNS resolver/zone the daemon staged for
653                    // this node so the guest config can fall back to them (same
654                    // bookkeeping `attach_container` does for the other handles).
655                    if let Some(server) = dns_server {
656                        self.dns_server_addr = Some(SocketAddr::new(server, 53));
657                    }
658                    if dns_domain.is_some() {
659                        self.dns_domain.clone_from(&dns_domain);
660                    }
661                    let config = self
662                        .attach_container_guest(
663                            &id,
664                            &service,
665                            join_global,
666                            dns_server,
667                            dns_domain,
668                            isolation_network,
669                        )
670                        .await?;
671                    Ok(OverlaydResponse::GuestConfig(config))
672                } else {
673                    let result = self
674                        .attach_container(
675                            handle,
676                            &service,
677                            join_global,
678                            ephemeral,
679                            dns_server,
680                            dns_domain,
681                            isolation_network,
682                        )
683                        .await?;
684                    Ok(OverlaydResponse::Attached(result))
685                }
686            }
687            OverlaydRequest::DetachContainer { handle } => {
688                if let AttachHandle::GuestManaged { id } = handle {
689                    self.detach_container_guest(&id).await?;
690                } else {
691                    self.detach_container(handle).await?;
692                }
693                Ok(OverlaydResponse::Ok)
694            }
695            // `scope` selects the target device: `Global` (default) = the single
696            // cluster transport; `Service { service }` = that service's
697            // dedicated per-service transport.
698            OverlaydRequest::AddPeer { peer, scope } => {
699                let info = peer_spec_to_info(&peer)?;
700                // VM-only overlay (macOS/Windows host adapter unavailable):
701                // there is no host transport to program for the Global scope, so
702                // WARN-AND-SKIP the on-device install instead of erroring. The
703                // peer is still mirrored into `global_peers` below so guests can
704                // reproduce the global peer set via the separate guest-config
705                // push — the host simply doesn't join. `Some` transports are
706                // unaffected.
707                if matches!(scope, PeerScope::Global) && self.global_transport.is_none() {
708                    tracing::warn!(
709                        peer = %peer.public_key,
710                        "global overlay has no host adapter (VM-only overlay); \
711                         skipping host peer install — guests receive this peer via \
712                         guest-config push"
713                    );
714                } else {
715                    let transport = self.transport_for_scope(&scope)?;
716                    Self::add_peer_on(transport, &info).await?;
717                }
718                // Record the peer's advertised NAT candidates (if any) so the
719                // NAT maintenance tick can hole-punch / relay toward it. Stored
720                // for both scopes keyed by public key (the cluster transport is
721                // the one carrying packets either way). Empty candidate lists
722                // are dropped from the map so the tick's borrow loop stays cheap.
723                if peer.candidates.is_empty() {
724                    self.peer_candidates.remove(&peer.public_key);
725                } else {
726                    let parsed: Vec<Candidate> = peer
727                        .candidates
728                        .iter()
729                        .filter_map(wire_to_candidate)
730                        .collect();
731                    if parsed.is_empty() {
732                        self.peer_candidates.remove(&peer.public_key);
733                    } else {
734                        self.peer_candidates.insert(peer.public_key.clone(), parsed);
735                    }
736                }
737                // Mirror Global peers into `global_peers` so a guest-managed
738                // attach can reproduce the host's global peer set for the guest.
739                if matches!(scope, PeerScope::Global) {
740                    self.global_peers.insert(peer.public_key.clone(), peer);
741                }
742                Ok(OverlaydResponse::Ok)
743            }
744            OverlaydRequest::RemovePeer { pubkey, scope } => {
745                // VM-only overlay: no host transport for the Global scope, so the
746                // on-device removal is a no-op — just drop it from `global_peers`
747                // below. `Some` transports are unaffected.
748                if matches!(scope, PeerScope::Global) && self.global_transport.is_none() {
749                    tracing::warn!(
750                        peer = %pubkey,
751                        "global overlay has no host adapter (VM-only overlay); \
752                         skipping host peer removal"
753                    );
754                } else {
755                    let transport = self.transport_for_scope(&scope)?;
756                    Self::remove_peer_on(transport, &pubkey).await?;
757                }
758                if matches!(scope, PeerScope::Global) {
759                    self.global_peers.remove(&pubkey);
760                }
761                self.peer_candidates.remove(&pubkey);
762                self.peer_connection_type.remove(&pubkey);
763                Ok(OverlaydResponse::Ok)
764            }
765            OverlaydRequest::AddAllowedIp {
766                pubkey,
767                cidr,
768                scope,
769            } => {
770                // VM-only overlay: no host device to plumb AllowedIPs into for the
771                // Global scope — warn-and-skip. `Some` transports are unaffected.
772                if matches!(scope, PeerScope::Global) && self.global_transport.is_none() {
773                    tracing::warn!(
774                        peer = %pubkey,
775                        cidr = %cidr,
776                        "global overlay has no host adapter (VM-only overlay); \
777                         skipping host AllowedIP add"
778                    );
779                } else {
780                    let transport = self.transport_for_scope(&scope)?;
781                    Self::add_allowed_ip_on(transport, &pubkey, &cidr).await?;
782                }
783                Ok(OverlaydResponse::Ok)
784            }
785            OverlaydRequest::RemoveAllowedIp {
786                pubkey,
787                cidr,
788                scope,
789            } => {
790                // VM-only overlay: no host device for the Global scope — the
791                // removal is a no-op. `Some` transports are unaffected.
792                if matches!(scope, PeerScope::Global) && self.global_transport.is_none() {
793                    tracing::warn!(
794                        peer = %pubkey,
795                        cidr = %cidr,
796                        "global overlay has no host adapter (VM-only overlay); \
797                         skipping host AllowedIP removal"
798                    );
799                } else {
800                    let transport = self.transport_for_scope(&scope)?;
801                    Self::remove_allowed_ip_on(transport, &pubkey, &cidr).await?;
802                }
803                Ok(OverlaydResponse::Ok)
804            }
805            OverlaydRequest::RegisterDns { name, ip } => {
806                self.register_dns(name, ip);
807                Ok(OverlaydResponse::Ok)
808            }
809            OverlaydRequest::UnregisterDns { name } => {
810                self.unregister_dns(&name);
811                Ok(OverlaydResponse::Ok)
812            }
813            OverlaydRequest::WriteScopedResolver {
814                zone,
815                node_ip,
816                port,
817            } => {
818                #[cfg(target_os = "macos")]
819                {
820                    zlayer_overlay::dns::write_scoped_resolver(&zone, node_ip, port).map_err(
821                        |e| OverlaydError::Overlay(format!("write_scoped_resolver({zone}): {e}")),
822                    )?;
823                    Ok(OverlaydResponse::Ok)
824                }
825                #[cfg(not(target_os = "macos"))]
826                {
827                    let _ = (zone, node_ip, port);
828                    Err(OverlaydError::Overlay(
829                        "scoped resolver is macOS-only".into(),
830                    ))
831                }
832            }
833            OverlaydRequest::RemoveScopedResolver { zone } => {
834                #[cfg(target_os = "macos")]
835                {
836                    zlayer_overlay::dns::remove_scoped_resolver(&zone).map_err(|e| {
837                        OverlaydError::Overlay(format!("remove_scoped_resolver({zone}): {e}"))
838                    })?;
839                    Ok(OverlaydResponse::Ok)
840                }
841                #[cfg(not(target_os = "macos"))]
842                {
843                    let _ = zone;
844                    Err(OverlaydError::Overlay(
845                        "scoped resolver is macOS-only".into(),
846                    ))
847                }
848            }
849            OverlaydRequest::PruneOrphanBridges { live_bridge_names } => {
850                let reclaimed = self.prune_orphan_bridges(&live_bridge_names).await;
851                Ok(OverlaydResponse::PrunedBridges { reclaimed })
852            }
853            OverlaydRequest::Status => Ok(OverlaydResponse::Status(self.status_snapshot().await)),
854            OverlaydRequest::NatTick => {
855                self.nat_maintenance_tick().await?;
856                Ok(OverlaydResponse::Ok)
857            }
858            OverlaydRequest::NatStatus => Ok(OverlaydResponse::NatStatus(
859                self.nat_status_snapshot().await,
860            )),
861            OverlaydRequest::Shutdown => {
862                self.shutdown_requested = true;
863                self.teardown_global_overlay();
864                Ok(OverlaydResponse::Ok)
865            }
866        }
867    }
868
869    // -- global overlay ------------------------------------------------------
870
871    /// Bring up (or reuse) this node's base/global overlay.
872    ///
873    /// Idempotent: if a global transport is already live, reuse it (recreating
874    /// without this guard could yank the kernel TUN out from under the running
875    /// boringtun worker). Re-binds the IP allocator to `slice_cidr` if one is
876    /// supplied so container IPs never collide across nodes.
877    ///
878    /// # Errors
879    /// Returns an error if key generation or interface creation fails.
880    #[allow(clippy::too_many_lines)]
881    #[allow(clippy::too_many_arguments)]
882    async fn setup_global_overlay(
883        &mut self,
884        deployment: String,
885        instance_id: String,
886        cluster_cidr: &str,
887        slice_cidr: Option<&str>,
888        wg_port: u16,
889        nat: Option<NatConfigSpec>,
890        host_adapter_mandatory: bool,
891    ) -> Result<String, OverlaydError> {
892        self.deployment = deployment;
893        self.instance_id = instance_id;
894        self.overlay_port = wg_port;
895
896        let cluster: IpNetwork = cluster_cidr.parse().map_err(|e| {
897            OverlaydError::Other(format!("invalid cluster CIDR {cluster_cidr}: {e}"))
898        })?;
899        self.cluster_cidr = Some(cluster);
900        if let Some(slice) = slice_cidr {
901            let slice_net: IpNetwork = slice
902                .parse()
903                .map_err(|e| OverlaydError::Other(format!("invalid slice CIDR {slice}: {e}")))?;
904            self.slice_cidr = Some(slice_net);
905            self.ip_allocator = IpAllocator::new(slice_net);
906        }
907        // Thread the full operator-supplied NAT config (STUN/TURN servers,
908        // timeouts, relay-server bind + credential) into overlayd. `None` means
909        // the main daemon supplied no explicit config, so overlayd keeps its
910        // built-in `NatConfig::default()` (NAT enabled, Google STUN). A `Some`
911        // spec is converted verbatim — including the relay credential, stashed
912        // separately so the relay server can be stood up with a cluster-shared
913        // auth key on the first NAT tick.
914        if let Some(spec) = nat {
915            self.cluster_relay_credential = spec
916                .relay_server
917                .as_ref()
918                .and_then(|r| r.auth_credential.clone());
919            self.nat_config = Some(nat_config_spec_to_config(spec));
920        }
921
922        if let Some(name) = self.global_interface.clone() {
923            if self.global_transport.is_some() {
924                tracing::debug!(
925                    deployment = %self.deployment,
926                    "Global overlay already active, reusing existing transport"
927                );
928                return Ok(name);
929            }
930        }
931
932        let interface_name = make_interface_name(&[&self.deployment, &self.instance_id], "g");
933
934        let (private_key, public_key) = OverlayTransport::generate_keys()
935            .await
936            .map_err(|e| OverlaydError::Overlay(format!("Failed to generate keys: {e}")))?;
937
938        // The node's own overlay IP is the deterministic first-usable host of
939        // its slice (reserved offset 1), NOT a racy `allocate()` that drifts by
940        // allocation order. Containers draw from offset 2 onward, so the node
941        // IP is stable across restarts and never collides with a container.
942        let node_ip = self.ip_allocator.node_ip();
943        self.transport_public_key = Some(public_key.clone());
944        let physical_egress_ip = match zlayer_overlay::detect_physical_egress().await {
945            Ok(egress) => Some(egress.ip),
946            Err(e) => {
947                tracing::warn!(
948                    error = %e,
949                    "failed to detect physical egress; WireGuard local_endpoint \
950                     will bind UNSPECIFIED for the global overlay"
951                );
952                None
953            }
954        };
955        let config = self.build_config(
956            private_key,
957            public_key,
958            node_ip,
959            16,
960            self.overlay_port,
961            physical_egress_ip,
962        );
963        // Remove any stale `-g` interface with this (now deterministic) name
964        // left by a previous daemon instance, so the create below cleanly
965        // REPLACES it instead of failing "File exists" or orphaning the old
966        // one. With a stable per-host instance id the name is constant across
967        // restarts, so exactly one global interface ever exists.
968        #[cfg(target_os = "linux")]
969        let _ = crate::netlink::delete_link_by_name(&interface_name).await;
970        let mut transport = OverlayTransport::new(config, interface_name);
971
972        // Creating the host overlay adapter is fatal on Linux (the kernel TUN IS
973        // the container data path) but only DEGRADES on macOS/Windows: there,
974        // Linux containers run in a VZ VM / WSL2 distro that creates its own
975        // overlay device and meshes VM-to-VM over UDP, so the host adapter
976        // (utun/Wintun, needs root/Administrator) is just the host's own overlay
977        // membership and is NOT on the container data path. The allocator and
978        // `node_ip` are already bound above, so guest-config push + IP allocation
979        // keep working even when the host adapter is unavailable.
980        // Map the (non-`Send`) `Box<dyn Error>` to an owned `String` BEFORE the
981        // match so no non-`Send` value is held across the `configure().await`
982        // below — the daemon's request handler future must stay `Send`.
983        let create_result = transport
984            .create_interface()
985            .await
986            .map_err(|e| e.to_string());
987        let actual_name = match create_result {
988            Ok(()) => {
989                transport.configure(&[]).await.map_err(|e| {
990                    OverlaydError::Overlay(format!("Failed to configure global overlay: {e}"))
991                })?;
992                // Read back the actual interface name (on macOS, the kernel
993                // assigns utunN).
994                let actual_name = transport.interface_name().to_string();
995                self.node_ip = Some(node_ip);
996                self.global_interface = Some(actual_name.clone());
997                self.global_transport = Some(transport);
998                actual_name
999            }
1000            Err(e) if !host_adapter_failure_is_fatal(host_adapter_mandatory) => {
1001                // macOS / Windows: continue with a VM-only overlay. Leave
1002                // `global_transport == None` (the natural "no host adapter"
1003                // signal), keep `node_ip` so allocation/guest config are
1004                // unaffected, and SKIP `configure` (no device to program).
1005                tracing::warn!(
1006                    error = %e,
1007                    "host overlay adapter unavailable (needs root/Administrator); \
1008                     continuing with VM-only overlay — the host will not join the \
1009                     overlay, but containers running in the VM mesh VM-to-VM and IP \
1010                     allocation/guest config are unaffected"
1011                );
1012                self.node_ip = Some(node_ip);
1013                self.global_interface = None;
1014                self.global_transport = None;
1015                // No real device exists; return an honest marker so the IPC
1016                // response is a success without implying a live adapter.
1017                "(host-adapter-disabled)".to_string()
1018            }
1019            Err(e) => {
1020                // Linux (and any future fatal-on-failure target): unchanged —
1021                // a host-adapter creation failure aborts overlay setup.
1022                return Err(OverlaydError::Overlay(format!(
1023                    "Failed to create global overlay: {e}"
1024                )));
1025            }
1026        };
1027
1028        // In rootless mode the daemon runs in its own network namespace and
1029        // `pasta` provides egress NAT + inbound port forwarding; the host-table
1030        // iptables setup below is at best a no-op inside the netns and at worst
1031        // spurious, so skip it entirely. Otherwise install the host firewall
1032        // rules as usual.
1033        if std::env::var_os("ZLAYER_ROOTLESS").is_none() {
1034            // Stop systemd-networkd / NetworkManager from managing the overlay
1035            // links overlayd just created. With a permissive default match they
1036            // try to bring `zl-*` up / run DHCP and (seen on a CI runner)
1037            // SIGABRT on the networkd watchdog while processing a `zl-*` Link
1038            // UP. Best-effort; reverted in `teardown_global_overlay`.
1039            zlayer_overlay::networkd::mark_overlay_interfaces_unmanaged();
1040
1041            // Allow overlay traffic through the host firewall (UFW / firewalld /
1042            // a bare `iptables -P FORWARD DROP`). Without this, a container's DNS
1043            // query to the node overlay IP — and inter-service overlay traffic —
1044            // is dropped by the host's INPUT/FORWARD policy before it reaches
1045            // ZLayer's resolver. Best-effort: a host without `iptables` logs a
1046            // warning rather than aborting overlay setup.
1047            if let Err(e) =
1048                zlayer_overlay::firewall::ensure_overlay_subnet_rules(&cluster.to_string())
1049            {
1050                tracing::warn!(
1051                    error = %e,
1052                    cidr = %cluster,
1053                    "failed to install overlay firewall allow-rules; service DNS / \
1054                     cross-service traffic may be blocked by the host firewall"
1055                );
1056            }
1057
1058            // SNAT overlay-sourced egress so containers can reach the LAN/internet.
1059            // The allow-rules above + `ip_forward` only get the packet *forwarded*
1060            // out the WAN NIC; without masquerade it leaves with a private
1061            // `10.200.0.0/16` source and replies never route back (ENETUNREACH /
1062            // hangs for `wget http://<public-ip>`). Best-effort, same as above.
1063            if let Err(e) =
1064                zlayer_overlay::firewall::ensure_overlay_masquerade(&cluster.to_string())
1065            {
1066                tracing::warn!(
1067                    error = %e,
1068                    cidr = %cluster,
1069                    "failed to install overlay egress masquerade; overlay containers \
1070                     may be unable to reach the LAN / internet"
1071                );
1072            }
1073        } else {
1074            tracing::info!(
1075                "rootless mode: skipping host iptables (pasta provides egress + port forwarding)"
1076            );
1077        }
1078
1079        Ok(actual_name)
1080    }
1081
1082    /// Tear down the node's base overlay (e.g. on full uninstall / shutdown).
1083    fn teardown_global_overlay(&mut self) {
1084        if let Some(mut transport) = self.global_transport.take() {
1085            tracing::info!("Shutting down global overlay");
1086            transport.shutdown();
1087        }
1088        self.global_interface = None;
1089        self.transport_public_key = None;
1090
1091        // Revert host network state this daemon mutated so a clean stop
1092        // recovers connectivity WITHOUT requiring a reboot. Forwarding
1093        // sysctls and the overlay iptables chains are otherwise sticky:
1094        // they survive both the daemon stop and an `iptables -F`, so prior
1095        // to this the only way to undo them was a reboot.
1096        #[cfg(target_os = "linux")]
1097        self.revert_forwarding();
1098        zlayer_overlay::firewall::remove_overlay_masquerade();
1099        zlayer_overlay::firewall::remove_overlay_subnet_rules();
1100        // `remove_member_isolation` deliberately leaves the ZLAYER-OVERLAY-ISO
1101        // chain + its FORWARD jump resident (other members may still use them);
1102        // on a full overlay teardown remove the whole chain so nothing leaks.
1103        zlayer_overlay::firewall::remove_overlay_isolation();
1104        // macOS: strip the pf overlay anchor + the two marked `/etc/pf.conf`
1105        // lines this node installs for the cluster/DNS ports. Without this they
1106        // leak past daemon stop (the anchor file and `/etc/pf.conf` refs are
1107        // sticky on disk). Idempotent: a missing anchor / not-root / disabled-pf
1108        // case is treated as a successful no-op by the backend. cfg-gated so
1109        // Linux/Windows teardown behaviour is unchanged.
1110        #[cfg(target_os = "macos")]
1111        if let Err(e) = zlayer_overlay::firewall::remove_overlay_rules() {
1112            tracing::warn!(error = %e, "failed to remove macOS pf overlay rules during teardown");
1113        }
1114        // Remove the systemd-networkd / NetworkManager "unmanaged" drop-ins we
1115        // installed at setup so a clean stop fully reverts host network state.
1116        zlayer_overlay::networkd::unmark_overlay_interfaces_unmanaged();
1117
1118        // Revert the host-side netlink resources this daemon created (veths,
1119        // host /32 routes, bridges). The netlink helpers are async; this fn must
1120        // keep its sync signature, so bridge to the surrounding multi-thread
1121        // tokio runtime via block_in_place + Handle::block_on. Order matters:
1122        // delete routes first (they reference the veth as their oif), then the
1123        // host-side veths, then the bridges (deleting a bridge link drops its
1124        // address + up state). Every delete is best-effort + idempotent: a
1125        // resource a prior per-container detach already removed surfaces as
1126        // NotFound/ESRCH which the helpers treat as success, and a genuine
1127        // failure is logged and skipped so a partial teardown never aborts the
1128        // rest.
1129        #[cfg(target_os = "linux")]
1130        {
1131            let routes: Vec<(IpAddr, u8, String)> = std::mem::take(&mut self.created_host_routes);
1132            let veths: Vec<String> = self.created_veths.drain().collect();
1133            let bridges: Vec<String> = self.created_bridges.drain().collect();
1134
1135            let delete_all = || async {
1136                for (dest, prefix, dev) in &routes {
1137                    if let Err(e) = crate::netlink::delete_route_via_dev(*dest, *prefix, dev).await
1138                    {
1139                        tracing::warn!(
1140                            dest = %dest, prefix, dev = %dev, error = %e,
1141                            "teardown: failed to delete host route (continuing)"
1142                        );
1143                    }
1144                }
1145                for veth in &veths {
1146                    if let Err(e) = crate::netlink::delete_link_by_name(veth).await {
1147                        tracing::warn!(
1148                            veth = %veth, error = %e,
1149                            "teardown: failed to delete host-side veth (continuing)"
1150                        );
1151                    }
1152                }
1153                for bridge in &bridges {
1154                    if let Err(e) = crate::netlink::delete_link_by_name(bridge).await {
1155                        tracing::warn!(
1156                            bridge = %bridge, error = %e,
1157                            "teardown: failed to delete bridge (continuing)"
1158                        );
1159                    }
1160                }
1161            };
1162
1163            match tokio::runtime::Handle::try_current() {
1164                Ok(handle) => {
1165                    tokio::task::block_in_place(|| handle.block_on(delete_all()));
1166                }
1167                Err(_) => {
1168                    // No ambient runtime (e.g. a non-async shutdown path): spin
1169                    // up a throwaway current-thread runtime to drive the deletes.
1170                    match tokio::runtime::Builder::new_current_thread()
1171                        .enable_all()
1172                        .build()
1173                    {
1174                        Ok(rt) => rt.block_on(delete_all()),
1175                        Err(e) => tracing::warn!(
1176                            error = %e,
1177                            "teardown: could not build a runtime to revert netlink \
1178                             resources; veths/routes/bridges left in place"
1179                        ),
1180                    }
1181                }
1182            }
1183        }
1184    }
1185
1186    /// Enable IP forwarding for an overlay container attach, scoped to the
1187    /// address family in use and (for IPv6) to the specific overlay devices.
1188    ///
1189    /// IPv4 has no per-interface forwarding knob that affects routing the way
1190    /// we need, so `net.ipv4.ip_forward` is global — but that is harmless for
1191    /// the host's own INPUT / reply path (it only permits the box to route
1192    /// transit traffic). We snapshot its prior value once so teardown can
1193    /// restore it.
1194    ///
1195    /// IPv6 is the dangerous case: `net.ipv6.conf.all.forwarding=1` forces
1196    /// `accept_ra=0` + `autoconf=0` on EVERY IPv6 interface, which drops the
1197    /// RA-learned default route and path-MTU on the public NIC and blackholes
1198    /// the host's own larger reply packets. We therefore enable forwarding
1199    /// only on the specific overlay device(s) via
1200    /// `net.ipv6.conf.<dev>.forwarding`, which routes overlay traffic without
1201    /// touching the physical NIC's RA / PMTU state.
1202    #[cfg(target_os = "linux")]
1203    fn enable_forwarding_for_attach(
1204        &mut self,
1205        is_v6: bool,
1206        veth_host: &str,
1207        bridge_name: Option<&str>,
1208    ) {
1209        // IPv4 forwarding (global) — required for v4 overlay egress, benign
1210        // for INPUT. Snapshot the prior value exactly once.
1211        if self.prev_ipv4_forward.is_none() {
1212            let prev = crate::netlink::read_sysctl("net.ipv4.ip_forward")
1213                .unwrap_or_else(|_| "0".to_string());
1214            self.prev_ipv4_forward = Some(prev);
1215        }
1216        let _ = crate::netlink::set_sysctl("net.ipv4.ip_forward", "1");
1217
1218        // IPv6 forwarding — PER-INTERFACE only. Enable on the host-side veth
1219        // and (when bridged) the bridge so the overlay routes, without the
1220        // `all.forwarding` RA/PMTU side effect on the physical NIC. The Linux
1221        // sysctl name uses '/' for the interface segment escaped to '.' by
1222        // set_sysctl's dot-translation — so pass the device name with any
1223        // literal dots intact (overlay device names never contain dots).
1224        if is_v6 {
1225            for dev in std::iter::once(veth_host).chain(bridge_name) {
1226                let key = format!("net.ipv6.conf.{dev}.forwarding");
1227                if crate::netlink::set_sysctl(&key, "1").is_ok() {
1228                    self.ipv6_forward_ifaces.insert(dev.to_string());
1229                }
1230            }
1231        }
1232    }
1233
1234    /// Revert the forwarding sysctls this daemon enabled (counterpart of
1235    /// [`Self::enable_forwarding_for_attach`]). Restores the snapshotted IPv4
1236    /// value and clears per-interface IPv6 forwarding on exactly the devices
1237    /// we touched. Best-effort: a failed write (device already gone, `/proc`
1238    /// not writable) is ignored — the worst case is the pre-existing sticky
1239    /// state, never a crash on shutdown.
1240    #[cfg(target_os = "linux")]
1241    fn revert_forwarding(&mut self) {
1242        if let Some(prev) = self.prev_ipv4_forward.take() {
1243            let _ = crate::netlink::set_sysctl("net.ipv4.ip_forward", &prev);
1244        }
1245        for dev in self.ipv6_forward_ifaces.drain() {
1246            let key = format!("net.ipv6.conf.{dev}.forwarding");
1247            let _ = crate::netlink::set_sysctl(&key, "0");
1248        }
1249    }
1250
1251    // -- service overlay -----------------------------------------------------
1252
1253    /// Set up the per-service Linux bridge that backs `service` on this node.
1254    ///
1255    /// Returns the bridge name on success.
1256    ///
1257    /// # Errors
1258    /// Returns an error if subnet assignment fails (exhaustion), if the bridge
1259    /// cannot be created, or if the cluster transport rejects the `AllowedIPs`
1260    /// update.
1261    #[cfg(target_os = "linux")]
1262    async fn setup_service_overlay(
1263        &mut self,
1264        service: &str,
1265        mode: OverlayMode,
1266    ) -> Result<ServiceOverlayInfo, OverlaydError> {
1267        // Decision surface is the two predicates on `OverlayMode` (see
1268        // `zlayer_types::overlay`), not an ad-hoc variant match:
1269        //   - uses_shared_bridge() -> the single node-wide shared bridge (+ the
1270        //     userspace free-port L4 proxy wired in `proxy_manager.rs`).
1271        //   - uses_per_service_wg() -> a dedicated per-service WireGuard device.
1272        //   - uses_isolation_scope() -> Isolated: Auto topology here; the L3
1273        //     fence is applied at ATTACH time via `isolation_network`.
1274        //   - otherwise (Auto)      -> per-service Linux bridge carried on the
1275        //     single cluster-wide WireGuard interface (today's default).
1276        // Record the resolved mode so the container ATTACH path can branch.
1277        let resolved = mode.resolve();
1278        self.service_modes.insert(service.to_string(), resolved);
1279        if resolved.uses_shared_bridge() {
1280            self.setup_service_overlay_shared_bridge(service).await
1281        } else if resolved.uses_per_service_wg() {
1282            self.setup_service_overlay_dedicated(service).await
1283        } else if resolved.uses_isolation_scope() {
1284            // Isolated == Auto topology (per-service bridge on the cluster-wide
1285            // WireGuard); the L3 fence is applied at ATTACH time via
1286            // `isolation_network`, not in segment setup. Same target as the
1287            // default, made explicit so a new mode can't silently fall through.
1288            self.setup_service_overlay_cluster_wg(service).await
1289        } else {
1290            self.setup_service_overlay_cluster_wg(service).await
1291        }
1292    }
1293
1294    /// `Auto`-mode per-service overlay (Linux): a per-service Linux bridge backed
1295    /// by the SINGLE cluster-wide `WireGuard` transport (the service subnet is
1296    /// plumbed onto the cluster device's `AllowedIPs`). This is the original
1297    /// default `setup_service_overlay` body, returning a [`ServiceOverlayInfo`]
1298    /// with the bridge name and all dedicated-device identity fields `None`
1299    /// (`Auto` shares the cluster device).
1300    ///
1301    /// Returns the bridge name on success.
1302    ///
1303    /// # Errors
1304    /// Returns an error if subnet assignment fails (exhaustion), if the bridge
1305    /// cannot be created, or if the cluster transport rejects the `AllowedIPs`
1306    /// update.
1307    #[cfg(target_os = "linux")]
1308    #[allow(clippy::too_many_lines)]
1309    async fn setup_service_overlay_cluster_wg(
1310        &mut self,
1311        service: &str,
1312    ) -> Result<ServiceOverlayInfo, OverlaydError> {
1313        // 1. Idempotency check.
1314        if let Some(existing) = self.service_bridges.get(service) {
1315            let name = existing.name.clone();
1316            tracing::debug!(service = %service, bridge = %name, "Service bridge already active, reusing");
1317            return Ok(cluster_wg_overlay_info(name));
1318        }
1319
1320        // 2. Assign subnet via the (currently local) ServiceSubnetRegistry.
1321        self.ensure_service_subnet_registry()?;
1322        let subnet: ipnet::IpNet = {
1323            let registry = self
1324                .service_subnet_registry
1325                .as_mut()
1326                .expect("ensure_service_subnet_registry leaves Some");
1327            let node_key = self.local_node_id.to_string();
1328            registry.assign(service, &node_key).map_err(|e| {
1329                OverlaydError::Overlay(format!(
1330                    "ServiceSubnetRegistry::assign({service}, {node_key}) failed: {e}"
1331                ))
1332            })?
1333        };
1334
1335        // 3+4+6. Create the per-service Linux bridge, assign its gateway, bring
1336        // it up, build the per-service IpAllocator, and record it.
1337        let bridge_name = self.create_service_bridge(service, subnet).await?;
1338
1339        // 5. Plumb subnet into the cluster transport's local AllowedIPs so the
1340        // single cluster device carries this service's cross-node traffic
1341        // (Shared mode shares one crypto context for every service).
1342        if let Some(ref cluster) = self.global_transport {
1343            if let Some(ref pubkey) = self.local_wg_pubkey {
1344                if let Err(e) = cluster.add_allowed_ip(pubkey, subnet).await {
1345                    tracing::warn!(
1346                        service = %service,
1347                        subnet = %subnet,
1348                        error = %e,
1349                        "Failed to add service subnet to cluster transport AllowedIPs (non-fatal)"
1350                    );
1351                }
1352            } else {
1353                tracing::debug!(service = %service, "local_wg_pubkey not yet set; skipping cluster AllowedIPs update");
1354            }
1355        }
1356
1357        Ok(cluster_wg_overlay_info(bridge_name))
1358    }
1359
1360    /// `Shared`-mode per-service overlay (Linux): attach `service` onto the
1361    /// SINGLE node-wide shared Linux bridge (created once, reused by every
1362    /// Shared service on this node), carried on the cluster-wide `WireGuard`
1363    /// interface. There is NO per-service bridge and NO per-service `WireGuard`;
1364    /// container ports are exposed via the userspace free-port L4 proxy
1365    /// (`proxy_manager.rs`). Returns the shared bridge name.
1366    ///
1367    /// Idempotent: the shared bridge is allocated a single subnet and brought up
1368    /// exactly once; subsequent Shared services reuse it. The service is recorded
1369    /// in `service_interfaces` (pointing at the shared bridge) so presence checks
1370    /// and the attach path resolve it.
1371    ///
1372    /// # Errors
1373    /// Returns an error if the one-time shared-subnet assignment fails
1374    /// (exhaustion), if the shared bridge cannot be created, or if the cluster
1375    /// transport rejects the `AllowedIPs` update.
1376    #[cfg(target_os = "linux")]
1377    async fn setup_service_overlay_shared_bridge(
1378        &mut self,
1379        service: &str,
1380    ) -> Result<ServiceOverlayInfo, OverlaydError> {
1381        let bridge_name = self.ensure_shared_bridge().await?;
1382        // Point this service at the shared bridge so presence checks succeed and
1383        // the attach path resolves it to the shared bridge.
1384        self.service_interfaces
1385            .insert(service.to_string(), bridge_name.clone());
1386        tracing::info!(service = %service, bridge = %bridge_name, "Service attached to shared node-wide bridge");
1387        Ok(shared_overlay_info(bridge_name))
1388    }
1389
1390    /// Ensure the single node-wide shared Linux bridge exists, returning its
1391    /// name. Created once with its own subnet (drawn from the same
1392    /// `ServiceSubnetRegistry` every service subnet comes from, under a fixed
1393    /// reserved key so it never collides with a real service) and plumbed onto
1394    /// the cluster transport's `AllowedIPs` so shared containers are
1395    /// mesh-reachable across nodes. Subsequent calls return the existing name.
1396    ///
1397    /// # Errors
1398    /// Returns an error if subnet assignment fails or the bridge cannot be
1399    /// created/addressed/brought up.
1400    #[cfg(target_os = "linux")]
1401    async fn ensure_shared_bridge(&mut self) -> Result<String, OverlaydError> {
1402        use zlayer_overlay::allocator::IpAllocator as OverlayIpAllocator;
1403
1404        if let Some(existing) = self.shared_bridge.as_ref() {
1405            return Ok(existing.name.clone());
1406        }
1407
1408        // One subnet for the whole shared bridge. Use a fixed reserved key in the
1409        // registry (never a real service name) so the shared bridge gets exactly
1410        // one stable subnet, distinct from every per-service subnet.
1411        self.ensure_service_subnet_registry()?;
1412        let subnet: ipnet::IpNet = {
1413            let registry = self
1414                .service_subnet_registry
1415                .as_mut()
1416                .expect("ensure_service_subnet_registry leaves Some");
1417            let node_key = self.local_node_id.to_string();
1418            registry.assign(SHARED_BRIDGE_REGISTRY_KEY, &node_key).map_err(|e| {
1419                OverlaydError::Overlay(format!(
1420                    "ServiceSubnetRegistry::assign({SHARED_BRIDGE_REGISTRY_KEY}, {node_key}) failed: {e}"
1421                ))
1422            })?
1423        };
1424
1425        // Deterministic, IFNAMSIZ-safe shared-bridge name (one per node). Use the
1426        // same naming helper as per-service bridges with a fixed key so it stays
1427        // <= 15 chars and is unambiguous (`zl-...-sh`).
1428        let bridge_name =
1429            make_interface_name(&[&self.deployment, &self.instance_id, "shared"], "sh");
1430
1431        if let Err(e) = crate::netlink::create_bridge(&bridge_name).await {
1432            return Err(OverlaydError::Overlay(format!(
1433                "create_bridge({bridge_name}) failed: {e}"
1434            )));
1435        }
1436        if let Err(e) = crate::netlink::set_bridge_stp(&bridge_name, false) {
1437            tracing::warn!(bridge = %bridge_name, error = %e, "set_bridge_stp(off) failed (non-fatal)");
1438        }
1439
1440        // Flush stale addresses first: `create_bridge` is idempotent on EEXIST, so
1441        // a shared bridge that survived a restart would otherwise accumulate a
1442        // second gateway (the same dual-address bug fixed for per-service bridges).
1443        let gateway = first_usable_ip(subnet);
1444        if let Err(e) = crate::netlink::flush_addresses_on_link_by_name(&bridge_name).await {
1445            tracing::warn!(bridge = %bridge_name, error = %e, "flush_addresses_on_link_by_name failed (non-fatal)");
1446        }
1447        if let Err(e) =
1448            crate::netlink::add_address_to_link_by_name(&bridge_name, gateway, subnet.prefix_len())
1449                .await
1450        {
1451            let _ = crate::netlink::delete_bridge(&bridge_name).await;
1452            return Err(OverlaydError::Overlay(format!(
1453                "add_address_to_link_by_name({bridge_name}, {gateway}/{}) failed: {e}",
1454                subnet.prefix_len()
1455            )));
1456        }
1457        if let Err(e) = crate::netlink::set_link_up_by_name(&bridge_name).await {
1458            let _ = crate::netlink::delete_bridge(&bridge_name).await;
1459            return Err(OverlaydError::Overlay(format!(
1460                "set_link_up_by_name({bridge_name}) failed: {e}"
1461            )));
1462        }
1463
1464        // Track the shared bridge for global teardown (deleting the link drops
1465        // its gateway address + up state).
1466        self.created_bridges.insert(bridge_name.clone());
1467
1468        let mut ip_allocator = OverlayIpAllocator::new(&subnet.to_string()).map_err(|e| {
1469            OverlaydError::Overlay(format!("IpAllocator::new({subnet}) failed: {e}"))
1470        })?;
1471        let _ = ip_allocator.allocate_specific(gateway);
1472
1473        // Plumb the shared subnet onto the cluster transport's AllowedIPs so the
1474        // single cluster device carries shared-bridge cross-node traffic (same
1475        // mechanism the cluster-WG per-service path uses).
1476        if let Some(ref cluster) = self.global_transport {
1477            if let Some(ref pubkey) = self.local_wg_pubkey {
1478                if let Err(e) = cluster.add_allowed_ip(pubkey, subnet).await {
1479                    tracing::warn!(
1480                        subnet = %subnet,
1481                        error = %e,
1482                        "Failed to add shared-bridge subnet to cluster transport AllowedIPs (non-fatal)"
1483                    );
1484                }
1485            } else {
1486                tracing::debug!(
1487                    "local_wg_pubkey not yet set; skipping shared-bridge cluster AllowedIPs update"
1488                );
1489            }
1490        }
1491
1492        self.shared_bridge = Some(ServiceBridge {
1493            name: bridge_name.clone(),
1494            subnet,
1495            gateway,
1496            ip_allocator,
1497        });
1498
1499        tracing::info!(bridge = %bridge_name, subnet = %subnet, gateway = %gateway, "Shared node-wide bridge created");
1500        Ok(bridge_name)
1501    }
1502
1503    /// Create the per-service Linux bridge for `service` on `subnet`, assign its
1504    /// gateway, bring it up, build the per-service [`IpAllocator`], and record it
1505    /// in `service_bridges` + `service_interfaces`. Returns the bridge name.
1506    ///
1507    /// Shared and Dedicated mode share this bridge mechanic verbatim — the ONLY
1508    /// difference between the two modes is which `WireGuard` device the service
1509    /// subnet/peers are plumbed onto (the single cluster transport for Shared,
1510    /// the dedicated per-service transport for Dedicated). This helper does NOT
1511    /// touch any transport's `AllowedIPs`; the caller does that against the
1512    /// device it owns.
1513    ///
1514    /// # Errors
1515    /// Returns an error if the bridge cannot be created, addressed, or brought
1516    /// up, or if the per-service `IpAllocator` cannot be built.
1517    #[cfg(target_os = "linux")]
1518    async fn create_service_bridge(
1519        &mut self,
1520        service: &str,
1521        subnet: ipnet::IpNet,
1522    ) -> Result<String, OverlaydError> {
1523        use zlayer_overlay::allocator::IpAllocator as OverlayIpAllocator;
1524
1525        let bridge_name = make_interface_name(&[&self.deployment, &self.instance_id, service], "b");
1526
1527        if let Err(e) = crate::netlink::create_bridge(&bridge_name).await {
1528            return Err(OverlaydError::Overlay(format!(
1529                "create_bridge({bridge_name}) failed: {e}"
1530            )));
1531        }
1532        if let Err(e) = crate::netlink::set_bridge_stp(&bridge_name, false) {
1533            tracing::warn!(bridge = %bridge_name, error = %e, "set_bridge_stp(off) failed (non-fatal)");
1534        }
1535
1536        // Gateway = first usable host in the subnet, assigned to the bridge.
1537        // Flush any pre-existing addresses FIRST: `create_bridge` is idempotent
1538        // on EEXIST, so a bridge that survived a restart would otherwise keep its
1539        // old gateway and we'd stack the new one on top (the observed dual
1540        // /28 + /26 bug). Flushing makes the assignment idempotent and self-heals
1541        // such bridges. Non-fatal: on a brand-new bridge there is nothing to flush.
1542        let gateway = first_usable_ip(subnet);
1543        if let Err(e) = crate::netlink::flush_addresses_on_link_by_name(&bridge_name).await {
1544            tracing::warn!(bridge = %bridge_name, error = %e, "flush_addresses_on_link_by_name failed (non-fatal)");
1545        }
1546        if let Err(e) =
1547            crate::netlink::add_address_to_link_by_name(&bridge_name, gateway, subnet.prefix_len())
1548                .await
1549        {
1550            let _ = crate::netlink::delete_bridge(&bridge_name).await;
1551            return Err(OverlaydError::Overlay(format!(
1552                "add_address_to_link_by_name({bridge_name}, {gateway}/{}) failed: {e}",
1553                subnet.prefix_len()
1554            )));
1555        }
1556        if let Err(e) = crate::netlink::set_link_up_by_name(&bridge_name).await {
1557            let _ = crate::netlink::delete_bridge(&bridge_name).await;
1558            return Err(OverlaydError::Overlay(format!(
1559                "set_link_up_by_name({bridge_name}) failed: {e}"
1560            )));
1561        }
1562
1563        // Track the per-service bridge for global teardown (deleting the link
1564        // drops its gateway address + up state).
1565        self.created_bridges.insert(bridge_name.clone());
1566
1567        // Build per-service IpAllocator, reserve the gateway.
1568        let mut ip_allocator = OverlayIpAllocator::new(&subnet.to_string()).map_err(|e| {
1569            OverlaydError::Overlay(format!("IpAllocator::new({subnet}) failed: {e}"))
1570        })?;
1571        let _ = ip_allocator.allocate_specific(gateway);
1572
1573        self.service_bridges.insert(
1574            service.to_string(),
1575            ServiceBridge {
1576                name: bridge_name.clone(),
1577                subnet,
1578                gateway,
1579                ip_allocator,
1580            },
1581        );
1582        self.service_interfaces
1583            .insert(service.to_string(), bridge_name.clone());
1584
1585        tracing::info!(service = %service, bridge = %bridge_name, subnet = %subnet, gateway = %gateway, "Service bridge created");
1586        Ok(bridge_name)
1587    }
1588
1589    /// Non-Linux variant of `setup_service_overlay`. On Windows the per-service
1590    /// segment is the HCN Internal network created lazily at attach time, and on
1591    /// macOS containers fall through to host networking. Registers the service
1592    /// in `service_interfaces` with a placeholder name so presence checks work.
1593    ///
1594    /// # Errors
1595    /// Infallible on non-Linux; the `Result` is preserved for ABI parity.
1596    #[cfg(not(target_os = "linux"))]
1597    async fn setup_service_overlay(
1598        &mut self,
1599        service: &str,
1600        mode: OverlayMode,
1601    ) -> Result<ServiceOverlayInfo, OverlaydError> {
1602        // Same predicate-driven decision surface as Linux (see
1603        // `zlayer_types::overlay`). The container ATTACH path differentiates the
1604        // modes per-OS; here we only record the resolved mode and register the
1605        // appropriate placeholder/info so presence checks and `Status` work.
1606        //
1607        //   - uses_per_service_wg() -> the cross-platform dedicated path (a real
1608        //     per-service WireGuard device; on Windows it also stands up a
1609        //     per-service HCN Internal network at attach time).
1610        //   - otherwise (`Auto` and `Shared`) -> no per-service WireGuard device.
1611        //     On macOS both rely on VZ NAT + host-port forwarding (the free-port
1612        //     L4 proxy), so they route to the SAME real path — the only honest
1613        //     mapping a VZ guest can express (it has no per-service bridge or WG
1614        //     to differentiate). On Windows the attach path reads the recorded
1615        //     mode to send `Shared` containers onto a shared HCN NAT network and
1616        //     `Auto` containers onto the node's base overlay network.
1617        //   - uses_isolation_scope() -> Isolated: Auto topology here; the L3
1618        //     fence is applied at ATTACH time via `isolation_network`.
1619        let resolved = mode.resolve();
1620        self.service_modes.insert(service.to_string(), resolved);
1621        if resolved.uses_per_service_wg() {
1622            self.setup_service_overlay_dedicated(service).await
1623        } else if resolved.uses_shared_bridge() {
1624            self.setup_service_overlay_shared_bridge(service).await
1625        } else if resolved.uses_isolation_scope() {
1626            // Isolated == Auto topology (per-service bridge on the cluster-wide
1627            // WireGuard); the L3 fence is applied at ATTACH time via
1628            // `isolation_network`, not in segment setup. Same target as the
1629            // default, made explicit so a new mode can't silently fall through.
1630            self.setup_service_overlay_cluster_wg(service).await
1631        } else {
1632            self.setup_service_overlay_cluster_wg(service).await
1633        }
1634    }
1635
1636    /// `Auto`-mode per-service overlay (non-Linux): on Windows the per-service
1637    /// segment is the node's base overlay HCN network used at attach time, and on
1638    /// macOS containers ride VZ NAT. Registers the service in `service_interfaces`
1639    /// with a placeholder name so presence checks work.
1640    ///
1641    /// # Errors
1642    /// Infallible on non-Linux; the `Result` is preserved for ABI parity.
1643    #[cfg(not(target_os = "linux"))]
1644    #[allow(clippy::unused_async)]
1645    async fn setup_service_overlay_cluster_wg(
1646        &mut self,
1647        service: &str,
1648    ) -> Result<ServiceOverlayInfo, OverlaydError> {
1649        let placeholder = make_interface_name(&[&self.deployment, &self.instance_id, service], "b");
1650        self.service_interfaces
1651            .insert(service.to_string(), placeholder.clone());
1652        tracing::debug!(service = %service, "Service overlay bridge setup is Linux-only; using direct networking placeholder");
1653        Ok(cluster_wg_overlay_info(placeholder))
1654    }
1655
1656    /// `Shared`-mode per-service overlay (non-Linux). There is no per-service
1657    /// `WireGuard` device and no per-service bridge:
1658    /// - macOS: the container is a VZ VM behind VZ NAT (a single shared host
1659    ///   adapter with host-port forwarding); its ports are exposed by the
1660    ///   userspace free-port L4 proxy. Nothing to provision here beyond a
1661    ///   placeholder so presence checks succeed.
1662    /// - Windows: containers attach to a SINGLE shared HCN NAT network reused
1663    ///   across all Shared services (created lazily at attach time); a placeholder
1664    ///   interface is registered here.
1665    ///
1666    /// Registers the service in `service_interfaces` with a placeholder name.
1667    ///
1668    /// # Errors
1669    /// Infallible on non-Linux; the `Result` is preserved for ABI parity.
1670    #[cfg(not(target_os = "linux"))]
1671    #[allow(clippy::unused_async)]
1672    async fn setup_service_overlay_shared_bridge(
1673        &mut self,
1674        service: &str,
1675    ) -> Result<ServiceOverlayInfo, OverlaydError> {
1676        // A single placeholder shared by every Shared service on this node (it
1677        // names the shared data-plane, not a per-service interface).
1678        let placeholder =
1679            make_interface_name(&[&self.deployment, &self.instance_id, "shared"], "sh");
1680        self.service_interfaces
1681            .insert(service.to_string(), placeholder.clone());
1682        tracing::debug!(service = %service, "Shared-mode service uses the node-wide shared data-plane (VZ NAT on macOS / shared HCN NAT on Windows)");
1683        Ok(shared_overlay_info(placeholder))
1684    }
1685
1686    /// Dedicated-mode per-service overlay: stand up a *second* real `WireGuard`
1687    /// device for `service` with its own crypto context, listen port, overlay
1688    /// IP, and subnet — distinct from the single cluster transport.
1689    ///
1690    /// The cross-platform core (identity, subnet assign, transport bring-up,
1691    /// marker persist, status) runs on every OS; only the *attachment* of
1692    /// containers onto the device is platform-gated:
1693    /// - Linux: a per-service bridge (same mechanic as Shared) routed over the
1694    ///   dedicated device instead of the cluster device.
1695    /// - Windows: a per-service HCN Internal network (a later task; a clearly
1696    ///   marked seam returns an error here for now).
1697    /// - macOS: nothing further — the utun device is the attachment.
1698    ///
1699    /// # Errors
1700    /// Returns an error if port/key/subnet allocation, transport bring-up,
1701    /// marker persistence, or the platform attachment fails.
1702    #[allow(clippy::too_many_lines)]
1703    async fn setup_service_overlay_dedicated(
1704        &mut self,
1705        service: &str,
1706    ) -> Result<ServiceOverlayInfo, OverlaydError> {
1707        // ----- cross-platform core (runs on every OS) -----
1708
1709        // 1. Idempotency: an existing dedicated transport returns its identity.
1710        if let Some(st) = self.service_transports.get(service) {
1711            return Ok(dedicated_overlay_info(
1712                st.interface.clone(),
1713                &st.public_key,
1714                st.listen_port,
1715                st.overlay_ip,
1716                st.subnet,
1717            ));
1718        }
1719
1720        // 2. Identity: reuse a stable identity from the marker if one exists
1721        //    (so the device re-binds the same key + port across restarts),
1722        //    otherwise mint a fresh port + keypair + interface name.
1723        let marker_path =
1724            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
1725        let recorded = NetworkState::load(&marker_path)
1726            .get(&owner_for_service(service))
1727            .cloned();
1728
1729        let (private_key, public_key, listen_port, iface_hint) = match recorded.as_ref() {
1730            Some(entry)
1731                if entry.wg_private_key.is_some()
1732                    && entry.wg_public_key.is_some()
1733                    && entry.wg_port.is_some()
1734                    && entry.interface.is_some() =>
1735            {
1736                let port = entry.wg_port.expect("checked above");
1737                self.dedicated_ports.reserve(port);
1738                (
1739                    entry.wg_private_key.clone().expect("checked above"),
1740                    entry.wg_public_key.clone().expect("checked above"),
1741                    port,
1742                    entry.interface.clone().expect("checked above"),
1743                )
1744            }
1745            _ => {
1746                let port = self.dedicated_ports.allocate()?;
1747                let (priv_key, pub_key) = OverlayTransport::generate_keys()
1748                    .await
1749                    .map_err(|e| OverlaydError::Overlay(format!("Failed to generate keys: {e}")))?;
1750                let iface =
1751                    make_interface_name(&[&self.deployment, &self.instance_id, service], "d");
1752                (priv_key, pub_key, port, iface)
1753            }
1754        };
1755
1756        // 3. Subnet: assign from the same registry Shared uses, so per-service
1757        //    subnets stay globally unique regardless of mode.
1758        self.ensure_service_subnet_registry()?;
1759        let subnet: ipnet::IpNet = {
1760            let registry = self
1761                .service_subnet_registry
1762                .as_mut()
1763                .expect("ensure_service_subnet_registry leaves Some");
1764            let node_key = self.local_node_id.to_string();
1765            registry.assign(service, &node_key).map_err(|e| {
1766                OverlaydError::Overlay(format!(
1767                    "ServiceSubnetRegistry::assign({service}, {node_key}) failed: {e}"
1768                ))
1769            })?
1770        };
1771        let overlay_ip = first_usable_ip(subnet);
1772
1773        // 4. Build + bring up the dedicated transport. The device's overlay CIDR
1774        //    is the service subnet (so boringtun routes that subnet over THIS
1775        //    device), and its listen port is the dedicated port.
1776        let physical_egress_ip = match zlayer_overlay::detect_physical_egress().await {
1777            Ok(egress) => Some(egress.ip),
1778            Err(e) => {
1779                tracing::warn!(
1780                    error = %e,
1781                    service = %service,
1782                    "failed to detect physical egress; WireGuard local_endpoint \
1783                     will bind UNSPECIFIED for the dedicated overlay"
1784                );
1785                None
1786            }
1787        };
1788        let config = self.build_config(
1789            private_key.clone(),
1790            public_key.clone(),
1791            overlay_ip,
1792            subnet.prefix_len(),
1793            listen_port,
1794            physical_egress_ip,
1795        );
1796        let mut transport = OverlayTransport::new(config, iface_hint);
1797        transport.create_interface().await.map_err(|e| {
1798            OverlaydError::Overlay(format!(
1799                "Failed to create dedicated overlay for {service}: {e}"
1800            ))
1801        })?;
1802        transport.configure(&[]).await.map_err(|e| {
1803            OverlaydError::Overlay(format!(
1804                "Failed to configure dedicated overlay for {service}: {e}"
1805            ))
1806        })?;
1807        let actual_iface = transport.interface_name().to_string();
1808
1809        // 5. Persist the marker so the identity survives restarts. Match the
1810        //    base/Shared entry shape (owner/kind/name/id/subnet) plus the
1811        //    dedicated WG fields.
1812        let mut marker = NetworkState::load(&marker_path);
1813        marker.upsert(ManagedNetwork {
1814            owner: owner_for_service(service),
1815            kind: "wg-dedicated".to_string(),
1816            name: actual_iface.clone(),
1817            id: public_key.clone(),
1818            subnet: subnet.to_string(),
1819            wg_port: Some(listen_port),
1820            wg_private_key: Some(private_key),
1821            wg_public_key: Some(public_key.clone()),
1822            interface: Some(actual_iface.clone()),
1823        });
1824        if let Err(e) = marker.save(&marker_path) {
1825            tracing::warn!(service = %service, error = %e, path = %marker_path.display(), "failed to persist dedicated-overlay marker (device still live)");
1826        }
1827
1828        // 6. Record the live transport. Build the guest-attach IPAM bounded to
1829        //    the service subnet, reserving the node's own dedicated-device IP so
1830        //    a joining guest never draws it.
1831        let mut ip_allocator = zlayer_overlay::allocator::IpAllocator::new(&subnet.to_string())
1832            .map_err(|e| {
1833                OverlaydError::Overlay(format!("IpAllocator::new({subnet}) failed: {e}"))
1834            })?;
1835        let _ = ip_allocator.allocate_specific(overlay_ip);
1836        self.service_transports.insert(
1837            service.to_string(),
1838            ServiceTransport {
1839                transport,
1840                interface: actual_iface.clone(),
1841                public_key: public_key.clone(),
1842                listen_port,
1843                overlay_ip,
1844                subnet,
1845                ip_allocator,
1846            },
1847        );
1848
1849        tracing::info!(
1850            service = %service,
1851            interface = %actual_iface,
1852            listen_port,
1853            subnet = %subnet,
1854            overlay_ip = %overlay_ip,
1855            "Dedicated per-service overlay device created"
1856        );
1857
1858        // ----- platform-gated attachment -----
1859        // `name` in the returned info is the container-attach handle: the bridge
1860        // name on Linux, the dedicated interface elsewhere.
1861        let name = self
1862            .attach_dedicated_service(service, subnet, overlay_ip)
1863            .await?;
1864
1865        Ok(dedicated_overlay_info(
1866            name,
1867            &public_key,
1868            listen_port,
1869            overlay_ip,
1870            subnet,
1871        ))
1872    }
1873
1874    /// Linux attachment for a dedicated per-service overlay: create the same
1875    /// per-service bridge Shared uses, but route the service subnet over the
1876    /// DEDICATED device rather than the cluster device.
1877    ///
1878    /// Concretely, the dedicated transport's overlay CIDR already covers
1879    /// `subnet` (set at `build_config` time in the core), so boringtun routes
1880    /// `subnet` out the dedicated TUN; we additionally plumb `subnet` onto this
1881    /// node's own `AllowedIPs` entry on the dedicated device so locally
1882    /// originated packets to the subnet are accepted. Returns the bridge name.
1883    ///
1884    /// # Errors
1885    /// Returns an error if the bridge cannot be created.
1886    #[cfg(target_os = "linux")]
1887    async fn attach_dedicated_service(
1888        &mut self,
1889        service: &str,
1890        subnet: ipnet::IpNet,
1891        overlay_ip: IpAddr,
1892    ) -> Result<String, OverlaydError> {
1893        let _ = overlay_ip;
1894        let bridge_name = self.create_service_bridge(service, subnet).await?;
1895
1896        // Plumb the service subnet onto the DEDICATED device (not the cluster
1897        // device). The dedicated transport's overlay CIDR already routes the
1898        // subnet out its TUN; adding it to our own pubkey's AllowedIPs keeps the
1899        // local-accept side consistent with the Shared path's cluster plumbing.
1900        if let Some(st) = self.service_transports.get(service) {
1901            if let Some(ref pubkey) = self.local_wg_pubkey {
1902                if let Err(e) = st.transport.add_allowed_ip(pubkey, subnet).await {
1903                    tracing::warn!(
1904                        service = %service,
1905                        subnet = %subnet,
1906                        error = %e,
1907                        "Failed to add service subnet to dedicated transport AllowedIPs (non-fatal)"
1908                    );
1909                }
1910            } else {
1911                tracing::debug!(service = %service, "local_wg_pubkey not yet set; skipping dedicated AllowedIPs update");
1912            }
1913        }
1914
1915        Ok(bridge_name)
1916    }
1917
1918    /// Windows attachment for a dedicated per-service overlay.
1919    ///
1920    /// The cross-platform core has already stood up the dedicated Wintun
1921    /// transport (the encrypted node-to-node path for the service subnet). This
1922    /// adds the *container-facing* side: a per-service HCN **Internal** network
1923    /// onto which the agent's containers attach (instead of the node's shared
1924    /// base overlay network), so dedicated-service traffic is isolated at the
1925    /// vSwitch layer. Returns the per-service network's name, which the caller
1926    /// records as the [`ServiceOverlayInfo::name`] attach handle.
1927    ///
1928    /// # Errors
1929    /// Propagates any error from [`Self::ensure_service_network`].
1930    #[cfg(target_os = "windows")]
1931    async fn attach_dedicated_service(
1932        &mut self,
1933        service: &str,
1934        subnet: ipnet::IpNet,
1935        _overlay_ip: IpAddr,
1936    ) -> Result<String, OverlaydError> {
1937        // Create (or reuse) the per-service Internal HCN network. The returned
1938        // GUID is recorded in the marker under `owner_for_service(service)`;
1939        // the `AttachContainer` handler reuses it via the same marker lookup.
1940        let _net_id = self.ensure_service_network(service, subnet).await?;
1941        // The attach handle reported back is the per-service network's name.
1942        let daemon_name = self.deployment_or_default();
1943        Ok(format!(
1944            "{}-svc-{service}",
1945            overlay_network_name(&daemon_name)
1946        ))
1947    }
1948
1949    /// macOS attachment for a dedicated per-service overlay: the cross-platform
1950    /// core already brought up a utun device; there is no bridge, so the
1951    /// interface name itself is the attach handle.
1952    #[cfg(all(not(target_os = "linux"), not(target_os = "windows")))]
1953    #[allow(clippy::unused_async)]
1954    async fn attach_dedicated_service(
1955        &mut self,
1956        service: &str,
1957        _subnet: ipnet::IpNet,
1958        _overlay_ip: IpAddr,
1959    ) -> Result<String, OverlaydError> {
1960        let iface = self
1961            .service_transports
1962            .get(service)
1963            .map(|st| st.interface.clone())
1964            .unwrap_or_default();
1965        Ok(iface)
1966    }
1967
1968    /// Tear down the per-service segment for `service`. Idempotent.
1969    // Only the Linux body awaits (netlink + cluster AllowedIPs); other targets
1970    // are synchronous (transport shutdown is sync) but must keep the async
1971    // signature for the dispatch call.
1972    #[cfg_attr(not(target_os = "linux"), allow(clippy::unused_async))]
1973    async fn teardown_service_overlay(&mut self, service: &str) {
1974        // Drop the recorded mode; a `Shared` service's containers no longer route
1975        // to the shared bridge once it is gone. The node-wide shared bridge
1976        // itself is deliberately NOT torn down here — other Shared services reuse
1977        // it (it is reclaimed only on full overlay teardown / uninstall).
1978        self.service_modes.remove(service);
1979
1980        // Auto-mode segment teardown (per-service bridge on Linux, placeholder
1981        // elsewhere). A Shared-mode service has no per-service bridge, so
1982        // `service_bridges.remove` is a no-op for it (its `service_interfaces`
1983        // placeholder pointing at the shared bridge is removed below).
1984        #[cfg(target_os = "linux")]
1985        {
1986            let removed = self.service_bridges.remove(service);
1987            self.service_interfaces.remove(service);
1988
1989            // Remove the subnet from the cluster AllowedIPs only when we still
1990            // know it (the in-memory entry survived).
1991            if let Some(ref bridge) = removed {
1992                if let Some(ref cluster) = self.global_transport {
1993                    if let Some(ref pubkey) = self.local_wg_pubkey {
1994                        if let Err(e) = cluster.remove_allowed_ip(pubkey, bridge.subnet).await {
1995                            tracing::warn!(
1996                                service = %service,
1997                                subnet = %bridge.subnet,
1998                                error = %e,
1999                                "Failed to remove service subnet from cluster AllowedIPs (non-fatal)"
2000                            );
2001                        }
2002                    }
2003                }
2004            }
2005
2006            // Delete the physical bridge by its DETERMINISTIC name, regardless of
2007            // whether the in-memory entry survived. After an overlayd restart the
2008            // `service_bridges` map is empty, so a delete gated on `Some(..)` would
2009            // silently leak the `zl-…-b` link forever (the observed orphan/linkdown
2010            // bridges). `delete_bridge` no-ops on ENODEV, so deleting an absent link
2011            // is safe — and the `-b` suffix never collides with a Shared service's
2012            // shared `-sh` bridge, so this can't tear down the wrong thing.
2013            let bridge_name = removed.as_ref().map_or_else(
2014                || make_interface_name(&[&self.deployment, &self.instance_id, service], "b"),
2015                |b| b.name.clone(),
2016            );
2017            if let Err(e) = crate::netlink::delete_bridge(&bridge_name).await {
2018                tracing::warn!(service = %service, bridge = %bridge_name, error = %e, "delete_bridge failed (non-fatal)");
2019            }
2020
2021            // Release the subnet-registry slot by service name (works whether or
2022            // not the in-memory entry survived).
2023            if let Some(registry) = self.service_subnet_registry.as_mut() {
2024                let node_key = self.local_node_id.to_string();
2025                let _ = registry.release(service, &node_key);
2026            }
2027
2028            if removed.is_some() {
2029                tracing::info!(service = %service, bridge = %bridge_name, "Tore down service bridge");
2030            } else {
2031                tracing::debug!(service = %service, bridge = %bridge_name, "best-effort delete of (possibly absent) service bridge by name");
2032            }
2033        }
2034        #[cfg(not(target_os = "linux"))]
2035        {
2036            if let Some(iface) = self.service_interfaces.remove(service) {
2037                tracing::info!(service = %service, interface = %iface, "Removed service overlay interface (placeholder, non-Linux)");
2038            }
2039        }
2040
2041        // Dedicated-mode teardown (cross-platform): tear down the per-service
2042        // transport, free its port, and drop its marker entry. No-op when the
2043        // service ran in Shared mode (nothing in `service_transports`).
2044        if let Some(mut st) = self.service_transports.remove(service) {
2045            st.transport.shutdown();
2046            self.dedicated_ports.release(st.listen_port);
2047
2048            // Release the subnet assignment (Shared releases it inside the
2049            // Linux block above; the dedicated subnet lives in the same
2050            // registry, so release it here for the dedicated case on every OS).
2051            if let Some(registry) = self.service_subnet_registry.as_mut() {
2052                let node_key = self.local_node_id.to_string();
2053                let _ = registry.release(service, &node_key);
2054            }
2055
2056            let marker_path =
2057                zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
2058            let mut marker = NetworkState::load(&marker_path);
2059            let removed_entry = marker.remove(&owner_for_service(service));
2060            if removed_entry.is_some() {
2061                if let Err(e) = marker.save(&marker_path) {
2062                    tracing::warn!(service = %service, error = %e, path = %marker_path.display(), "failed to persist dedicated-overlay marker removal");
2063                }
2064            }
2065
2066            // Windows: delete the per-service HCN Internal network this service
2067            // owned. The marker entry's `id` is the bare HCN GUID (set by
2068            // `ensure_service_network`); delete the network so a dedicated
2069            // service tears down cleanly without waiting for a full uninstall.
2070            // Also drop the per-service container-IP allocator.
2071            #[cfg(target_os = "windows")]
2072            {
2073                self.service_ip_allocators.remove(service);
2074                if let Some(entry) = removed_entry.as_ref() {
2075                    if entry.kind == "hcn-internal" {
2076                        if let Ok(guid) = windows::core::GUID::try_from(entry.id.as_str()) {
2077                            match zlayer_hns::network::Network::delete(guid) {
2078                                Ok(()) => {
2079                                    tracing::info!(service = %service, id = %entry.id, "deleted per-service HCN network");
2080                                }
2081                                Err(e) => {
2082                                    tracing::warn!(service = %service, id = %entry.id, error = %e, "failed to delete per-service HCN network (may leak until uninstall)");
2083                                }
2084                            }
2085                        } else {
2086                            tracing::warn!(service = %service, id = %entry.id, "per-service marker has unparseable HCN GUID; skipping network delete");
2087                        }
2088                    }
2089                }
2090            }
2091            #[cfg(not(target_os = "windows"))]
2092            drop(removed_entry);
2093
2094            tracing::info!(
2095                service = %service,
2096                interface = %st.interface,
2097                listen_port = st.listen_port,
2098                "Tore down dedicated per-service overlay device"
2099            );
2100        }
2101    }
2102
2103    /// Reclaim orphaned per-service host bridges (and their stale device veths)
2104    /// that no live deployment still owns. `live_bridge_names` is the full set of
2105    /// `zl-…-b` bridge names every currently-restored service SHOULD own,
2106    /// computed by the main daemon from storage.
2107    ///
2108    /// For every host link whose name looks like one of OUR per-service bridge
2109    /// (`…-b`) or dedicated device (`…-d`) interfaces but is NOT in `live` and is
2110    /// NOT the active global (`-g`) or shared (`-sh`) interface, we:
2111    ///   1. delete the link (idempotent — ENODEV is success),
2112    ///   2. release its service-subnet registry assignment + cluster `AllowedIPs`
2113    ///      when the `(service, node)` key can be recovered from the registry
2114    ///      snapshot by reproducing the deterministic bridge name, and
2115    ///   3. drop any stale in-memory `service_bridges`/`service_interfaces`
2116    ///      entries pointing at it.
2117    ///
2118    /// Best-effort + idempotent: a failure on one link is logged and the sweep
2119    /// continues. Returns the names actually reclaimed.
2120    #[cfg(target_os = "linux")]
2121    async fn prune_orphan_bridges(&mut self, live_bridge_names: &[String]) -> Vec<String> {
2122        use std::collections::HashSet;
2123
2124        let links = match crate::netlink::list_all_links().await {
2125            Ok(links) => links,
2126            Err(e) => {
2127                tracing::warn!(error = %e, "prune_orphan_bridges: failed to list host links");
2128                return Vec::new();
2129            }
2130        };
2131
2132        let live: HashSet<&str> = live_bridge_names.iter().map(String::as_str).collect();
2133
2134        // The interfaces we must NEVER reclaim even though they carry the `zl-`
2135        // prefix: the active global transport device and the node-wide shared
2136        // bridge. (Container veths `veth-…`/`vc-…` are handled by the separate
2137        // PID-keyed `sweep_orphan_veths`; here we only target service bridges +
2138        // dedicated device interfaces, which `sweep_orphan_veths` never touches.)
2139        let mut protected: HashSet<String> = HashSet::new();
2140        if let Some(g) = self.global_interface.clone() {
2141            protected.insert(g);
2142        }
2143        if let Some(ref sh) = self.shared_bridge {
2144            protected.insert(sh.name.clone());
2145        }
2146        // Protect every dedicated-service WireGuard transport (`…-d`) by name. A
2147        // `-d` is a WG device, not a bridge — it has no `brif`, so the zero-member
2148        // guard below treats it as 0 members, and the daemon's `live` set only
2149        // carries `…-b` names; without this it would be reaped as a live device.
2150        //
2151        // We deliberately do NOT blanket-protect `service_bridges` (`…-b`) here.
2152        // That map holds BOTH managed-service bridges AND standalone/per-job
2153        // bridges (e.g. a Runner's per-job network), and overlayd cannot tell
2154        // them apart — a standalone container's `DetachContainer` releases the
2155        // veth/IP but never removes the bridge or its `service_bridges` entry, so
2156        // a blanket protect shielded those orphans forever (only a restart, which
2157        // wipes the map, ever cleared them). Managed bridges stay protected by
2158        // being in the daemon's authoritative `live` set; standalone bridges are
2159        // not in storage, so they fall through to the zero-member guard and are
2160        // reclaimed once idle.
2161        for st in self.service_transports.values() {
2162            protected.insert(st.interface.clone());
2163        }
2164
2165        // Snapshot the subnet registry once so we can recover the `(service,
2166        // node)` key for an orphan by reproducing its deterministic bridge/device
2167        // name. The registry has no release-by-subnet API, so we map name ->
2168        // (service, node) here.
2169        let mut name_to_key: HashMap<String, (String, String, ipnet::IpNet)> = HashMap::new();
2170        if let Some(registry) = self.service_subnet_registry.as_ref() {
2171            for ((service, node), subnet) in registry.snapshot().assignments {
2172                let bridge =
2173                    make_interface_name(&[&self.deployment, &self.instance_id, &service], "b");
2174                let device =
2175                    make_interface_name(&[&self.deployment, &self.instance_id, &service], "d");
2176                name_to_key.insert(bridge, (service.clone(), node.clone(), subnet));
2177                name_to_key.insert(device, (service, node, subnet));
2178            }
2179        }
2180
2181        let mut reclaimed = Vec::new();
2182        for (_index, name) in links {
2183            // Only consider OUR per-service bridge (`-b`) or dedicated device
2184            // (`-d`) interfaces that are neither live nor protected. The pure
2185            // predicate (unit-tested in `orphan_bridge_selection`) keeps us off
2186            // unrelated host links, the global/shared interfaces, and the veth
2187            // namespaces.
2188            if !is_orphan_service_bridge(&name, &live, &protected) {
2189                continue;
2190            }
2191
2192            // Zero-member guard: only reclaim a non-live candidate once it is
2193            // IDLE — no member links. A `-b` bridge with a running container has
2194            // ≥1 veth in its `brif`, so an in-use (or a sub-ms mid-creation,
2195            // pre-attach is the only 0-member window) standalone bridge is left
2196            // alone; an orphan `-d` has no `brif` (0) and is correctly reaped.
2197            // This is what makes dropping the `service_bridges` blanket-protect
2198            // safe — a live managed bridge is already excluded by `live`, and any
2199            // other in-use bridge is excluded here.
2200            if crate::netlink::bridge_member_count(&name).await > 0 {
2201                continue;
2202            }
2203
2204            tracing::info!(link = %name, "prune_orphan_bridges: reclaiming orphan service bridge/device");
2205
2206            // 1. Release the subnet + cluster AllowedIPs when we can recover the
2207            //    owning service key from the registry.
2208            if let Some((service, node, subnet)) = name_to_key.get(&name).cloned() {
2209                if let Some(ref cluster) = self.global_transport {
2210                    if let Some(ref pubkey) = self.local_wg_pubkey {
2211                        if let Err(e) = cluster.remove_allowed_ip(pubkey, subnet).await {
2212                            tracing::warn!(
2213                                link = %name,
2214                                subnet = %subnet,
2215                                error = %e,
2216                                "prune_orphan_bridges: remove_allowed_ip failed (non-fatal)"
2217                            );
2218                        }
2219                    }
2220                }
2221                if let Some(registry) = self.service_subnet_registry.as_mut() {
2222                    let _ = registry.release(&service, &node);
2223                }
2224            }
2225
2226            // 2. Delete the link itself (idempotent).
2227            if let Err(e) = crate::netlink::delete_bridge(&name).await {
2228                tracing::warn!(link = %name, error = %e, "prune_orphan_bridges: delete_bridge failed (non-fatal)");
2229                continue;
2230            }
2231
2232            // 3. Drop any stale in-memory bookkeeping pointing at this link.
2233            self.service_bridges.retain(|_, b| b.name != name);
2234            self.service_interfaces.retain(|_, iface| *iface != name);
2235
2236            reclaimed.push(name);
2237        }
2238
2239        if !reclaimed.is_empty() {
2240            tracing::info!(count = reclaimed.len(), bridges = ?reclaimed, "prune_orphan_bridges: reclaimed orphaned service bridges/devices");
2241        }
2242        reclaimed
2243    }
2244
2245    /// Non-Linux variant: per-service bridges are a Linux-only mechanic (Windows
2246    /// uses HCN networks torn down in `teardown_service_overlay`; macOS rides VZ
2247    /// NAT), so there are no host bridge links to sweep.
2248    #[cfg(not(target_os = "linux"))]
2249    #[allow(clippy::unused_async, clippy::unused_self)]
2250    async fn prune_orphan_bridges(&mut self, _live_bridge_names: &[String]) -> Vec<String> {
2251        Vec::new()
2252    }
2253
2254    /// Initialize the local fallback `ServiceSubnetRegistry` from the configured
2255    /// cluster CIDR. Called on first `setup_service_overlay` use.
2256    ///
2257    /// # Errors
2258    /// Returns an error when no cluster CIDR is configured or the registry
2259    /// cannot be built.
2260    fn ensure_service_subnet_registry(&mut self) -> Result<(), OverlaydError> {
2261        use zlayer_overlay::allocator::ServiceSubnetRegistry;
2262
2263        if self.service_subnet_registry.is_some() {
2264            return Ok(());
2265        }
2266        let cluster_cidr = self.cluster_cidr.ok_or_else(|| {
2267            OverlaydError::Other(
2268                "service subnet registry needs a cluster CIDR (SetupGlobalOverlay first)"
2269                    .to_string(),
2270            )
2271        })?;
2272        let cluster_ipnet: ipnet::IpNet = cluster_cidr.to_string().parse().map_err(|e| {
2273            OverlaydError::Other(format!(
2274                "failed to convert cluster CIDR {cluster_cidr} to ipnet::IpNet: {e}"
2275            ))
2276        })?;
2277        // Per-service bridge slice prefix. `/26` (V4) = ~61 usable container
2278        // IPs per service per node — keep in sync with
2279        // `zlayer_scheduler::raft::DEFAULT_SERVICE_SUBNET_SLICE_PREFIX` (the
2280        // canonical default; not imported here to avoid a dependency cycle).
2281        // The older `/28` (13 usable) exhausted under CI churn.
2282        let slice_prefix: u8 = match cluster_ipnet {
2283            ipnet::IpNet::V4(_) => 26,
2284            ipnet::IpNet::V6(_) => 120,
2285        };
2286        let mut registry =
2287            ServiceSubnetRegistry::new(cluster_ipnet, slice_prefix).map_err(|e| {
2288                OverlaydError::Other(format!("failed to build ServiceSubnetRegistry: {e}"))
2289            })?;
2290        // Reserve the node's own overlay IP so no per-service bridge subnet
2291        // overlaps it — the overlay DNS server listens on `<node_ip>:53`, and a
2292        // bridge subnet containing that IP would black-hole its containers' DNS
2293        // (they'd ARP for the node IP on their bridge, where nothing answers).
2294        if let Some(node_ip) = self.node_ip {
2295            registry.reserve_ip(node_ip);
2296        }
2297        self.service_subnet_registry = Some(registry);
2298        Ok(())
2299    }
2300
2301    // -- IP allocation -------------------------------------------------------
2302
2303    /// Allocate an overlay IP from the per-service bridge (Linux) or the node
2304    /// slice (otherwise). `join_global` reserves a second global-overlay IP too,
2305    /// matching the eth1 attach behavior.
2306    ///
2307    /// # Errors
2308    /// Returns an error if the relevant pool is exhausted.
2309    fn allocate_ip(&mut self, service: &str, join_global: bool) -> Result<IpAddr, OverlaydError> {
2310        // `join_global` does not allocate a second IP here: the companion
2311        // global-overlay IP (eth1) is reserved at attach time. `AllocateIp`
2312        // returns only the primary (service / slice) IP the caller asked for.
2313        let _ = join_global;
2314        #[cfg(target_os = "linux")]
2315        {
2316            // A Shared-mode service draws from the single node-wide shared bridge;
2317            // every other mode draws from its own per-service bridge.
2318            let use_shared = self
2319                .service_modes
2320                .get(service)
2321                .copied()
2322                .unwrap_or_default()
2323                .uses_shared_bridge();
2324            if use_shared {
2325                if let Some(bridge) = self.shared_bridge.as_mut() {
2326                    return bridge.ip_allocator.allocate().ok_or_else(|| {
2327                        OverlaydError::Overlay(format!(
2328                            "shared bridge {} subnet {} exhausted",
2329                            bridge.name, bridge.subnet
2330                        ))
2331                    });
2332                }
2333            } else if let Some(bridge) = self.service_bridges.get_mut(service) {
2334                return bridge.ip_allocator.allocate().ok_or_else(|| {
2335                    OverlaydError::Overlay(format!(
2336                        "service bridge {} subnet {} exhausted",
2337                        bridge.name, bridge.subnet
2338                    ))
2339                });
2340            }
2341        }
2342        let _ = service;
2343        self.ip_allocator.allocate()
2344    }
2345
2346    /// Return an overlay IP to the allocator (service-bridge pool when known,
2347    /// otherwise the node slice).
2348    fn release_ip(&mut self, ip: IpAddr) {
2349        #[cfg(target_os = "linux")]
2350        {
2351            if let Some(bridge) = self.shared_bridge.as_mut() {
2352                if bridge.subnet.contains(&ip) {
2353                    bridge.ip_allocator.release(ip);
2354                    return;
2355                }
2356            }
2357            for bridge in self.service_bridges.values_mut() {
2358                if bridge.subnet.contains(&ip) {
2359                    bridge.ip_allocator.release(ip);
2360                    return;
2361                }
2362            }
2363        }
2364        self.ip_allocator.release(ip);
2365    }
2366
2367    // -- container attach (Linux) -------------------------------------------
2368
2369    /// Wire a container into the overlay and return its [`AttachResult`].
2370    ///
2371    /// # Errors
2372    /// Returns an error if the container cannot be attached.
2373    #[allow(clippy::too_many_arguments)]
2374    async fn attach_container(
2375        &mut self,
2376        handle: AttachHandle,
2377        service: &str,
2378        join_global: bool,
2379        ephemeral: bool,
2380        dns_server: Option<IpAddr>,
2381        dns_domain: Option<String>,
2382        isolation_network: Option<String>,
2383    ) -> Result<AttachResult, OverlaydError> {
2384        // Record the overlay DNS resolver/zone the main daemon staged for this
2385        // node so later attaches (and the Windows HCN endpoint `Dns` schema)
2386        // can fall back to them when a per-attach value isn't supplied.
2387        if let Some(server) = dns_server {
2388            self.dns_server_addr = Some(SocketAddr::new(server, 53));
2389        }
2390        if dns_domain.is_some() {
2391            self.dns_domain.clone_from(&dns_domain);
2392        }
2393        match handle {
2394            AttachHandle::LinuxPid { pid } => {
2395                let ip = self
2396                    .attach_container_linux(pid, service, join_global, ephemeral, isolation_network)
2397                    .await?;
2398                Ok(AttachResult {
2399                    ip,
2400                    namespace_guid: None,
2401                })
2402            }
2403            AttachHandle::WindowsContainer { container_id, ip } => {
2404                self.attach_container_windows(
2405                    &container_id,
2406                    service,
2407                    ip,
2408                    dns_server,
2409                    dns_domain,
2410                    isolation_network,
2411                )
2412                .await
2413            }
2414            AttachHandle::HostShared { id } => {
2415                let ip = self
2416                    .attach_container_host_shared(&id, service, ephemeral, isolation_network)
2417                    .await?;
2418                Ok(AttachResult {
2419                    ip,
2420                    namespace_guid: None,
2421                })
2422            }
2423            AttachHandle::GuestManaged { .. } => Err(OverlaydError::Other(
2424                "guest-managed attach must go through attach_container_guest, not attach_container"
2425                    .to_string(),
2426            )),
2427        }
2428    }
2429
2430    /// Tear down a container's overlay attachment and release its IP.
2431    ///
2432    /// # Errors
2433    /// Returns an error only if a netlink delete fails for a reason other than
2434    /// "link not found".
2435    async fn detach_container(&mut self, handle: AttachHandle) -> Result<(), OverlaydError> {
2436        match handle {
2437            AttachHandle::LinuxPid { pid } => self.detach_container_linux(pid).await,
2438            AttachHandle::WindowsContainer { container_id, .. } => {
2439                self.detach_container_windows(&container_id).await
2440            }
2441            AttachHandle::HostShared { id } => self.detach_container_host_shared(&id).await,
2442            AttachHandle::GuestManaged { .. } => Err(OverlaydError::Other(
2443                "guest-managed detach must go through detach_container_guest, not detach_container"
2444                    .to_string(),
2445            )),
2446        }
2447    }
2448
2449    // -- container attach (guest-managed) -----------------------------------
2450
2451    /// Guest-managed overlay attach: allocate the overlay identity for a VM guest
2452    /// that brings up its own kernel `WireGuard` device.
2453    ///
2454    /// overlayd cannot enter the guest's network namespace (it is a VM, not a
2455    /// host process), so instead of a veth/HCN endpoint it:
2456    /// 1. allocates the overlay IP from the SAME pool the Linux attach uses (the
2457    ///    per-service bridge pool when one exists, otherwise the node slice) so
2458    ///    guest addresses never collide with container addresses;
2459    /// 2. generates a fresh `WireGuard` keypair for the guest;
2460    /// 3. builds the peer set the guest must configure — every GLOBAL peer the
2461    ///    host already knows, plus THIS node itself (so the guest can reach the
2462    ///    host node over the overlay; carries a keepalive so the guest keeps its
2463    ///    NAT mapping open from behind VZ NAT);
2464    /// 4. registers the generated public key as a GLOBAL peer (host route to the
2465    ///    guest, roaming endpoint learned from the guest's keepalive) so remote
2466    ///    nodes and this node route to it;
2467    /// 5. records the attachment keyed by `id` so `DetachContainer` can release
2468    ///    the IP and remove the peer.
2469    ///
2470    /// Platform-agnostic: pure IPAM + keygen + peer bookkeeping (no netns/veth/
2471    /// HCN), so it compiles and runs on macOS (where the overlayd serving a VZ
2472    /// host lives) as well as Linux.
2473    ///
2474    /// # Errors
2475    /// Returns an error if the global overlay is not set up, the IP pool is
2476    /// exhausted, key generation fails, or registering the guest peer fails.
2477    #[allow(clippy::cast_possible_truncation, clippy::too_many_lines)]
2478    async fn attach_container_guest(
2479        &mut self,
2480        id: &str,
2481        service: &str,
2482        join_global: bool,
2483        dns_server: Option<IpAddr>,
2484        dns_domain: Option<String>,
2485        isolation_network: Option<String>,
2486    ) -> Result<GuestOverlayConfig, OverlaydError> {
2487        // The global transport must exist: we both register the guest as a peer
2488        // on it and advertise this node (its public key + listen port) to the
2489        // guest. Resolve both up front so we fail before allocating anything.
2490        let node_public_key = self.transport_public_key.clone().ok_or_else(|| {
2491            OverlaydError::Other(
2492                "guest-managed attach requires the global overlay to be set up first \
2493                 (no node WireGuard public key)"
2494                    .to_string(),
2495            )
2496        })?;
2497        if self.global_transport.is_none() {
2498            return Err(OverlaydError::Other(
2499                "guest-managed attach requires the global overlay to be set up first \
2500                 (no global transport)"
2501                    .to_string(),
2502            ));
2503        }
2504
2505        // 1. Allocate the overlay IP from the same pool the Linux attach uses and
2506        //    derive the prefix length from that pool's network. On Linux a
2507        //    per-service bridge (when present) supplies both the IP and its
2508        //    subnet's prefix; otherwise (and on every non-Linux host) the node
2509        //    slice / cluster CIDR does.
2510        let (overlay_ip, prefix_len, pool_service, dedicated): (IpAddr, u8, Option<String>, bool) = {
2511            #[cfg(target_os = "linux")]
2512            {
2513                let use_shared = self
2514                    .service_modes
2515                    .get(service)
2516                    .copied()
2517                    .unwrap_or_default()
2518                    .uses_shared_bridge();
2519                let bridge = if use_shared {
2520                    self.shared_bridge.as_mut()
2521                } else {
2522                    self.service_bridges.get_mut(service)
2523                };
2524                if let Some(bridge) = bridge {
2525                    let ip = bridge.ip_allocator.allocate().ok_or_else(|| {
2526                        OverlaydError::Overlay(format!(
2527                            "service bridge {} subnet {} exhausted",
2528                            bridge.name, bridge.subnet
2529                        ))
2530                    })?;
2531                    let prefix = bridge.subnet.prefix_len();
2532                    (ip, prefix, Some(service.to_string()), false)
2533                } else {
2534                    let ip = self.ip_allocator.allocate()?;
2535                    (ip, self.slice_prefix_len(), None, false)
2536                }
2537            }
2538            #[cfg(not(target_os = "linux"))]
2539            {
2540                // A Dedicated service owns a second WireGuard device (own crypto +
2541                // subnet); its guest draws from that device's allocator and lands
2542                // on the dedicated subnet, not the global cluster mesh. Every other
2543                // mode hairpins through the node slice on the global transport.
2544                let dedicated = self
2545                    .service_modes
2546                    .get(service)
2547                    .copied()
2548                    .unwrap_or_default()
2549                    .uses_per_service_wg();
2550                if dedicated {
2551                    let st = self.service_transports.get_mut(service).ok_or_else(|| {
2552                        OverlaydError::Other(format!(
2553                            "Dedicated service {service} has no dedicated overlay; \
2554                             call setup_service_overlay first"
2555                        ))
2556                    })?;
2557                    let ip = st.ip_allocator.allocate().ok_or_else(|| {
2558                        OverlaydError::Overlay(format!(
2559                            "dedicated service {service} subnet {} exhausted",
2560                            st.subnet
2561                        ))
2562                    })?;
2563                    (ip, st.subnet.prefix_len(), Some(service.to_string()), true)
2564                } else {
2565                    let ip = self.ip_allocator.allocate()?;
2566                    (ip, self.slice_prefix_len(), None, false)
2567                }
2568            }
2569        };
2570        // `join_global` is informational for a guest-managed attach: the guest's
2571        // single WireGuard device IS its global-overlay endpoint, so there is no
2572        // separate eth1 IP to reserve. Touch it so callers stay consistent with
2573        // the Linux/Windows handles.
2574        let _ = join_global;
2575
2576        // 2. Generate the guest's WireGuard keypair (reuse the transport's
2577        //    native x25519 keygen — never reimplement curve25519 here).
2578        let (private_key, public_key) = OverlayTransport::generate_keys().await.map_err(|e| {
2579            // Roll back the IP allocation so a keygen failure leaks nothing.
2580            self.release_guest_ip(overlay_ip, pool_service.as_deref());
2581            OverlaydError::Overlay(format!("failed to generate guest keys: {e}"))
2582        })?;
2583
2584        // 3. Build the peer set. A VZ guest is behind the host's NAT and can only
2585        //    reach the LOCAL node (via its NAT gateway) — it cannot dial other
2586        //    nodes' or sibling guests' endpoints directly. So it gets exactly ONE
2587        //    peer: this node. ALL overlay traffic (including to sibling containers
2588        //    and remote nodes) routes through this node, which forwards/hairpins it
2589        //    (the node already holds a /32 peer for every container — step 4 — and
2590        //    the real inter-node peers). We deliberately do NOT add the per-guest
2591        //    /32 peers here: a /32 with no reachable endpoint would win
2592        //    longest-prefix routing and black-hole sibling traffic. The endpoint
2593        //    returned here is the node's overlay IP as a placeholder; the VZ
2594        //    runtime rewrites it to the guest's NAT gateway (the only host address
2595        //    the guest can reach) before delivering the config. Keepalive holds the
2596        //    guest's NAT mapping open so the node can reach back.
2597        //
2598        //    Dedicated mode: the single peer is this node's DEDICATED per-service
2599        //    device (its own pubkey + listen port + subnet as AllowedIPs), so the
2600        //    guest joins that service's isolated mesh. Every other mode peers with
2601        //    the global cluster device, AllowedIPs = the whole cluster CIDR.
2602        let (peer_pubkey, peer_listen_port, peer_allowed) = if dedicated {
2603            let st = self
2604                .service_transports
2605                .get(service)
2606                .expect("dedicated transport allocated above");
2607            (st.public_key.clone(), st.listen_port, st.subnet.to_string())
2608        } else {
2609            let node_allowed = self
2610                .cluster_cidr
2611                .or(self.slice_cidr)
2612                .map_or_else(|| String::from("0.0.0.0/0"), |c| c.to_string());
2613            (node_public_key, self.overlay_port, node_allowed)
2614        };
2615        let node_endpoint = self.node_endpoint_for_guest(peer_listen_port);
2616        let peers: Vec<PeerSpec> = vec![PeerSpec {
2617            public_key: peer_pubkey,
2618            endpoint: node_endpoint,
2619            allowed_ips: peer_allowed,
2620            persistent_keepalive_secs: 25,
2621            // The guest reaches the node via its NAT gateway (the only host
2622            // address it can route to); it does not run the host's ICE-lite
2623            // candidate exchange, so no candidates are advertised here.
2624            candidates: Vec::new(),
2625        }];
2626
2627        // 4. Register the guest's public key as a GLOBAL peer (host route to the
2628        //    guest at <overlay_ip>/32, roaming endpoint learned from keepalive).
2629        //    Go through the same internal path `AddPeer { Global }` uses.
2630        let host_route = format!(
2631            "{}/{}",
2632            overlay_ip,
2633            if overlay_ip.is_ipv6() { 128 } else { 32 }
2634        );
2635        let guest_peer = PeerSpec {
2636            public_key: public_key.clone(),
2637            // Empty/roaming: the guest is behind NAT; boringtun learns its source
2638            // endpoint from the guest's first keepalive. `0.0.0.0:0` is the
2639            // wire-safe "unset endpoint" sentinel that still parses as a
2640            // SocketAddr (peer_spec_to_info requires a parseable endpoint).
2641            endpoint: "0.0.0.0:0".to_string(),
2642            allowed_ips: host_route,
2643            persistent_keepalive_secs: 0,
2644            // The guest's roaming endpoint is learned from its first keepalive;
2645            // it advertises no NAT candidates (the host learns the source).
2646            candidates: Vec::new(),
2647        };
2648        let guest_peer_info = peer_spec_to_info(&guest_peer)?;
2649        let scope = if dedicated {
2650            PeerScope::Service {
2651                service: service.to_string(),
2652            }
2653        } else {
2654            PeerScope::Global
2655        };
2656        {
2657            let transport = self.transport_for_scope(&scope)?;
2658            if let Err(e) = Self::add_peer_on(transport, &guest_peer_info).await {
2659                self.release_guest_ip(overlay_ip, pool_service.as_deref());
2660                return Err(e);
2661            }
2662        }
2663        // Track it among the global peers (so a *subsequent* guest attach also
2664        // learns about this guest) and record the attachment for detach.
2665        self.global_peers
2666            .insert(public_key.clone(), guest_peer.clone());
2667        // Per-network membership + node-side L3 isolation: record the guest's
2668        // overlay IP in its isolated network's member set, and enforce the
2669        // cross-platform isolation policy on THIS node. A VZ guest hairpins ALL
2670        // its overlay traffic through this node's WireGuard device, so the node
2671        // is the enforcement point: on macOS this dispatches to pf (a per-network
2672        // table + sub-anchor); on Linux it dispatches to iptables (harmless here
2673        // — guests do not run on Linux). The guest's own WireGuard AllowedIPs are
2674        // the in-guest belt; this is the node-side suspenders.
2675        if let Some(ref net) = isolation_network {
2676            let node_ip = self
2677                .node_ip
2678                .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::new(10, 200, 0, 1)));
2679            let cidr = self
2680                .cluster_cidr
2681                .map_or_else(|| "10.200.0.0/16".to_string(), |c| c.to_string());
2682            // Peers = current members of the network BEFORE inserting this guest.
2683            let peers: Vec<IpAddr> = self
2684                .network_members
2685                .get(net)
2686                .map(|m| m.iter().copied().collect())
2687                .unwrap_or_default();
2688            if let Err(e) = zlayer_overlay::firewall::ensure_member_isolation(
2689                net, overlay_ip, &peers, node_ip, &cidr,
2690            ) {
2691                tracing::warn!(network = %net, member = %overlay_ip, error = %e, "failed to install per-network L3 isolation for guest (non-fatal)");
2692            }
2693            self.network_members
2694                .entry(net.clone())
2695                .or_default()
2696                .insert(overlay_ip);
2697        }
2698        self.guest_attachments.insert(
2699            id.to_string(),
2700            GuestAttachInfo {
2701                overlay_ip,
2702                public_key: public_key.clone(),
2703                service_name: pool_service,
2704                isolation_network,
2705            },
2706        );
2707
2708        // 5. Return the config the caller ships into the guest.
2709        Ok(GuestOverlayConfig {
2710            overlay_ip,
2711            prefix_len,
2712            private_key,
2713            public_key,
2714            // The guest's device listens on the same port as its single in-guest
2715            // peer (the node device it joins): the node's overlay WG port for the
2716            // global mesh, or the dedicated device's listen port in Dedicated mode.
2717            listen_port: peer_listen_port,
2718            peers,
2719            dns_server: dns_server.or_else(|| self.dns_server_addr.map(|s| s.ip())),
2720            dns_domain: dns_domain.or_else(|| self.dns_domain.clone()),
2721        })
2722    }
2723
2724    /// Release a guest-managed attach by `id`: drop the host route + global peer
2725    /// and return the allocated IP to its pool. Idempotent.
2726    ///
2727    /// # Errors
2728    /// Returns an error only if removing the peer from the global transport fails
2729    /// for a reason other than "peer not found".
2730    async fn detach_container_guest(&mut self, id: &str) -> Result<(), OverlaydError> {
2731        let Some(info) = self.guest_attachments.remove(id) else {
2732            return Ok(());
2733        };
2734        // Remove the guest's peer from the same scope it was registered on: a
2735        // Dedicated guest sits on its service's dedicated device, every other
2736        // guest on the global cluster device. Mirror the attach-time scope choice
2737        // so a dedicated guest peer does not leak on teardown.
2738        let scope = match info.service_name.as_deref() {
2739            Some(svc)
2740                if self
2741                    .service_modes
2742                    .get(svc)
2743                    .copied()
2744                    .unwrap_or_default()
2745                    .uses_per_service_wg() =>
2746            {
2747                PeerScope::Service {
2748                    service: svc.to_string(),
2749                }
2750            }
2751            _ => PeerScope::Global,
2752        };
2753        self.global_peers.remove(&info.public_key);
2754        if let Ok(transport) = self.transport_for_scope(&scope) {
2755            if let Err(e) = Self::remove_peer_on(transport, &info.public_key).await {
2756                tracing::warn!(
2757                    guest = %id,
2758                    pubkey = %info.public_key,
2759                    scope = ?scope,
2760                    error = %e,
2761                    "failed to remove guest peer from its overlay transport"
2762                );
2763            }
2764        }
2765        // Drain the per-network membership set for this guest and tear down the
2766        // node-side L3 isolation rule for it (pf on macOS, iptables on Linux —
2767        // the latter is a no-op for guests, which never run on Linux). Drop the
2768        // network entry once empty.
2769        if let Some(net) = info.isolation_network.as_deref() {
2770            if let Some(set) = self.network_members.get_mut(net) {
2771                set.remove(&info.overlay_ip);
2772            }
2773            let remaining_peers: Vec<IpAddr> = self
2774                .network_members
2775                .get(net)
2776                .map(|m| m.iter().copied().collect())
2777                .unwrap_or_default();
2778            let node_ip = self
2779                .node_ip
2780                .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::new(10, 200, 0, 1)));
2781            let cidr = self
2782                .cluster_cidr
2783                .map_or_else(|| "10.200.0.0/16".to_string(), |c| c.to_string());
2784            zlayer_overlay::firewall::remove_member_isolation(
2785                net,
2786                info.overlay_ip,
2787                &remaining_peers,
2788                node_ip,
2789                &cidr,
2790            );
2791            if self
2792                .network_members
2793                .get(net)
2794                .is_some_and(std::collections::HashSet::is_empty)
2795            {
2796                self.network_members.remove(net);
2797            }
2798        }
2799        // Return the IP to whichever pool it came from.
2800        self.release_guest_ip(info.overlay_ip, info.service_name.as_deref());
2801        Ok(())
2802    }
2803
2804    // -- container attach (macOS host-shared) -------------------------------
2805
2806    /// Host-shared overlay attach: give a macOS host-shared container
2807    /// ([`AttachHandle::HostShared`] — Seatbelt / native-VZ / libkrun) its own
2808    /// first-class L3 overlay membership.
2809    ///
2810    /// A host-shared container shares the node's host network namespace and its
2811    /// single cluster `utun`; it cannot get its own netns/veth (Seatbelt) or its
2812    /// own kernel `WireGuard` device (no guest VM to run one). So instead of a
2813    /// veth/HCN endpoint or a per-guest WG keypair, this:
2814    /// 1. allocates a DISTINCT overlay `/32` from the node slice (never the node
2815    ///    IP — `IpAllocator` reserves offset 1 — and never `None`). The node
2816    ///    slice is already advertised cluster-wide as this node's `AllowedIPs`,
2817    ///    so the `/32` auto-routes to this node with no peer reconfiguration;
2818    /// 2. adds that `/32` as an alias on the node's overlay `utun` so the kernel
2819    ///    delivers inbound overlay packets for it locally (boringtun decrypts
2820    ///    and writes the plaintext packet to the utun, which only delivers to a
2821    ///    configured local address);
2822    /// 3. records per-network membership + installs node-side L3 isolation when
2823    ///    `isolation_network` is set (pf on macOS), exactly like the guest path;
2824    /// 4. records the attachment keyed by `id` so `DetachContainer` can remove
2825    ///    the alias, drain the membership, and release the IP.
2826    ///
2827    /// HONEST CONSTRAINT: host-shared containers share the node's single cluster
2828    /// `utun`, so `OverlayMode::Dedicated`'s per-service `WireGuard` CRYPTO
2829    /// isolation cannot apply to them — there is no per-container WG device
2830    /// without a netns or a guest VM to host one. They still get a distinct
2831    /// overlay IP + L3 isolation (per-network membership / pf) + overlay DNS,
2832    /// which is full first-class L3 overlay membership. This is a real OS
2833    /// constraint of host-shared execution, not a stub.
2834    ///
2835    /// # Errors
2836    /// Returns an error if the node slice is exhausted, or if the global overlay
2837    /// interface is not set up (so there is no `utun` to alias the `/32` on).
2838    async fn attach_container_host_shared(
2839        &mut self,
2840        id: &str,
2841        service: &str,
2842        ephemeral: bool,
2843        isolation_network: Option<String>,
2844    ) -> Result<IpAddr, OverlaydError> {
2845        // 1. Allocate a distinct /32 from the node slice. Never the node IP
2846        //    (reserved at offset 1), never None — exhaustion maps to the same
2847        //    `OverlaydError::Overlay` the other attach paths surface.
2848        let ip = self.ip_allocator.allocate()?;
2849        let prefix_len: u8 = if ip.is_ipv6() { 128 } else { 32 };
2850
2851        // 2. Make the /32 locally deliverable on the node's overlay utun via an
2852        //    alias on the single cluster transport's interface. Roll the IP
2853        //    allocation back on any failure so nothing leaks.
2854        let alias_res = if let Some(transport) = self.global_transport.as_ref() {
2855            transport
2856                .add_alias(ip, prefix_len)
2857                .await
2858                .map_err(|e| OverlaydError::Overlay(e.to_string()))
2859        } else {
2860            Err(OverlaydError::Other(
2861                "host-shared attach requires the global overlay to be set up first \
2862                 (no utun to alias the container /32 on)"
2863                    .to_string(),
2864            ))
2865        };
2866        if let Err(e) = alias_res {
2867            self.ip_allocator.release(ip);
2868            return Err(e);
2869        }
2870
2871        // 3. Per-network membership + node-side L3 isolation (mirror the guest
2872        //    path). The host-shared container hairpins all overlay traffic
2873        //    through this node's WireGuard device, so the node is the
2874        //    enforcement point (pf on macOS).
2875        if let Some(ref net) = isolation_network {
2876            let node_ip = self
2877                .node_ip
2878                .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::new(10, 200, 0, 1)));
2879            let cidr = self
2880                .cluster_cidr
2881                .map_or_else(|| "10.200.0.0/16".to_string(), |c| c.to_string());
2882            // Peers = current members of the network BEFORE inserting this one.
2883            let peers: Vec<IpAddr> = self
2884                .network_members
2885                .get(net)
2886                .map(|m| m.iter().copied().collect())
2887                .unwrap_or_default();
2888            if let Err(e) =
2889                zlayer_overlay::firewall::ensure_member_isolation(net, ip, &peers, node_ip, &cidr)
2890            {
2891                tracing::warn!(network = %net, member = %ip, error = %e, "failed to install per-network L3 isolation for host-shared container (non-fatal)");
2892            }
2893            self.network_members
2894                .entry(net.clone())
2895                .or_default()
2896                .insert(ip);
2897        }
2898
2899        // 4. Record the attachment so detach can reverse all of the above.
2900        self.host_shared_attachments.insert(
2901            id.to_string(),
2902            AttachInfo {
2903                service_ip: ip,
2904                service_name: Some(service.to_string()),
2905                // No separate global/eth1 IP: a host-shared container reaches the
2906                // global overlay through the SAME /32 aliased on the node utun.
2907                global_ip: None,
2908                ephemeral,
2909                isolation_network,
2910            },
2911        );
2912
2913        Ok(ip)
2914    }
2915
2916    /// Release a host-shared attach by `id`: remove the utun `/32` alias, drain
2917    /// its per-network L3 isolation membership, and return the IP to the node
2918    /// slice. Idempotent. Mirrors [`Self::detach_container_guest`].
2919    ///
2920    /// # Errors
2921    /// Returns `Ok` even when removing the alias fails (best-effort, logged) —
2922    /// the IP is always returned to the pool so it can never leak.
2923    async fn detach_container_host_shared(&mut self, id: &str) -> Result<(), OverlaydError> {
2924        let Some(info) = self.host_shared_attachments.remove(id) else {
2925            return Ok(());
2926        };
2927        // Drain the per-network membership set and tear down the node-side L3
2928        // isolation rule for this container; drop the network entry once empty.
2929        if let Some(net) = info.isolation_network.as_deref() {
2930            if let Some(set) = self.network_members.get_mut(net) {
2931                set.remove(&info.service_ip);
2932            }
2933            let remaining_peers: Vec<IpAddr> = self
2934                .network_members
2935                .get(net)
2936                .map(|m| m.iter().copied().collect())
2937                .unwrap_or_default();
2938            let node_ip = self
2939                .node_ip
2940                .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::new(10, 200, 0, 1)));
2941            let cidr = self
2942                .cluster_cidr
2943                .map_or_else(|| "10.200.0.0/16".to_string(), |c| c.to_string());
2944            zlayer_overlay::firewall::remove_member_isolation(
2945                net,
2946                info.service_ip,
2947                &remaining_peers,
2948                node_ip,
2949                &cidr,
2950            );
2951            if self
2952                .network_members
2953                .get(net)
2954                .is_some_and(std::collections::HashSet::is_empty)
2955            {
2956                self.network_members.remove(net);
2957            }
2958        }
2959        // Remove the utun /32 alias (best-effort: a failed removal must not
2960        // strand the IP, so we log and still release below).
2961        let prefix_len: u8 = if info.service_ip.is_ipv6() { 128 } else { 32 };
2962        if let Some(transport) = self.global_transport.as_ref() {
2963            if let Err(e) = transport.remove_alias(info.service_ip, prefix_len).await {
2964                tracing::warn!(
2965                    container = %id,
2966                    ip = %info.service_ip,
2967                    error = %e,
2968                    "failed to remove host-shared overlay /32 alias from utun (non-fatal)"
2969                );
2970            }
2971        }
2972        // Return the IP to the node slice.
2973        self.ip_allocator.release(info.service_ip);
2974
2975        // Per-job segment lifecycle observability. Unlike the Linux veth path —
2976        // which reaps a per-service BRIDGE on the last ephemeral detach — a
2977        // host-shared container shares the node's single cluster utun and owns
2978        // no per-service bridge or dedicated WG device to tear down (see the
2979        // HONEST CONSTRAINT note on `attach_container_host_shared`). The only
2980        // per-segment state is its overlay `/32` + per-network membership, both
2981        // already reversed above. So `ephemeral` and `service_name` drive the
2982        // last-leaver TRACE here (mirroring the Linux ephemeral-teardown log)
2983        // rather than a bridge teardown: an ephemeral (per-job) segment's IP
2984        // return is logged at info level for reclamation traceability, a
2985        // managed service's at debug.
2986        let service = info.service_name.as_deref().unwrap_or("<none>");
2987        if info.ephemeral {
2988            tracing::info!(
2989                container = %id,
2990                service = %service,
2991                ip = %info.service_ip,
2992                "ephemeral host-shared overlay member detached — per-job segment /32 returned to node slice"
2993            );
2994        } else {
2995            tracing::debug!(
2996                container = %id,
2997                service = %service,
2998                ip = %info.service_ip,
2999                "host-shared overlay member detached — /32 returned to node slice"
3000            );
3001        }
3002        Ok(())
3003    }
3004
3005    /// Release a guest overlay IP back to the pool it was drawn from: the named
3006    /// service bridge's allocator (Linux) when `service` is set and the bridge
3007    /// still exists, otherwise the node slice allocator.
3008    fn release_guest_ip(&mut self, ip: IpAddr, service: Option<&str>) {
3009        #[cfg(target_os = "linux")]
3010        {
3011            // A Shared-mode service drew from the single node-wide shared bridge,
3012            // which is keyed by subnet, not by service name. Try it first.
3013            if let Some(bridge) = self.shared_bridge.as_mut() {
3014                if bridge.subnet.contains(&ip) {
3015                    bridge.ip_allocator.release(ip);
3016                    return;
3017                }
3018            }
3019            if let Some(svc) = service {
3020                if let Some(bridge) = self.service_bridges.get_mut(svc) {
3021                    bridge.ip_allocator.release(ip);
3022                    return;
3023                }
3024            }
3025        }
3026        #[cfg(not(target_os = "linux"))]
3027        {
3028            // A Dedicated-mode guest drew its IP from the per-service transport's
3029            // allocator (keyed by service name); return it there so the dedicated
3030            // subnet does not leak addresses across guest churn.
3031            if let Some(svc) = service {
3032                if let Some(st) = self.service_transports.get_mut(svc) {
3033                    st.ip_allocator.release(ip);
3034                    return;
3035                }
3036            }
3037        }
3038        let _ = service;
3039        self.ip_allocator.release(ip);
3040    }
3041
3042    /// Prefix length of the address pool guest IPs are drawn from when not using
3043    /// a per-service bridge: the node slice if assigned, else the cluster CIDR.
3044    fn slice_prefix_len(&self) -> u8 {
3045        self.slice_cidr.or(self.cluster_cidr).map_or(
3046            if self.node_ip.is_some_and(|ip| ip.is_ipv6()) {
3047                64
3048            } else {
3049                24
3050            },
3051            |c| c.prefix(),
3052        )
3053    }
3054
3055    /// Reachable `WireGuard` endpoint for THIS node, advertised to a guest as a
3056    /// peer on `listen_port` (the node's global overlay port, or a Dedicated
3057    /// service's per-service device port). overlayd has no public reflexive
3058    /// address at this layer, so it uses the node's overlay-listen identity
3059    /// (`node_ip:listen_port`); the caller (the VZ runtime that ships the config
3060    /// into the guest) rewrites it to the concrete VZ-NAT gateway endpoint the
3061    /// guest can dial. Falls back to the unspecified address when no node IP is
3062    /// assigned yet.
3063    fn node_endpoint_for_guest(&self, listen_port: u16) -> String {
3064        let ip = self.node_ip.unwrap_or(IpAddr::V4(Ipv4Addr::UNSPECIFIED));
3065        SocketAddr::new(ip, listen_port).to_string()
3066    }
3067
3068    /// Linux veth/netns attach. On non-Linux this returns the node's overlay IP
3069    /// (host networking) and is never wired for a `LinuxPid` handle in practice.
3070    #[cfg(target_os = "linux")]
3071    #[allow(clippy::too_many_lines)]
3072    async fn attach_container_linux(
3073        &mut self,
3074        container_pid: u32,
3075        service: &str,
3076        join_global: bool,
3077        ephemeral: bool,
3078        isolation_network: Option<String>,
3079    ) -> Result<IpAddr, OverlaydError> {
3080        // Resolve which bridge backs this service. A `Shared`-mode service
3081        // attaches onto the SINGLE node-wide shared bridge; every other mode
3082        // (`Auto`, `Dedicated`) attaches onto its own per-service bridge. The
3083        // mode was recorded at `setup_service_overlay` time.
3084        let use_shared = self
3085            .service_modes
3086            .get(service)
3087            .copied()
3088            .unwrap_or_default()
3089            .uses_shared_bridge();
3090
3091        let (bridge_name, bridge_subnet, bridge_gateway, container_ip) = if use_shared {
3092            let bridge = self.shared_bridge.as_mut().ok_or_else(|| {
3093                OverlaydError::Other(format!(
3094                    "no shared bridge for Shared-mode service {service}; call setup_service_overlay() first"
3095                ))
3096            })?;
3097            let ip = bridge.ip_allocator.allocate().ok_or_else(|| {
3098                OverlaydError::Overlay(format!(
3099                    "shared bridge {} subnet {} exhausted",
3100                    bridge.name, bridge.subnet
3101                ))
3102            })?;
3103            (bridge.name.clone(), bridge.subnet, bridge.gateway, ip)
3104        } else {
3105            let bridge = self.service_bridges.get_mut(service).ok_or_else(|| {
3106                OverlaydError::Other(format!(
3107                    "no service bridge for service {service}; call setup_service_overlay() first"
3108                ))
3109            })?;
3110            let ip = bridge.ip_allocator.allocate().ok_or_else(|| {
3111                OverlaydError::Overlay(format!(
3112                    "service bridge {} subnet {} exhausted",
3113                    bridge.name, bridge.subnet
3114                ))
3115            })?;
3116            (bridge.name.clone(), bridge.subnet, bridge.gateway, ip)
3117        };
3118
3119        let bridge_params = BridgeAttachParams {
3120            bridge_name: &bridge_name,
3121            gateway: bridge_gateway,
3122            subnet_prefix_len: bridge_subnet.prefix_len(),
3123        };
3124        if let Err(e) = self
3125            .attach_to_interface(
3126                container_pid,
3127                container_ip,
3128                "s",
3129                "eth0",
3130                Some(&bridge_params),
3131            )
3132            .await
3133        {
3134            if use_shared {
3135                if let Some(bridge) = self.shared_bridge.as_mut() {
3136                    bridge.ip_allocator.release(container_ip);
3137                }
3138            } else if let Some(bridge) = self.service_bridges.get_mut(service) {
3139                bridge.ip_allocator.release(container_ip);
3140            }
3141            return Err(e);
3142        }
3143
3144        let mut global_ip: Option<IpAddr> = None;
3145        if join_global && self.global_interface.is_some() {
3146            let g_ip = self.ip_allocator.allocate()?;
3147            self.attach_to_interface(container_pid, g_ip, "g", "eth1", None)
3148                .await?;
3149            global_ip = Some(g_ip);
3150        }
3151
3152        // Per-network L3 isolation: when this attach joins a named isolated
3153        // network, install the Docker-style iptables rules pinning this member
3154        // to its own network's members + node + egress, then record it in the
3155        // membership map. Non-fatal: a host without iptables logs and proceeds.
3156        if let Some(ref net) = isolation_network {
3157            let node_ip = self
3158                .node_ip
3159                .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::new(10, 200, 0, 1)));
3160            let cidr = self
3161                .cluster_cidr
3162                .map_or_else(|| "10.200.0.0/16".to_string(), |c| c.to_string());
3163            let peers: Vec<IpAddr> = self
3164                .network_members
3165                .get(net)
3166                .map(|m| m.iter().copied().collect())
3167                .unwrap_or_default();
3168            if let Err(e) = zlayer_overlay::firewall::ensure_member_isolation(
3169                net,
3170                container_ip,
3171                &peers,
3172                node_ip,
3173                &cidr,
3174            ) {
3175                tracing::warn!(network = %net, member = %container_ip, error = %e, "failed to install per-network L3 isolation (non-fatal)");
3176            }
3177            self.network_members
3178                .entry(net.clone())
3179                .or_default()
3180                .insert(container_ip);
3181        }
3182
3183        self.attached.insert(
3184            container_pid,
3185            AttachInfo {
3186                service_ip: container_ip,
3187                service_name: Some(service.to_string()),
3188                global_ip,
3189                ephemeral,
3190                isolation_network,
3191            },
3192        );
3193
3194        Ok(container_ip)
3195    }
3196
3197    /// Non-Linux fallback: containers share the host network, so return the
3198    /// node's overlay IP (or loopback).
3199    #[cfg(not(target_os = "linux"))]
3200    #[allow(clippy::unused_async)]
3201    async fn attach_container_linux(
3202        &mut self,
3203        _container_pid: u32,
3204        service: &str,
3205        _join_global: bool,
3206        _ephemeral: bool,
3207        _isolation_network: Option<String>,
3208    ) -> Result<IpAddr, OverlaydError> {
3209        tracing::debug!(service = %service, "LinuxPid attach is a no-op off Linux; using node overlay IP");
3210        Ok(self.node_ip.unwrap_or(IpAddr::V4(Ipv4Addr::LOCALHOST)))
3211    }
3212
3213    /// Release the overlay resources held by a Linux container PID. Idempotent.
3214    #[cfg(target_os = "linux")]
3215    async fn detach_container_linux(&mut self, pid: u32) -> Result<(), OverlaydError> {
3216        // "Process id or not, kill the adapter": the host-side veth name is
3217        // deterministic (`veth-<pid>-{s,g}`), so delete it UNCONDITIONALLY by
3218        // name — even when no attach record survives (a previous daemon crashed
3219        // before recording it, or it was already reaped). Without this, a missing
3220        // record left the host veth orphaned until the PID-keyed periodic sweep
3221        // (which only fires once the PID is dead). The deletes are idempotent
3222        // (ENODEV = success), so the always-on `-g` delete is harmless when the
3223        // container never joined the global overlay.
3224        let info = self.attached.remove(&pid);
3225
3226        let veth_s = format!("veth-{pid}-s");
3227        if let Err(e) = crate::netlink::delete_link_by_name(&veth_s).await {
3228            tracing::warn!(link = %veth_s, pid, error = %e, "Failed to delete service veth");
3229        }
3230        let veth_g = format!("veth-{pid}-g");
3231        if let Err(e) = crate::netlink::delete_link_by_name(&veth_g).await {
3232            tracing::warn!(link = %veth_g, pid, error = %e, "Failed to delete global veth");
3233        }
3234
3235        // No attach record -> nothing more to release (IP/registry bookkeeping
3236        // is keyed off the record). The veths above are already gone.
3237        let Some(info) = info else {
3238            return Ok(());
3239        };
3240
3241        // Return the service IP to whichever pool owns it. A Shared-mode service
3242        // drew its IP from the single node-wide shared bridge (no per-service
3243        // bridge exists for it), so try the shared bridge by subnet containment
3244        // before the named per-service bridge.
3245        if self.shared_bridge.as_mut().is_some_and(|b| {
3246            b.subnet.contains(&info.service_ip) && b.ip_allocator.release(info.service_ip)
3247        }) {
3248            // released into the shared bridge
3249        } else if let Some(svc) = info.service_name.as_deref() {
3250            if let Some(bridge) = self.service_bridges.get_mut(svc) {
3251                bridge.ip_allocator.release(info.service_ip);
3252            } else {
3253                tracing::debug!(service = %svc, ip = %info.service_ip, "detach: service bridge already torn down; dropping service IP release");
3254            }
3255        } else {
3256            self.ip_allocator.release(info.service_ip);
3257        }
3258        if let Some(g) = info.global_ip {
3259            self.ip_allocator.release(g);
3260        }
3261
3262        // Per-network L3 isolation drain: remove this member from its isolated
3263        // network's membership set and tear down its iptables rules against the
3264        // remaining members. Drop the network entry once empty.
3265        if let Some(net) = info.isolation_network.as_deref() {
3266            if let Some(set) = self.network_members.get_mut(net) {
3267                set.remove(&info.service_ip);
3268            }
3269            let still: Vec<IpAddr> = self
3270                .network_members
3271                .get(net)
3272                .map(|m| m.iter().copied().collect())
3273                .unwrap_or_default();
3274            let node_ip = self
3275                .node_ip
3276                .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::new(10, 200, 0, 1)));
3277            let cidr = self
3278                .cluster_cidr
3279                .map_or_else(|| "10.200.0.0/16".to_string(), |c| c.to_string());
3280            zlayer_overlay::firewall::remove_member_isolation(
3281                net,
3282                info.service_ip,
3283                &still,
3284                node_ip,
3285                &cidr,
3286            );
3287            if self
3288                .network_members
3289                .get(net)
3290                .is_some_and(std::collections::HashSet::is_empty)
3291            {
3292                self.network_members.remove(net);
3293            }
3294        }
3295
3296        // Ephemeral last-leaver teardown: a standalone/per-job bridge is reclaimed
3297        // the moment its LAST container leaves (the periodic prune is only the
3298        // ~300s backstop). Managed attaches use ephemeral=false so their bridge
3299        // persists across scale-to-0. Route through teardown_service_overlay so
3300        // overlayd's in-memory state stays synced — never a hand `ip link del`.
3301        // This container's veth is already removed above, so a 0 member count
3302        // means no containers remain on the bridge.
3303        if info.ephemeral {
3304            if let Some(svc) = info.service_name.clone() {
3305                if let Some(bridge_name) = self.service_bridges.get(&svc).map(|b| b.name.clone()) {
3306                    if crate::netlink::bridge_member_count(&bridge_name).await == 0 {
3307                        tracing::info!(service = %svc, bridge = %bridge_name, "ephemeral overlay bridge idle after last detach — tearing down");
3308                        self.teardown_service_overlay(&svc).await;
3309                    }
3310                }
3311            }
3312        }
3313        Ok(())
3314    }
3315
3316    /// Non-Linux fallback: nothing to detach (host networking).
3317    #[cfg(not(target_os = "linux"))]
3318    #[allow(clippy::unused_async)]
3319    async fn detach_container_linux(&mut self, _pid: u32) -> Result<(), OverlaydError> {
3320        Ok(())
3321    }
3322
3323    /// Best-effort sweep of orphan veth endpoints whose owning container PID is
3324    /// no longer alive. Names matching `veth-<pid>-*` / `vc-<pid>-*` where
3325    /// `/proc/<pid>` does not exist are deleted.
3326    #[cfg(target_os = "linux")]
3327    async fn sweep_orphan_veths() {
3328        let links = match crate::netlink::list_all_links().await {
3329            Ok(links) => links,
3330            Err(e) => {
3331                tracing::warn!(error = %e, "Failed to list links for orphan sweep");
3332                return;
3333            }
3334        };
3335        for (_index, name) in links {
3336            let remainder = if let Some(r) = name.strip_prefix("veth-") {
3337                r
3338            } else if let Some(r) = name.strip_prefix("vc-") {
3339                r
3340            } else {
3341                continue;
3342            };
3343            let Some(pid_str) = remainder.split('-').next() else {
3344                continue;
3345            };
3346            let pid: u32 = match pid_str.parse() {
3347                Ok(p) => p,
3348                Err(_) => continue,
3349            };
3350            if Path::new(&format!("/proc/{pid}")).exists() {
3351                continue;
3352            }
3353            tracing::info!(link = %name, pid = pid, "Deleting orphan veth");
3354            if let Err(e) = crate::netlink::delete_link_by_name(&name).await {
3355                tracing::warn!(link = %name, error = %e, "Failed to delete orphan veth");
3356            }
3357        }
3358    }
3359
3360    #[cfg(target_os = "linux")]
3361    #[allow(clippy::too_many_lines)]
3362    async fn attach_to_interface(
3363        &mut self,
3364        container_pid: u32,
3365        ip: IpAddr,
3366        tag: &str,
3367        container_iface: &str,
3368        bridge: Option<&BridgeAttachParams<'_>>,
3369    ) -> Result<(), OverlaydError> {
3370        // Best-effort cleanup of orphan veths left by a previous daemon crash.
3371        Self::sweep_orphan_veths().await;
3372
3373        let is_v6 = ip.is_ipv6();
3374        let prefix_len: u8 = if let Some(b) = bridge {
3375            b.subnet_prefix_len
3376        } else if is_v6 {
3377            64
3378        } else {
3379            24
3380        };
3381        let host_prefix: u8 = if is_v6 { 128 } else { 32 };
3382
3383        let veth_host = format!("veth-{container_pid}-{tag}");
3384        let veth_pending = format!("vc-{container_pid}-{tag}");
3385        let veth_container = container_iface.to_string();
3386
3387        let container_ns_fd = std::os::fd::OwnedFd::from(
3388            std::fs::File::open(format!("/proc/{container_pid}/ns/net")).map_err(|e| {
3389                OverlaydError::Overlay(format!("Failed to open /proc/{container_pid}/ns/net: {e}"))
3390            })?,
3391        );
3392
3393        crate::netlink::delete_link_by_name(&veth_host)
3394            .await
3395            .map_err(|e| OverlaydError::Overlay(format!("pre-cleanup delete {veth_host}: {e}")))?;
3396        crate::netlink::delete_link_by_name(&veth_pending)
3397            .await
3398            .map_err(|e| {
3399                OverlaydError::Overlay(format!("pre-cleanup delete {veth_pending}: {e}"))
3400            })?;
3401
3402        let bridge_gateway: Option<IpAddr> = bridge.map(|b| b.gateway);
3403        let bridge_name: Option<String> = bridge.map(|b| b.bridge_name.to_string());
3404        let node_ip = self.node_ip;
3405
3406        let result: Result<(), OverlaydError> = async {
3407            crate::netlink::create_veth_pair(&veth_host, &veth_pending)
3408                .await
3409                .map_err(|e| OverlaydError::Overlay(format!("create veth pair: {e}")))?;
3410
3411            crate::netlink::move_link_into_netns_fd_and_rename(
3412                &veth_pending,
3413                AsFd::as_fd(&container_ns_fd),
3414                &veth_container,
3415            )
3416            .map_err(|e| OverlaydError::Overlay(format!("move veth into netns: {e}")))?;
3417
3418            let vc = veth_container.clone();
3419            let bridge_gateway_for_netns = bridge_gateway;
3420            tokio::task::spawn_blocking(move || {
3421                crate::netlink::with_netns_fd_async(container_ns_fd, move || async move {
3422                    crate::netlink::add_address_to_link_by_name(&vc, ip, prefix_len).await?;
3423                    crate::netlink::set_link_up_by_name(&vc).await?;
3424                    crate::netlink::set_link_up_by_name("lo").await?;
3425                    if let Some(gw) = bridge_gateway_for_netns {
3426                        crate::netlink::add_default_route_via_gateway(gw).await?;
3427                    }
3428                    Ok(())
3429                })
3430            })
3431            .await
3432            .map_err(|e| OverlaydError::Overlay(format!("container netns task panicked: {e}")))?
3433            .map_err(|e| OverlaydError::Overlay(format!("container netns ops: {e}")))?;
3434
3435            crate::netlink::set_link_up_by_name(&veth_host)
3436                .await
3437                .map_err(|e| OverlaydError::Overlay(format!("set {veth_host} up: {e}")))?;
3438
3439            if let Some(bname) = bridge_name.as_deref() {
3440                crate::netlink::add_link_to_bridge(&veth_host, bname)
3441                    .await
3442                    .map_err(|e| {
3443                        OverlaydError::Overlay(format!(
3444                            "enslave {veth_host} to bridge {bname}: {e}"
3445                        ))
3446                    })?;
3447            } else {
3448                crate::netlink::replace_route_via_dev(ip, host_prefix, &veth_host, node_ip)
3449                    .await
3450                    .map_err(|e| {
3451                        OverlaydError::Overlay(format!("host route for {ip}/{host_prefix}: {e}"))
3452                    })?;
3453            }
3454
3455            Ok(())
3456        }
3457        .await;
3458
3459        // Enable IP forwarding so the host routes between the overlay device(s)
3460        // and the egress NIC. CRITICAL: this is scoped to the address family
3461        // actually in use and (for IPv6) to the specific overlay devices —
3462        // NEVER `net.ipv6.conf.all.forwarding`, whose documented kernel side
3463        // effect is to force `accept_ra=0` + `autoconf=0` on every IPv6
3464        // interface (including the public NIC), dropping the RA-learned default
3465        // route / path-MTU and blackholing the host's own larger reply packets
3466        // (e.g. inbound SSH stalls after key exchange). Done outside the
3467        // attach `result` block so a forwarding-sysctl failure can never roll
3468        // back a successful veth attach. Tracked so teardown reverts it.
3469        if result.is_ok() {
3470            self.enable_forwarding_for_attach(is_v6, &veth_host, bridge_name.as_deref());
3471
3472            // Track the host-side resources this attach created so a clean
3473            // global teardown reverts every host mutation. The host-side veth
3474            // half exists in both the bridged and bridgeless paths; the host
3475            // `/32`(`/128`) route is installed ONLY on the bridgeless path
3476            // (`replace_route_via_dev` above), so record it only when there was
3477            // no bridge to enslave into. All deletions are idempotent, so a
3478            // resource a later per-container detach removes first is harmless.
3479            self.created_veths.insert(veth_host.clone());
3480            if bridge_name.is_none() {
3481                self.created_host_routes
3482                    .push((ip, host_prefix, veth_host.clone()));
3483            }
3484        }
3485
3486        if result.is_err() {
3487            let _ = crate::netlink::delete_link_by_name(&veth_host).await;
3488            let _ = crate::netlink::delete_link_by_name(&veth_pending).await;
3489        }
3490        result
3491    }
3492
3493    // -- container attach (Windows HCN) -------------------------------------
3494
3495    /// Windows attach: ensure the overlay HCN Internal network exists, allocate
3496    /// or validate the IP, create the per-container HCN endpoint + namespace,
3497    /// and return the bare-lowercase namespace GUID for the agent to embed in
3498    /// the compute-system document.
3499    ///
3500    /// # Errors
3501    /// Returns an error if the network/endpoint cannot be created or the slice
3502    /// is exhausted.
3503    #[cfg(target_os = "windows")]
3504    #[allow(clippy::too_many_lines)]
3505    async fn attach_container_windows(
3506        &mut self,
3507        container_id: &str,
3508        service: &str,
3509        ip_override: Option<IpAddr>,
3510        dns_server: Option<IpAddr>,
3511        dns_domain: Option<String>,
3512        isolation_network: Option<String>,
3513    ) -> Result<AttachResult, OverlaydError> {
3514        // Resolve whether THIS service has a dedicated per-service overlay. It
3515        // does iff a live dedicated transport exists OR a `hcn-internal` marker
3516        // entry is recorded under `owner_for_service(service)` (the network
3517        // survives daemon restarts even if the transport map is empty mid-init).
3518        // Dedicated services attach onto their OWN per-service Internal network
3519        // and draw IPs from the service subnet; everything else uses the node's
3520        // base/shared overlay network and the node slice.
3521        let dedicated_subnet = self.dedicated_service_subnet(service);
3522        // A `Shared`-mode service attaches onto the SINGLE shared HCN NAT network
3523        // reused across all Shared services (container ports are exposed via the
3524        // userspace free-port L4 proxy). The mode was recorded at setup time.
3525        let use_shared_nat = self
3526            .service_modes
3527            .get(service)
3528            .copied()
3529            .unwrap_or_default()
3530            .uses_shared_bridge();
3531
3532        let (net_id, ip, prefix_length) = if let Some(net) = isolation_network.as_deref() {
3533            // ----- per-isolation-network Internal HCN network path -----
3534            //
3535            // An "isolated" ZLayer network routes its members onto a dedicated
3536            // HCN Internal vSwitch keyed by the isolation-network NAME (not the
3537            // service). HCN Internal vSwitches are mutually isolated by default,
3538            // so same-network members share one vSwitch (reach each other +
3539            // egress via the network gateway + the node), while different
3540            // isolation networks land on separate vSwitches and cannot reach
3541            // each other — L3 isolation with NO ACLs and NO per-member churn.
3542            // This mirrors the Dedicated per-service branch below, but keyed by
3543            // the isolation-network name and drawing IPs from a per-network
3544            // subnet carved deterministically from the node slice.
3545            let iso_subnet = self.isolation_network_subnet(net)?;
3546            let net_id = self.ensure_isolation_network(net, iso_subnet).await?;
3547
3548            // Per-network container IPs come from the isolation network's own
3549            // subnet (never the node slice), via a lazily-created allocator
3550            // bounded to that subnet. The allocator is keyed by the isolation
3551            // network's owner key so it never collides with a same-named
3552            // dedicated service's allocator. An `ip_override` is honored only
3553            // when it falls inside the isolation subnet.
3554            let iso_ipnetwork: IpNetwork = iso_subnet.to_string().parse().map_err(|e| {
3555                OverlaydError::Other(format!(
3556                    "failed to parse isolation subnet {iso_subnet}: {e}"
3557                ))
3558            })?;
3559            let alloc_key = crate::network_state::owner_for_isolation_network(net);
3560            let allocator = self
3561                .service_ip_allocators
3562                .entry(alloc_key)
3563                .or_insert_with(|| IpAllocator::new(iso_ipnetwork));
3564            let ip = match ip_override {
3565                Some(ip) if iso_subnet.contains(&ip) => ip,
3566                Some(ip) => {
3567                    return Err(OverlaydError::Other(format!(
3568                        "overridden IP {ip} is not inside isolation network subnet {iso_subnet} for network {net}"
3569                    )));
3570                }
3571                None => allocator.allocate()?,
3572            };
3573            (net_id, ip, iso_subnet.prefix_len())
3574        } else if use_shared_nat {
3575            // ----- shared HCN NAT network path -----
3576            let slice = self.slice_cidr.ok_or_else(|| {
3577                OverlaydError::Other(
3578                    "no node slice assigned yet (SetupGlobalOverlay with slice_cidr first)"
3579                        .to_string(),
3580                )
3581            })?;
3582            let slice_ipnet: ipnet::IpNet = slice.to_string().parse().map_err(|e| {
3583                OverlaydError::Other(format!("failed to parse slice CIDR {slice}: {e}"))
3584            })?;
3585            let net_id = self.ensure_shared_nat_network(slice_ipnet).await?;
3586            let ip = match ip_override {
3587                Some(ip) => ip,
3588                None => self.ip_allocator.allocate()?,
3589            };
3590            (net_id, ip, slice_ipnet.prefix_len())
3591        } else if let Some(svc_subnet) = dedicated_subnet {
3592            // ----- dedicated per-service network path -----
3593            let net_id = self.ensure_service_network(service, svc_subnet).await?;
3594
3595            // Allocate (or validate) the IP from the SERVICE subnet, not the
3596            // node slice. A per-service allocator is created lazily and bounded
3597            // to the service subnet so addresses stay inside the dedicated
3598            // network. An `ip_override` inside the service subnet is honored;
3599            // one outside it is rejected so a slice-allocated IP can't leak onto
3600            // the dedicated network.
3601            let svc_ipnetwork: IpNetwork = svc_subnet.to_string().parse().map_err(|e| {
3602                OverlaydError::Other(format!("failed to parse service subnet {svc_subnet}: {e}"))
3603            })?;
3604            let allocator = self
3605                .service_ip_allocators
3606                .entry(service.to_string())
3607                .or_insert_with(|| IpAllocator::new(svc_ipnetwork));
3608            let ip = match ip_override {
3609                Some(ip) if svc_subnet.contains(&ip) => ip,
3610                Some(ip) => {
3611                    return Err(OverlaydError::Other(format!(
3612                        "overridden IP {ip} is not inside dedicated service subnet {svc_subnet} for service {service}"
3613                    )));
3614                }
3615                None => allocator.allocate()?,
3616            };
3617            (net_id, ip, svc_subnet.prefix_len())
3618        } else {
3619            // ----- shared base overlay network path (unchanged) -----
3620            let slice = self.slice_cidr.ok_or_else(|| {
3621                OverlaydError::Other(
3622                    "no node slice assigned yet (SetupGlobalOverlay with slice_cidr first)"
3623                        .to_string(),
3624                )
3625            })?;
3626            let slice_ipnet: ipnet::IpNet = slice.to_string().parse().map_err(|e| {
3627                OverlaydError::Other(format!("failed to parse slice CIDR {slice}: {e}"))
3628            })?;
3629            let net_id = self.ensure_overlay_network(slice_ipnet).await?;
3630            let ip = match ip_override {
3631                Some(ip) => ip,
3632                None => self.ip_allocator.allocate()?,
3633            };
3634            (net_id, ip, slice_ipnet.prefix_len())
3635        };
3636
3637        // 3. Create the endpoint + per-container namespace on the network.
3638        let dns_server_eff = dns_server.or_else(|| self.dns_server_addr.map(|a| a.ip()));
3639        let dns_domain_for_attach = dns_domain.or_else(|| self.dns_domain.clone());
3640        let cluster_cidr = self.cluster_cidr.map(|c| c.to_string()).unwrap_or_default();
3641        let owner_tag = owner_tag(&self.deployment_or_default());
3642        let cid = container_id.to_string();
3643
3644        let attachment = tokio::task::spawn_blocking(move || {
3645            zlayer_hns::attach::EndpointAttachment::create_overlay(
3646                net_id,
3647                &owner_tag,
3648                cid.as_str(),
3649                ip,
3650                prefix_length,
3651                &cluster_cidr,
3652                dns_server_eff,
3653                dns_domain_for_attach.as_deref(),
3654            )
3655        })
3656        .await
3657        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
3658        .map_err(|e| OverlaydError::Overlay(format!("HCN overlay endpoint attach failed: {e}")))?;
3659
3660        let namespace_id = attachment.namespace_id();
3661        let bare_guid = format_guid_bare(namespace_id);
3662
3663        // Per-network membership: record the container's IP in its isolated
3664        // network's member set. Windows enforcement is an HCN ACL — a
3665        // Linux-incompatible mechanism wired separately; overlayd only maintains
3666        // the membership map here and does NOT call the iptables firewall helper.
3667        if let Some(ref net) = isolation_network {
3668            self.network_members
3669                .entry(net.clone())
3670                .or_default()
3671                .insert(ip);
3672        }
3673
3674        // Record for autoclean keyed by namespace GUID.
3675        self.hcn_cleanup
3676            .insert(namespace_id, (service.to_string(), ip, isolation_network));
3677
3678        tracing::info!(
3679            ns = %bare_guid,
3680            service = %service,
3681            ip = %ip,
3682            "Attached container to HCN overlay"
3683        );
3684
3685        Ok(AttachResult {
3686            ip,
3687            namespace_guid: Some(bare_guid),
3688        })
3689    }
3690
3691    /// Non-Windows path: a `WindowsContainer` handle has no meaning off Windows.
3692    #[cfg(not(target_os = "windows"))]
3693    #[allow(clippy::unused_async)]
3694    async fn attach_container_windows(
3695        &mut self,
3696        _container_id: &str,
3697        _service: &str,
3698        _ip_override: Option<IpAddr>,
3699        _dns_server: Option<IpAddr>,
3700        _dns_domain: Option<String>,
3701        _isolation_network: Option<String>,
3702    ) -> Result<AttachResult, OverlaydError> {
3703        Err(OverlaydError::Other(
3704            "WindowsContainer attach is only supported on Windows".to_string(),
3705        ))
3706    }
3707
3708    /// Detach a Windows container by its bare namespace GUID and release its IP.
3709    /// Idempotent: unknown ids are a no-op.
3710    #[cfg(target_os = "windows")]
3711    async fn detach_container_windows(
3712        &mut self,
3713        namespace_guid: &str,
3714    ) -> Result<(), OverlaydError> {
3715        use windows::core::GUID;
3716
3717        let Ok(guid) = GUID::try_from(namespace_guid) else {
3718            tracing::warn!(ns = %namespace_guid, "detach: unparseable namespace GUID");
3719            return Ok(());
3720        };
3721        if let Some((service, ip, isolation_network)) = self.hcn_cleanup.remove(&guid) {
3722            // Release the IP into the pool it was drawn from. An isolation-network
3723            // member drew from the per-network allocator (keyed by the isolation
3724            // owner key), NOT the node slice; release it there so the isolation
3725            // subnet doesn't leak addresses. Everything else came from the node
3726            // slice.
3727            if let Some(net) = isolation_network.as_deref() {
3728                let alloc_key = crate::network_state::owner_for_isolation_network(net);
3729                if let Some(allocator) = self.service_ip_allocators.get_mut(&alloc_key) {
3730                    allocator.release(ip);
3731                } else {
3732                    self.ip_allocator.release(ip);
3733                }
3734            } else {
3735                self.ip_allocator.release(ip);
3736            }
3737            // Drain the per-network membership set.
3738            let mut net_now_empty: Option<String> = None;
3739            if let Some(net) = isolation_network.as_deref() {
3740                if let Some(set) = self.network_members.get_mut(net) {
3741                    set.remove(&ip);
3742                }
3743                if self
3744                    .network_members
3745                    .get(net)
3746                    .is_some_and(std::collections::HashSet::is_empty)
3747                {
3748                    self.network_members.remove(net);
3749                    net_now_empty = Some(net.to_string());
3750                }
3751            }
3752            tracing::info!(ns = %namespace_guid, service = %service, ip = %ip, "Released HCN overlay attachment");
3753
3754            // Last-member teardown: when the final member of an isolation network
3755            // leaves, reclaim its per-network HCN Internal network (mirroring the
3756            // per-service network teardown in `teardown_service_overlay`) so we
3757            // don't leak an HCN vSwitch until the next full uninstall. Drop the
3758            // per-network IP allocator and the marker entry too.
3759            if let Some(net) = net_now_empty {
3760                self.teardown_isolation_network(&net).await;
3761            }
3762        }
3763        Ok(())
3764    }
3765
3766    /// Reclaim the per-isolation-network HCN Internal network for `net`: delete
3767    /// the HCN network by the GUID recorded in the marker, drop its marker entry,
3768    /// and discard the per-network IP allocator. Best-effort and idempotent —
3769    /// called once the last member of the isolation network detaches. Mirrors the
3770    /// per-service network teardown in [`Self::teardown_service_overlay`].
3771    #[cfg(target_os = "windows")]
3772    async fn teardown_isolation_network(&mut self, net: &str) {
3773        let owner = crate::network_state::owner_for_isolation_network(net);
3774
3775        // Drop the per-network container-IP allocator.
3776        self.service_ip_allocators.remove(&owner);
3777
3778        let marker_path =
3779            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
3780        let mut marker = crate::network_state::NetworkState::load(&marker_path);
3781        let removed_entry = marker.remove(&owner);
3782        if removed_entry.is_some() {
3783            if let Err(e) = marker.save(&marker_path) {
3784                tracing::warn!(network = %net, error = %e, path = %marker_path.display(), "failed to persist isolation-network marker removal");
3785            }
3786        }
3787
3788        if let Some(entry) = removed_entry {
3789            if entry.kind == "hcn-internal" {
3790                match windows::core::GUID::try_from(entry.id.as_str()) {
3791                    Ok(guid) => {
3792                        let id_str = entry.id.clone();
3793                        let net_owned = net.to_string();
3794                        let delete = tokio::task::spawn_blocking(move || {
3795                            zlayer_hns::network::Network::delete(guid)
3796                        })
3797                        .await;
3798                        match delete {
3799                            Ok(Ok(())) => {
3800                                tracing::info!(network = %net_owned, id = %id_str, "deleted per-isolation-network HCN network on last detach");
3801                            }
3802                            Ok(Err(e)) => {
3803                                tracing::warn!(network = %net_owned, id = %id_str, error = %e, "failed to delete isolation-network HCN network (may leak until uninstall)");
3804                            }
3805                            Err(e) => {
3806                                tracing::warn!(network = %net_owned, id = %id_str, error = %e, "spawn_blocking join failed deleting isolation-network HCN network");
3807                            }
3808                        }
3809                    }
3810                    Err(_) => {
3811                        tracing::warn!(network = %net, id = %entry.id, "isolation-network marker has unparseable HCN GUID; skipping network delete");
3812                    }
3813                }
3814            }
3815        }
3816    }
3817
3818    /// Non-Windows path.
3819    #[cfg(not(target_os = "windows"))]
3820    #[allow(clippy::unused_async)]
3821    async fn detach_container_windows(
3822        &mut self,
3823        _namespace_guid: &str,
3824    ) -> Result<(), OverlaydError> {
3825        Ok(())
3826    }
3827
3828    /// Ensure the per-daemon HCN overlay (Internal vSwitch, no physical-NIC
3829    /// binding) exists on the host, reusing one recorded in the
3830    /// `{data_dir}/agent_network.json` marker or discoverable by name, and
3831    /// recording it in the marker on create.
3832    ///
3833    /// # Errors
3834    /// Propagates the underlying `zlayer_hns` error on create failure.
3835    #[cfg(target_os = "windows")]
3836    #[allow(clippy::too_many_lines)]
3837    async fn ensure_overlay_network(
3838        &mut self,
3839        slice_cidr: ipnet::IpNet,
3840    ) -> Result<windows::core::GUID, OverlaydError> {
3841        use windows::core::GUID;
3842
3843        let daemon_name = self.deployment_or_default();
3844        let net_name = overlay_network_name(&daemon_name);
3845        let marker_path =
3846            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
3847
3848        // Fast path: marker names a network GUID that still exists; reopen it.
3849        if let Some(recorded_id) = crate::network_state::NetworkState::load(&marker_path)
3850            .get(crate::network_state::OWNER_BASE)
3851            .and_then(|entry| GUID::try_from(entry.id.as_str()).ok())
3852        {
3853            let reopened = tokio::task::spawn_blocking(move || {
3854                zlayer_hns::network::Network::open(recorded_id).ok()
3855            })
3856            .await
3857            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
3858            if reopened.is_some() {
3859                tracing::info!(name = %net_name, "reusing HCN overlay network from marker");
3860                return Ok(recorded_id);
3861            }
3862        }
3863
3864        // Idempotency: reuse a host network whose queried name matches ours.
3865        let target_name = net_name.clone();
3866        let existing = tokio::task::spawn_blocking(move || -> Option<GUID> {
3867            let guids = zlayer_hns::network::list("{}").ok()?;
3868            for guid in guids {
3869                let Ok(network) = zlayer_hns::network::Network::open(guid) else {
3870                    continue;
3871                };
3872                if matches!(network.query("{}"), Ok(props) if props.name == target_name) {
3873                    return Some(guid);
3874                }
3875            }
3876            None
3877        })
3878        .await
3879        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
3880
3881        if let Some(existing_id) = existing {
3882            tracing::info!(name = %net_name, "reusing existing HCN overlay network");
3883            return Ok(existing_id);
3884        }
3885
3886        let net_id = GUID::new()
3887            .map_err(|e| OverlaydError::Other(format!("GUID::new for overlay network: {e}")))?;
3888        let subnet_str = slice_cidr.to_string();
3889
3890        // Default: an HCN Internal network — an internal vSwitch with NO
3891        // physical-NIC binding — so container traffic never touches the
3892        // operator's gateway adapter. Setting ZLAYER_HCN_UPLINK_ADAPTER opts
3893        // into the legacy Transparent model bound to that named uplink.
3894        let use_transparent = std::env::var(zlayer_hns::adapter::ZLAYER_UPLINK_ENV)
3895            .ok()
3896            .is_some_and(|v| !v.trim().is_empty());
3897
3898        let net_name_for_create = net_name.clone();
3899        let subnet_for_create = subnet_str.clone();
3900        if use_transparent {
3901            let uplink = zlayer_hns::adapter::find_primary_adapter()
3902                .map_err(|e| OverlaydError::Other(format!("find_primary_adapter: {e}")))?;
3903            tracing::warn!(uplink = %uplink, "ZLAYER_HCN_UPLINK_ADAPTER set: creating HCN *Transparent* overlay bound to a physical NIC");
3904            tokio::task::spawn_blocking(move || {
3905                zlayer_hns::network::Network::create_transparent(
3906                    net_id,
3907                    &net_name_for_create,
3908                    &subnet_for_create,
3909                    &uplink,
3910                )
3911            })
3912            .await
3913            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
3914            .map_err(|e| {
3915                OverlaydError::Overlay(format!("HcnCreateNetwork transparent ({net_name}): {e}"))
3916            })?;
3917        } else {
3918            tokio::task::spawn_blocking(move || {
3919                zlayer_hns::network::Network::create_internal(
3920                    net_id,
3921                    &net_name_for_create,
3922                    &subnet_for_create,
3923                )
3924            })
3925            .await
3926            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
3927            .map_err(|e| {
3928                OverlaydError::Overlay(format!("HcnCreateNetwork internal ({net_name}): {e}"))
3929            })?;
3930        }
3931
3932        // HCN's Static IPAM needs ~1-2s after network create to settle its
3933        // address pool; without this the first endpoint frequently fails with
3934        // HCN_E_ADDR_INVALID_OR_RESERVED.
3935        tokio::time::sleep(std::time::Duration::from_secs(2)).await;
3936
3937        tracing::info!(
3938            subnet = %subnet_str,
3939            mode = if use_transparent { "Transparent" } else { "Internal" },
3940            "created HCN overlay network"
3941        );
3942
3943        // Persist the marker so subsequent runs reuse this network by GUID and a
3944        // full uninstall knows to delete it. Best-effort.
3945        let mut marker = crate::network_state::NetworkState::load(&marker_path);
3946        marker.upsert(crate::network_state::ManagedNetwork {
3947            owner: crate::network_state::OWNER_BASE.to_string(),
3948            kind: if use_transparent {
3949                "hcn-transparent"
3950            } else {
3951                "hcn-internal"
3952            }
3953            .to_string(),
3954            name: net_name.clone(),
3955            id: format_guid_bare(net_id),
3956            subnet: subnet_str.clone(),
3957            // Base/Shared HCN network: no dedicated WireGuard identity.
3958            wg_port: None,
3959            wg_private_key: None,
3960            wg_public_key: None,
3961            interface: None,
3962        });
3963        if let Err(e) = marker.save(&marker_path) {
3964            tracing::warn!(error = %e, path = %marker_path.display(), "failed to persist agent network marker (network still reusable by name)");
3965        }
3966
3967        Ok(net_id)
3968    }
3969
3970    /// Ensure the SINGLE shared HCN **NAT** network exists on the host, reusing
3971    /// one recorded under the [`OWNER_SHARED_NAT`] marker owner (or discoverable
3972    /// by its derived name) and recording it on create. Reused across every
3973    /// `OverlayMode::Shared` service on this node.
3974    ///
3975    /// NAT gives Shared containers outbound connectivity and lets the userspace
3976    /// free-port L4 proxy (`proxy_manager.rs`) forward `host:FREEPORT` ->
3977    /// `container_ip:port` without a per-service vSwitch — the Windows analogue
3978    /// of the Linux node-wide shared bridge. Modeled on
3979    /// [`Self::ensure_overlay_network`] but keyed on [`OWNER_SHARED_NAT`] and
3980    /// forced to the NAT network type.
3981    ///
3982    /// Returns the network GUID.
3983    ///
3984    /// # Errors
3985    /// Propagates the underlying `zlayer_hns` error on create failure, or an
3986    /// error if the slice CIDR has no usable gateway host.
3987    #[cfg(target_os = "windows")]
3988    #[allow(clippy::too_many_lines)]
3989    async fn ensure_shared_nat_network(
3990        &mut self,
3991        slice_cidr: ipnet::IpNet,
3992    ) -> Result<windows::core::GUID, OverlaydError> {
3993        use windows::core::GUID;
3994
3995        let daemon_name = self.deployment_or_default();
3996        // Shared NAT network name: `<base overlay name>-shared` so it is
3997        // unambiguously distinct from the base network and per-service networks.
3998        let net_name = format!("{}-shared", overlay_network_name(&daemon_name));
3999        let owner = crate::network_state::OWNER_SHARED_NAT.to_string();
4000        let marker_path =
4001            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
4002
4003        // Fast path: marker names a network GUID that still exists; reopen it.
4004        let recorded_id = crate::network_state::NetworkState::load(&marker_path)
4005            .get(&owner)
4006            .filter(|entry| entry.kind == "hcn-nat")
4007            .and_then(|entry| GUID::try_from(entry.id.as_str()).ok());
4008        if let Some(recorded_id) = recorded_id {
4009            let reopened = tokio::task::spawn_blocking(move || {
4010                zlayer_hns::network::Network::open(recorded_id).ok()
4011            })
4012            .await
4013            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
4014            if reopened.is_some() {
4015                tracing::info!(name = %net_name, "reusing shared HCN NAT network from marker");
4016                return Ok(recorded_id);
4017            }
4018        }
4019
4020        // Idempotency: reuse a host network whose queried name matches ours.
4021        let target_name = net_name.clone();
4022        let existing = tokio::task::spawn_blocking(move || -> Option<GUID> {
4023            let guids = zlayer_hns::network::list("{}").ok()?;
4024            for guid in guids {
4025                let Ok(network) = zlayer_hns::network::Network::open(guid) else {
4026                    continue;
4027                };
4028                if matches!(network.query("{}"), Ok(props) if props.name == target_name) {
4029                    return Some(guid);
4030                }
4031            }
4032            None
4033        })
4034        .await
4035        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
4036
4037        if let Some(existing_id) = existing {
4038            tracing::info!(name = %net_name, "reusing existing shared HCN NAT network");
4039            return Ok(existing_id);
4040        }
4041
4042        let net_id = GUID::new()
4043            .map_err(|e| OverlaydError::Other(format!("GUID::new for shared NAT network: {e}")))?;
4044        let subnet_str = slice_cidr.to_string();
4045        let settings = shared_nat_settings(&net_name, &subnet_str).ok_or_else(|| {
4046            OverlaydError::Other(format!(
4047                "shared NAT network: slice CIDR '{subnet_str}' has no usable gateway host"
4048            ))
4049        })?;
4050
4051        let net_name_for_create = net_name.clone();
4052        tokio::task::spawn_blocking(move || {
4053            zlayer_hns::network::Network::create(net_id, &settings)
4054        })
4055        .await
4056        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
4057        .map_err(|e| OverlaydError::Overlay(format!("HcnCreateNetwork NAT ({net_name}): {e}")))?;
4058        let _ = net_name_for_create;
4059
4060        // HCN's IPAM needs ~1-2s after network create to settle its address pool
4061        // (same wait as the base/Internal networks).
4062        tokio::time::sleep(std::time::Duration::from_secs(2)).await;
4063
4064        tracing::info!(subnet = %subnet_str, "created shared HCN NAT network");
4065
4066        let mut marker = crate::network_state::NetworkState::load(&marker_path);
4067        marker.upsert(crate::network_state::ManagedNetwork {
4068            owner,
4069            kind: "hcn-nat".to_string(),
4070            name: net_name.clone(),
4071            id: format_guid_bare(net_id),
4072            subnet: subnet_str.clone(),
4073            wg_port: None,
4074            wg_private_key: None,
4075            wg_public_key: None,
4076            interface: None,
4077        });
4078        if let Err(e) = marker.save(&marker_path) {
4079            tracing::warn!(error = %e, path = %marker_path.display(), "failed to persist shared NAT network marker (network still reusable by name)");
4080        }
4081
4082        Ok(net_id)
4083    }
4084
4085    /// Ensure the per-service HCN **Internal** network for `service` exists on
4086    /// the host, reusing one recorded under the `service:<name>` marker owner
4087    /// (or discoverable by its derived name) and recording it on create.
4088    ///
4089    /// This is the Windows analogue of the Linux per-service bridge: a
4090    /// dedicated (`OverlayMode::Dedicated`) service gets its OWN isolated HCN
4091    /// Internal network — an internal vSwitch with NO physical-NIC binding —
4092    /// distinct from the node's shared base overlay network. Containers attach
4093    /// to it (rather than the base network) so dedicated-service traffic is
4094    /// segregated at the vSwitch layer. Modeled on [`Self::ensure_overlay_network`]
4095    /// but keyed on [`owner_for_service`] and forced to the Internal type (never
4096    /// Transparent — the on-box test asserts zero external vSwitches for
4097    /// dedicated services).
4098    ///
4099    /// Returns the network GUID.
4100    ///
4101    /// # Errors
4102    /// Propagates the underlying `zlayer_hns` error on create failure.
4103    #[cfg(target_os = "windows")]
4104    #[allow(clippy::too_many_lines)]
4105    async fn ensure_service_network(
4106        &mut self,
4107        service: &str,
4108        subnet: ipnet::IpNet,
4109    ) -> Result<windows::core::GUID, OverlaydError> {
4110        use windows::core::GUID;
4111
4112        let daemon_name = self.deployment_or_default();
4113        // Per-service network name: `<base overlay name>-svc-<service>` so it is
4114        // unambiguously distinct from the base network and from other services.
4115        let net_name = format!("{}-svc-{service}", overlay_network_name(&daemon_name));
4116        let owner = owner_for_service(service);
4117        let marker_path =
4118            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
4119
4120        // Fast path: marker names a network GUID that still exists; reopen it.
4121        // Only honor the recorded id when it belongs to an HCN-internal entry —
4122        // a Dedicated WireGuard marker (`kind == "wg-dedicated"`) stores the
4123        // transport public key in `id`, NOT an HCN GUID, so it must be ignored
4124        // for HCN reuse.
4125        let recorded_hcn_id = crate::network_state::NetworkState::load(&marker_path)
4126            .get(&owner)
4127            .filter(|entry| entry.kind == "hcn-internal")
4128            .and_then(|entry| GUID::try_from(entry.id.as_str()).ok());
4129        if let Some(recorded_id) = recorded_hcn_id {
4130            let reopened = tokio::task::spawn_blocking(move || {
4131                zlayer_hns::network::Network::open(recorded_id).ok()
4132            })
4133            .await
4134            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
4135            if reopened.is_some() {
4136                tracing::info!(name = %net_name, service = %service, "reusing per-service HCN network from marker");
4137                return Ok(recorded_id);
4138            }
4139        }
4140
4141        // Idempotency: reuse a host network whose queried name matches ours.
4142        let target_name = net_name.clone();
4143        let existing = tokio::task::spawn_blocking(move || -> Option<GUID> {
4144            let guids = zlayer_hns::network::list("{}").ok()?;
4145            for guid in guids {
4146                let Ok(network) = zlayer_hns::network::Network::open(guid) else {
4147                    continue;
4148                };
4149                if matches!(network.query("{}"), Ok(props) if props.name == target_name) {
4150                    return Some(guid);
4151                }
4152            }
4153            None
4154        })
4155        .await
4156        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
4157
4158        if let Some(existing_id) = existing {
4159            tracing::info!(name = %net_name, service = %service, "reusing existing per-service HCN network");
4160            return Ok(existing_id);
4161        }
4162
4163        let net_id = GUID::new()
4164            .map_err(|e| OverlaydError::Other(format!("GUID::new for per-service network: {e}")))?;
4165        let subnet_str = subnet.to_string();
4166
4167        // ALWAYS Internal for a dedicated service — never Transparent. The
4168        // dedicated requirement is isolation; an Internal network binds NO
4169        // physical NIC (no external vSwitch), which is what the on-box test
4170        // asserts.
4171        let net_name_for_create = net_name.clone();
4172        let subnet_for_create = subnet_str.clone();
4173        tokio::task::spawn_blocking(move || {
4174            zlayer_hns::network::Network::create_internal(
4175                net_id,
4176                &net_name_for_create,
4177                &subnet_for_create,
4178            )
4179        })
4180        .await
4181        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
4182        .map_err(|e| {
4183            OverlaydError::Overlay(format!("HcnCreateNetwork internal ({net_name}): {e}"))
4184        })?;
4185
4186        // HCN's Static IPAM needs ~1-2s after network create to settle its
4187        // address pool; without this the first endpoint frequently fails with
4188        // HCN_E_ADDR_INVALID_OR_RESERVED (same wait as the base network).
4189        tokio::time::sleep(std::time::Duration::from_secs(2)).await;
4190
4191        tracing::info!(
4192            service = %service,
4193            subnet = %subnet_str,
4194            "created per-service HCN Internal network"
4195        );
4196
4197        // Persist the marker (owner = `service:<name>`, kind = `hcn-internal`)
4198        // so subsequent runs reuse this network by GUID and a full uninstall
4199        // (`purge_managed_networks`, which sweeps every `kind` starting with
4200        // `hcn`) deletes it. Best-effort.
4201        //
4202        // A dedicated Windows service shares the SAME owner key for two facts:
4203        // the dedicated WireGuard identity (written by the cross-platform core
4204        // in `setup_service_overlay_dedicated`, kind `wg-dedicated`) and this
4205        // HCN network's GUID. The marker is keyed by owner, so carry the WG
4206        // identity fields over when we rewrite the entry to `hcn-internal` — the
4207        // single entry then holds both the HCN GUID (in `id`) and the WG
4208        // identity (in the `wg_*`/`interface` fields), and the WG private key
4209        // survives restarts. (The core re-asserts the `wg-dedicated` shape on
4210        // the next setup; this path re-asserts `hcn-internal` again right after
4211        // — both are self-healing because the network is also reusable by name.)
4212        let mut marker = crate::network_state::NetworkState::load(&marker_path);
4213        let carried = marker.get(&owner).cloned();
4214        marker.upsert(crate::network_state::ManagedNetwork {
4215            owner,
4216            kind: "hcn-internal".to_string(),
4217            name: net_name.clone(),
4218            id: format_guid_bare(net_id),
4219            subnet: subnet_str.clone(),
4220            wg_port: carried.as_ref().and_then(|c| c.wg_port),
4221            wg_private_key: carried.as_ref().and_then(|c| c.wg_private_key.clone()),
4222            wg_public_key: carried.as_ref().and_then(|c| c.wg_public_key.clone()),
4223            interface: carried.as_ref().and_then(|c| c.interface.clone()),
4224        });
4225        if let Err(e) = marker.save(&marker_path) {
4226            tracing::warn!(service = %service, error = %e, path = %marker_path.display(), "failed to persist per-service network marker (network still reusable by name)");
4227        }
4228
4229        Ok(net_id)
4230    }
4231
4232    /// Resolve the per-isolation-network subnet for `net`, carving a fixed-size
4233    /// sub-block out of the node slice deterministically by name hash.
4234    ///
4235    /// Isolation networks attach onto a dedicated HCN Internal vSwitch and need
4236    /// their OWN address pool (never the node slice's shared pool) so a member's
4237    /// IP is on-link with its network's gateway. Unlike dedicated services,
4238    /// isolation networks aren't registered in the cluster's
4239    /// [`ServiceSubnetRegistry`] (a standalone isolated container may use the
4240    /// base overlay, where no `SetupServiceOverlay` ran), so the subnet is
4241    /// derived locally and deterministically: the node slice is split into
4242    /// `/<sub_prefix>` blocks and the network name selects one by hash. The
4243    /// derivation is stable across restarts (same name -> same block) so a
4244    /// reused HCN network keeps the same subnet.
4245    ///
4246    /// # Errors
4247    /// Returns an error if no node slice is assigned yet, the slice CIDR is
4248    /// unparseable, or the slice cannot be subnetted (e.g. already at the host
4249    /// prefix).
4250    #[cfg(target_os = "windows")]
4251    fn isolation_network_subnet(&self, net: &str) -> Result<ipnet::IpNet, OverlaydError> {
4252        use std::hash::{Hash, Hasher};
4253
4254        let slice = self.slice_cidr.ok_or_else(|| {
4255            OverlaydError::Other(
4256                "no node slice assigned yet (SetupGlobalOverlay with slice_cidr first)".to_string(),
4257            )
4258        })?;
4259        let slice_ipnet: ipnet::IpNet = slice.to_string().parse().map_err(|e| {
4260            OverlaydError::Other(format!("failed to parse slice CIDR {slice}: {e}"))
4261        })?;
4262
4263        // Carve the slice into /<sub_prefix> blocks. A `/28` (V4) gives ~13
4264        // usable container IPs per isolation network per node — enough for the
4265        // isolated-container use case — while leaving room for several distinct
4266        // isolation networks inside one node slice. Clamp to the slice prefix so
4267        // a slice already more specific than the target just yields itself.
4268        let sub_prefix: u8 = match slice_ipnet {
4269            ipnet::IpNet::V4(_) => 28u8.max(slice_ipnet.prefix_len()),
4270            ipnet::IpNet::V6(_) => 124u8.max(slice_ipnet.prefix_len()),
4271        };
4272
4273        let blocks: Vec<ipnet::IpNet> = slice_ipnet
4274            .subnets(sub_prefix)
4275            .map_err(|e| {
4276                OverlaydError::Other(format!(
4277                    "failed to subnet slice {slice_ipnet} into /{sub_prefix} blocks: {e}"
4278                ))
4279            })?
4280            .collect();
4281        if blocks.is_empty() {
4282            return Err(OverlaydError::Other(format!(
4283                "slice {slice_ipnet} yielded no /{sub_prefix} blocks for isolation network {net}"
4284            )));
4285        }
4286
4287        let mut hasher = std::collections::hash_map::DefaultHasher::new();
4288        net.hash(&mut hasher);
4289        // `% blocks.len()` is always < blocks.len() <= usize::MAX, so this never
4290        // truncates; `try_from` keeps clippy happy without an unchecked cast.
4291        let idx = usize::try_from(hasher.finish() % blocks.len() as u64).unwrap_or(0);
4292        Ok(blocks[idx])
4293    }
4294
4295    /// Ensure the per-isolation-network HCN **Internal** network for `net` exists
4296    /// on the host, reusing one recorded under the
4297    /// [`owner_for_isolation_network`] marker owner (or discoverable by its
4298    /// derived name) and recording it on create.
4299    ///
4300    /// This is the Windows mechanism for per-network L3 isolation: every
4301    /// `ZLayer` "isolated" network gets its OWN HCN Internal vSwitch — an
4302    /// internal vSwitch with NO physical-NIC binding. HCN Internal vSwitches are
4303    /// mutually isolated by default, so same-network members (sharing this
4304    /// vSwitch) reach each other + egress + the node, while members of a
4305    /// different isolation network land on a different vSwitch and cannot reach
4306    /// them. No ACLs, no per-member churn. Modeled on
4307    /// [`Self::ensure_service_network`] but keyed on
4308    /// [`owner_for_isolation_network`] and named `<overlay>-iso-<net>`.
4309    ///
4310    /// Returns the network GUID.
4311    ///
4312    /// # Errors
4313    /// Propagates the underlying `zlayer_hns` error on create failure.
4314    #[cfg(target_os = "windows")]
4315    async fn ensure_isolation_network(
4316        &mut self,
4317        net: &str,
4318        subnet: ipnet::IpNet,
4319    ) -> Result<windows::core::GUID, OverlaydError> {
4320        use windows::core::GUID;
4321
4322        let daemon_name = self.deployment_or_default();
4323        // Per-isolation-network name: `<base overlay name>-iso-<net>` so it is
4324        // unambiguously distinct from the base network and per-service networks.
4325        let net_name = format!("{}-iso-{net}", overlay_network_name(&daemon_name));
4326        let owner = crate::network_state::owner_for_isolation_network(net);
4327        let marker_path =
4328            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
4329
4330        // Fast path: marker names a network GUID that still exists; reopen it.
4331        let recorded_hcn_id = crate::network_state::NetworkState::load(&marker_path)
4332            .get(&owner)
4333            .filter(|entry| entry.kind == "hcn-internal")
4334            .and_then(|entry| GUID::try_from(entry.id.as_str()).ok());
4335        if let Some(recorded_id) = recorded_hcn_id {
4336            let reopened = tokio::task::spawn_blocking(move || {
4337                zlayer_hns::network::Network::open(recorded_id).ok()
4338            })
4339            .await
4340            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
4341            if reopened.is_some() {
4342                tracing::info!(name = %net_name, network = %net, "reusing per-isolation-network HCN network from marker");
4343                return Ok(recorded_id);
4344            }
4345        }
4346
4347        // Idempotency: reuse a host network whose queried name matches ours.
4348        let target_name = net_name.clone();
4349        let existing = tokio::task::spawn_blocking(move || -> Option<GUID> {
4350            let guids = zlayer_hns::network::list("{}").ok()?;
4351            for guid in guids {
4352                let Ok(network) = zlayer_hns::network::Network::open(guid) else {
4353                    continue;
4354                };
4355                if matches!(network.query("{}"), Ok(props) if props.name == target_name) {
4356                    return Some(guid);
4357                }
4358            }
4359            None
4360        })
4361        .await
4362        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
4363
4364        if let Some(existing_id) = existing {
4365            tracing::info!(name = %net_name, network = %net, "reusing existing per-isolation-network HCN network");
4366            return Ok(existing_id);
4367        }
4368
4369        let net_id = GUID::new().map_err(|e| {
4370            OverlaydError::Other(format!("GUID::new for per-isolation-network network: {e}"))
4371        })?;
4372        let subnet_str = subnet.to_string();
4373
4374        // ALWAYS Internal for an isolation network — never Transparent. The
4375        // isolation requirement is exactly the Internal-vSwitch property: no
4376        // physical-NIC binding, mutually isolated from other Internal vSwitches.
4377        let net_name_for_create = net_name.clone();
4378        let subnet_for_create = subnet_str.clone();
4379        tokio::task::spawn_blocking(move || {
4380            zlayer_hns::network::Network::create_internal(
4381                net_id,
4382                &net_name_for_create,
4383                &subnet_for_create,
4384            )
4385        })
4386        .await
4387        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
4388        .map_err(|e| {
4389            OverlaydError::Overlay(format!("HcnCreateNetwork internal ({net_name}): {e}"))
4390        })?;
4391
4392        // HCN's Static IPAM needs ~1-2s after network create to settle its
4393        // address pool; without this the first endpoint frequently fails with
4394        // HCN_E_ADDR_INVALID_OR_RESERVED (same wait as the per-service network).
4395        tokio::time::sleep(std::time::Duration::from_secs(2)).await;
4396
4397        tracing::info!(
4398            network = %net,
4399            subnet = %subnet_str,
4400            "created per-isolation-network HCN Internal network"
4401        );
4402
4403        // Persist the marker (owner = `iso:<net>`, kind = `hcn-internal`) so
4404        // subsequent runs reuse this network by GUID and a full uninstall
4405        // (`purge_managed_networks`, which sweeps every `kind` starting with
4406        // `hcn`) deletes it. Best-effort.
4407        let mut marker = crate::network_state::NetworkState::load(&marker_path);
4408        marker.upsert(crate::network_state::ManagedNetwork {
4409            owner,
4410            kind: "hcn-internal".to_string(),
4411            name: net_name.clone(),
4412            id: format_guid_bare(net_id),
4413            subnet: subnet_str.clone(),
4414            // Isolation HCN network: no dedicated WireGuard identity.
4415            wg_port: None,
4416            wg_private_key: None,
4417            wg_public_key: None,
4418            interface: None,
4419        });
4420        if let Err(e) = marker.save(&marker_path) {
4421            tracing::warn!(network = %net, error = %e, path = %marker_path.display(), "failed to persist per-isolation-network marker (network still reusable by name)");
4422        }
4423
4424        Ok(net_id)
4425    }
4426
4427    /// Resolve the dedicated per-service subnet for `service`, if the service
4428    /// runs in `OverlayMode::Dedicated` on this node.
4429    ///
4430    /// Source of truth, in order:
4431    /// 1. The live [`ServiceTransport`] in `service_transports` (the normal
4432    ///    case once `SetupServiceOverlay` has run this process).
4433    /// 2. A persisted `hcn-internal` marker entry under
4434    ///    [`owner_for_service`]`(service)` — covers the window where the HCN
4435    ///    network exists from a prior run but the transport map is still empty.
4436    ///
4437    /// Returns `None` for Shared-mode services (attach onto the base network).
4438    #[cfg(target_os = "windows")]
4439    fn dedicated_service_subnet(&self, service: &str) -> Option<ipnet::IpNet> {
4440        if let Some(st) = self.service_transports.get(service) {
4441            return Some(st.subnet);
4442        }
4443        let marker_path =
4444            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
4445        crate::network_state::NetworkState::load(&marker_path)
4446            .get(&owner_for_service(service))
4447            .filter(|entry| entry.kind == "hcn-internal")
4448            .and_then(|entry| entry.subnet.parse::<ipnet::IpNet>().ok())
4449    }
4450
4451    /// The daemon name used for HCN network/owner naming, defaulting to
4452    /// `"zlayer"` when no deployment has been set yet.
4453    #[cfg(target_os = "windows")]
4454    fn deployment_or_default(&self) -> String {
4455        if self.deployment.is_empty() {
4456            "zlayer".to_string()
4457        } else {
4458            self.deployment.clone()
4459        }
4460    }
4461
4462    // -- peers ---------------------------------------------------------------
4463
4464    /// Resolve a [`PeerScope`] to the live [`OverlayTransport`] its ops target.
4465    ///
4466    /// `Global` -> the single cluster transport; `Service { service }` -> that
4467    /// service's dedicated per-service transport (Dedicated mode only).
4468    ///
4469    /// # Errors
4470    /// Returns an error if the global overlay is not up (for `Global`) or no
4471    /// dedicated overlay exists for the named service (for `Service`).
4472    fn transport_for_scope(&self, scope: &PeerScope) -> Result<&OverlayTransport, OverlaydError> {
4473        match scope {
4474            PeerScope::Global => self
4475                .global_transport
4476                .as_ref()
4477                .ok_or_else(|| OverlaydError::Other("global overlay not set up".into())),
4478            PeerScope::Service { service } => self
4479                .service_transports
4480                .get(service)
4481                .map(|s| &s.transport)
4482                .ok_or_else(|| {
4483                    OverlaydError::Other(format!("no dedicated overlay for service {service}"))
4484                }),
4485        }
4486    }
4487
4488    /// Add a peer to a resolved transport.
4489    ///
4490    /// # Errors
4491    /// Wraps the underlying transport error.
4492    async fn add_peer_on(
4493        transport: &OverlayTransport,
4494        peer: &PeerInfo,
4495    ) -> Result<(), OverlaydError> {
4496        transport
4497            .add_peer(peer)
4498            .await
4499            .map_err(|e| OverlaydError::Overlay(format!("add_peer failed: {e}")))
4500    }
4501
4502    /// Remove a peer (by base64 public key) from a resolved transport.
4503    ///
4504    /// # Errors
4505    /// Wraps the underlying transport error.
4506    async fn remove_peer_on(
4507        transport: &OverlayTransport,
4508        pubkey: &str,
4509    ) -> Result<(), OverlaydError> {
4510        transport
4511            .remove_peer(pubkey)
4512            .await
4513            .map_err(|e| OverlaydError::Overlay(format!("remove_peer failed: {e}")))
4514    }
4515
4516    /// Plumb a CIDR into a peer's `AllowedIPs` on a resolved transport.
4517    ///
4518    /// # Errors
4519    /// Returns an error when the CIDR is invalid or the UAPI write fails.
4520    async fn add_allowed_ip_on(
4521        transport: &OverlayTransport,
4522        pubkey: &str,
4523        cidr: &str,
4524    ) -> Result<(), OverlaydError> {
4525        let net: ipnet::IpNet = cidr
4526            .parse()
4527            .map_err(|e| OverlaydError::Other(format!("invalid CIDR {cidr}: {e}")))?;
4528        transport
4529            .add_allowed_ip(pubkey, net)
4530            .await
4531            .map_err(|e| OverlaydError::Overlay(format!("add_allowed_ip failed: {e}")))
4532    }
4533
4534    /// Remove a CIDR from a peer's `AllowedIPs` on a resolved transport.
4535    ///
4536    /// # Errors
4537    /// Returns an error when the CIDR is invalid or the UAPI write fails.
4538    async fn remove_allowed_ip_on(
4539        transport: &OverlayTransport,
4540        pubkey: &str,
4541        cidr: &str,
4542    ) -> Result<(), OverlaydError> {
4543        let net: ipnet::IpNet = cidr
4544            .parse()
4545            .map_err(|e| OverlaydError::Other(format!("invalid CIDR {cidr}: {e}")))?;
4546        transport
4547            .remove_allowed_ip(pubkey, net)
4548            .await
4549            .map_err(|e| OverlaydError::Overlay(format!("remove_allowed_ip failed: {e}")))
4550    }
4551
4552    // -- DNS -----------------------------------------------------------------
4553
4554    /// Register an overlay DNS A/AAAA record.
4555    fn register_dns(&mut self, name: String, ip: IpAddr) {
4556        self.dns_records.insert(name, ip);
4557    }
4558
4559    /// Remove an overlay DNS record.
4560    fn unregister_dns(&mut self, name: &str) {
4561        self.dns_records.remove(name);
4562    }
4563
4564    // -- NAT -----------------------------------------------------------------
4565
4566    /// Periodic NAT traversal maintenance: lazily start NAT (and the built-in
4567    /// relay server), re-probe STUN, refresh relays, and run the connect-half —
4568    /// hole-punching / relaying toward every peer whose direct endpoint has not
4569    /// produced a recent `WireGuard` handshake.
4570    ///
4571    /// No-op when NAT traversal is disabled in the resolved [`NatConfig`].
4572    ///
4573    /// # Errors
4574    /// Returns an error when the underlying STUN refresh fails.
4575    async fn nat_maintenance_tick(&mut self) -> Result<(), OverlaydError> {
4576        // Lazily start NAT traversal on the first tick if a config asks for it.
4577        if self.nat_traversal.is_none() {
4578            let config = self.nat_config.clone().unwrap_or_default();
4579            if config.enabled {
4580                // Stand up the built-in relay server here (once) when the
4581                // resolved config carries a `relay_server`. The auth credential
4582                // MUST be cluster-wide-shared (every node's relay *client*
4583                // derives the same BLAKE2b key via `derive_auth_key`), so it
4584                // comes from `cluster_relay_credential` — the cluster HS256
4585                // secret the main daemon stamped into
4586                // `NatConfigSpec.relay_server.auth_credential`, NOT the node's
4587                // per-node WireGuard key. When no credential was supplied the
4588                // relay derives a key from the empty string (only same-config
4589                // nodes can use it).
4590                if let Some(relay_cfg) = config.relay_server.clone() {
4591                    if self.relay_server.is_none() {
4592                        let credential = self.cluster_relay_credential.clone().unwrap_or_default();
4593                        let relay = RelayServer::new(&relay_cfg, &credential);
4594                        match relay.start().await {
4595                            Ok(bound) => {
4596                                tracing::info!(
4597                                    bound = %bound,
4598                                    external = %relay_cfg.external_addr,
4599                                    "Built-in relay server started"
4600                                );
4601                                self.relay_bound_addr = Some(bound);
4602                                self.relay_server = Some(relay);
4603                            }
4604                            Err(e) => {
4605                                tracing::warn!(error = %e, "Built-in relay server failed to start");
4606                            }
4607                        }
4608                    }
4609                }
4610
4611                let mut nat = NatTraversal::new(config, self.overlay_port);
4612                match nat.gather_candidates().await {
4613                    Ok(candidates) => {
4614                        tracing::info!(count = candidates.len(), "Gathered NAT candidates");
4615                        self.nat_last_refresh.store(now_unix(), Ordering::SeqCst);
4616                        self.nat_traversal = Some(nat);
4617                    }
4618                    Err(e) => {
4619                        tracing::warn!(error = %e, "NAT candidate gathering failed");
4620                        return Ok(());
4621                    }
4622                }
4623                // First-tick connect: try to establish toward every already-known
4624                // peer (peers added before NAT came up).
4625                self.nat_connect_known_peers().await;
4626            } else {
4627                return Ok(());
4628            }
4629        }
4630
4631        // Refresh STUN/relay state, then run the connect-half for peers that
4632        // still lack a recent handshake.
4633        if let Some(nat) = self.nat_traversal.as_mut() {
4634            match nat.refresh().await {
4635                Ok(changed) => {
4636                    if changed {
4637                        tracing::info!("NAT reflexive address changed during refresh");
4638                    }
4639                    self.nat_last_refresh.store(now_unix(), Ordering::SeqCst);
4640                }
4641                Err(e) => {
4642                    return Err(OverlaydError::Overlay(format!(
4643                        "NAT maintenance tick failed: {e}"
4644                    )));
4645                }
4646            }
4647        }
4648        self.nat_connect_known_peers().await;
4649        Ok(())
4650    }
4651
4652    /// The NAT connect-half: for every peer with advertised candidates that has
4653    /// no recent `WireGuard` handshake, call [`NatTraversal::connect_to_peer`]
4654    /// (which itself updates the live device's peer endpoint) and record the
4655    /// resulting [`ConnectionType`].
4656    ///
4657    /// Best-effort: a peer with no live global transport, no candidates, or a
4658    /// failed traversal is left untouched (its persistent direct endpoint keeps
4659    /// retrying). Candidate sets are collected into a local `Vec` first so the
4660    /// borrow of `self.nat_traversal` / `self.global_transport` does not overlap
4661    /// the mutable borrow of `self.peer_connection_type`.
4662    async fn nat_connect_known_peers(&mut self) {
4663        // No host transport (VM-only overlay) or no NAT orchestrator → nothing
4664        // to connect on this node.
4665        let (Some(_), Some(_)) = (self.global_transport.as_ref(), self.nat_traversal.as_ref())
4666        else {
4667            return;
4668        };
4669        if self.peer_candidates.is_empty() {
4670            return;
4671        }
4672
4673        // Peers whose handshake is older than this cutoff (or never seen) are
4674        // candidates for a (re)connect attempt. WireGuard's default keepalive is
4675        // 25s; 3× that is a generous "the direct endpoint is clearly not
4676        // establishing" threshold that avoids churning healthy peers.
4677        let cutoff = now_unix().saturating_sub(75);
4678
4679        // Snapshot the (pubkey, candidates) work set up front to satisfy the
4680        // borrow checker (we borrow self.transport + self.nat below).
4681        let work: Vec<(String, Vec<Candidate>)> = self
4682            .peer_candidates
4683            .iter()
4684            .map(|(k, v)| (k.clone(), v.clone()))
4685            .collect();
4686
4687        let transport = self.global_transport.as_ref().expect("checked above");
4688        let nat = self.nat_traversal.as_ref().expect("checked above");
4689        let mut results: Vec<(String, ConnectionType)> = Vec::new();
4690
4691        for (pubkey, candidates) in &work {
4692            // Skip peers that already have a fresh handshake on the live device.
4693            match transport.check_peer_handshake(pubkey, cutoff).await {
4694                Ok(true) => continue,
4695                Ok(false) => {}
4696                Err(e) => {
4697                    tracing::debug!(peer = %pubkey, error = %e, "handshake check failed; attempting connect anyway");
4698                }
4699            }
4700            match nat.connect_to_peer(transport, pubkey, candidates).await {
4701                Ok(connection_type) => {
4702                    tracing::info!(
4703                        peer = %pubkey,
4704                        connection = %connection_type,
4705                        "NAT traversal established connection to peer"
4706                    );
4707                    results.push((pubkey.clone(), connection_type));
4708                }
4709                Err(e) => {
4710                    tracing::debug!(peer = %pubkey, error = %e, "NAT traversal could not connect to peer this tick");
4711                }
4712            }
4713        }
4714
4715        for (pubkey, ct) in results {
4716            self.peer_connection_type.insert(pubkey, ct);
4717        }
4718    }
4719
4720    /// Build a [`NatStatusWire`] from the live NAT orchestrator: this node's
4721    /// local candidates, the per-peer connection types recorded by the connect
4722    /// loop (with each peer's current remote endpoint parsed from the UAPI
4723    /// status dump), and the last STUN-refresh timestamp.
4724    async fn nat_status_snapshot(&self) -> NatStatusWire {
4725        let candidates = self
4726            .nat_traversal
4727            .as_ref()
4728            .map(|n| n.local_candidates().iter().map(candidate_to_wire).collect())
4729            .unwrap_or_default();
4730
4731        // Map hex-pubkey -> current remote endpoint from the live device's UAPI
4732        // dump. The dump keys peers by hex; `peer_connection_type` keys by
4733        // base64, so the join below converts each base64 key to hex.
4734        let mut endpoints: HashMap<String, String> = HashMap::new();
4735        if let Some(transport) = self.global_transport.as_ref() {
4736            if let Ok(dump) = transport.status().await {
4737                for p in parse_peer_status(&dump) {
4738                    if !p.endpoint.is_empty() {
4739                        endpoints.insert(p.public_key, p.endpoint);
4740                    }
4741                }
4742            }
4743        }
4744
4745        let peers = self
4746            .peer_connection_type
4747            .iter()
4748            .map(|(pubkey, ct)| {
4749                let remote_endpoint = zlayer_overlay::nat::pubkey_b64_to_hex(pubkey)
4750                    .and_then(|hex| endpoints.get(&hex).cloned());
4751                NatPeerWire {
4752                    node_id: pubkey.clone(),
4753                    connection_type: ct.to_string(),
4754                    remote_endpoint,
4755                }
4756            })
4757            .collect();
4758
4759        NatStatusWire {
4760            candidates,
4761            peers,
4762            last_refresh: self.nat_last_refresh.load(Ordering::SeqCst),
4763        }
4764    }
4765
4766    // -- status --------------------------------------------------------------
4767
4768    /// Build a [`StatusSnapshot`] from current overlay state.
4769    async fn status_snapshot(&self) -> StatusSnapshot {
4770        let mut peers: Vec<PeerStatus> = Vec::new();
4771        let public_key = self.transport_public_key.clone();
4772
4773        if let Some(transport) = self.global_transport.as_ref() {
4774            // Parse the UAPI dump for per-peer state. Best-effort: a parse
4775            // failure leaves the peer list empty rather than failing Status.
4776            if let Ok(dump) = transport.status().await {
4777                peers = parse_peer_status(&dump);
4778            }
4779        }
4780
4781        let service_count = u32::try_from(self.service_count()).unwrap_or(u32::MAX);
4782        let peer_count = u32::try_from(peers.len()).unwrap_or(u32::MAX);
4783
4784        // Per dedicated per-service overlay device: count its peers the same
4785        // way the global status does (parse the UAPI/status dump).
4786        let mut dedicated_services: Vec<DedicatedServiceStatus> = Vec::new();
4787        for (svc, st) in &self.service_transports {
4788            let peer_count = match st.transport.status().await {
4789                Ok(dump) => u32::try_from(parse_peer_status(&dump).len()).unwrap_or(u32::MAX),
4790                Err(_) => 0,
4791            };
4792            dedicated_services.push(DedicatedServiceStatus {
4793                service: svc.clone(),
4794                interface: st.interface.clone(),
4795                public_key: st.public_key.clone(),
4796                listen_port: st.listen_port,
4797                overlay_ip: st.overlay_ip,
4798                subnet: st.subnet.to_string(),
4799                peer_count,
4800            });
4801        }
4802
4803        StatusSnapshot {
4804            interface: self.global_interface.clone(),
4805            node_ip: self.node_ip,
4806            public_key,
4807            overlay_cidr: self.cluster_cidr.map(|c| c.to_string()),
4808            slice_cidr: self.slice_cidr.map(|c| c.to_string()),
4809            peer_count,
4810            service_count,
4811            peers,
4812            dedicated_services,
4813        }
4814    }
4815
4816    /// Number of per-service overlays set up on this node (Shared bridges /
4817    /// placeholders plus any Dedicated transports not already counted there).
4818    fn service_count(&self) -> usize {
4819        let extra_dedicated = self
4820            .service_transports
4821            .keys()
4822            .filter(|svc| !self.service_interfaces.contains_key(*svc))
4823            .count();
4824        self.service_interfaces.len() + extra_dedicated
4825    }
4826
4827    // -- config helper -------------------------------------------------------
4828
4829    fn build_config(
4830        &self,
4831        private_key: String,
4832        public_key: String,
4833        ip: IpAddr,
4834        mask: u8,
4835        listen_port: u16,
4836        physical_egress_ip: Option<IpAddr>,
4837    ) -> OverlayConfig {
4838        // Pick the source/advertised address for the WireGuard endpoint.
4839        //
4840        // Default is the family-matched UNSPECIFIED (`0.0.0.0` / `::`), which lets
4841        // the kernel pick a source per outgoing packet. When the caller resolved a
4842        // physical-egress IP (see `detect_physical_egress`) *and* its family
4843        // matches the overlay IP's family, we pin `local_endpoint` to that IP so
4844        // boringtun's data socket sources from — and advertises — the real NIC
4845        // rather than whatever the default route (possibly a VPN mesh) would pick.
4846        //
4847        // Family mismatch (e.g. physical egress is v4 but this overlay is v6) is
4848        // unusable for source selection, so we warn and fall back to UNSPECIFIED.
4849        //
4850        // boringtun limitation: boringtun 0.7's `DeviceConfig` exposes no way to
4851        // inject or pin the WireGuard DATA socket (its `uapi_fd` is the UAPI
4852        // CONTROL socket only), so `SO_BINDTODEVICE` on the data socket is
4853        // impossible today. Setting `local_endpoint` to the physical IP governs
4854        // source-address selection and the advertised endpoint, which is the
4855        // realistic scope of control we have.
4856        let unspecified = match ip {
4857            IpAddr::V4(_) => IpAddr::V4(Ipv4Addr::UNSPECIFIED),
4858            IpAddr::V6(_) => IpAddr::V6(Ipv6Addr::UNSPECIFIED),
4859        };
4860        let local_addr =
4861            if rootless_forces_unspecified(std::env::var_os("ZLAYER_ROOTLESS").is_some()) {
4862                // Rootless: detect_physical_egress() resolves pasta's in-netns tap IP
4863                // (e.g. 192.168.68.x), which is useless as a WG source/advertised
4864                // endpoint to remote peers. Force UNSPECIFIED; the kernel picks the
4865                // source per packet and the real reachable endpoint comes from the
4866                // advertise_addr path + pasta forwarding.
4867                unspecified
4868            } else {
4869                match physical_egress_ip {
4870                    Some(egress) if egress.is_ipv4() == ip.is_ipv4() => egress,
4871                    Some(egress) => {
4872                        tracing::warn!(
4873                            physical_egress_ip = %egress,
4874                            overlay_ip = %ip,
4875                            "physical egress IP family does not match overlay IP family; \
4876                             falling back to UNSPECIFIED for WireGuard local_endpoint"
4877                        );
4878                        unspecified
4879                    }
4880                    None => unspecified,
4881                }
4882            };
4883        let mut config = OverlayConfig {
4884            local_endpoint: SocketAddr::new(local_addr, listen_port),
4885            private_key,
4886            public_key,
4887            overlay_cidr: format!("{ip}/{mask}"),
4888            ..OverlayConfig::default()
4889        };
4890        if let Some(nat) = self.nat_config.clone() {
4891            config.nat = nat;
4892        }
4893        if let Some(dir) = self.uapi_sock_dir.clone() {
4894            config.uapi_sock_dir = dir;
4895        }
4896        config
4897    }
4898}
4899
4900/// Build an `Auto`-mode [`ServiceOverlayInfo`]: the per-service bridge/placeholder
4901/// name with every dedicated-device identity field left `None` (`Auto` carries
4902/// the service subnet on the single cluster-wide `WireGuard` device).
4903fn cluster_wg_overlay_info(name: String) -> ServiceOverlayInfo {
4904    ServiceOverlayInfo {
4905        name,
4906        mode: OverlayMode::Auto,
4907        wg_public_key: None,
4908        wg_port: None,
4909        overlay_ip: None,
4910        subnet: None,
4911    }
4912}
4913
4914/// Build a `Shared`-mode [`ServiceOverlayInfo`]: the shared node-wide
4915/// bridge/placeholder name with every dedicated-device identity field left
4916/// `None` (Shared mode shares the single cluster device and the node-wide
4917/// bridge; ports are exposed by the userspace free-port L4 proxy).
4918fn shared_overlay_info(name: String) -> ServiceOverlayInfo {
4919    ServiceOverlayInfo {
4920        name,
4921        mode: OverlayMode::Shared,
4922        wg_public_key: None,
4923        wg_port: None,
4924        overlay_ip: None,
4925        subnet: None,
4926    }
4927}
4928
4929/// Build a Dedicated-mode [`ServiceOverlayInfo`] from a dedicated device's
4930/// identity. `name` is the container-attach handle (bridge name on Linux, the
4931/// dedicated interface elsewhere).
4932fn dedicated_overlay_info(
4933    name: String,
4934    public_key: &str,
4935    listen_port: u16,
4936    overlay_ip: IpAddr,
4937    subnet: ipnet::IpNet,
4938) -> ServiceOverlayInfo {
4939    ServiceOverlayInfo {
4940        name,
4941        mode: OverlayMode::Dedicated,
4942        wg_public_key: Some(public_key.to_string()),
4943        wg_port: Some(listen_port),
4944        overlay_ip: Some(overlay_ip),
4945        subnet: Some(subnet.to_string()),
4946    }
4947}
4948
4949/// Convert a wire [`PeerSpec`] into a `zlayer_overlay::PeerInfo`.
4950///
4951/// # Errors
4952/// Returns an error if `endpoint` cannot be parsed as a `host:port`
4953/// [`SocketAddr`].
4954pub fn peer_spec_to_info(spec: &PeerSpec) -> Result<PeerInfo, OverlaydError> {
4955    let endpoint: SocketAddr = spec.endpoint.parse().map_err(|e| {
4956        OverlaydError::Other(format!("invalid peer endpoint {}: {e}", spec.endpoint))
4957    })?;
4958    Ok(PeerInfo::new(
4959        spec.public_key.clone(),
4960        endpoint,
4961        &spec.allowed_ips,
4962        std::time::Duration::from_secs(spec.persistent_keepalive_secs),
4963    ))
4964}
4965
4966/// Parse a `wg`-style UAPI/`status` dump into [`PeerStatus`] entries.
4967///
4968/// The dump is a series of `key=value` lines; each `public_key=` line starts a
4969/// new peer block, and subsequent `endpoint=` / `allowed_ip=` /
4970/// `latest_handshake=` lines belong to it.
4971fn parse_peer_status(dump: &str) -> Vec<PeerStatus> {
4972    let mut peers: Vec<PeerStatus> = Vec::new();
4973    let mut current: Option<PeerStatus> = None;
4974    let mut allowed: Vec<String> = Vec::new();
4975
4976    let flush = |peers: &mut Vec<PeerStatus>,
4977                 current: &mut Option<PeerStatus>,
4978                 allowed: &mut Vec<String>| {
4979        if let Some(mut p) = current.take() {
4980            p.allowed_ips = allowed.join(",");
4981            peers.push(p);
4982        }
4983        allowed.clear();
4984    };
4985
4986    for line in dump.lines() {
4987        let line = line.trim();
4988        let Some((key, value)) = line.split_once('=') else {
4989            continue;
4990        };
4991        match key.trim() {
4992            "public_key" | "peer" => {
4993                flush(&mut peers, &mut current, &mut allowed);
4994                current = Some(PeerStatus {
4995                    public_key: value.trim().to_string(),
4996                    endpoint: String::new(),
4997                    allowed_ips: String::new(),
4998                    last_handshake_unix_secs: 0,
4999                });
5000            }
5001            "endpoint" => {
5002                if let Some(p) = current.as_mut() {
5003                    p.endpoint = value.trim().to_string();
5004                }
5005            }
5006            "allowed_ip" | "allowed_ips" if current.is_some() => {
5007                allowed.push(value.trim().to_string());
5008            }
5009            "latest_handshake" | "last_handshake_time_sec" => {
5010                if let Some(p) = current.as_mut() {
5011                    p.last_handshake_unix_secs = value.trim().parse().unwrap_or(0);
5012                }
5013            }
5014            _ => {}
5015        }
5016    }
5017    flush(&mut peers, &mut current, &mut allowed);
5018    peers
5019}
5020
5021/// Convert a wire [`NatConfigSpec`] into the live [`NatConfig`] overlayd drives.
5022///
5023/// Sub-fields left at their zero value in the spec fall back to
5024/// [`NatConfig::default`]'s value (so a sparsely-populated spec still gets sane
5025/// STUN servers / timeouts). The `relay_server`'s `auth_credential` is stripped
5026/// here — it is carried separately on the server (`cluster_relay_credential`)
5027/// because `RelayServerConfig` has no credential field; this conversion only
5028/// produces the bind/external/max-sessions triple it does carry.
5029fn nat_config_spec_to_config(spec: NatConfigSpec) -> NatConfig {
5030    let defaults = NatConfig::default();
5031    NatConfig {
5032        enabled: spec.enabled,
5033        stun_servers: if spec.stun_servers.is_empty() {
5034            defaults.stun_servers
5035        } else {
5036            spec.stun_servers
5037                .into_iter()
5038                .map(|address| StunServerConfig {
5039                    address,
5040                    label: None,
5041                })
5042                .collect()
5043        },
5044        turn_servers: spec
5045            .turn_servers
5046            .into_iter()
5047            .map(|t| TurnServerConfig {
5048                address: t.addr,
5049                username: t.username,
5050                credential: t.credential,
5051                region: None,
5052            })
5053            .collect(),
5054        hole_punch_timeout_secs: if spec.hole_punch_timeout_secs == 0 {
5055            defaults.hole_punch_timeout_secs
5056        } else {
5057            spec.hole_punch_timeout_secs
5058        },
5059        stun_refresh_interval_secs: if spec.stun_refresh_interval_secs == 0 {
5060            defaults.stun_refresh_interval_secs
5061        } else {
5062            spec.stun_refresh_interval_secs
5063        },
5064        max_candidate_pairs: if spec.max_candidate_pairs == 0 {
5065            defaults.max_candidate_pairs
5066        } else {
5067            spec.max_candidate_pairs
5068        },
5069        relay_server: spec.relay_server.map(|r| RelayServerConfig {
5070            listen_port: r.listen_port,
5071            external_addr: r.external_addr,
5072            max_sessions: if r.max_sessions == 0 {
5073                default_max_relay_sessions()
5074            } else {
5075                r.max_sessions
5076            },
5077        }),
5078    }
5079}
5080
5081/// Default relay `max_sessions` used when a spec leaves it at `0`. Mirrors
5082/// `zlayer_overlay::nat::config`'s private `default_max_relay_sessions` (100).
5083const fn default_max_relay_sessions() -> usize {
5084    100
5085}
5086
5087/// Parse a wire [`NatCandidateWire`] into a live [`Candidate`].
5088///
5089/// Returns `None` when the address does not parse as a `host:port` socket
5090/// address or the type string is unrecognized. Priority is taken verbatim from
5091/// the wire (the advertiser already computed it) so the receiver honors the
5092/// peer's own preference ordering.
5093fn wire_to_candidate(w: &NatCandidateWire) -> Option<Candidate> {
5094    let address: SocketAddr = w.address.parse().ok()?;
5095    let candidate_type = match w.candidate_type.as_str() {
5096        "host" => CandidateType::Host,
5097        "server-reflexive" => CandidateType::ServerReflexive,
5098        "relay" => CandidateType::Relay,
5099        _ => return None,
5100    };
5101    let mut c = Candidate::new(candidate_type, address);
5102    c.priority = w.priority;
5103    Some(c)
5104}
5105
5106/// Convert a live [`Candidate`] into its wire [`NatCandidateWire`] form for a
5107/// `NatStatus` response.
5108fn candidate_to_wire(c: &Candidate) -> NatCandidateWire {
5109    let candidate_type = match c.candidate_type {
5110        CandidateType::Host => "host",
5111        CandidateType::ServerReflexive => "server-reflexive",
5112        CandidateType::Relay => "relay",
5113    };
5114    NatCandidateWire {
5115        candidate_type: candidate_type.to_string(),
5116        address: c.address.to_string(),
5117        priority: c.priority,
5118    }
5119}
5120
5121/// Current Unix time in whole seconds.
5122fn now_unix() -> u64 {
5123    std::time::SystemTime::now()
5124        .duration_since(std::time::UNIX_EPOCH)
5125        .unwrap_or_default()
5126        .as_secs()
5127}
5128
5129/// Offset (relative to the slice's network address) reserved for the node's
5130/// own overlay IP. Offset 1 is always the first usable host of the slice, so
5131/// the node IP is deterministic (`base + 1`) regardless of allocation order.
5132const NODE_RESERVED_OFFSET: u64 = 1;
5133
5134/// Simple IP address allocator supporting both IPv4 and IPv6, bounded to a
5135/// specific CIDR (typically a per-node `/28` slice). Allocations past the last
5136/// usable host return an exhaustion error.
5137///
5138/// Offset [`NODE_RESERVED_OFFSET`] (the first usable host) is reserved for the
5139/// node's own overlay IP and is never handed out by [`IpAllocator::allocate`],
5140/// so the node IP stays deterministic across restarts and immune to container
5141/// allocation order. Use [`IpAllocator::node_ip`] to read it.
5142struct IpAllocator {
5143    /// CIDR the allocator is bounded to.
5144    cidr: IpNetwork,
5145    /// Base (network) address of the CIDR.
5146    base: IpAddr,
5147    /// Monotonic counter for the next allocation offset relative to `base`.
5148    /// Starts at [`NODE_RESERVED_OFFSET`] + 1 so the node's reserved IP is
5149    /// never returned to a container.
5150    next_offset: AtomicU64,
5151    /// IPs returned by `release(...)`. `allocate()` drains this first before
5152    /// incrementing `next_offset`.
5153    released: parking_lot::Mutex<Vec<IpAddr>>,
5154}
5155
5156impl IpAllocator {
5157    fn new(cidr: IpNetwork) -> Self {
5158        Self {
5159            base: cidr.network(),
5160            cidr,
5161            // Reserve offset 1 for the node's own overlay IP; container
5162            // allocation starts at offset 2.
5163            next_offset: AtomicU64::new(NODE_RESERVED_OFFSET + 1),
5164            released: parking_lot::Mutex::new(Vec::new()),
5165        }
5166    }
5167
5168    /// The node's own overlay IP for this slice: the first usable host
5169    /// (`base + 1`), reserved so no container ever receives it. Deterministic
5170    /// for a given slice CIDR, independent of allocation order or restarts.
5171    fn node_ip(&self) -> IpAddr {
5172        self.compute_addr(NODE_RESERVED_OFFSET)
5173    }
5174
5175    #[allow(clippy::cast_possible_truncation)]
5176    fn compute_addr(&self, offset: u64) -> IpAddr {
5177        match self.base {
5178            IpAddr::V4(base_v4) => {
5179                let base_u32 = u32::from_be_bytes(base_v4.octets());
5180                let addr = base_u32.wrapping_add(offset as u32);
5181                IpAddr::V4(Ipv4Addr::from(addr.to_be_bytes()))
5182            }
5183            IpAddr::V6(base_v6) => {
5184                let base_u128 = u128::from(base_v6);
5185                let addr = base_u128.wrapping_add(u128::from(offset));
5186                IpAddr::V6(Ipv6Addr::from(addr))
5187            }
5188        }
5189    }
5190
5191    /// Allocate the next IP in the slice, reusing released IPs first.
5192    ///
5193    /// # Errors
5194    /// Returns [`OverlaydError::Overlay`] when the CIDR is exhausted.
5195    fn allocate(&self) -> Result<IpAddr, OverlaydError> {
5196        if let Some(ip) = self.released.lock().pop() {
5197            return Ok(ip);
5198        }
5199        let offset = self.next_offset.fetch_add(1, Ordering::SeqCst);
5200        let addr = self.compute_addr(offset);
5201
5202        let in_cidr = self.cidr.contains(addr);
5203        let is_v4_broadcast = matches!(
5204            (&self.cidr, &addr),
5205            (IpNetwork::V4(v4), IpAddr::V4(a)) if *a == v4.broadcast()
5206        );
5207        if !in_cidr || is_v4_broadcast {
5208            return Err(OverlaydError::Overlay(format!(
5209                "IP allocator exhausted: next address {addr} is outside slice {}",
5210                self.cidr
5211            )));
5212        }
5213        Ok(addr)
5214    }
5215
5216    /// Return an IP to the free pool. Idempotent. The node's reserved IP is
5217    /// never accepted back into the pool so it can never be handed to a
5218    /// container by a later `allocate()`.
5219    fn release(&self, ip: IpAddr) {
5220        if ip == self.node_ip() {
5221            return;
5222        }
5223        let mut released = self.released.lock();
5224        if !released.contains(&ip) {
5225            released.push(ip);
5226        }
5227    }
5228}
5229
5230// -- Windows HCN helpers (ported from the agent's hcs runtime) --------------
5231
5232/// Owner tag stamped onto every HCN endpoint this server creates. The legacy
5233/// single-instance value is `"zlayer"`; any other name is used verbatim so two
5234/// daemons running side-by-side never sweep each other's endpoints.
5235#[cfg(target_os = "windows")]
5236fn owner_tag(daemon_name: &str) -> String {
5237    if daemon_name == "zlayer" {
5238        "zlayer".to_string()
5239    } else {
5240        daemon_name.to_string()
5241    }
5242}
5243
5244/// Name of the per-daemon HCN overlay network on the host. Legacy
5245/// single-instance value is `"zlayer-overlay"`; any other name becomes
5246/// `"<daemon_name>-overlay"`.
5247#[cfg(target_os = "windows")]
5248fn overlay_network_name(daemon_name: &str) -> String {
5249    if daemon_name == "zlayer" {
5250        "zlayer-overlay".to_string()
5251    } else {
5252        format!("{daemon_name}-overlay")
5253    }
5254}
5255
5256/// Build the [`zlayer_hns::schema::HostComputeNetwork`] document for the single
5257/// shared HCN **NAT** network. A NAT network gives every attached container
5258/// outbound connectivity and host-port forwarding (driven by the userspace
5259/// free-port L4 proxy), without a per-service vSwitch — the Windows analogue of
5260/// the Linux node-wide shared bridge. The Static IPAM declares a default route
5261/// to the subnet gateway so HCN reserves only the gateway (same
5262/// `HCN_E_ADDR_INVALID_OR_RESERVED` avoidance the Internal/Transparent paths
5263/// use). Returns `None` when `subnet` has no usable gateway host.
5264#[cfg(target_os = "windows")]
5265fn shared_nat_settings(name: &str, subnet: &str) -> Option<zlayer_hns::schema::HostComputeNetwork> {
5266    use zlayer_hns::schema::{HostComputeNetwork, Ipam, NetworkType, Route, SchemaVersion, Subnet};
5267
5268    let net: ipnet::IpNet = subnet.parse().ok()?;
5269    let ipnet::IpNet::V4(v4) = net else {
5270        // HCN's NAT IPAM is IPv4 in the current schema.
5271        return None;
5272    };
5273    if v4.prefix_len() >= 31 {
5274        return None;
5275    }
5276    let gateway = std::net::Ipv4Addr::from(u32::from(v4.network()).checked_add(1)?).to_string();
5277
5278    Some(HostComputeNetwork {
5279        id: None,
5280        name: name.to_string(),
5281        ty: NetworkType::Nat,
5282        policies: Vec::new(),
5283        mac_pool: None,
5284        dns: None,
5285        ipams: vec![Ipam {
5286            ty: "Static".to_string(),
5287            subnets: vec![Subnet {
5288                ip_address_prefix: subnet.to_string(),
5289                routes: vec![Route {
5290                    next_hop: gateway,
5291                    destination_prefix: "0.0.0.0/0".to_string(),
5292                    metric: None,
5293                }],
5294                policies: Vec::new(),
5295            }],
5296        }],
5297        flags: 0,
5298        schema_version: SchemaVersion::default(),
5299    })
5300}
5301
5302/// Format a GUID as the bare, lowercase, un-braced string HCN/HCS use to
5303/// identify a namespace inside a compute-system document's
5304/// `Container.Networking.Namespace` field (e.g. `aabbccdd-eeff-...`).
5305#[cfg(target_os = "windows")]
5306fn format_guid_bare(id: windows::core::GUID) -> String {
5307    format!("{id:?}")
5308        .trim_matches(|c: char| c == '{' || c == '}')
5309        .to_ascii_lowercase()
5310}
5311
5312/// Delete every host-level HCN network this server created for `daemon_name` and
5313/// clear the persistent marker. Called on a full uninstall — never on a routine
5314/// stop/restart. Best-effort throughout. Synchronous (HCN calls are blocking).
5315#[cfg(target_os = "windows")]
5316pub fn purge_managed_networks(data_dir: &Path, daemon_name: &str) {
5317    use windows::core::GUID;
5318
5319    let marker_path = zlayer_paths::ZLayerDirs::new(data_dir.to_path_buf()).agent_network_state();
5320    let state = crate::network_state::NetworkState::load(&marker_path);
5321
5322    // Pass 1: delete recorded HCN networks by GUID.
5323    for entry in &state.networks {
5324        if !entry.kind.starts_with("hcn") {
5325            continue;
5326        }
5327        match GUID::try_from(entry.id.as_str()) {
5328            Ok(guid) => match zlayer_hns::network::Network::delete(guid) {
5329                Ok(()) => {
5330                    tracing::info!(name = %entry.name, id = %entry.id, "deleted managed HCN network");
5331                }
5332                Err(e) => {
5333                    tracing::warn!(name = %entry.name, id = %entry.id, error = %e, "failed to delete managed HCN network");
5334                }
5335            },
5336            Err(e) => {
5337                tracing::warn!(id = %entry.id, error = %e, "managed network marker has unparseable GUID");
5338            }
5339        }
5340    }
5341
5342    // Pass 2: name-sweep fallback for an overlay network whose marker entry was
5343    // lost (crash between create and marker write).
5344    let overlay_name = overlay_network_name(daemon_name);
5345    if let Ok(guids) = zlayer_hns::network::list("{}") {
5346        for guid in guids {
5347            let Ok(network) = zlayer_hns::network::Network::open(guid) else {
5348                continue;
5349            };
5350            let is_ours = matches!(network.query("{}"), Ok(props) if props.name == overlay_name);
5351            drop(network);
5352            if is_ours {
5353                match zlayer_hns::network::Network::delete(guid) {
5354                    Ok(()) => {
5355                        tracing::info!(name = %overlay_name, "deleted overlay HCN network (name sweep)");
5356                    }
5357                    Err(e) => {
5358                        tracing::warn!(name = %overlay_name, error = %e, "failed to delete overlay network (name sweep)");
5359                    }
5360                }
5361            }
5362        }
5363    }
5364
5365    if marker_path.exists() {
5366        if let Err(e) = std::fs::remove_file(&marker_path) {
5367            tracing::warn!(error = %e, path = %marker_path.display(), "failed to remove agent network marker");
5368        }
5369    }
5370}
5371
5372#[cfg(test)]
5373mod tests {
5374    use super::*;
5375
5376    #[cfg(target_os = "linux")]
5377    #[test]
5378    fn orphan_bridge_selection() {
5379        use std::collections::HashSet;
5380
5381        // Two live per-service bridges the daemon says SHOULD exist.
5382        let live: HashSet<&str> = ["zl-prod-0-web-b", "zl-prod-0-api-b"].into_iter().collect();
5383        // The active global device and node-wide shared bridge are protected,
5384        // plus a live in-memory dedicated device.
5385        let protected: HashSet<String> = ["zl-prod-0-g", "zl-prod-0-shared-sh", "zl-prod-0-db-d"]
5386            .into_iter()
5387            .map(String::from)
5388            .collect();
5389
5390        // The full set of host links the kernel would report.
5391        let host_links = [
5392            // Live -> keep.
5393            "zl-prod-0-web-b",
5394            "zl-prod-0-api-b",
5395            // Protected global / shared / live dedicated device -> keep.
5396            "zl-prod-0-g",
5397            "zl-prod-0-shared-sh",
5398            "zl-prod-0-db-d",
5399            // Orphan bridges (the user's observed leaks) -> reclaim.
5400            "zl-1ca4568944-b",
5401            "zl-81c6bc17c7-b",
5402            // Orphan dedicated device -> reclaim.
5403            "zl-prod-0-gone-d",
5404            // Container veths owned by the PID-keyed sweep, never here -> skip.
5405            "veth-4242-s",
5406            "vc-4242-g",
5407            // Unrelated host links -> skip.
5408            "eth0",
5409            "lo",
5410            "docker0",
5411            "zl-not-a-bridge",
5412        ];
5413
5414        let orphans: Vec<&str> = host_links
5415            .into_iter()
5416            .filter(|n| is_orphan_service_bridge(n, &live, &protected))
5417            .collect();
5418
5419        assert_eq!(
5420            orphans,
5421            vec!["zl-1ca4568944-b", "zl-81c6bc17c7-b", "zl-prod-0-gone-d"],
5422            "only orphaned -b/-d service bridges/devices are selected; \
5423             live, protected (-g/-sh/live -d), veth, and unrelated links are excluded"
5424        );
5425    }
5426
5427    #[test]
5428    fn peer_spec_to_info_parses_endpoint_and_keepalive() {
5429        let spec = PeerSpec {
5430            public_key: "base64key".to_string(),
5431            endpoint: "1.2.3.4:51820".to_string(),
5432            allowed_ips: "10.200.0.5/32,10.200.1.0/24".to_string(),
5433            persistent_keepalive_secs: 25,
5434            candidates: Vec::new(),
5435        };
5436        let info = peer_spec_to_info(&spec).expect("valid spec");
5437        assert_eq!(info.public_key, "base64key");
5438        assert_eq!(info.endpoint, "1.2.3.4:51820".parse().unwrap());
5439        assert_eq!(info.allowed_ips, "10.200.0.5/32,10.200.1.0/24");
5440        assert_eq!(
5441            info.persistent_keepalive_interval,
5442            std::time::Duration::from_secs(25)
5443        );
5444    }
5445
5446    #[test]
5447    fn peer_spec_to_info_rejects_bad_endpoint() {
5448        let spec = PeerSpec {
5449            public_key: "k".to_string(),
5450            endpoint: "not-a-socket-addr".to_string(),
5451            allowed_ips: String::new(),
5452            persistent_keepalive_secs: 0,
5453            candidates: Vec::new(),
5454        };
5455        assert!(peer_spec_to_info(&spec).is_err());
5456    }
5457
5458    #[test]
5459    fn interface_name_never_exceeds_limit() {
5460        let cases: Vec<(&[&str], &str)> = vec![
5461            (&["a"], "g"),
5462            (&["zlayer-manager"], "g"),
5463            (&["my-very-long-deployment-name-that-goes-on-and-on"], "g"),
5464            (&["zlayer", "manager"], "s"),
5465            (
5466                &["abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz"],
5467                "s",
5468            ),
5469            (&["x"], ""),
5470        ];
5471        for (parts, suffix) in &cases {
5472            let name = make_interface_name(parts, suffix);
5473            assert!(name.len() <= MAX_IFNAME_LEN, "Name '{name}' too long");
5474            assert!(name.starts_with("zl-"));
5475        }
5476    }
5477
5478    #[test]
5479    fn node_ip_is_first_usable_and_reserved() {
5480        let cidr: IpNetwork = "10.200.0.0/26".parse().unwrap();
5481        let alloc = IpAllocator::new(cidr);
5482
5483        // The node IP is the deterministic first-usable host of the slice.
5484        let expected_node_ip: IpAddr = "10.200.0.1".parse().unwrap();
5485        assert_eq!(alloc.node_ip(), expected_node_ip);
5486
5487        // Several container allocations must NEVER hand out the node IP, and
5488        // the node IP stays put regardless of allocation order.
5489        let mut handed_out = Vec::new();
5490        for _ in 0..10 {
5491            let ip = alloc.allocate().expect("slice not exhausted");
5492            assert_ne!(
5493                ip, expected_node_ip,
5494                "allocate() returned the reserved node IP"
5495            );
5496            handed_out.push(ip);
5497        }
5498        // Reservation holds after the allocations.
5499        assert_eq!(alloc.node_ip(), expected_node_ip);
5500
5501        // First container allocation is offset 2 (base + 2), proving offset 1
5502        // (the node) was reserved and skipped.
5503        assert_eq!(handed_out[0], "10.200.0.2".parse::<IpAddr>().unwrap());
5504
5505        // Releasing the node IP must not pollute the free pool with it.
5506        alloc.release(expected_node_ip);
5507        let next = alloc.allocate().expect("slice not exhausted");
5508        assert_ne!(
5509            next, expected_node_ip,
5510            "node IP leaked back into the pool via release()"
5511        );
5512    }
5513
5514    #[test]
5515    fn node_ip_ipv6_is_first_usable() {
5516        let cidr: IpNetwork = "fd00:200::/64".parse().unwrap();
5517        let alloc = IpAllocator::new(cidr);
5518        let expected: IpAddr = "fd00:200::1".parse().unwrap();
5519        assert_eq!(alloc.node_ip(), expected);
5520        for _ in 0..5 {
5521            assert_ne!(alloc.allocate().unwrap(), expected);
5522        }
5523        assert_eq!(alloc.node_ip(), expected);
5524    }
5525
5526    #[test]
5527    fn interface_name_is_deterministic() {
5528        assert_eq!(
5529            make_interface_name(&["zlayer-manager"], "g"),
5530            make_interface_name(&["zlayer-manager"], "g")
5531        );
5532    }
5533
5534    #[test]
5535    fn parse_peer_status_splits_blocks() {
5536        let dump = "\
5537public_key=AAA
5538endpoint=1.2.3.4:51820
5539allowed_ip=10.200.0.2/32
5540allowed_ip=10.200.1.0/24
5541latest_handshake=1700000000
5542public_key=BBB
5543endpoint=5.6.7.8:51820
5544allowed_ip=10.200.0.3/32
5545latest_handshake=0
5546";
5547        let peers = parse_peer_status(dump);
5548        assert_eq!(peers.len(), 2);
5549        assert_eq!(peers[0].public_key, "AAA");
5550        assert_eq!(peers[0].endpoint, "1.2.3.4:51820");
5551        assert_eq!(peers[0].allowed_ips, "10.200.0.2/32,10.200.1.0/24");
5552        assert_eq!(peers[0].last_handshake_unix_secs, 1_700_000_000);
5553        assert_eq!(peers[1].public_key, "BBB");
5554        assert_eq!(peers[1].last_handshake_unix_secs, 0);
5555    }
5556
5557    #[tokio::test]
5558    async fn status_snapshot_before_setup_is_empty() {
5559        let server = OverlaydServer::new(std::path::PathBuf::from("/tmp/zlayer-overlayd-test"));
5560        let snap = server.status_snapshot().await;
5561        assert!(snap.interface.is_none());
5562        assert!(snap.node_ip.is_none());
5563        assert!(snap.public_key.is_none());
5564        assert_eq!(snap.peer_count, 0);
5565        assert_eq!(snap.service_count, 0);
5566        assert!(snap.peers.is_empty());
5567    }
5568
5569    #[tokio::test]
5570    async fn allocate_and_release_ip_round_trip() {
5571        let mut server = OverlaydServer::new(std::path::PathBuf::from("/tmp/zlayer-overlayd-test"));
5572        let a = server.allocate_ip("svc", false).expect("alloc a");
5573        let b = server.allocate_ip("svc", false).expect("alloc b");
5574        assert_ne!(a, b);
5575        server.release_ip(a);
5576        // Released IP is handed back before the monotonic counter advances.
5577        let c = server.allocate_ip("svc", false).expect("alloc c");
5578        assert_eq!(c, a);
5579    }
5580
5581    /// Build a throwaway server bound to a unique temp data dir so the marker
5582    /// file (rehydrated in `new`) never collides between tests.
5583    fn test_server() -> OverlaydServer {
5584        let dir = std::env::temp_dir().join(format!(
5585            "zlayer-overlayd-scope-{}-{}",
5586            std::process::id(),
5587            now_unix()
5588        ));
5589        OverlaydServer::new(dir)
5590    }
5591
5592    /// `nat_config_spec_to_config` fills sparse fields from `NatConfig::default`
5593    /// and copies populated ones verbatim (the Step-0 wire-config threading).
5594    #[test]
5595    fn nat_config_spec_to_config_fills_defaults_and_copies() {
5596        // Empty spec → defaults (default STUN servers, default timeouts).
5597        let cfg = nat_config_spec_to_config(NatConfigSpec::default());
5598        let d = NatConfig::default();
5599        assert_eq!(cfg.stun_servers.len(), d.stun_servers.len());
5600        assert_eq!(cfg.hole_punch_timeout_secs, d.hole_punch_timeout_secs);
5601        assert_eq!(cfg.max_candidate_pairs, d.max_candidate_pairs);
5602        assert!(cfg.relay_server.is_none());
5603
5604        // Populated spec → copied verbatim; relay credential is NOT on the
5605        // produced RelayServerConfig (it is carried separately on the server).
5606        let spec = NatConfigSpec {
5607            enabled: true,
5608            stun_servers: vec!["stun.example:3478".to_string()],
5609            turn_servers: vec![zlayer_types::nat_wire::TurnServerSpec {
5610                addr: "turn.example:3478".to_string(),
5611                username: "u".to_string(),
5612                credential: "p".to_string(),
5613            }],
5614            hole_punch_timeout_secs: 9,
5615            stun_refresh_interval_secs: 40,
5616            max_candidate_pairs: 3,
5617            relay_server: Some(zlayer_types::nat_wire::RelayServerSpec {
5618                listen_port: 3478,
5619                external_addr: "1.2.3.4:3478".to_string(),
5620                max_sessions: 7,
5621                auth_credential: Some("cluster-secret".to_string()),
5622            }),
5623        };
5624        let cfg = nat_config_spec_to_config(spec);
5625        assert_eq!(cfg.stun_servers.len(), 1);
5626        assert_eq!(cfg.stun_servers[0].address, "stun.example:3478");
5627        assert_eq!(cfg.turn_servers.len(), 1);
5628        assert_eq!(cfg.hole_punch_timeout_secs, 9);
5629        assert_eq!(cfg.max_candidate_pairs, 3);
5630        let relay = cfg.relay_server.expect("relay present");
5631        assert_eq!(relay.listen_port, 3478);
5632        assert_eq!(relay.max_sessions, 7);
5633    }
5634
5635    /// `wire_to_candidate` parses valid candidates and rejects bad ones;
5636    /// `candidate_to_wire` is its inverse for the type/address/priority triple.
5637    #[test]
5638    fn candidate_wire_conversions_round_trip() {
5639        let w = NatCandidateWire {
5640            candidate_type: "server-reflexive".to_string(),
5641            address: "203.0.113.5:51820".to_string(),
5642            priority: 50,
5643        };
5644        let c = wire_to_candidate(&w).expect("valid candidate");
5645        assert_eq!(c.candidate_type, CandidateType::ServerReflexive);
5646        assert_eq!(c.priority, 50);
5647        let back = candidate_to_wire(&c);
5648        assert_eq!(back, w);
5649
5650        // Bad address / type → None.
5651        assert!(wire_to_candidate(&NatCandidateWire {
5652            candidate_type: "host".to_string(),
5653            address: "not-an-addr".to_string(),
5654            priority: 1,
5655        })
5656        .is_none());
5657        assert!(wire_to_candidate(&NatCandidateWire {
5658            candidate_type: "bogus".to_string(),
5659            address: "1.2.3.4:5".to_string(),
5660            priority: 1,
5661        })
5662        .is_none());
5663    }
5664
5665    /// `AddPeer` carrying candidates records them in `peer_candidates`; a
5666    /// candidate-free add (or one with only-invalid candidates) leaves no entry,
5667    /// and `RemovePeer` clears them.
5668    #[tokio::test]
5669    async fn add_peer_records_candidates_and_remove_clears_them() {
5670        let mut server = test_server();
5671        let pubkey = "base64key".to_string();
5672        let resp = server
5673            .handle(OverlaydRequest::AddPeer {
5674                peer: PeerSpec {
5675                    public_key: pubkey.clone(),
5676                    endpoint: "1.2.3.4:51820".to_string(),
5677                    allowed_ips: "10.200.0.2/32".to_string(),
5678                    persistent_keepalive_secs: 25,
5679                    candidates: vec![NatCandidateWire {
5680                        candidate_type: "host".to_string(),
5681                        address: "192.168.1.5:51820".to_string(),
5682                        priority: 100,
5683                    }],
5684                },
5685                scope: PeerScope::Global,
5686            })
5687            .await;
5688        assert!(matches!(resp, OverlaydResponse::Ok));
5689        assert_eq!(
5690            server.peer_candidates.get(&pubkey).map(Vec::len),
5691            Some(1),
5692            "candidates must be recorded"
5693        );
5694
5695        // Remove clears the candidate + connection-type bookkeeping.
5696        let resp = server
5697            .handle(OverlaydRequest::RemovePeer {
5698                pubkey: pubkey.clone(),
5699                scope: PeerScope::Global,
5700            })
5701            .await;
5702        assert!(matches!(resp, OverlaydResponse::Ok));
5703        assert!(!server.peer_candidates.contains_key(&pubkey));
5704    }
5705
5706    /// `NatStatus` returns a `NatStatusWire` (empty before any tick) — proving
5707    /// the new IPC pair is wired through `dispatch`.
5708    #[tokio::test]
5709    async fn nat_status_request_returns_wire_snapshot() {
5710        let mut server = test_server();
5711        let resp = server.handle(OverlaydRequest::NatStatus).await;
5712        match resp {
5713            OverlaydResponse::NatStatus(wire) => {
5714                assert!(wire.candidates.is_empty());
5715                assert!(wire.peers.is_empty());
5716            }
5717            other => panic!("expected NatStatus response, got {other:?}"),
5718        }
5719    }
5720
5721    /// True when the process can mutate netlink + `/proc/sys` (root). The
5722    /// teardown-completeness test below is `#[ignore]`d and additionally skips
5723    /// (not fails) when run via `--ignored` without privileges, matching the
5724    /// crate's "skip gracefully when not root" convention.
5725    #[cfg(target_os = "linux")]
5726    fn is_root() -> bool {
5727        // SAFETY: `geteuid` is a pure read of the caller's effective uid.
5728        #[allow(unsafe_code)]
5729        let euid = unsafe { libc::geteuid() };
5730        euid == 0
5731    }
5732
5733    /// End-to-end teardown completeness: populate the server's
5734    /// `created_veths` / `created_bridges` / `created_host_routes` tracking sets
5735    /// with REAL host resources created via netlink, snapshot
5736    /// `net.ipv4.ip_forward`, force it to `1` (recording the prior value in
5737    /// `prev_ipv4_forward` exactly as `enable_forwarding_for_attach` does), then
5738    /// drive the same teardown the `Shutdown` request triggers
5739    /// (`handle(OverlaydRequest::Shutdown)`), and assert: every tracked veth /
5740    /// bridge / route is gone at the kernel level AND `ip_forward` is restored to
5741    /// the snapshot.
5742    ///
5743    /// This is the regression for the full teardown fix (revert routes + veths +
5744    /// bridges + forwarding sysctl on shutdown). Names are unique and <=15 chars;
5745    /// a belt-and-braces cleanup runs before the asserts so a failed assertion
5746    /// still leaves the host clean. Skips (returns) when not root.
5747    #[cfg(target_os = "linux")]
5748    #[tokio::test(flavor = "multi_thread")]
5749    #[ignore = "needs CAP_NET_ADMIN + /proc/sys write; run on a privileged Linux host"]
5750    async fn shutdown_teardown_reverts_resources_and_ip_forward() {
5751        if !is_root() {
5752            eprintln!("skipping shutdown_teardown_reverts_resources_and_ip_forward: requires root");
5753            return;
5754        }
5755
5756        let suffix = format!("{:x}", now_unix() & 0xff_ffff);
5757        let veth_host = format!("vh-{suffix}");
5758        let veth_peer = format!("vp-{suffix}");
5759        let bridge = format!("zlb-{suffix}");
5760        assert!(veth_host.len() <= 15, "veth host name exceeds IFNAMSIZ");
5761        assert!(veth_peer.len() <= 15, "veth peer name exceeds IFNAMSIZ");
5762        assert!(bridge.len() <= 15, "bridge name exceeds IFNAMSIZ");
5763
5764        let dest = IpAddr::V4(Ipv4Addr::new(10, 233, 0, 9));
5765        let prefix: u8 = 32;
5766
5767        // --- create real host resources and register them with the server's
5768        // teardown-tracking sets, exactly as the attach paths do. ---
5769        crate::netlink::create_veth_pair(&veth_host, &veth_peer)
5770            .await
5771            .expect("create_veth_pair");
5772        crate::netlink::create_bridge(&bridge)
5773            .await
5774            .expect("create_bridge");
5775        crate::netlink::replace_route_via_dev(dest, prefix, &veth_host, None)
5776            .await
5777            .expect("replace_route_via_dev");
5778
5779        let mut server = test_server();
5780        server.created_veths.insert(veth_host.clone());
5781        server.created_bridges.insert(bridge.clone());
5782        server
5783            .created_host_routes
5784            .push((dest, prefix, veth_host.clone()));
5785
5786        // Snapshot ip_forward, then flip it to 1 and record the prior value the
5787        // way enable_forwarding_for_attach does so revert_forwarding restores it.
5788        let snapshot =
5789            crate::netlink::read_sysctl("net.ipv4.ip_forward").unwrap_or_else(|_| "0".to_string());
5790        server.prev_ipv4_forward = Some(snapshot.clone());
5791        crate::netlink::set_sysctl("net.ipv4.ip_forward", "1").expect("set ip_forward=1");
5792
5793        // --- drive teardown via the real Shutdown dispatch path ---
5794        let resp = server.handle(OverlaydRequest::Shutdown).await;
5795        assert!(
5796            matches!(resp, OverlaydResponse::Ok),
5797            "Shutdown should return Ok, got {resp:?}"
5798        );
5799
5800        // Snapshot kernel state AFTER teardown.
5801        let veth_gone = !std::path::Path::new(&format!("/sys/class/net/{veth_host}")).exists();
5802        let bridge_gone = !std::path::Path::new(&format!("/sys/class/net/{bridge}")).exists();
5803        let route_gone = {
5804            let target = format!("10.233.0.9/{prefix}");
5805            std::process::Command::new("ip")
5806                .args(["route", "show", &target, "dev", &veth_host])
5807                .output()
5808                .map_or(true, |o| !o.status.success() || o.stdout.is_empty())
5809        };
5810        let ip_forward_after = crate::netlink::read_sysctl("net.ipv4.ip_forward")
5811            .unwrap_or_else(|_| "unknown".to_string());
5812
5813        // Belt-and-braces cleanup before asserting so the host stays clean even
5814        // if an assertion fails (teardown should have done all of this already).
5815        let _ = crate::netlink::delete_route_via_dev(dest, prefix, &veth_host).await;
5816        let _ = crate::netlink::delete_link_by_name(&veth_host).await;
5817        let _ = crate::netlink::delete_link_by_name(&veth_peer).await;
5818        let _ = crate::netlink::delete_link_by_name(&bridge).await;
5819        // Restore ip_forward to the snapshot regardless of teardown outcome.
5820        let _ = crate::netlink::set_sysctl("net.ipv4.ip_forward", &snapshot);
5821
5822        // --- assertions ---
5823        assert!(veth_gone, "teardown should delete the tracked host veth");
5824        assert!(bridge_gone, "teardown should delete the tracked bridge");
5825        assert!(
5826            route_gone,
5827            "teardown should delete the tracked /32 host route"
5828        );
5829        assert_eq!(
5830            ip_forward_after.trim(),
5831            snapshot.trim(),
5832            "teardown should restore net.ipv4.ip_forward to its pre-overlay value"
5833        );
5834
5835        // Tracking sets must be drained by teardown so a re-run starts clean.
5836        assert!(
5837            server.created_veths.is_empty(),
5838            "created_veths should be drained by teardown"
5839        );
5840        assert!(
5841            server.created_bridges.is_empty(),
5842            "created_bridges should be drained by teardown"
5843        );
5844        assert!(
5845            server.created_host_routes.is_empty(),
5846            "created_host_routes should be drained by teardown"
5847        );
5848    }
5849
5850    #[test]
5851    fn build_config_uses_matching_physical_egress_ipv4() {
5852        let server = test_server();
5853        let overlay_ip: IpAddr = "10.200.0.1".parse().unwrap();
5854        let egress: IpAddr = "192.0.2.10".parse().unwrap();
5855        let config = server.build_config(
5856            "priv".to_string(),
5857            "pub".to_string(),
5858            overlay_ip,
5859            16,
5860            51820,
5861            Some(egress),
5862        );
5863        assert_eq!(config.local_endpoint, SocketAddr::new(egress, 51820));
5864    }
5865
5866    #[test]
5867    fn build_config_falls_back_to_unspecified_when_none() {
5868        let server = test_server();
5869        let overlay_ip: IpAddr = "10.200.0.1".parse().unwrap();
5870        let config = server.build_config(
5871            "priv".to_string(),
5872            "pub".to_string(),
5873            overlay_ip,
5874            16,
5875            51820,
5876            None,
5877        );
5878        assert_eq!(
5879            config.local_endpoint,
5880            SocketAddr::new(IpAddr::V4(Ipv4Addr::UNSPECIFIED), 51820)
5881        );
5882    }
5883
5884    #[test]
5885    fn build_config_falls_back_to_unspecified_on_family_mismatch() {
5886        let server = test_server();
5887        // Overlay is v6 but the resolved physical egress is v4: unusable for
5888        // source selection, so we must fall back to the v6 UNSPECIFIED address.
5889        let overlay_ip: IpAddr = "fd00::1".parse().unwrap();
5890        let egress: IpAddr = "192.0.2.10".parse().unwrap();
5891        let config = server.build_config(
5892            "priv".to_string(),
5893            "pub".to_string(),
5894            overlay_ip,
5895            64,
5896            51820,
5897            Some(egress),
5898        );
5899        assert_eq!(
5900            config.local_endpoint,
5901            SocketAddr::new(IpAddr::V6(Ipv6Addr::UNSPECIFIED), 51820)
5902        );
5903    }
5904
5905    #[test]
5906    fn rootless_forces_unspecified_decision() {
5907        // Rootless mode must force the WG local_endpoint to UNSPECIFIED because
5908        // detect_physical_egress() resolves pasta's in-netns tap IP there.
5909        assert!(rootless_forces_unspecified(true));
5910        // Non-rootless preserves the existing physical-egress selection path.
5911        assert!(!rootless_forces_unspecified(false));
5912    }
5913
5914    #[tokio::test]
5915    async fn transport_for_scope_global_requires_setup() {
5916        let server = test_server();
5917        // No global overlay set up yet -> Global scope errors. (Can't use
5918        // `expect_err` because `&OverlayTransport` is not `Debug`.)
5919        match server.transport_for_scope(&PeerScope::Global) {
5920            Ok(_) => panic!("global overlay should not be set up"),
5921            Err(OverlaydError::Other(m)) => {
5922                assert!(m.contains("global overlay not set up"), "got: {m}");
5923            }
5924            Err(other) => panic!("unexpected error: {other:?}"),
5925        }
5926    }
5927
5928    #[tokio::test]
5929    async fn transport_for_scope_unset_service_errors() {
5930        let server = test_server();
5931        match server.transport_for_scope(&PeerScope::Service {
5932            service: "x".to_string(),
5933        }) {
5934            Ok(_) => panic!("no dedicated overlay should exist for x"),
5935            Err(OverlaydError::Other(m)) => {
5936                assert_eq!(m, "no dedicated overlay for service x");
5937            }
5938            Err(other) => panic!("unexpected error: {other:?}"),
5939        }
5940    }
5941
5942    #[tokio::test]
5943    async fn add_peer_service_scope_before_setup_errors_via_dispatch() {
5944        let mut server = test_server();
5945        let resp = server
5946            .handle(OverlaydRequest::AddPeer {
5947                peer: PeerSpec {
5948                    public_key: "k".to_string(),
5949                    endpoint: "1.2.3.4:51820".to_string(),
5950                    allowed_ips: "10.200.0.2/32".to_string(),
5951                    persistent_keepalive_secs: 0,
5952                    candidates: Vec::new(),
5953                },
5954                scope: PeerScope::Service {
5955                    service: "x".to_string(),
5956                },
5957            })
5958            .await;
5959        match resp {
5960            OverlaydResponse::Err { message } => {
5961                assert_eq!(message, "no dedicated overlay for service x");
5962            }
5963            other => panic!("expected Err response, got {other:?}"),
5964        }
5965    }
5966
5967    /// The host-adapter degrade decision. A `create_interface()` failure is fatal
5968    /// on Linux (the kernel TUN IS the container data path) and degrades to a
5969    /// VM-only overlay on macOS/Windows (containers mesh VM-to-VM, the host
5970    /// utun/Wintun is off the data path). We can't provoke a real utun/Wintun
5971    /// syscall failure from a Linux test box, so we assert the pure `cfg!`-driven
5972    /// classifier instead: on this Linux test runner it must report fatal.
5973    /// (On macOS/Windows the same fn returns `false` — that arm is covered by the
5974    /// cfg, exercised natively, and cannot be asserted here.)
5975    #[test]
5976    fn host_adapter_failure_fatal_decision() {
5977        // Non-mandatory: platform-driven — fatal on Linux, degrade on macOS/Windows.
5978        assert_eq!(
5979            host_adapter_failure_is_fatal(false),
5980            cfg!(target_os = "linux"),
5981            "non-mandatory host-adapter failure is fatal only on Linux (kernel TUN is the data path)"
5982        );
5983        // Mandatory (host-shared macOS nodes where the utun IS the container data
5984        // path): fatal on every platform.
5985        assert!(
5986            host_adapter_failure_is_fatal(true),
5987            "a mandatory host adapter must make failure fatal on every platform"
5988        );
5989    }
5990
5991    /// A VM-only overlay leaves `global_transport == None`. The Global-scope peer
5992    /// dispatch must then WARN-AND-SKIP the on-device install (guests get the
5993    /// peer via guest-config push) rather than erroring — assert the dispatch
5994    /// returns `Ok` and still mirrors the peer into `global_peers`. This is the
5995    /// Linux-runnable proxy for the degraded host-adapter path: it exercises the
5996    /// exact `None`-tolerant branch without needing a real utun/Wintun failure.
5997    #[tokio::test]
5998    async fn add_global_peer_with_no_host_adapter_skips_and_records() {
5999        let mut server = test_server();
6000        assert!(
6001            server.global_transport.is_none(),
6002            "fresh server has no host adapter (VM-only precondition)"
6003        );
6004        let pubkey = "k".to_string();
6005        let resp = server
6006            .handle(OverlaydRequest::AddPeer {
6007                peer: PeerSpec {
6008                    public_key: pubkey.clone(),
6009                    endpoint: "1.2.3.4:51820".to_string(),
6010                    allowed_ips: "10.200.0.2/32".to_string(),
6011                    persistent_keepalive_secs: 0,
6012                    candidates: Vec::new(),
6013                },
6014                scope: PeerScope::Global,
6015            })
6016            .await;
6017        match resp {
6018            OverlaydResponse::Ok => {}
6019            other => panic!("expected Ok (warn-and-skip), got {other:?}"),
6020        }
6021        assert!(
6022            server.global_peers.contains_key(&pubkey),
6023            "Global peer must still be mirrored for guest-config push"
6024        );
6025    }
6026
6027    /// End-to-end Dedicated setup. Needs a real TUN device, so it is ignored by
6028    /// default and only runs on a privileged Linux host (mirrors the crate's
6029    /// other privileged overlay e2e tests).
6030    #[cfg(target_os = "linux")]
6031    #[tokio::test]
6032    #[ignore = "needs CAP_NET_ADMIN; run on a privileged Linux host"]
6033    async fn dedicated_setup_creates_distinct_device_and_routes_service_peer() {
6034        let mut server = test_server();
6035        // Bring up the global overlay first so the cluster CIDR + global device
6036        // exist (the dedicated device must get a distinct port and key).
6037        let global_name = server
6038            .setup_global_overlay(
6039                "dep".to_string(),
6040                "i0".to_string(),
6041                "10.200.0.0/16",
6042                Some("10.200.0.0/28"),
6043                zlayer_core::DEFAULT_WG_PORT,
6044                None,
6045                false,
6046            )
6047            .await
6048            .expect("global overlay up");
6049        assert!(!global_name.is_empty());
6050
6051        // Dedicated service setup.
6052        let info = server
6053            .setup_service_overlay("web", OverlayMode::Dedicated)
6054            .await
6055            .expect("dedicated service overlay up");
6056        assert_eq!(info.mode, OverlayMode::Dedicated);
6057        let port = info.wg_port.expect("dedicated port");
6058        assert_ne!(
6059            port, server.overlay_port,
6060            "dedicated device must not share the global port"
6061        );
6062
6063        let st = server
6064            .service_transports
6065            .get("web")
6066            .expect("service transport recorded");
6067        assert_eq!(st.listen_port, port);
6068        assert_ne!(
6069            st.interface, global_name,
6070            "dedicated interface must differ from global"
6071        );
6072        assert_eq!(
6073            Some(st.public_key.clone()),
6074            info.wg_public_key,
6075            "info pubkey matches recorded transport"
6076        );
6077        assert_ne!(
6078            Some(st.public_key.clone()),
6079            server.transport_public_key,
6080            "dedicated key must differ from global key"
6081        );
6082
6083        // A Service-scoped AddPeer must land on the dedicated device (succeeds),
6084        // proving scope routing targets the per-service transport.
6085        let resp = server
6086            .handle(OverlaydRequest::AddPeer {
6087                peer: PeerSpec {
6088                    public_key: {
6089                        let (_priv, pubk) = OverlayTransport::generate_keys().await.unwrap();
6090                        pubk
6091                    },
6092                    endpoint: "5.6.7.8:51999".to_string(),
6093                    allowed_ips: "10.201.0.2/32".to_string(),
6094                    persistent_keepalive_secs: 25,
6095                    candidates: Vec::new(),
6096                },
6097                scope: PeerScope::Service {
6098                    service: "web".to_string(),
6099                },
6100            })
6101            .await;
6102        assert!(
6103            matches!(resp, OverlaydResponse::Ok),
6104            "service-scoped add_peer should land on the dedicated device, got {resp:?}"
6105        );
6106    }
6107
6108    #[tokio::test]
6109    async fn guest_attach_requires_global_overlay() {
6110        // Without a global overlay (no node public key / transport) a
6111        // guest-managed attach must error rather than allocate anything.
6112        let mut server = test_server();
6113        let resp = server
6114            .handle(OverlaydRequest::AttachContainer {
6115                handle: AttachHandle::GuestManaged {
6116                    id: "vm-1".to_string(),
6117                },
6118                service: "web".to_string(),
6119                join_global: true,
6120                dns_server: None,
6121                dns_domain: None,
6122                ephemeral: false,
6123                isolation_network: None,
6124            })
6125            .await;
6126        match resp {
6127            OverlaydResponse::Err { message } => {
6128                assert!(
6129                    message.contains("global overlay to be set up"),
6130                    "got: {message}"
6131                );
6132            }
6133            other => panic!("expected Err response, got {other:?}"),
6134        }
6135        // Nothing was recorded.
6136        assert!(server.guest_attachments.is_empty());
6137    }
6138
6139    #[tokio::test]
6140    async fn detach_unknown_guest_is_idempotent() {
6141        let mut server = test_server();
6142        // No such guest -> Ok (idempotent), no panic.
6143        server
6144            .detach_container_guest("never-attached")
6145            .await
6146            .expect("detach of unknown guest is a no-op");
6147    }
6148
6149    /// Full guest-managed attach/detach round-trip. Needs a real TUN device (the
6150    /// global overlay must be live so the guest peer can be installed), so it is
6151    /// ignored by default and only runs on a privileged Linux host — mirrors the
6152    /// crate's other privileged overlay e2e tests.
6153    #[cfg(target_os = "linux")]
6154    #[tokio::test]
6155    #[ignore = "needs CAP_NET_ADMIN; run on a privileged Linux host"]
6156    async fn guest_attach_allocates_config_and_detach_releases() {
6157        let mut server = test_server();
6158        server
6159            .setup_global_overlay(
6160                "dep".to_string(),
6161                "i0".to_string(),
6162                "10.200.0.0/16",
6163                Some("10.200.0.0/28"),
6164                zlayer_core::DEFAULT_WG_PORT,
6165                None,
6166                false,
6167            )
6168            .await
6169            .expect("global overlay up");
6170
6171        // Seed a global peer so the guest config carries it through.
6172        let (_p, other_pub) = OverlayTransport::generate_keys().await.unwrap();
6173        let add = server
6174            .handle(OverlaydRequest::AddPeer {
6175                peer: PeerSpec {
6176                    public_key: other_pub.clone(),
6177                    endpoint: "9.9.9.9:51820".to_string(),
6178                    allowed_ips: "10.200.1.0/28".to_string(),
6179                    persistent_keepalive_secs: 25,
6180                    candidates: Vec::new(),
6181                },
6182                scope: PeerScope::Global,
6183            })
6184            .await;
6185        assert!(
6186            matches!(add, OverlaydResponse::Ok),
6187            "seed peer add: {add:?}"
6188        );
6189
6190        let resp = server
6191            .handle(OverlaydRequest::AttachContainer {
6192                handle: AttachHandle::GuestManaged {
6193                    id: "vm-1".to_string(),
6194                },
6195                service: "web".to_string(),
6196                join_global: true,
6197                dns_server: Some("10.200.0.1".parse().unwrap()),
6198                dns_domain: Some("overlay".to_string()),
6199                ephemeral: false,
6200                isolation_network: None,
6201            })
6202            .await;
6203        let config = match resp {
6204            OverlaydResponse::GuestConfig(c) => c,
6205            other => panic!("expected GuestConfig, got {other:?}"),
6206        };
6207        assert!(!config.private_key.is_empty());
6208        assert!(!config.public_key.is_empty());
6209        assert_ne!(config.private_key, config.public_key);
6210        assert_eq!(config.listen_port, server.overlay_port);
6211        assert_eq!(config.dns_server, Some("10.200.0.1".parse().unwrap()));
6212        // Peers = the seeded global peer + this node (self) + nothing else.
6213        assert!(
6214            config.peers.iter().any(|p| p.public_key == other_pub),
6215            "guest must learn the seeded global peer"
6216        );
6217        assert!(
6218            config
6219                .peers
6220                .iter()
6221                .any(|p| Some(&p.public_key) == server.transport_public_key.as_ref()),
6222            "guest must learn THIS node as a peer"
6223        );
6224        // The guest's own key is registered as a global peer (host route).
6225        assert!(server.global_peers.contains_key(&config.public_key));
6226        let info = server
6227            .guest_attachments
6228            .get("vm-1")
6229            .expect("attachment recorded");
6230        assert_eq!(info.overlay_ip, config.overlay_ip);
6231
6232        // Detach releases the peer + IP.
6233        let det = server
6234            .handle(OverlaydRequest::DetachContainer {
6235                handle: AttachHandle::GuestManaged {
6236                    id: "vm-1".to_string(),
6237                },
6238            })
6239            .await;
6240        assert!(matches!(det, OverlaydResponse::Ok), "detach: {det:?}");
6241        assert!(!server.guest_attachments.contains_key("vm-1"));
6242        assert!(!server.global_peers.contains_key(&config.public_key));
6243    }
6244
6245    /// The `setup_service_overlay` dispatch must handle ALL THREE modes —
6246    /// including the default `Auto` — without panicking. `resolve()` is now the
6247    /// identity, so the old `unreachable!("resolve never returns Auto")` arm
6248    /// would panic on the default mode; this proves the arm is gone. Each mode
6249    /// is recorded in `service_modes` BEFORE any netlink/transport work, so we
6250    /// assert on that deterministically regardless of host privilege (the
6251    /// downstream bridge/transport bring-up may succeed or fail depending on
6252    /// `CAP_NET_ADMIN`, but it must never panic).
6253    #[cfg(target_os = "linux")]
6254    #[tokio::test]
6255    async fn dispatch_handles_all_three_modes_without_panic() {
6256        for mode in [
6257            OverlayMode::Auto,
6258            OverlayMode::Shared,
6259            OverlayMode::Dedicated,
6260        ] {
6261            let mut server = test_server();
6262            let service = format!("svc-{mode:?}");
6263            // Must return a Result (Ok or Err) — never panic via `unreachable!`.
6264            let _ = server.setup_service_overlay(&service, mode).await;
6265            // The resolved mode is recorded up front for the attach path.
6266            assert_eq!(
6267                server.service_modes.get(&service).copied(),
6268                Some(mode.resolve()),
6269                "mode {mode:?} must be recorded for the attach path"
6270            );
6271        }
6272    }
6273
6274    /// Two distinct `Shared` services must reuse the SAME node-wide shared
6275    /// bridge (one bridge, not two), while an `Auto` service gets its OWN
6276    /// per-service bridge. Needs `CAP_NET_ADMIN` to create the bridges, so it is
6277    /// ignored by default like the crate's other privileged overlay e2e tests.
6278    #[cfg(target_os = "linux")]
6279    #[tokio::test]
6280    #[ignore = "needs CAP_NET_ADMIN; run on a privileged Linux host"]
6281    async fn shared_services_reuse_one_bridge_auto_gets_its_own() {
6282        let mut server = test_server();
6283        server
6284            .setup_global_overlay(
6285                "dep".to_string(),
6286                "i0".to_string(),
6287                "10.200.0.0/16",
6288                Some("10.200.0.0/26"),
6289                zlayer_core::DEFAULT_WG_PORT,
6290                None,
6291                false,
6292            )
6293            .await
6294            .expect("global overlay up");
6295
6296        // First Shared service creates the shared bridge.
6297        let info_a = server
6298            .setup_service_overlay("web", OverlayMode::Shared)
6299            .await
6300            .expect("shared service web up");
6301        assert_eq!(info_a.mode, OverlayMode::Shared);
6302        let shared_name = server
6303            .shared_bridge
6304            .as_ref()
6305            .expect("shared bridge created")
6306            .name
6307            .clone();
6308        assert_eq!(info_a.name, shared_name);
6309        // Shared services are NOT per-service bridges.
6310        assert!(
6311            !server.service_bridges.contains_key("web"),
6312            "Shared service must not create a per-service bridge"
6313        );
6314
6315        // Second Shared service REUSES the same shared bridge — no new bridge.
6316        let info_b = server
6317            .setup_service_overlay("api", OverlayMode::Shared)
6318            .await
6319            .expect("shared service api up");
6320        assert_eq!(
6321            info_b.name, shared_name,
6322            "a second Shared service must reuse the SAME node-wide bridge"
6323        );
6324        assert!(!server.service_bridges.contains_key("api"));
6325        // Still exactly one shared bridge object.
6326        assert_eq!(
6327            server.shared_bridge.as_ref().map(|b| b.name.clone()),
6328            Some(shared_name.clone())
6329        );
6330
6331        // An Auto service gets its OWN per-service bridge, distinct from the
6332        // shared bridge.
6333        let info_c = server
6334            .setup_service_overlay("batch", OverlayMode::Auto)
6335            .await
6336            .expect("auto service batch up");
6337        assert_eq!(info_c.mode, OverlayMode::Auto);
6338        assert!(
6339            server.service_bridges.contains_key("batch"),
6340            "Auto service must get its own per-service bridge"
6341        );
6342        assert_ne!(
6343            info_c.name, shared_name,
6344            "Auto per-service bridge must differ from the shared bridge"
6345        );
6346
6347        // Both Shared services point their service_interfaces entry at the one
6348        // shared bridge; the Auto service points at its own.
6349        assert_eq!(server.service_interfaces.get("web"), Some(&shared_name));
6350        assert_eq!(server.service_interfaces.get("api"), Some(&shared_name));
6351        assert_ne!(server.service_interfaces.get("batch"), Some(&shared_name));
6352    }
6353
6354    /// A `Shared` service's container attach must draw its IP from the shared
6355    /// bridge pool and must fail cleanly (no panic, clear error) when the shared
6356    /// bridge has not been set up yet. Unprivileged: exercises only the
6357    /// pre-netlink resolution branch.
6358    #[cfg(target_os = "linux")]
6359    #[tokio::test]
6360    async fn attach_shared_without_setup_errors_cleanly() {
6361        let mut server = test_server();
6362        // Mark the service Shared but never set up the shared bridge.
6363        server
6364            .service_modes
6365            .insert("web".to_string(), OverlayMode::Shared);
6366        let err = server
6367            .attach_container_linux(424_242, "web", false, false, None)
6368            .await
6369            .expect_err("attach must fail without a shared bridge");
6370        match err {
6371            OverlaydError::Other(m) => {
6372                assert!(
6373                    m.contains("no shared bridge"),
6374                    "expected shared-bridge error, got: {m}"
6375                );
6376            }
6377            other => panic!("unexpected error variant: {other:?}"),
6378        }
6379    }
6380
6381    /// A container attached on a NAMED isolated network must be recorded in the
6382    /// per-network membership map (`network_members["net-a"]` gains the member's
6383    /// service IP). Needs `CAP_NET_ADMIN` to bring up the bridge + veth, so it is
6384    /// ignored by default like the crate's other privileged overlay e2e tests.
6385    #[cfg(target_os = "linux")]
6386    #[tokio::test]
6387    #[ignore = "needs CAP_NET_ADMIN; run on a privileged Linux host"]
6388    async fn attach_linux_isolated_network_records_membership() {
6389        let mut server = test_server();
6390        server
6391            .setup_global_overlay(
6392                "dep".to_string(),
6393                "i0".to_string(),
6394                "10.200.0.0/16",
6395                Some("10.200.0.0/26"),
6396                zlayer_core::DEFAULT_WG_PORT,
6397                None,
6398                false,
6399            )
6400            .await
6401            .expect("global overlay up");
6402
6403        // An Auto service gives us a real per-service bridge to attach onto.
6404        server
6405            .setup_service_overlay("web", OverlayMode::Auto)
6406            .await
6407            .expect("auto service web up");
6408
6409        // Attach this very process (a live PID with a real netns) onto the named
6410        // isolated network "net-a".
6411        let pid = std::process::id();
6412        let ip = server
6413            .attach_container_linux(pid, "web", false, true, Some("net-a".to_string()))
6414            .await
6415            .expect("attach onto isolated network");
6416
6417        // Membership map gained exactly this member under "net-a".
6418        let members = server
6419            .network_members
6420            .get("net-a")
6421            .expect("net-a membership recorded");
6422        assert!(
6423            members.contains(&ip),
6424            "network_members[net-a] must contain the attached member IP {ip}"
6425        );
6426
6427        // Detach drains the membership and drops the now-empty network entry.
6428        server
6429            .detach_container_linux(pid)
6430            .await
6431            .expect("detach succeeds");
6432        assert!(
6433            !server.network_members.contains_key("net-a"),
6434            "empty isolated network must be dropped from network_members on last detach"
6435        );
6436    }
6437
6438    /// The isolation-network owner key namespace is distinct from the dedicated
6439    /// per-service namespace, so an isolation network and a service of the same
6440    /// name never collide on the same marker/allocator key. Platform-agnostic.
6441    #[test]
6442    fn isolation_owner_key_distinct_from_service_owner_key() {
6443        let iso = crate::network_state::owner_for_isolation_network("alpha");
6444        let svc = crate::network_state::owner_for_service("alpha");
6445        assert_ne!(
6446            iso, svc,
6447            "isolation and service owner keys must not collide for the same name"
6448        );
6449        assert_eq!(iso, "iso:alpha");
6450        assert_eq!(svc, "service:alpha");
6451    }
6452
6453    /// `isolation_network_subnet` is deterministic (same name -> same block so a
6454    /// reused HCN network keeps its subnet across restarts), stays INSIDE the
6455    /// node slice, and lands DIFFERENT isolation networks on DISJOINT sub-blocks
6456    /// (the whole point of L3 isolation — distinct networks must not share an
6457    /// address range). Windows-only (the method is `cfg(windows)`); exercised by
6458    /// `cargo xwin test`.
6459    #[cfg(target_os = "windows")]
6460    #[test]
6461    fn isolation_network_subnet_is_deterministic_disjoint_and_inside_slice() {
6462        let mut server = test_server();
6463        let slice: IpNetwork = "10.200.5.0/26".parse().unwrap();
6464        server.slice_cidr = Some(slice);
6465        let slice_net: ipnet::IpNet = "10.200.5.0/26".parse().unwrap();
6466
6467        // Deterministic: same name -> same block on repeated calls.
6468        let a1 = server.isolation_network_subnet("alpha").unwrap();
6469        let a2 = server.isolation_network_subnet("alpha").unwrap();
6470        assert_eq!(a1, a2, "same isolation network must map to the same subnet");
6471
6472        // Inside the node slice and at the /28 sub-prefix.
6473        assert!(
6474            slice_net.contains(&a1.network()) && slice_net.contains(&a1.broadcast()),
6475            "isolation subnet {a1} must be wholly inside the node slice {slice_net}"
6476        );
6477        assert_eq!(a1.prefix_len(), 28, "expected a /28 isolation sub-block");
6478
6479        // A different network name carving a different /28 block must be disjoint.
6480        // (`beta` and `gamma` hash to different indices than `alpha`; pick whichever
6481        //  of several names lands on a distinct block to assert disjointness.)
6482        let other = ["beta", "gamma", "delta", "omega", "zeta"]
6483            .iter()
6484            .map(|n| server.isolation_network_subnet(n).unwrap())
6485            .find(|s| *s != a1)
6486            .expect("at least one other name must land on a different /28 block");
6487        let overlaps = a1.contains(&other.network()) || other.contains(&a1.network());
6488        assert!(
6489            !overlaps,
6490            "distinct isolation networks must occupy disjoint subnets ({a1} vs {other})"
6491        );
6492    }
6493}