zlayer-overlayd 0.13.0

//! The overlayd server engine.
//!
//! [`OverlaydServer`] is a near 1:1 migration of the *mechanics* half of the
//! agent's `OverlayManager`: it owns the single cluster `WireGuard`
//! [`OverlayTransport`], the per-service Linux bridges (Linux) / HCN Internal
//! network + endpoints (Windows), the per-node IP allocator, DNS config, and
//! NAT traversal. The cluster-brain half (Raft, scheduler, service registry)
//! stays in the main daemon, which drives this server over the IPC contract in
//! [`zlayer_types::overlayd`].
//!
//! Every [`OverlaydRequest`] maps to a method here via [`OverlaydServer::handle`].

use std::collections::HashMap;
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr};
#[cfg(target_os = "linux")]
use std::os::fd::AsFd;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicU64, Ordering};

use ipnetwork::IpNetwork;
use zlayer_overlay::nat::{RelayServerConfig, StunServerConfig, TurnServerConfig};
use zlayer_overlay::{
    Candidate, CandidateType, ConnectionType, NatConfig, NatTraversal, OverlayConfig,
    OverlayTransport, PeerInfo, RelayServer,
};
use zlayer_types::overlayd::{
    AttachHandle, AttachResult, DedicatedServiceStatus, GuestOverlayConfig, NatCandidateWire,
    NatConfigSpec, NatPeerWire, NatStatusWire, OverlayMode, OverlaydRequest, OverlaydResponse,
    PeerScope, PeerSpec, PeerStatus, ServiceOverlayInfo, StatusSnapshot,
};

use crate::error::OverlaydError;
use crate::network_state::{
    owner_for_service, DedicatedPortAllocator, ManagedNetwork, NetworkState,
};

/// Maximum length for Linux network interface names (IFNAMSIZ - 1 for null terminator).
const MAX_IFNAME_LEN: usize = 15;

/// Reserved [`zlayer_overlay::allocator::ServiceSubnetRegistry`] key for the
/// single node-wide shared bridge (`OverlayMode::Shared`). The leading NUL-like
/// sentinel can never collide with a real service name (service names come from
/// deployment specs and are DNS-label-shaped), so the shared bridge always gets
/// exactly one stable subnet distinct from every per-service subnet.
#[cfg(target_os = "linux")]
const SHARED_BRIDGE_REGISTRY_KEY: &str = "__zlayer_shared_bridge__";

/// Generate a Linux-safe interface name guaranteed to be <= 15 chars.
///
/// Joins the `parts` with `-` after a `"zl-"` prefix and appends `-{suffix}` if
/// non-empty. When the result exceeds 15 characters, a deterministic hash of all
/// parts is used instead to keep the name unique and within the kernel limit.
#[must_use]
pub fn make_interface_name(parts: &[&str], suffix: &str) -> String {
    use std::collections::hash_map::DefaultHasher;
    use std::hash::{Hash, Hasher};

    let base = format!("zl-{}", parts.join("-"));
    let candidate = if suffix.is_empty() {
        base
    } else {
        format!("{base}-{suffix}")
    };

    if candidate.len() <= MAX_IFNAME_LEN {
        return candidate;
    }

    // Name is too long -- produce a deterministic hash-based name.
    let mut hasher = DefaultHasher::new();
    for part in parts {
        part.hash(&mut hasher);
    }
    suffix.hash(&mut hasher);
    let hash = format!("{:x}", hasher.finish());

    if suffix.is_empty() {
        // "zl-" (3) + up to 12 hex chars = 15
        let budget = MAX_IFNAME_LEN - 3;
        format!("zl-{}", &hash[..budget.min(hash.len())])
    } else {
        // "zl-" (3) + hash + "-" (1) + suffix
        let suffix_cost = 1 + suffix.len(); // "-" + suffix
        let hash_budget = MAX_IFNAME_LEN.saturating_sub(3 + suffix_cost);
        if hash_budget == 0 {
            let budget = MAX_IFNAME_LEN - 3;
            format!("zl-{}", &hash[..budget.min(hash.len())])
        } else {
            format!("zl-{}-{}", &hash[..hash_budget.min(hash.len())], suffix)
        }
    }
}

/// Pure orphan-selection predicate for [`OverlaydServer::prune_orphan_bridges`].
///
/// Returns `true` iff `name` is one of OUR per-service bridge (`zl-…-b`) or
/// dedicated device (`zl-…-d`) interfaces AND is neither in the `live` set (the
/// names the daemon says SHOULD exist) nor `protected` (the active global `-g`
/// device, the node-wide `-sh` shared bridge, and any live in-memory service
/// bridge/device). The `zl-` prefix gate keeps the sweep off unrelated host
/// links; the `-b`/`-d` suffix gate keeps it off the global/shared interfaces
/// and the `veth-…`/`vc-…` container-veth namespace (those are reclaimed by the
/// PID-keyed `sweep_orphan_veths`, never here).
#[cfg(target_os = "linux")]
fn is_orphan_service_bridge(
    name: &str,
    live: &std::collections::HashSet<&str>,
    protected: &std::collections::HashSet<String>,
) -> bool {
    if !name.starts_with("zl-") {
        return false;
    }
    if !(name.ends_with("-b") || name.ends_with("-d")) {
        return false;
    }
    !live.contains(name) && !protected.contains(name)
}

/// First usable host address in `subnet`.
///
/// For IPv4 this is `network() + 1` (skipping the network address). For IPv6
/// the same rule applies — the network address is conventionally reserved.
fn first_usable_ip(subnet: ipnet::IpNet) -> IpAddr {
    match subnet {
        ipnet::IpNet::V4(v4) => {
            let net = u32::from(v4.network());
            IpAddr::V4(Ipv4Addr::from(net.wrapping_add(1)))
        }
        ipnet::IpNet::V6(v6) => {
            let net = u128::from(v6.network());
            IpAddr::V6(Ipv6Addr::from(net.wrapping_add(1)))
        }
    }
}

/// Parameters threaded into [`OverlaydServer::attach_to_interface`] when a
/// container is being attached to a per-service Linux bridge.
#[cfg(target_os = "linux")]
#[derive(Debug)]
struct BridgeAttachParams<'a> {
    /// Linux bridge name on the host to enslave the host-side veth into.
    bridge_name: &'a str,
    /// Bridge's L3 gateway IP. The container's default route is set here.
    gateway: IpAddr,
    /// Prefix length of the bridge's subnet.
    subnet_prefix_len: u8,
}

/// Tracking info recorded by [`OverlaydServer::attach_container`] for every
/// container that successfully attaches on Linux (via the per-PID `attached`
/// map) and for every macOS host-shared container (via the
/// `host_shared_attachments` map). Used by `detach_container`. Cross-platform
/// so the host-shared path — which runs on macOS — can reuse the same record.
#[derive(Debug, Clone)]
struct AttachInfo {
    /// IP allocated on the per-service overlay (eth0 inside the container).
    service_ip: IpAddr,
    /// Name of the service whose bridge owns `service_ip`.
    service_name: Option<String>,
    /// IP allocated on the global overlay (eth1), if the container joined it.
    /// `Some` iff the container also attached to the global overlay; the
    /// detach path now deletes `veth-<pid>-g` unconditionally (idempotent), so
    /// no separate `joined_global` flag is needed.
    ///
    /// Linux-only: this is the per-container global/eth1 IP, allocated and read
    /// solely by the Linux veth attach/detach paths. Host-shared containers
    /// (macOS/Windows) share the node's single cluster utun and reach the
    /// global overlay through their node `/32` alias, so they never allocate a
    /// separate eth1 IP — it is always `None` off Linux and never read there.
    #[cfg_attr(not(target_os = "linux"), allow(dead_code))]
    global_ip: Option<IpAddr>,
    /// True when this attach asked overlayd to reap the per-service bridge
    /// once the LAST container detaches (ephemeral/per-job networks). False
    /// for managed services (bridge persists across scale-to-0).
    ephemeral: bool,
    /// `Some(network)` when this container joined the named isolated network;
    /// drives per-network L3 isolation membership cleanup on detach.
    isolation_network: Option<String>,
}

/// Tracking info recorded by [`OverlaydServer::attach_container_guest`] for a
/// guest-managed attach. Platform-agnostic (no netns/veth/HCN): the guest owns
/// its own `WireGuard` device; the host only allocated the address + registered
/// the guest's public key as a global peer.
#[derive(Debug, Clone)]
struct GuestAttachInfo {
    /// Overlay IP allocated for the guest (released on detach).
    overlay_ip: IpAddr,
    /// Base64 public key registered on the global transport for the guest
    /// (removed on detach).
    public_key: String,
    /// Service whose bridge pool owns `overlay_ip` (Linux service-bridge path);
    /// `None` when drawn from the node slice. Mirrors `AttachInfo::service_name`
    /// so detach returns the IP to the right pool.
    service_name: Option<String>,
    /// `Some(network)` when this guest joined the named isolated network;
    /// drives per-network membership cleanup on detach. The guest's own
    /// enforcement (`WireGuard` `AllowedIPs`) is wired separately — overlayd only
    /// maintains the membership map here.
    isolation_network: Option<String>,
}

/// Per-service Linux bridge state. One bridge per service per node; containers
/// attach to it via veth pairs and cross-node packets ride the single cluster
/// `OverlayTransport` with the service subnet plumbed into its `AllowedIPs`.
#[cfg(target_os = "linux")]
#[derive(Debug)]
struct ServiceBridge {
    /// Linux bridge name, kept under IFNAMSIZ-1 by [`make_interface_name`].
    name: String,
    /// CIDR of the service's subnet on this node.
    subnet: ipnet::IpNet,
    /// Gateway IP within the subnet (first usable address).
    gateway: IpAddr,
    /// Per-service IP allocator covering `subnet`.
    ip_allocator: zlayer_overlay::allocator::IpAllocator,
}

/// A dedicated per-service `WireGuard` transport (`OverlayMode::Dedicated`).
///
/// Unlike Shared mode — where every service subnet is plumbed onto the single
/// cluster [`OverlayTransport`] via multi-CIDR `AllowedIPs` — a Dedicated
/// service owns a *second* real `WireGuard` device with its own crypto context,
/// listen port, overlay IP, and subnet. The device is portable (boringtun
/// userspace `WireGuard` works on Linux/macOS/Windows), so this struct is
/// cross-platform; only the bridge/HCN *attachment* of containers onto it is
/// platform-gated.
struct ServiceTransport {
    /// The live dedicated `WireGuard` device. Dropping it tears down the TUN.
    transport: OverlayTransport,
    /// Actual interface name (kernel-assigned `utunN` on macOS).
    interface: String,
    /// base64 public key of this dedicated device.
    public_key: String,
    /// UDP listen port handed out by [`DedicatedPortAllocator`].
    listen_port: u16,
    /// This node's overlay IP on the dedicated device.
    overlay_ip: std::net::IpAddr,
    /// The service's subnet carried by the dedicated device.
    subnet: ipnet::IpNet,
    /// Guest-attach IPAM bounded to `subnet`. VZ-Linux / WSL2 guests that join
    /// this Dedicated service draw their overlay IP from here so they land on
    /// the dedicated device's subnet (own crypto) rather than the node slice.
    /// The node's own `overlay_ip` is reserved at setup so guests never collide
    /// with it. Unused on Linux, where dedicated containers attach via a
    /// per-service bridge that owns its own allocator.
    #[cfg_attr(target_os = "linux", allow(dead_code))]
    ip_allocator: zlayer_overlay::allocator::IpAllocator,
}

/// The overlay daemon engine.
pub struct OverlaydServer {
    /// Deployment name (used for network naming). Set by `SetupGlobalOverlay`.
    deployment: String,
    /// Per-daemon-process disambiguator included in overlay link names. Set by
    /// `SetupGlobalOverlay`.
    instance_id: String,
    /// Root data directory; HCN markers, IPAM state, etc. live under it.
    data_dir: PathBuf,
    /// Global overlay interface name.
    global_interface: Option<String>,
    /// Global overlay transport (kept alive for the TUN device lifetime). The
    /// SINGLE cluster-wide `WireGuard` transport; every service subnet is
    /// plumbed through its `AllowedIPs`.
    global_transport: Option<OverlayTransport>,
    /// Service-name -> per-service Linux bridge / placeholder name.
    service_interfaces: HashMap<String, String>,
    /// Service-name -> dedicated per-service `WireGuard` transport (Dedicated
    /// mode). Coexists with `global_transport`. Empty for Shared-only nodes.
    service_transports: HashMap<String, ServiceTransport>,
    /// Port allocator for dedicated devices (band above the global WG port).
    dedicated_ports: DedicatedPortAllocator,
    /// Per-service bridge state (Linux only).
    #[cfg(target_os = "linux")]
    service_bridges: HashMap<String, ServiceBridge>,
    /// The SINGLE node-wide shared bridge backing every `OverlayMode::Shared`
    /// service (Linux only). Created once on the first Shared-service setup and
    /// reused for all subsequent ones; container ports are exposed via the
    /// userspace free-port L4 proxy (`proxy_manager.rs`), not per-service
    /// bridges. `None` until the first Shared service is set up.
    #[cfg(target_os = "linux")]
    shared_bridge: Option<ServiceBridge>,
    /// Resolved per-service overlay mode, recorded at `setup_service_overlay_*`
    /// time so the container ATTACH path knows which data-plane a service uses
    /// (per-service bridge for `Auto`/`Dedicated` vs the single shared bridge
    /// for `Shared`) without re-deriving it. Cross-platform.
    service_modes: HashMap<String, OverlayMode>,
    /// Local fallback `ServiceSubnetRegistry`. Used by the Linux Shared bridge
    /// path and by the cross-platform Dedicated path (subnets stay globally
    /// unique regardless of mode/OS).
    service_subnet_registry: Option<zlayer_overlay::allocator::ServiceSubnetRegistry>,
    /// Local raft node id used as the partition key for service-subnet assign.
    local_node_id: u64,
    /// Base64 `WireGuard` public key of THIS node's cluster transport, as told
    /// by the main daemon via `SetLocalWgPubkey` (used for service-subnet
    /// `AllowedIPs` plumbing).
    local_wg_pubkey: Option<String>,
    /// Public key generated for the live global transport, recorded at
    /// `setup_global_overlay` time so `Status` can surface it (the transport
    /// itself exposes no public-key accessor).
    transport_public_key: Option<String>,
    /// IP allocator for the node's overlay slice.
    ip_allocator: IpAllocator,
    /// This node's IP on the global overlay network.
    node_ip: Option<IpAddr>,
    /// `WireGuard` listen port for the overlay network.
    overlay_port: u16,
    /// Full cluster CIDR (e.g. `10.200.0.0/16`).
    cluster_cidr: Option<IpNetwork>,
    /// Per-node slice CIDR.
    slice_cidr: Option<IpNetwork>,
    /// Map of HCN namespace GUID -> (`service_name`, `allocated_ip`,
    /// `isolation_network`) for autoclean. The trailing `isolation_network` lets
    /// detach drain the per-network membership map for this container.
    #[cfg(target_os = "windows")]
    hcn_cleanup: HashMap<windows::core::GUID, (String, std::net::IpAddr, Option<String>)>,
    /// Per-service container-IP allocators for Windows dedicated services. Each
    /// is bounded to that service's subnet (not the node slice) so dedicated
    /// containers draw addresses from their own isolated network. Keyed by
    /// service name; created lazily on the first dedicated attach.
    #[cfg(target_os = "windows")]
    service_ip_allocators: HashMap<String, IpAllocator>,
    /// Per-PID tracking of overlay attachments on Linux.
    #[cfg(target_os = "linux")]
    attached: HashMap<u32, AttachInfo>,
    /// Per-isolated-network membership: network name -> the set of member
    /// overlay (service) IPs currently attached to it. Drives per-network L3
    /// isolation (a member reaches only its own network's members + node +
    /// egress). Populated on attach, drained on detach, across all platforms.
    network_members: std::collections::HashMap<String, std::collections::HashSet<IpAddr>>,
    /// Peers installed on the GLOBAL transport via `AddPeer { Global }`, keyed by
    /// base64 public key. Tracked here (in wire-safe [`PeerSpec`] form, with the
    /// keys kept base64 — the boringtun UAPI dump only exposes hex keys) so a
    /// guest-managed attach can hand the guest the exact peer set the host's own
    /// global device carries. Platform-agnostic: the guest path runs on macOS.
    global_peers: HashMap<String, PeerSpec>,
    /// Guest-managed overlay attachments, keyed by the opaque container `id` from
    /// [`AttachHandle::GuestManaged`]. Records the allocated overlay IP and the
    /// generated public key registered in the mesh so `DetachContainer` can
    /// release the IP and remove the peer.
    guest_attachments: HashMap<String, GuestAttachInfo>,
    /// Host-shared overlay attachments, keyed by the opaque container `id` from
    /// [`AttachHandle::HostShared`] (macOS Seatbelt / native-VZ / libkrun
    /// containers that share the node's host network namespace and its single
    /// cluster `utun`). Records the distinct overlay `/32` allocated for the
    /// container so `DetachContainer` can remove the utun alias, drain the
    /// per-network L3 isolation membership, and release the IP. Cross-platform
    /// (the host-shared path compiles everywhere; it is exercised on macOS).
    host_shared_attachments: HashMap<String, AttachInfo>,
    /// Overlay DNS server listen address, if one was bootstrapped.
    dns_server_addr: Option<SocketAddr>,
    /// DNS domain for overlay service discovery.
    dns_domain: Option<String>,
    /// Overlay DNS A/AAAA records this node owns (name -> ip).
    dns_records: HashMap<String, IpAddr>,
    /// NAT traversal configuration threaded into every `OverlayConfig`.
    nat_config: Option<NatConfig>,
    /// Override for `OverlayConfig::uapi_sock_dir`.
    uapi_sock_dir: Option<PathBuf>,
    /// Live NAT traversal orchestrator.
    nat_traversal: Option<NatTraversal>,
    /// Unix-epoch seconds of the last successful candidate gather / STUN refresh.
    nat_last_refresh: AtomicU64,
    /// NAT-traversal candidates each peer advertised, keyed by base64 public
    /// key. Populated from `AddPeer { Global }` (the join-time candidate
    /// exchange); the NAT maintenance tick feeds these into
    /// `NatTraversal::connect_to_peer` to hole-punch / relay toward a peer whose
    /// direct endpoint has not produced a recent `WireGuard` handshake.
    peer_candidates: HashMap<String, Vec<Candidate>>,
    /// The [`ConnectionType`] last negotiated to each peer (keyed by base64
    /// public key), recorded by the connect loop so `NatStatus` can report
    /// direct / hole-punched / relayed per peer.
    peer_connection_type: HashMap<String, ConnectionType>,
    /// Built-in relay server, started lazily on the first NAT tick when the
    /// resolved [`NatConfig::relay_server`] is `Some`. Kept alive for the
    /// daemon's lifetime so its background accept loop keeps running.
    relay_server: Option<RelayServer>,
    /// The address the built-in [`Self::relay_server`] actually bound (the real
    /// port when `listen_port == 0`).
    relay_bound_addr: Option<SocketAddr>,
    /// Cluster-shared credential used to derive the built-in relay server's
    /// `BLAKE2b` auth key. Carried in `NatConfigSpec.relay_server.auth_credential`
    /// (the main daemon sets it from the cluster HS256 secret) so every node's
    /// relay client derives the *same* key. `None` when no credential was
    /// supplied (the relay then derives a key from the empty string — only nodes
    /// that likewise have no credential can use it).
    cluster_relay_credential: Option<String>,
    /// Set when a `Shutdown` request has been received.
    shutdown_requested: bool,
    /// IPv4 `net.ipv4.ip_forward` value observed BEFORE the daemon first
    /// enabled forwarding for an overlay container attach. `Some(prev)` is
    /// recorded exactly once (the first time we flip it to `1`); teardown
    /// restores `prev` so a clean shutdown reverts host routing state the
    /// daemon turned on without clobbering an operator who set it. `None`
    /// means the daemon never enabled IPv4 forwarding (nothing to revert).
    #[cfg(target_os = "linux")]
    prev_ipv4_forward: Option<String>,
    /// Per-interface IPv6 `net.ipv6.conf.<dev>.forwarding` was enabled on
    /// these device names for overlay routing. We enable forwarding
    /// PER-INTERFACE (never `net.ipv6.conf.all.forwarding`, which has the
    /// documented side effect of forcing `accept_ra=0` + `autoconf=0` on
    /// every IPv6 interface — including the public NIC — and silently
    /// dropping the RA-learned default route / path-MTU, which blackholes
    /// the host's own larger reply packets). Teardown clears forwarding on
    /// exactly these devices.
    #[cfg(target_os = "linux")]
    ipv6_forward_ifaces: std::collections::HashSet<String>,
    /// Host-side veth device names THIS daemon created (`veth-<pid>-<tag>`),
    /// recorded right after a successful `create_veth_pair`. A clean global
    /// teardown deletes each so no host veth half is left dangling once the
    /// overlay stops. Per-container detach may delete some of these first;
    /// deletion is idempotent (a missing device is ignored). Only names this
    /// daemon created are tracked — never a blanket prefix sweep that could
    /// catch a concurrent overlay's interfaces.
    #[cfg(target_os = "linux")]
    created_veths: std::collections::HashSet<String>,
    /// `zl-*` bridge device names THIS daemon created (per-service and the
    /// node-wide shared bridge), recorded right after a successful
    /// `create_bridge` + address + up. Deleting the bridge link on teardown
    /// also drops its gateway address and up state, so the name alone is enough
    /// to fully revert it.
    #[cfg(target_os = "linux")]
    created_bridges: std::collections::HashSet<String>,
    /// Host `/32` (`/128`) routes to a container IP via a host-side veth that
    /// THIS daemon installed via `replace_route_via_dev` (the bridgeless attach
    /// path). Each entry is `(dest, prefix_len, dev)` — enough to delete the
    /// exact route on teardown via `delete_route_via_dev`. Deletion is
    /// idempotent (a route a prior detach already removed is ignored).
    #[cfg(target_os = "linux")]
    created_host_routes: Vec<(IpAddr, u8, String)>,
}

/// Whether rootless mode forces the `WireGuard` `local_endpoint` to UNSPECIFIED.
///
/// In rootless mode `detect_physical_egress()` runs inside the daemon netns and
/// resolves pasta's in-netns tap IP, which is a meaningless WG source/advertised
/// endpoint to remote peers. Extracted as a pure fn so the decision is testable
/// without mutating the process-global `ZLAYER_ROOTLESS` env var (env writes race
/// across parallel tests).
fn rootless_forces_unspecified(rootless: bool) -> bool {
    rootless
}

/// Whether a failure to create the HOST overlay adapter is fatal for the node.
///
/// On Linux the host adapter (a kernel TUN brought up via netlink, with the
/// rootless userns+netns path as a fallback) IS the container data path, so a
/// creation failure must abort overlay setup. On macOS/Windows, Linux
/// containers live in a VZ VM / WSL2 distro that creates its OWN overlay device
/// and meshes VM-to-VM over UDP — the host adapter (utun/Wintun, which needs
/// root/Administrator) is only the host's own membership in the overlay and is
/// NOT on the container data path. So on those platforms a host-adapter failure
/// must DEGRADE to a VM-only overlay (warn + continue) rather than abort.
///
/// Extracted as a `cfg!`-driven pure fn so the degrade decision is unit-testable
/// on Linux without needing to provoke a real utun/Wintun syscall failure.
fn host_adapter_failure_is_fatal(host_adapter_mandatory: bool) -> bool {
    cfg!(target_os = "linux") || host_adapter_mandatory
}

impl OverlaydServer {
    /// Create a fresh server bound to `data_dir`. The overlay itself is brought
    /// up lazily by `SetupGlobalOverlay` (which carries the deployment, slice,
    /// port, and NAT toggle from the main daemon).
    ///
    /// # Panics
    /// Panics only if the compile-time-constant default CIDR `10.200.0.0/16`
    /// fails to parse (impossible).
    #[must_use]
    pub fn new(data_dir: PathBuf) -> Self {
        // Until SetupGlobalOverlay arrives, the allocator is bounded to the
        // default cluster /16. SetupGlobalOverlay re-binds it to the node slice.
        let default_cidr: IpNetwork = "10.200.0.0/16".parse().expect("compile-time constant CIDR");
        let overlay_port = zlayer_core::DEFAULT_WG_PORT;

        // Rehydrate the dedicated-port allocator from the on-disk marker so a
        // service that already owns a dedicated overlay re-binds the exact UDP
        // port it had before this process started.
        let marker_path = zlayer_paths::ZLayerDirs::new(data_dir.clone()).agent_network_state();
        let recorded_dedicated_ports: Vec<u16> = NetworkState::load(&marker_path)
            .networks
            .iter()
            .filter(|n| n.owner.starts_with("service:"))
            .filter_map(|n| n.wg_port)
            .collect();

        Self {
            deployment: String::new(),
            instance_id: String::new(),
            data_dir,
            global_interface: None,
            global_transport: None,
            service_interfaces: HashMap::new(),
            service_transports: HashMap::new(),
            dedicated_ports: DedicatedPortAllocator::new(overlay_port, recorded_dedicated_ports),
            #[cfg(target_os = "linux")]
            service_bridges: HashMap::new(),
            #[cfg(target_os = "linux")]
            shared_bridge: None,
            service_modes: HashMap::new(),
            service_subnet_registry: None,
            local_node_id: 0,
            local_wg_pubkey: None,
            transport_public_key: None,
            ip_allocator: IpAllocator::new(default_cidr),
            node_ip: None,
            overlay_port,
            cluster_cidr: Some(default_cidr),
            slice_cidr: None,
            #[cfg(target_os = "windows")]
            hcn_cleanup: HashMap::new(),
            #[cfg(target_os = "windows")]
            service_ip_allocators: HashMap::new(),
            #[cfg(target_os = "linux")]
            attached: HashMap::new(),
            network_members: std::collections::HashMap::new(),
            global_peers: HashMap::new(),
            guest_attachments: HashMap::new(),
            host_shared_attachments: HashMap::new(),
            dns_server_addr: None,
            dns_domain: None,
            dns_records: HashMap::new(),
            nat_config: None,
            uapi_sock_dir: None,
            nat_traversal: None,
            nat_last_refresh: AtomicU64::new(0),
            peer_candidates: HashMap::new(),
            peer_connection_type: HashMap::new(),
            relay_server: None,
            relay_bound_addr: None,
            cluster_relay_credential: None,
            shutdown_requested: false,
            #[cfg(target_os = "linux")]
            prev_ipv4_forward: None,
            #[cfg(target_os = "linux")]
            ipv6_forward_ifaces: std::collections::HashSet::new(),
            #[cfg(target_os = "linux")]
            created_veths: std::collections::HashSet::new(),
            #[cfg(target_os = "linux")]
            created_bridges: std::collections::HashSet::new(),
            #[cfg(target_os = "linux")]
            created_host_routes: Vec::new(),
        }
    }

    /// Override the `WireGuard` UAPI socket directory for every overlay
    /// transport built by this server.
    #[must_use]
    pub fn with_uapi_sock_dir(mut self, dir: impl Into<PathBuf>) -> Self {
        self.uapi_sock_dir = Some(dir.into());
        self
    }

    /// Whether a `Shutdown` request has been received.
    #[must_use]
    pub fn shutdown_requested(&self) -> bool {
        self.shutdown_requested
    }

    /// The root data directory this server was constructed with. Used by the
    /// uninstall path (`purge_managed_networks`) and for HCN marker resolution.
    #[must_use]
    pub fn data_dir(&self) -> &Path {
        &self.data_dir
    }

    // -- request dispatch ----------------------------------------------------

    /// Execute one [`OverlaydRequest`], producing the [`OverlaydResponse`] the
    /// server sends back over IPC. Any internal error is folded into
    /// [`OverlaydResponse::Err`].
    pub async fn handle(&mut self, req: OverlaydRequest) -> OverlaydResponse {
        match self.dispatch(req).await {
            Ok(resp) => resp,
            Err(e) => OverlaydResponse::Err {
                message: e.to_string(),
            },
        }
    }

    #[allow(clippy::too_many_lines)]
    async fn dispatch(&mut self, req: OverlaydRequest) -> Result<OverlaydResponse, OverlaydError> {
        match req {
            OverlaydRequest::SetLocalNodeId { node_id } => {
                self.local_node_id = node_id;
                Ok(OverlaydResponse::Ok)
            }
            OverlaydRequest::SetLocalWgPubkey { pubkey } => {
                self.local_wg_pubkey = Some(pubkey);
                Ok(OverlaydResponse::Ok)
            }
            OverlaydRequest::SetupGlobalOverlay {
                deployment,
                instance_id,
                cluster_cidr,
                slice_cidr,
                wg_port,
                nat,
                host_adapter_mandatory,
            } => {
                let name = self
                    .setup_global_overlay(
                        deployment,
                        instance_id,
                        &cluster_cidr,
                        slice_cidr.as_deref(),
                        wg_port,
                        nat,
                        host_adapter_mandatory,
                    )
                    .await?;
                Ok(OverlaydResponse::BridgeName { name })
            }
            OverlaydRequest::TeardownGlobalOverlay => {
                self.teardown_global_overlay();
                Ok(OverlaydResponse::Ok)
            }
            OverlaydRequest::SetupServiceOverlay { service, mode } => {
                let info = self.setup_service_overlay(&service, mode).await?;
                Ok(OverlaydResponse::ServiceOverlay(info))
            }
            OverlaydRequest::TeardownServiceOverlay { service } => {
                self.teardown_service_overlay(&service).await;
                Ok(OverlaydResponse::Ok)
            }
            OverlaydRequest::AllocateIp {
                service,
                join_global,
            } => {
                let ip = self.allocate_ip(&service, join_global)?;
                Ok(OverlaydResponse::Ip { ip })
            }
            OverlaydRequest::ReleaseIp { ip } => {
                self.release_ip(ip);
                Ok(OverlaydResponse::Ok)
            }
            OverlaydRequest::AttachContainer {
                handle,
                service,
                join_global,
                dns_server,
                dns_domain,
                ephemeral,
                isolation_network,
            } => {
                // A guest-managed attach takes a wholly separate path: it cannot
                // build a veth/HCN endpoint (the target is a VM, not a host
                // process), so it allocates the overlay identity + peer set and
                // returns it as `GuestConfig`. PID/HCN handles keep the existing
                // veth/HCN attach and return `Attached`.
                if let AttachHandle::GuestManaged { id } = handle {
                    // Record the overlay DNS resolver/zone the daemon staged for
                    // this node so the guest config can fall back to them (same
                    // bookkeeping `attach_container` does for the other handles).
                    if let Some(server) = dns_server {
                        self.dns_server_addr = Some(SocketAddr::new(server, 53));
                    }
                    if dns_domain.is_some() {
                        self.dns_domain.clone_from(&dns_domain);
                    }
                    let config = self
                        .attach_container_guest(
                            &id,
                            &service,
                            join_global,
                            dns_server,
                            dns_domain,
                            isolation_network,
                        )
                        .await?;
                    Ok(OverlaydResponse::GuestConfig(config))
                } else {
                    let result = self
                        .attach_container(
                            handle,
                            &service,
                            join_global,
                            ephemeral,
                            dns_server,
                            dns_domain,
                            isolation_network,
                        )
                        .await?;
                    Ok(OverlaydResponse::Attached(result))
                }
            }
            OverlaydRequest::DetachContainer { handle } => {
                if let AttachHandle::GuestManaged { id } = handle {
                    self.detach_container_guest(&id).await?;
                } else {
                    self.detach_container(handle).await?;
                }
                Ok(OverlaydResponse::Ok)
            }
            // `scope` selects the target device: `Global` (default) = the single
            // cluster transport; `Service { service }` = that service's
            // dedicated per-service transport.
            OverlaydRequest::AddPeer { peer, scope } => {
                let info = peer_spec_to_info(&peer)?;
                // VM-only overlay (macOS/Windows host adapter unavailable):
                // there is no host transport to program for the Global scope, so
                // WARN-AND-SKIP the on-device install instead of erroring. The
                // peer is still mirrored into `global_peers` below so guests can
                // reproduce the global peer set via the separate guest-config
                // push — the host simply doesn't join. `Some` transports are
                // unaffected.
                if matches!(scope, PeerScope::Global) && self.global_transport.is_none() {
                    tracing::warn!(
                        peer = %peer.public_key,
                        "global overlay has no host adapter (VM-only overlay); \
                         skipping host peer install — guests receive this peer via \
                         guest-config push"
                    );
                } else {
                    let transport = self.transport_for_scope(&scope)?;
                    Self::add_peer_on(transport, &info).await?;
                }
                // Record the peer's advertised NAT candidates (if any) so the
                // NAT maintenance tick can hole-punch / relay toward it. Stored
                // for both scopes keyed by public key (the cluster transport is
                // the one carrying packets either way). Empty candidate lists
                // are dropped from the map so the tick's borrow loop stays cheap.
                if peer.candidates.is_empty() {
                    self.peer_candidates.remove(&peer.public_key);
                } else {
                    let parsed: Vec<Candidate> = peer
                        .candidates
                        .iter()
                        .filter_map(wire_to_candidate)
                        .collect();
                    if parsed.is_empty() {
                        self.peer_candidates.remove(&peer.public_key);
                    } else {
                        self.peer_candidates.insert(peer.public_key.clone(), parsed);
                    }
                }
                // Mirror Global peers into `global_peers` so a guest-managed
                // attach can reproduce the host's global peer set for the guest.
                if matches!(scope, PeerScope::Global) {
                    self.global_peers.insert(peer.public_key.clone(), peer);
                }
                Ok(OverlaydResponse::Ok)
            }
            OverlaydRequest::RemovePeer { pubkey, scope } => {
                // VM-only overlay: no host transport for the Global scope, so the
                // on-device removal is a no-op — just drop it from `global_peers`
                // below. `Some` transports are unaffected.
                if matches!(scope, PeerScope::Global) && self.global_transport.is_none() {
                    tracing::warn!(
                        peer = %pubkey,
                        "global overlay has no host adapter (VM-only overlay); \
                         skipping host peer removal"
                    );
                } else {
                    let transport = self.transport_for_scope(&scope)?;
                    Self::remove_peer_on(transport, &pubkey).await?;
                }
                if matches!(scope, PeerScope::Global) {
                    self.global_peers.remove(&pubkey);
                }
                self.peer_candidates.remove(&pubkey);
                self.peer_connection_type.remove(&pubkey);
                Ok(OverlaydResponse::Ok)
            }
            OverlaydRequest::AddAllowedIp {
                pubkey,
                cidr,
                scope,
            } => {
                // VM-only overlay: no host device to plumb AllowedIPs into for the
                // Global scope — warn-and-skip. `Some` transports are unaffected.
                if matches!(scope, PeerScope::Global) && self.global_transport.is_none() {
                    tracing::warn!(
                        peer = %pubkey,
                        cidr = %cidr,
                        "global overlay has no host adapter (VM-only overlay); \
                         skipping host AllowedIP add"
                    );
                } else {
                    let transport = self.transport_for_scope(&scope)?;
                    Self::add_allowed_ip_on(transport, &pubkey, &cidr).await?;
                }
                Ok(OverlaydResponse::Ok)
            }
            OverlaydRequest::RemoveAllowedIp {
                pubkey,
                cidr,
                scope,
            } => {
                // VM-only overlay: no host device for the Global scope — the
                // removal is a no-op. `Some` transports are unaffected.
                if matches!(scope, PeerScope::Global) && self.global_transport.is_none() {
                    tracing::warn!(
                        peer = %pubkey,
                        cidr = %cidr,
                        "global overlay has no host adapter (VM-only overlay); \
                         skipping host AllowedIP removal"
                    );
                } else {
                    let transport = self.transport_for_scope(&scope)?;
                    Self::remove_allowed_ip_on(transport, &pubkey, &cidr).await?;
                }
                Ok(OverlaydResponse::Ok)
            }
            OverlaydRequest::RegisterDns { name, ip } => {
                self.register_dns(name, ip);
                Ok(OverlaydResponse::Ok)
            }
            OverlaydRequest::UnregisterDns { name } => {
                self.unregister_dns(&name);
                Ok(OverlaydResponse::Ok)
            }
            OverlaydRequest::WriteScopedResolver {
                zone,
                node_ip,
                port,
            } => {
                #[cfg(target_os = "macos")]
                {
                    zlayer_overlay::dns::write_scoped_resolver(&zone, node_ip, port).map_err(
                        |e| OverlaydError::Overlay(format!("write_scoped_resolver({zone}): {e}")),
                    )?;
                    Ok(OverlaydResponse::Ok)
                }
                #[cfg(not(target_os = "macos"))]
                {
                    let _ = (zone, node_ip, port);
                    Err(OverlaydError::Overlay(
                        "scoped resolver is macOS-only".into(),
                    ))
                }
            }
            OverlaydRequest::RemoveScopedResolver { zone } => {
                #[cfg(target_os = "macos")]
                {
                    zlayer_overlay::dns::remove_scoped_resolver(&zone).map_err(|e| {
                        OverlaydError::Overlay(format!("remove_scoped_resolver({zone}): {e}"))
                    })?;
                    Ok(OverlaydResponse::Ok)
                }
                #[cfg(not(target_os = "macos"))]
                {
                    let _ = zone;
                    Err(OverlaydError::Overlay(
                        "scoped resolver is macOS-only".into(),
                    ))
                }
            }
            OverlaydRequest::PruneOrphanBridges { live_bridge_names } => {
                let reclaimed = self.prune_orphan_bridges(&live_bridge_names).await;
                Ok(OverlaydResponse::PrunedBridges { reclaimed })
            }
            OverlaydRequest::Status => Ok(OverlaydResponse::Status(self.status_snapshot().await)),
            OverlaydRequest::NatTick => {
                self.nat_maintenance_tick().await?;
                Ok(OverlaydResponse::Ok)
            }
            OverlaydRequest::NatStatus => Ok(OverlaydResponse::NatStatus(
                self.nat_status_snapshot().await,
            )),
            OverlaydRequest::Shutdown => {
                self.shutdown_requested = true;
                self.teardown_global_overlay();
                Ok(OverlaydResponse::Ok)
            }
        }
    }

    // -- global overlay ------------------------------------------------------

    /// Bring up (or reuse) this node's base/global overlay.
    ///
    /// Idempotent: if a global transport is already live, reuse it (recreating
    /// without this guard could yank the kernel TUN out from under the running
    /// boringtun worker). Re-binds the IP allocator to `slice_cidr` if one is
    /// supplied so container IPs never collide across nodes.
    ///
    /// # Errors
    /// Returns an error if key generation or interface creation fails.
    #[allow(clippy::too_many_lines)]
    #[allow(clippy::too_many_arguments)]
    async fn setup_global_overlay(
        &mut self,
        deployment: String,
        instance_id: String,
        cluster_cidr: &str,
        slice_cidr: Option<&str>,
        wg_port: u16,
        nat: Option<NatConfigSpec>,
        host_adapter_mandatory: bool,
    ) -> Result<String, OverlaydError> {
        self.deployment = deployment;
        self.instance_id = instance_id;
        self.overlay_port = wg_port;

        let cluster: IpNetwork = cluster_cidr.parse().map_err(|e| {
            OverlaydError::Other(format!("invalid cluster CIDR {cluster_cidr}: {e}"))
        })?;
        self.cluster_cidr = Some(cluster);
        if let Some(slice) = slice_cidr {
            let slice_net: IpNetwork = slice
                .parse()
                .map_err(|e| OverlaydError::Other(format!("invalid slice CIDR {slice}: {e}")))?;
            self.slice_cidr = Some(slice_net);
            self.ip_allocator = IpAllocator::new(slice_net);
        }
        // Thread the full operator-supplied NAT config (STUN/TURN servers,
        // timeouts, relay-server bind + credential) into overlayd. `None` means
        // the main daemon supplied no explicit config, so overlayd keeps its
        // built-in `NatConfig::default()` (NAT enabled, Google STUN). A `Some`
        // spec is converted verbatim — including the relay credential, stashed
        // separately so the relay server can be stood up with a cluster-shared
        // auth key on the first NAT tick.
        if let Some(spec) = nat {
            self.cluster_relay_credential = spec
                .relay_server
                .as_ref()
                .and_then(|r| r.auth_credential.clone());
            self.nat_config = Some(nat_config_spec_to_config(spec));
        }

        if let Some(name) = self.global_interface.clone() {
            if self.global_transport.is_some() {
                tracing::debug!(
                    deployment = %self.deployment,
                    "Global overlay already active, reusing existing transport"
                );
                return Ok(name);
            }
        }

        let interface_name = make_interface_name(&[&self.deployment, &self.instance_id], "g");

        let (private_key, public_key) = OverlayTransport::generate_keys()
            .await
            .map_err(|e| OverlaydError::Overlay(format!("Failed to generate keys: {e}")))?;

        // The node's own overlay IP is the deterministic first-usable host of
        // its slice (reserved offset 1), NOT a racy `allocate()` that drifts by
        // allocation order. Containers draw from offset 2 onward, so the node
        // IP is stable across restarts and never collides with a container.
        let node_ip = self.ip_allocator.node_ip();
        self.transport_public_key = Some(public_key.clone());
        let physical_egress_ip = match zlayer_overlay::detect_physical_egress().await {
            Ok(egress) => Some(egress.ip),
            Err(e) => {
                tracing::warn!(
                    error = %e,
                    "failed to detect physical egress; WireGuard local_endpoint \
                     will bind UNSPECIFIED for the global overlay"
                );
                None
            }
        };
        let config = self.build_config(
            private_key,
            public_key,
            node_ip,
            16,
            self.overlay_port,
            physical_egress_ip,
        );
        // Remove any stale `-g` interface with this (now deterministic) name
        // left by a previous daemon instance, so the create below cleanly
        // REPLACES it instead of failing "File exists" or orphaning the old
        // one. With a stable per-host instance id the name is constant across
        // restarts, so exactly one global interface ever exists.
        #[cfg(target_os = "linux")]
        let _ = crate::netlink::delete_link_by_name(&interface_name).await;
        let mut transport = OverlayTransport::new(config, interface_name);

        // Creating the host overlay adapter is fatal on Linux (the kernel TUN IS
        // the container data path) but only DEGRADES on macOS/Windows: there,
        // Linux containers run in a VZ VM / WSL2 distro that creates its own
        // overlay device and meshes VM-to-VM over UDP, so the host adapter
        // (utun/Wintun, needs root/Administrator) is just the host's own overlay
        // membership and is NOT on the container data path. The allocator and
        // `node_ip` are already bound above, so guest-config push + IP allocation
        // keep working even when the host adapter is unavailable.
        // Map the (non-`Send`) `Box<dyn Error>` to an owned `String` BEFORE the
        // match so no non-`Send` value is held across the `configure().await`
        // below — the daemon's request handler future must stay `Send`.
        let create_result = transport
            .create_interface()
            .await
            .map_err(|e| e.to_string());
        let actual_name = match create_result {
            Ok(()) => {
                transport.configure(&[]).await.map_err(|e| {
                    OverlaydError::Overlay(format!("Failed to configure global overlay: {e}"))
                })?;
                // Read back the actual interface name (on macOS, the kernel
                // assigns utunN).
                let actual_name = transport.interface_name().to_string();
                self.node_ip = Some(node_ip);
                self.global_interface = Some(actual_name.clone());
                self.global_transport = Some(transport);
                actual_name
            }
            Err(e) if !host_adapter_failure_is_fatal(host_adapter_mandatory) => {
                // macOS / Windows: continue with a VM-only overlay. Leave
                // `global_transport == None` (the natural "no host adapter"
                // signal), keep `node_ip` so allocation/guest config are
                // unaffected, and SKIP `configure` (no device to program).
                tracing::warn!(
                    error = %e,
                    "host overlay adapter unavailable (needs root/Administrator); \
                     continuing with VM-only overlay — the host will not join the \
                     overlay, but containers running in the VM mesh VM-to-VM and IP \
                     allocation/guest config are unaffected"
                );
                self.node_ip = Some(node_ip);
                self.global_interface = None;
                self.global_transport = None;
                // No real device exists; return an honest marker so the IPC
                // response is a success without implying a live adapter.
                "(host-adapter-disabled)".to_string()
            }
            Err(e) => {
                // Linux (and any future fatal-on-failure target): unchanged —
                // a host-adapter creation failure aborts overlay setup.
                return Err(OverlaydError::Overlay(format!(
                    "Failed to create global overlay: {e}"
                )));
            }
        };

        // In rootless mode the daemon runs in its own network namespace and
        // `pasta` provides egress NAT + inbound port forwarding; the host-table
        // iptables setup below is at best a no-op inside the netns and at worst
        // spurious, so skip it entirely. Otherwise install the host firewall
        // rules as usual.
        if std::env::var_os("ZLAYER_ROOTLESS").is_none() {
            // Stop systemd-networkd / NetworkManager from managing the overlay
            // links overlayd just created. With a permissive default match they
            // try to bring `zl-*` up / run DHCP and (seen on a CI runner)
            // SIGABRT on the networkd watchdog while processing a `zl-*` Link
            // UP. Best-effort; reverted in `teardown_global_overlay`.
            zlayer_overlay::networkd::mark_overlay_interfaces_unmanaged();

            // Allow overlay traffic through the host firewall (UFW / firewalld /
            // a bare `iptables -P FORWARD DROP`). Without this, a container's DNS
            // query to the node overlay IP — and inter-service overlay traffic —
            // is dropped by the host's INPUT/FORWARD policy before it reaches
            // ZLayer's resolver. Best-effort: a host without `iptables` logs a
            // warning rather than aborting overlay setup.
            if let Err(e) =
                zlayer_overlay::firewall::ensure_overlay_subnet_rules(&cluster.to_string())
            {
                tracing::warn!(
                    error = %e,
                    cidr = %cluster,
                    "failed to install overlay firewall allow-rules; service DNS / \
                     cross-service traffic may be blocked by the host firewall"
                );
            }

            // SNAT overlay-sourced egress so containers can reach the LAN/internet.
            // The allow-rules above + `ip_forward` only get the packet *forwarded*
            // out the WAN NIC; without masquerade it leaves with a private
            // `10.200.0.0/16` source and replies never route back (ENETUNREACH /
            // hangs for `wget http://<public-ip>`). Best-effort, same as above.
            if let Err(e) =
                zlayer_overlay::firewall::ensure_overlay_masquerade(&cluster.to_string())
            {
                tracing::warn!(
                    error = %e,
                    cidr = %cluster,
                    "failed to install overlay egress masquerade; overlay containers \
                     may be unable to reach the LAN / internet"
                );
            }
        } else {
            tracing::info!(
                "rootless mode: skipping host iptables (pasta provides egress + port forwarding)"
            );
        }

        Ok(actual_name)
    }

    /// Tear down the node's base overlay (e.g. on full uninstall / shutdown).
    fn teardown_global_overlay(&mut self) {
        if let Some(mut transport) = self.global_transport.take() {
            tracing::info!("Shutting down global overlay");
            transport.shutdown();
        }
        self.global_interface = None;
        self.transport_public_key = None;

        // Revert host network state this daemon mutated so a clean stop
        // recovers connectivity WITHOUT requiring a reboot. Forwarding
        // sysctls and the overlay iptables chains are otherwise sticky:
        // they survive both the daemon stop and an `iptables -F`, so prior
        // to this the only way to undo them was a reboot.
        #[cfg(target_os = "linux")]
        self.revert_forwarding();
        zlayer_overlay::firewall::remove_overlay_masquerade();
        zlayer_overlay::firewall::remove_overlay_subnet_rules();
        // `remove_member_isolation` deliberately leaves the ZLAYER-OVERLAY-ISO
        // chain + its FORWARD jump resident (other members may still use them);
        // on a full overlay teardown remove the whole chain so nothing leaks.
        zlayer_overlay::firewall::remove_overlay_isolation();
        // macOS: strip the pf overlay anchor + the two marked `/etc/pf.conf`
        // lines this node installs for the cluster/DNS ports. Without this they
        // leak past daemon stop (the anchor file and `/etc/pf.conf` refs are
        // sticky on disk). Idempotent: a missing anchor / not-root / disabled-pf
        // case is treated as a successful no-op by the backend. cfg-gated so
        // Linux/Windows teardown behaviour is unchanged.
        #[cfg(target_os = "macos")]
        if let Err(e) = zlayer_overlay::firewall::remove_overlay_rules() {
            tracing::warn!(error = %e, "failed to remove macOS pf overlay rules during teardown");
        }
        // Remove the systemd-networkd / NetworkManager "unmanaged" drop-ins we
        // installed at setup so a clean stop fully reverts host network state.
        zlayer_overlay::networkd::unmark_overlay_interfaces_unmanaged();

        // Revert the host-side netlink resources this daemon created (veths,
        // host /32 routes, bridges). The netlink helpers are async; this fn must
        // keep its sync signature, so bridge to the surrounding multi-thread
        // tokio runtime via block_in_place + Handle::block_on. Order matters:
        // delete routes first (they reference the veth as their oif), then the
        // host-side veths, then the bridges (deleting a bridge link drops its
        // address + up state). Every delete is best-effort + idempotent: a
        // resource a prior per-container detach already removed surfaces as
        // NotFound/ESRCH which the helpers treat as success, and a genuine
        // failure is logged and skipped so a partial teardown never aborts the
        // rest.
        #[cfg(target_os = "linux")]
        {
            let routes: Vec<(IpAddr, u8, String)> = std::mem::take(&mut self.created_host_routes);
            let veths: Vec<String> = self.created_veths.drain().collect();
            let bridges: Vec<String> = self.created_bridges.drain().collect();

            let delete_all = || async {
                for (dest, prefix, dev) in &routes {
                    if let Err(e) = crate::netlink::delete_route_via_dev(*dest, *prefix, dev).await
                    {
                        tracing::warn!(
                            dest = %dest, prefix, dev = %dev, error = %e,
                            "teardown: failed to delete host route (continuing)"
                        );
                    }
                }
                for veth in &veths {
                    if let Err(e) = crate::netlink::delete_link_by_name(veth).await {
                        tracing::warn!(
                            veth = %veth, error = %e,
                            "teardown: failed to delete host-side veth (continuing)"
                        );
                    }
                }
                for bridge in &bridges {
                    if let Err(e) = crate::netlink::delete_link_by_name(bridge).await {
                        tracing::warn!(
                            bridge = %bridge, error = %e,
                            "teardown: failed to delete bridge (continuing)"
                        );
                    }
                }
            };

            match tokio::runtime::Handle::try_current() {
                Ok(handle) => {
                    tokio::task::block_in_place(|| handle.block_on(delete_all()));
                }
                Err(_) => {
                    // No ambient runtime (e.g. a non-async shutdown path): spin
                    // up a throwaway current-thread runtime to drive the deletes.
                    match tokio::runtime::Builder::new_current_thread()
                        .enable_all()
                        .build()
                    {
                        Ok(rt) => rt.block_on(delete_all()),
                        Err(e) => tracing::warn!(
                            error = %e,
                            "teardown: could not build a runtime to revert netlink \
                             resources; veths/routes/bridges left in place"
                        ),
                    }
                }
            }
        }
    }

    /// Enable IP forwarding for an overlay container attach, scoped to the
    /// address family in use and (for IPv6) to the specific overlay devices.
    ///
    /// IPv4 has no per-interface forwarding knob that affects routing the way
    /// we need, so `net.ipv4.ip_forward` is global — but that is harmless for
    /// the host's own INPUT / reply path (it only permits the box to route
    /// transit traffic). We snapshot its prior value once so teardown can
    /// restore it.
    ///
    /// IPv6 is the dangerous case: `net.ipv6.conf.all.forwarding=1` forces
    /// `accept_ra=0` + `autoconf=0` on EVERY IPv6 interface, which drops the
    /// RA-learned default route and path-MTU on the public NIC and blackholes
    /// the host's own larger reply packets. We therefore enable forwarding
    /// only on the specific overlay device(s) via
    /// `net.ipv6.conf.<dev>.forwarding`, which routes overlay traffic without
    /// touching the physical NIC's RA / PMTU state.
    #[cfg(target_os = "linux")]
    fn enable_forwarding_for_attach(
        &mut self,
        is_v6: bool,
        veth_host: &str,
        bridge_name: Option<&str>,
    ) {
        // IPv4 forwarding (global) — required for v4 overlay egress, benign
        // for INPUT. Snapshot the prior value exactly once.
        if self.prev_ipv4_forward.is_none() {
            let prev = crate::netlink::read_sysctl("net.ipv4.ip_forward")
                .unwrap_or_else(|_| "0".to_string());
            self.prev_ipv4_forward = Some(prev);
        }
        let _ = crate::netlink::set_sysctl("net.ipv4.ip_forward", "1");

        // IPv6 forwarding — PER-INTERFACE only. Enable on the host-side veth
        // and (when bridged) the bridge so the overlay routes, without the
        // `all.forwarding` RA/PMTU side effect on the physical NIC. The Linux
        // sysctl name uses '/' for the interface segment escaped to '.' by
        // set_sysctl's dot-translation — so pass the device name with any
        // literal dots intact (overlay device names never contain dots).
        if is_v6 {
            for dev in std::iter::once(veth_host).chain(bridge_name) {
                let key = format!("net.ipv6.conf.{dev}.forwarding");
                if crate::netlink::set_sysctl(&key, "1").is_ok() {
                    self.ipv6_forward_ifaces.insert(dev.to_string());
                }
            }
        }
    }

    /// Revert the forwarding sysctls this daemon enabled (counterpart of
    /// [`Self::enable_forwarding_for_attach`]). Restores the snapshotted IPv4
    /// value and clears per-interface IPv6 forwarding on exactly the devices
    /// we touched. Best-effort: a failed write (device already gone, `/proc`
    /// not writable) is ignored — the worst case is the pre-existing sticky
    /// state, never a crash on shutdown.
    #[cfg(target_os = "linux")]
    fn revert_forwarding(&mut self) {
        if let Some(prev) = self.prev_ipv4_forward.take() {
            let _ = crate::netlink::set_sysctl("net.ipv4.ip_forward", &prev);
        }
        for dev in self.ipv6_forward_ifaces.drain() {
            let key = format!("net.ipv6.conf.{dev}.forwarding");
            let _ = crate::netlink::set_sysctl(&key, "0");
        }
    }

    // -- service overlay -----------------------------------------------------

    /// Set up the per-service Linux bridge that backs `service` on this node.
    ///
    /// Returns the bridge name on success.
    ///
    /// # Errors
    /// Returns an error if subnet assignment fails (exhaustion), if the bridge
    /// cannot be created, or if the cluster transport rejects the `AllowedIPs`
    /// update.
    #[cfg(target_os = "linux")]
    async fn setup_service_overlay(
        &mut self,
        service: &str,
        mode: OverlayMode,
    ) -> Result<ServiceOverlayInfo, OverlaydError> {
        // Decision surface is the two predicates on `OverlayMode` (see
        // `zlayer_types::overlay`), not an ad-hoc variant match:
        //   - uses_shared_bridge() -> the single node-wide shared bridge (+ the
        //     userspace free-port L4 proxy wired in `proxy_manager.rs`).
        //   - uses_per_service_wg() -> a dedicated per-service WireGuard device.
        //   - uses_isolation_scope() -> Isolated: Auto topology here; the L3
        //     fence is applied at ATTACH time via `isolation_network`.
        //   - otherwise (Auto)      -> per-service Linux bridge carried on the
        //     single cluster-wide WireGuard interface (today's default).
        // Record the resolved mode so the container ATTACH path can branch.
        let resolved = mode.resolve();
        self.service_modes.insert(service.to_string(), resolved);
        if resolved.uses_shared_bridge() {
            self.setup_service_overlay_shared_bridge(service).await
        } else if resolved.uses_per_service_wg() {
            self.setup_service_overlay_dedicated(service).await
        } else if resolved.uses_isolation_scope() {
            // Isolated == Auto topology (per-service bridge on the cluster-wide
            // WireGuard); the L3 fence is applied at ATTACH time via
            // `isolation_network`, not in segment setup. Same target as the
            // default, made explicit so a new mode can't silently fall through.
            self.setup_service_overlay_cluster_wg(service).await
        } else {
            self.setup_service_overlay_cluster_wg(service).await
        }
    }

    /// `Auto`-mode per-service overlay (Linux): a per-service Linux bridge backed
    /// by the SINGLE cluster-wide `WireGuard` transport (the service subnet is
    /// plumbed onto the cluster device's `AllowedIPs`). This is the original
    /// default `setup_service_overlay` body, returning a [`ServiceOverlayInfo`]
    /// with the bridge name and all dedicated-device identity fields `None`
    /// (`Auto` shares the cluster device).
    ///
    /// Returns the bridge name on success.
    ///
    /// # Errors
    /// Returns an error if subnet assignment fails (exhaustion), if the bridge
    /// cannot be created, or if the cluster transport rejects the `AllowedIPs`
    /// update.
    #[cfg(target_os = "linux")]
    #[allow(clippy::too_many_lines)]
    async fn setup_service_overlay_cluster_wg(
        &mut self,
        service: &str,
    ) -> Result<ServiceOverlayInfo, OverlaydError> {
        // 1. Idempotency check.
        if let Some(existing) = self.service_bridges.get(service) {
            let name = existing.name.clone();
            tracing::debug!(service = %service, bridge = %name, "Service bridge already active, reusing");
            return Ok(cluster_wg_overlay_info(name));
        }

        // 2. Assign subnet via the (currently local) ServiceSubnetRegistry.
        self.ensure_service_subnet_registry()?;
        let subnet: ipnet::IpNet = {
            let registry = self
                .service_subnet_registry
                .as_mut()
                .expect("ensure_service_subnet_registry leaves Some");
            let node_key = self.local_node_id.to_string();
            registry.assign(service, &node_key).map_err(|e| {
                OverlaydError::Overlay(format!(
                    "ServiceSubnetRegistry::assign({service}, {node_key}) failed: {e}"
                ))
            })?
        };

        // 3+4+6. Create the per-service Linux bridge, assign its gateway, bring
        // it up, build the per-service IpAllocator, and record it.
        let bridge_name = self.create_service_bridge(service, subnet).await?;

        // 5. Plumb subnet into the cluster transport's local AllowedIPs so the
        // single cluster device carries this service's cross-node traffic
        // (Shared mode shares one crypto context for every service).
        if let Some(ref cluster) = self.global_transport {
            if let Some(ref pubkey) = self.local_wg_pubkey {
                if let Err(e) = cluster.add_allowed_ip(pubkey, subnet).await {
                    tracing::warn!(
                        service = %service,
                        subnet = %subnet,
                        error = %e,
                        "Failed to add service subnet to cluster transport AllowedIPs (non-fatal)"
                    );
                }
            } else {
                tracing::debug!(service = %service, "local_wg_pubkey not yet set; skipping cluster AllowedIPs update");
            }
        }

        Ok(cluster_wg_overlay_info(bridge_name))
    }

    /// `Shared`-mode per-service overlay (Linux): attach `service` onto the
    /// SINGLE node-wide shared Linux bridge (created once, reused by every
    /// Shared service on this node), carried on the cluster-wide `WireGuard`
    /// interface. There is NO per-service bridge and NO per-service `WireGuard`;
    /// container ports are exposed via the userspace free-port L4 proxy
    /// (`proxy_manager.rs`). Returns the shared bridge name.
    ///
    /// Idempotent: the shared bridge is allocated a single subnet and brought up
    /// exactly once; subsequent Shared services reuse it. The service is recorded
    /// in `service_interfaces` (pointing at the shared bridge) so presence checks
    /// and the attach path resolve it.
    ///
    /// # Errors
    /// Returns an error if the one-time shared-subnet assignment fails
    /// (exhaustion), if the shared bridge cannot be created, or if the cluster
    /// transport rejects the `AllowedIPs` update.
    #[cfg(target_os = "linux")]
    async fn setup_service_overlay_shared_bridge(
        &mut self,
        service: &str,
    ) -> Result<ServiceOverlayInfo, OverlaydError> {
        let bridge_name = self.ensure_shared_bridge().await?;
        // Point this service at the shared bridge so presence checks succeed and
        // the attach path resolves it to the shared bridge.
        self.service_interfaces
            .insert(service.to_string(), bridge_name.clone());
        tracing::info!(service = %service, bridge = %bridge_name, "Service attached to shared node-wide bridge");
        Ok(shared_overlay_info(bridge_name))
    }

    /// Ensure the single node-wide shared Linux bridge exists, returning its
    /// name. Created once with its own subnet (drawn from the same
    /// `ServiceSubnetRegistry` every service subnet comes from, under a fixed
    /// reserved key so it never collides with a real service) and plumbed onto
    /// the cluster transport's `AllowedIPs` so shared containers are
    /// mesh-reachable across nodes. Subsequent calls return the existing name.
    ///
    /// # Errors
    /// Returns an error if subnet assignment fails or the bridge cannot be
    /// created/addressed/brought up.
    #[cfg(target_os = "linux")]
    async fn ensure_shared_bridge(&mut self) -> Result<String, OverlaydError> {
        use zlayer_overlay::allocator::IpAllocator as OverlayIpAllocator;

        if let Some(existing) = self.shared_bridge.as_ref() {
            return Ok(existing.name.clone());
        }

        // One subnet for the whole shared bridge. Use a fixed reserved key in the
        // registry (never a real service name) so the shared bridge gets exactly
        // one stable subnet, distinct from every per-service subnet.
        self.ensure_service_subnet_registry()?;
        let subnet: ipnet::IpNet = {
            let registry = self
                .service_subnet_registry
                .as_mut()
                .expect("ensure_service_subnet_registry leaves Some");
            let node_key = self.local_node_id.to_string();
            registry.assign(SHARED_BRIDGE_REGISTRY_KEY, &node_key).map_err(|e| {
                OverlaydError::Overlay(format!(
                    "ServiceSubnetRegistry::assign({SHARED_BRIDGE_REGISTRY_KEY}, {node_key}) failed: {e}"
                ))
            })?
        };

        // Deterministic, IFNAMSIZ-safe shared-bridge name (one per node). Use the
        // same naming helper as per-service bridges with a fixed key so it stays
        // <= 15 chars and is unambiguous (`zl-...-sh`).
        let bridge_name =
            make_interface_name(&[&self.deployment, &self.instance_id, "shared"], "sh");

        if let Err(e) = crate::netlink::create_bridge(&bridge_name).await {
            return Err(OverlaydError::Overlay(format!(
                "create_bridge({bridge_name}) failed: {e}"
            )));
        }
        if let Err(e) = crate::netlink::set_bridge_stp(&bridge_name, false) {
            tracing::warn!(bridge = %bridge_name, error = %e, "set_bridge_stp(off) failed (non-fatal)");
        }

        // Flush stale addresses first: `create_bridge` is idempotent on EEXIST, so
        // a shared bridge that survived a restart would otherwise accumulate a
        // second gateway (the same dual-address bug fixed for per-service bridges).
        let gateway = first_usable_ip(subnet);
        if let Err(e) = crate::netlink::flush_addresses_on_link_by_name(&bridge_name).await {
            tracing::warn!(bridge = %bridge_name, error = %e, "flush_addresses_on_link_by_name failed (non-fatal)");
        }
        if let Err(e) =
            crate::netlink::add_address_to_link_by_name(&bridge_name, gateway, subnet.prefix_len())
                .await
        {
            let _ = crate::netlink::delete_bridge(&bridge_name).await;
            return Err(OverlaydError::Overlay(format!(
                "add_address_to_link_by_name({bridge_name}, {gateway}/{}) failed: {e}",
                subnet.prefix_len()
            )));
        }
        if let Err(e) = crate::netlink::set_link_up_by_name(&bridge_name).await {
            let _ = crate::netlink::delete_bridge(&bridge_name).await;
            return Err(OverlaydError::Overlay(format!(
                "set_link_up_by_name({bridge_name}) failed: {e}"
            )));
        }

        // Track the shared bridge for global teardown (deleting the link drops
        // its gateway address + up state).
        self.created_bridges.insert(bridge_name.clone());

        let mut ip_allocator = OverlayIpAllocator::new(&subnet.to_string()).map_err(|e| {
            OverlaydError::Overlay(format!("IpAllocator::new({subnet}) failed: {e}"))
        })?;
        let _ = ip_allocator.allocate_specific(gateway);

        // Plumb the shared subnet onto the cluster transport's AllowedIPs so the
        // single cluster device carries shared-bridge cross-node traffic (same
        // mechanism the cluster-WG per-service path uses).
        if let Some(ref cluster) = self.global_transport {
            if let Some(ref pubkey) = self.local_wg_pubkey {
                if let Err(e) = cluster.add_allowed_ip(pubkey, subnet).await {
                    tracing::warn!(
                        subnet = %subnet,
                        error = %e,
                        "Failed to add shared-bridge subnet to cluster transport AllowedIPs (non-fatal)"
                    );
                }
            } else {
                tracing::debug!(
                    "local_wg_pubkey not yet set; skipping shared-bridge cluster AllowedIPs update"
                );
            }
        }

        self.shared_bridge = Some(ServiceBridge {
            name: bridge_name.clone(),
            subnet,
            gateway,
            ip_allocator,
        });

        tracing::info!(bridge = %bridge_name, subnet = %subnet, gateway = %gateway, "Shared node-wide bridge created");
        Ok(bridge_name)
    }

    /// Create the per-service Linux bridge for `service` on `subnet`, assign its
    /// gateway, bring it up, build the per-service [`IpAllocator`], and record it
    /// in `service_bridges` + `service_interfaces`. Returns the bridge name.
    ///
    /// Shared and Dedicated mode share this bridge mechanic verbatim — the ONLY
    /// difference between the two modes is which `WireGuard` device the service
    /// subnet/peers are plumbed onto (the single cluster transport for Shared,
    /// the dedicated per-service transport for Dedicated). This helper does NOT
    /// touch any transport's `AllowedIPs`; the caller does that against the
    /// device it owns.
    ///
    /// # Errors
    /// Returns an error if the bridge cannot be created, addressed, or brought
    /// up, or if the per-service `IpAllocator` cannot be built.
    #[cfg(target_os = "linux")]
    async fn create_service_bridge(
        &mut self,
        service: &str,
        subnet: ipnet::IpNet,
    ) -> Result<String, OverlaydError> {
        use zlayer_overlay::allocator::IpAllocator as OverlayIpAllocator;

        let bridge_name = make_interface_name(&[&self.deployment, &self.instance_id, service], "b");

        if let Err(e) = crate::netlink::create_bridge(&bridge_name).await {
            return Err(OverlaydError::Overlay(format!(
                "create_bridge({bridge_name}) failed: {e}"
            )));
        }
        if let Err(e) = crate::netlink::set_bridge_stp(&bridge_name, false) {
            tracing::warn!(bridge = %bridge_name, error = %e, "set_bridge_stp(off) failed (non-fatal)");
        }

        // Gateway = first usable host in the subnet, assigned to the bridge.
        // Flush any pre-existing addresses FIRST: `create_bridge` is idempotent
        // on EEXIST, so a bridge that survived a restart would otherwise keep its
        // old gateway and we'd stack the new one on top (the observed dual
        // /28 + /26 bug). Flushing makes the assignment idempotent and self-heals
        // such bridges. Non-fatal: on a brand-new bridge there is nothing to flush.
        let gateway = first_usable_ip(subnet);
        if let Err(e) = crate::netlink::flush_addresses_on_link_by_name(&bridge_name).await {
            tracing::warn!(bridge = %bridge_name, error = %e, "flush_addresses_on_link_by_name failed (non-fatal)");
        }
        if let Err(e) =
            crate::netlink::add_address_to_link_by_name(&bridge_name, gateway, subnet.prefix_len())
                .await
        {
            let _ = crate::netlink::delete_bridge(&bridge_name).await;
            return Err(OverlaydError::Overlay(format!(
                "add_address_to_link_by_name({bridge_name}, {gateway}/{}) failed: {e}",
                subnet.prefix_len()
            )));
        }
        if let Err(e) = crate::netlink::set_link_up_by_name(&bridge_name).await {
            let _ = crate::netlink::delete_bridge(&bridge_name).await;
            return Err(OverlaydError::Overlay(format!(
                "set_link_up_by_name({bridge_name}) failed: {e}"
            )));
        }

        // Track the per-service bridge for global teardown (deleting the link
        // drops its gateway address + up state).
        self.created_bridges.insert(bridge_name.clone());

        // Build per-service IpAllocator, reserve the gateway.
        let mut ip_allocator = OverlayIpAllocator::new(&subnet.to_string()).map_err(|e| {
            OverlaydError::Overlay(format!("IpAllocator::new({subnet}) failed: {e}"))
        })?;
        let _ = ip_allocator.allocate_specific(gateway);

        self.service_bridges.insert(
            service.to_string(),
            ServiceBridge {
                name: bridge_name.clone(),
                subnet,
                gateway,
                ip_allocator,
            },
        );
        self.service_interfaces
            .insert(service.to_string(), bridge_name.clone());

        tracing::info!(service = %service, bridge = %bridge_name, subnet = %subnet, gateway = %gateway, "Service bridge created");
        Ok(bridge_name)
    }

    /// Non-Linux variant of `setup_service_overlay`. On Windows the per-service
    /// segment is the HCN Internal network created lazily at attach time, and on
    /// macOS containers fall through to host networking. Registers the service
    /// in `service_interfaces` with a placeholder name so presence checks work.
    ///
    /// # Errors
    /// Infallible on non-Linux; the `Result` is preserved for ABI parity.
    #[cfg(not(target_os = "linux"))]
    async fn setup_service_overlay(
        &mut self,
        service: &str,
        mode: OverlayMode,
    ) -> Result<ServiceOverlayInfo, OverlaydError> {
        // Same predicate-driven decision surface as Linux (see
        // `zlayer_types::overlay`). The container ATTACH path differentiates the
        // modes per-OS; here we only record the resolved mode and register the
        // appropriate placeholder/info so presence checks and `Status` work.
        //
        //   - uses_per_service_wg() -> the cross-platform dedicated path (a real
        //     per-service WireGuard device; on Windows it also stands up a
        //     per-service HCN Internal network at attach time).
        //   - otherwise (`Auto` and `Shared`) -> no per-service WireGuard device.
        //     On macOS both rely on VZ NAT + host-port forwarding (the free-port
        //     L4 proxy), so they route to the SAME real path — the only honest
        //     mapping a VZ guest can express (it has no per-service bridge or WG
        //     to differentiate). On Windows the attach path reads the recorded
        //     mode to send `Shared` containers onto a shared HCN NAT network and
        //     `Auto` containers onto the node's base overlay network.
        //   - uses_isolation_scope() -> Isolated: Auto topology here; the L3
        //     fence is applied at ATTACH time via `isolation_network`.
        let resolved = mode.resolve();
        self.service_modes.insert(service.to_string(), resolved);
        if resolved.uses_per_service_wg() {
            self.setup_service_overlay_dedicated(service).await
        } else if resolved.uses_shared_bridge() {
            self.setup_service_overlay_shared_bridge(service).await
        } else if resolved.uses_isolation_scope() {
            // Isolated == Auto topology (per-service bridge on the cluster-wide
            // WireGuard); the L3 fence is applied at ATTACH time via
            // `isolation_network`, not in segment setup. Same target as the
            // default, made explicit so a new mode can't silently fall through.
            self.setup_service_overlay_cluster_wg(service).await
        } else {
            self.setup_service_overlay_cluster_wg(service).await
        }
    }

    /// `Auto`-mode per-service overlay (non-Linux): on Windows the per-service
    /// segment is the node's base overlay HCN network used at attach time, and on
    /// macOS containers ride VZ NAT. Registers the service in `service_interfaces`
    /// with a placeholder name so presence checks work.
    ///
    /// # Errors
    /// Infallible on non-Linux; the `Result` is preserved for ABI parity.
    #[cfg(not(target_os = "linux"))]
    #[allow(clippy::unused_async)]
    async fn setup_service_overlay_cluster_wg(
        &mut self,
        service: &str,
    ) -> Result<ServiceOverlayInfo, OverlaydError> {
        let placeholder = make_interface_name(&[&self.deployment, &self.instance_id, service], "b");
        self.service_interfaces
            .insert(service.to_string(), placeholder.clone());
        tracing::debug!(service = %service, "Service overlay bridge setup is Linux-only; using direct networking placeholder");
        Ok(cluster_wg_overlay_info(placeholder))
    }

    /// `Shared`-mode per-service overlay (non-Linux). There is no per-service
    /// `WireGuard` device and no per-service bridge:
    /// - macOS: the container is a VZ VM behind VZ NAT (a single shared host
    ///   adapter with host-port forwarding); its ports are exposed by the
    ///   userspace free-port L4 proxy. Nothing to provision here beyond a
    ///   placeholder so presence checks succeed.
    /// - Windows: containers attach to a SINGLE shared HCN NAT network reused
    ///   across all Shared services (created lazily at attach time); a placeholder
    ///   interface is registered here.
    ///
    /// Registers the service in `service_interfaces` with a placeholder name.
    ///
    /// # Errors
    /// Infallible on non-Linux; the `Result` is preserved for ABI parity.
    #[cfg(not(target_os = "linux"))]
    #[allow(clippy::unused_async)]
    async fn setup_service_overlay_shared_bridge(
        &mut self,
        service: &str,
    ) -> Result<ServiceOverlayInfo, OverlaydError> {
        // A single placeholder shared by every Shared service on this node (it
        // names the shared data-plane, not a per-service interface).
        let placeholder =
            make_interface_name(&[&self.deployment, &self.instance_id, "shared"], "sh");
        self.service_interfaces
            .insert(service.to_string(), placeholder.clone());
        tracing::debug!(service = %service, "Shared-mode service uses the node-wide shared data-plane (VZ NAT on macOS / shared HCN NAT on Windows)");
        Ok(shared_overlay_info(placeholder))
    }

    /// Dedicated-mode per-service overlay: stand up a *second* real `WireGuard`
    /// device for `service` with its own crypto context, listen port, overlay
    /// IP, and subnet — distinct from the single cluster transport.
    ///
    /// The cross-platform core (identity, subnet assign, transport bring-up,
    /// marker persist, status) runs on every OS; only the *attachment* of
    /// containers onto the device is platform-gated:
    /// - Linux: a per-service bridge (same mechanic as Shared) routed over the
    ///   dedicated device instead of the cluster device.
    /// - Windows: a per-service HCN Internal network (a later task; a clearly
    ///   marked seam returns an error here for now).
    /// - macOS: nothing further — the utun device is the attachment.
    ///
    /// # Errors
    /// Returns an error if port/key/subnet allocation, transport bring-up,
    /// marker persistence, or the platform attachment fails.
    #[allow(clippy::too_many_lines)]
    async fn setup_service_overlay_dedicated(
        &mut self,
        service: &str,
    ) -> Result<ServiceOverlayInfo, OverlaydError> {
        // ----- cross-platform core (runs on every OS) -----

        // 1. Idempotency: an existing dedicated transport returns its identity.
        if let Some(st) = self.service_transports.get(service) {
            return Ok(dedicated_overlay_info(
                st.interface.clone(),
                &st.public_key,
                st.listen_port,
                st.overlay_ip,
                st.subnet,
            ));
        }

        // 2. Identity: reuse a stable identity from the marker if one exists
        //    (so the device re-binds the same key + port across restarts),
        //    otherwise mint a fresh port + keypair + interface name.
        let marker_path =
            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
        let recorded = NetworkState::load(&marker_path)
            .get(&owner_for_service(service))
            .cloned();

        let (private_key, public_key, listen_port, iface_hint) = match recorded.as_ref() {
            Some(entry)
                if entry.wg_private_key.is_some()
                    && entry.wg_public_key.is_some()
                    && entry.wg_port.is_some()
                    && entry.interface.is_some() =>
            {
                let port = entry.wg_port.expect("checked above");
                self.dedicated_ports.reserve(port);
                (
                    entry.wg_private_key.clone().expect("checked above"),
                    entry.wg_public_key.clone().expect("checked above"),
                    port,
                    entry.interface.clone().expect("checked above"),
                )
            }
            _ => {
                let port = self.dedicated_ports.allocate()?;
                let (priv_key, pub_key) = OverlayTransport::generate_keys()
                    .await
                    .map_err(|e| OverlaydError::Overlay(format!("Failed to generate keys: {e}")))?;
                let iface =
                    make_interface_name(&[&self.deployment, &self.instance_id, service], "d");
                (priv_key, pub_key, port, iface)
            }
        };

        // 3. Subnet: assign from the same registry Shared uses, so per-service
        //    subnets stay globally unique regardless of mode.
        self.ensure_service_subnet_registry()?;
        let subnet: ipnet::IpNet = {
            let registry = self
                .service_subnet_registry
                .as_mut()
                .expect("ensure_service_subnet_registry leaves Some");
            let node_key = self.local_node_id.to_string();
            registry.assign(service, &node_key).map_err(|e| {
                OverlaydError::Overlay(format!(
                    "ServiceSubnetRegistry::assign({service}, {node_key}) failed: {e}"
                ))
            })?
        };
        let overlay_ip = first_usable_ip(subnet);

        // 4. Build + bring up the dedicated transport. The device's overlay CIDR
        //    is the service subnet (so boringtun routes that subnet over THIS
        //    device), and its listen port is the dedicated port.
        let physical_egress_ip = match zlayer_overlay::detect_physical_egress().await {
            Ok(egress) => Some(egress.ip),
            Err(e) => {
                tracing::warn!(
                    error = %e,
                    service = %service,
                    "failed to detect physical egress; WireGuard local_endpoint \
                     will bind UNSPECIFIED for the dedicated overlay"
                );
                None
            }
        };
        let config = self.build_config(
            private_key.clone(),
            public_key.clone(),
            overlay_ip,
            subnet.prefix_len(),
            listen_port,
            physical_egress_ip,
        );
        let mut transport = OverlayTransport::new(config, iface_hint);
        transport.create_interface().await.map_err(|e| {
            OverlaydError::Overlay(format!(
                "Failed to create dedicated overlay for {service}: {e}"
            ))
        })?;
        transport.configure(&[]).await.map_err(|e| {
            OverlaydError::Overlay(format!(
                "Failed to configure dedicated overlay for {service}: {e}"
            ))
        })?;
        let actual_iface = transport.interface_name().to_string();

        // 5. Persist the marker so the identity survives restarts. Match the
        //    base/Shared entry shape (owner/kind/name/id/subnet) plus the
        //    dedicated WG fields.
        let mut marker = NetworkState::load(&marker_path);
        marker.upsert(ManagedNetwork {
            owner: owner_for_service(service),
            kind: "wg-dedicated".to_string(),
            name: actual_iface.clone(),
            id: public_key.clone(),
            subnet: subnet.to_string(),
            wg_port: Some(listen_port),
            wg_private_key: Some(private_key),
            wg_public_key: Some(public_key.clone()),
            interface: Some(actual_iface.clone()),
        });
        if let Err(e) = marker.save(&marker_path) {
            tracing::warn!(service = %service, error = %e, path = %marker_path.display(), "failed to persist dedicated-overlay marker (device still live)");
        }

        // 6. Record the live transport. Build the guest-attach IPAM bounded to
        //    the service subnet, reserving the node's own dedicated-device IP so
        //    a joining guest never draws it.
        let mut ip_allocator = zlayer_overlay::allocator::IpAllocator::new(&subnet.to_string())
            .map_err(|e| {
                OverlaydError::Overlay(format!("IpAllocator::new({subnet}) failed: {e}"))
            })?;
        let _ = ip_allocator.allocate_specific(overlay_ip);
        self.service_transports.insert(
            service.to_string(),
            ServiceTransport {
                transport,
                interface: actual_iface.clone(),
                public_key: public_key.clone(),
                listen_port,
                overlay_ip,
                subnet,
                ip_allocator,
            },
        );

        tracing::info!(
            service = %service,
            interface = %actual_iface,
            listen_port,
            subnet = %subnet,
            overlay_ip = %overlay_ip,
            "Dedicated per-service overlay device created"
        );

        // ----- platform-gated attachment -----
        // `name` in the returned info is the container-attach handle: the bridge
        // name on Linux, the dedicated interface elsewhere.
        let name = self
            .attach_dedicated_service(service, subnet, overlay_ip)
            .await?;

        Ok(dedicated_overlay_info(
            name,
            &public_key,
            listen_port,
            overlay_ip,
            subnet,
        ))
    }

    /// Linux attachment for a dedicated per-service overlay: create the same
    /// per-service bridge Shared uses, but route the service subnet over the
    /// DEDICATED device rather than the cluster device.
    ///
    /// Concretely, the dedicated transport's overlay CIDR already covers
    /// `subnet` (set at `build_config` time in the core), so boringtun routes
    /// `subnet` out the dedicated TUN; we additionally plumb `subnet` onto this
    /// node's own `AllowedIPs` entry on the dedicated device so locally
    /// originated packets to the subnet are accepted. Returns the bridge name.
    ///
    /// # Errors
    /// Returns an error if the bridge cannot be created.
    #[cfg(target_os = "linux")]
    async fn attach_dedicated_service(
        &mut self,
        service: &str,
        subnet: ipnet::IpNet,
        overlay_ip: IpAddr,
    ) -> Result<String, OverlaydError> {
        let _ = overlay_ip;
        let bridge_name = self.create_service_bridge(service, subnet).await?;

        // Plumb the service subnet onto the DEDICATED device (not the cluster
        // device). The dedicated transport's overlay CIDR already routes the
        // subnet out its TUN; adding it to our own pubkey's AllowedIPs keeps the
        // local-accept side consistent with the Shared path's cluster plumbing.
        if let Some(st) = self.service_transports.get(service) {
            if let Some(ref pubkey) = self.local_wg_pubkey {
                if let Err(e) = st.transport.add_allowed_ip(pubkey, subnet).await {
                    tracing::warn!(
                        service = %service,
                        subnet = %subnet,
                        error = %e,
                        "Failed to add service subnet to dedicated transport AllowedIPs (non-fatal)"
                    );
                }
            } else {
                tracing::debug!(service = %service, "local_wg_pubkey not yet set; skipping dedicated AllowedIPs update");
            }
        }

        Ok(bridge_name)
    }

    /// Windows attachment for a dedicated per-service overlay.
    ///
    /// The cross-platform core has already stood up the dedicated Wintun
    /// transport (the encrypted node-to-node path for the service subnet). This
    /// adds the *container-facing* side: a per-service HCN **Internal** network
    /// onto which the agent's containers attach (instead of the node's shared
    /// base overlay network), so dedicated-service traffic is isolated at the
    /// vSwitch layer. Returns the per-service network's name, which the caller
    /// records as the [`ServiceOverlayInfo::name`] attach handle.
    ///
    /// # Errors
    /// Propagates any error from [`Self::ensure_service_network`].
    #[cfg(target_os = "windows")]
    async fn attach_dedicated_service(
        &mut self,
        service: &str,
        subnet: ipnet::IpNet,
        _overlay_ip: IpAddr,
    ) -> Result<String, OverlaydError> {
        // Create (or reuse) the per-service Internal HCN network. The returned
        // GUID is recorded in the marker under `owner_for_service(service)`;
        // the `AttachContainer` handler reuses it via the same marker lookup.
        let _net_id = self.ensure_service_network(service, subnet).await?;
        // The attach handle reported back is the per-service network's name.
        let daemon_name = self.deployment_or_default();
        Ok(format!(
            "{}-svc-{service}",
            overlay_network_name(&daemon_name)
        ))
    }

    /// macOS attachment for a dedicated per-service overlay: the cross-platform
    /// core already brought up a utun device; there is no bridge, so the
    /// interface name itself is the attach handle.
    #[cfg(all(not(target_os = "linux"), not(target_os = "windows")))]
    #[allow(clippy::unused_async)]
    async fn attach_dedicated_service(
        &mut self,
        service: &str,
        _subnet: ipnet::IpNet,
        _overlay_ip: IpAddr,
    ) -> Result<String, OverlaydError> {
        let iface = self
            .service_transports
            .get(service)
            .map(|st| st.interface.clone())
            .unwrap_or_default();
        Ok(iface)
    }

    /// Tear down the per-service segment for `service`. Idempotent.
    // Only the Linux body awaits (netlink + cluster AllowedIPs); other targets
    // are synchronous (transport shutdown is sync) but must keep the async
    // signature for the dispatch call.
    #[cfg_attr(not(target_os = "linux"), allow(clippy::unused_async))]
    async fn teardown_service_overlay(&mut self, service: &str) {
        // Drop the recorded mode; a `Shared` service's containers no longer route
        // to the shared bridge once it is gone. The node-wide shared bridge
        // itself is deliberately NOT torn down here — other Shared services reuse
        // it (it is reclaimed only on full overlay teardown / uninstall).
        self.service_modes.remove(service);

        // Auto-mode segment teardown (per-service bridge on Linux, placeholder
        // elsewhere). A Shared-mode service has no per-service bridge, so
        // `service_bridges.remove` is a no-op for it (its `service_interfaces`
        // placeholder pointing at the shared bridge is removed below).
        #[cfg(target_os = "linux")]
        {
            let removed = self.service_bridges.remove(service);
            self.service_interfaces.remove(service);

            // Remove the subnet from the cluster AllowedIPs only when we still
            // know it (the in-memory entry survived).
            if let Some(ref bridge) = removed {
                if let Some(ref cluster) = self.global_transport {
                    if let Some(ref pubkey) = self.local_wg_pubkey {
                        if let Err(e) = cluster.remove_allowed_ip(pubkey, bridge.subnet).await {
                            tracing::warn!(
                                service = %service,
                                subnet = %bridge.subnet,
                                error = %e,
                                "Failed to remove service subnet from cluster AllowedIPs (non-fatal)"
                            );
                        }
                    }
                }
            }

            // Delete the physical bridge by its DETERMINISTIC name, regardless of
            // whether the in-memory entry survived. After an overlayd restart the
            // `service_bridges` map is empty, so a delete gated on `Some(..)` would
            // silently leak the `zl-…-b` link forever (the observed orphan/linkdown
            // bridges). `delete_bridge` no-ops on ENODEV, so deleting an absent link
            // is safe — and the `-b` suffix never collides with a Shared service's
            // shared `-sh` bridge, so this can't tear down the wrong thing.
            let bridge_name = removed.as_ref().map_or_else(
                || make_interface_name(&[&self.deployment, &self.instance_id, service], "b"),
                |b| b.name.clone(),
            );
            if let Err(e) = crate::netlink::delete_bridge(&bridge_name).await {
                tracing::warn!(service = %service, bridge = %bridge_name, error = %e, "delete_bridge failed (non-fatal)");
            }

            // Release the subnet-registry slot by service name (works whether or
            // not the in-memory entry survived).
            if let Some(registry) = self.service_subnet_registry.as_mut() {
                let node_key = self.local_node_id.to_string();
                let _ = registry.release(service, &node_key);
            }

            if removed.is_some() {
                tracing::info!(service = %service, bridge = %bridge_name, "Tore down service bridge");
            } else {
                tracing::debug!(service = %service, bridge = %bridge_name, "best-effort delete of (possibly absent) service bridge by name");
            }
        }
        #[cfg(not(target_os = "linux"))]
        {
            if let Some(iface) = self.service_interfaces.remove(service) {
                tracing::info!(service = %service, interface = %iface, "Removed service overlay interface (placeholder, non-Linux)");
            }
        }

        // Dedicated-mode teardown (cross-platform): tear down the per-service
        // transport, free its port, and drop its marker entry. No-op when the
        // service ran in Shared mode (nothing in `service_transports`).
        if let Some(mut st) = self.service_transports.remove(service) {
            st.transport.shutdown();
            self.dedicated_ports.release(st.listen_port);

            // Release the subnet assignment (Shared releases it inside the
            // Linux block above; the dedicated subnet lives in the same
            // registry, so release it here for the dedicated case on every OS).
            if let Some(registry) = self.service_subnet_registry.as_mut() {
                let node_key = self.local_node_id.to_string();
                let _ = registry.release(service, &node_key);
            }

            let marker_path =
                zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
            let mut marker = NetworkState::load(&marker_path);
            let removed_entry = marker.remove(&owner_for_service(service));
            if removed_entry.is_some() {
                if let Err(e) = marker.save(&marker_path) {
                    tracing::warn!(service = %service, error = %e, path = %marker_path.display(), "failed to persist dedicated-overlay marker removal");
                }
            }

            // Windows: delete the per-service HCN Internal network this service
            // owned. The marker entry's `id` is the bare HCN GUID (set by
            // `ensure_service_network`); delete the network so a dedicated
            // service tears down cleanly without waiting for a full uninstall.
            // Also drop the per-service container-IP allocator.
            #[cfg(target_os = "windows")]
            {
                self.service_ip_allocators.remove(service);
                if let Some(entry) = removed_entry.as_ref() {
                    if entry.kind == "hcn-internal" {
                        if let Ok(guid) = windows::core::GUID::try_from(entry.id.as_str()) {
                            match zlayer_hns::network::Network::delete(guid) {
                                Ok(()) => {
                                    tracing::info!(service = %service, id = %entry.id, "deleted per-service HCN network");
                                }
                                Err(e) => {
                                    tracing::warn!(service = %service, id = %entry.id, error = %e, "failed to delete per-service HCN network (may leak until uninstall)");
                                }
                            }
                        } else {
                            tracing::warn!(service = %service, id = %entry.id, "per-service marker has unparseable HCN GUID; skipping network delete");
                        }
                    }
                }
            }
            #[cfg(not(target_os = "windows"))]
            drop(removed_entry);

            tracing::info!(
                service = %service,
                interface = %st.interface,
                listen_port = st.listen_port,
                "Tore down dedicated per-service overlay device"
            );
        }
    }

    /// Reclaim orphaned per-service host bridges (and their stale device veths)
    /// that no live deployment still owns. `live_bridge_names` is the full set of
    /// `zl-…-b` bridge names every currently-restored service SHOULD own,
    /// computed by the main daemon from storage.
    ///
    /// For every host link whose name looks like one of OUR per-service bridge
    /// (`…-b`) or dedicated device (`…-d`) interfaces but is NOT in `live` and is
    /// NOT the active global (`-g`) or shared (`-sh`) interface, we:
    ///   1. delete the link (idempotent — ENODEV is success),
    ///   2. release its service-subnet registry assignment + cluster `AllowedIPs`
    ///      when the `(service, node)` key can be recovered from the registry
    ///      snapshot by reproducing the deterministic bridge name, and
    ///   3. drop any stale in-memory `service_bridges`/`service_interfaces`
    ///      entries pointing at it.
    ///
    /// Best-effort + idempotent: a failure on one link is logged and the sweep
    /// continues. Returns the names actually reclaimed.
    #[cfg(target_os = "linux")]
    async fn prune_orphan_bridges(&mut self, live_bridge_names: &[String]) -> Vec<String> {
        use std::collections::HashSet;

        let links = match crate::netlink::list_all_links().await {
            Ok(links) => links,
            Err(e) => {
                tracing::warn!(error = %e, "prune_orphan_bridges: failed to list host links");
                return Vec::new();
            }
        };

        let live: HashSet<&str> = live_bridge_names.iter().map(String::as_str).collect();

        // The interfaces we must NEVER reclaim even though they carry the `zl-`
        // prefix: the active global transport device and the node-wide shared
        // bridge. (Container veths `veth-…`/`vc-…` are handled by the separate
        // PID-keyed `sweep_orphan_veths`; here we only target service bridges +
        // dedicated device interfaces, which `sweep_orphan_veths` never touches.)
        let mut protected: HashSet<String> = HashSet::new();
        if let Some(g) = self.global_interface.clone() {
            protected.insert(g);
        }
        if let Some(ref sh) = self.shared_bridge {
            protected.insert(sh.name.clone());
        }
        // Protect every dedicated-service WireGuard transport (`…-d`) by name. A
        // `-d` is a WG device, not a bridge — it has no `brif`, so the zero-member
        // guard below treats it as 0 members, and the daemon's `live` set only
        // carries `…-b` names; without this it would be reaped as a live device.
        //
        // We deliberately do NOT blanket-protect `service_bridges` (`…-b`) here.
        // That map holds BOTH managed-service bridges AND standalone/per-job
        // bridges (e.g. a Runner's per-job network), and overlayd cannot tell
        // them apart — a standalone container's `DetachContainer` releases the
        // veth/IP but never removes the bridge or its `service_bridges` entry, so
        // a blanket protect shielded those orphans forever (only a restart, which
        // wipes the map, ever cleared them). Managed bridges stay protected by
        // being in the daemon's authoritative `live` set; standalone bridges are
        // not in storage, so they fall through to the zero-member guard and are
        // reclaimed once idle.
        for st in self.service_transports.values() {
            protected.insert(st.interface.clone());
        }

        // Snapshot the subnet registry once so we can recover the `(service,
        // node)` key for an orphan by reproducing its deterministic bridge/device
        // name. The registry has no release-by-subnet API, so we map name ->
        // (service, node) here.
        let mut name_to_key: HashMap<String, (String, String, ipnet::IpNet)> = HashMap::new();
        if let Some(registry) = self.service_subnet_registry.as_ref() {
            for ((service, node), subnet) in registry.snapshot().assignments {
                let bridge =
                    make_interface_name(&[&self.deployment, &self.instance_id, &service], "b");
                let device =
                    make_interface_name(&[&self.deployment, &self.instance_id, &service], "d");
                name_to_key.insert(bridge, (service.clone(), node.clone(), subnet));
                name_to_key.insert(device, (service, node, subnet));
            }
        }

        let mut reclaimed = Vec::new();
        for (_index, name) in links {
            // Only consider OUR per-service bridge (`-b`) or dedicated device
            // (`-d`) interfaces that are neither live nor protected. The pure
            // predicate (unit-tested in `orphan_bridge_selection`) keeps us off
            // unrelated host links, the global/shared interfaces, and the veth
            // namespaces.
            if !is_orphan_service_bridge(&name, &live, &protected) {
                continue;
            }

            // Zero-member guard: only reclaim a non-live candidate once it is
            // IDLE — no member links. A `-b` bridge with a running container has
            // ≥1 veth in its `brif`, so an in-use (or a sub-ms mid-creation,
            // pre-attach is the only 0-member window) standalone bridge is left
            // alone; an orphan `-d` has no `brif` (0) and is correctly reaped.
            // This is what makes dropping the `service_bridges` blanket-protect
            // safe — a live managed bridge is already excluded by `live`, and any
            // other in-use bridge is excluded here.
            if crate::netlink::bridge_member_count(&name).await > 0 {
                continue;
            }

            tracing::info!(link = %name, "prune_orphan_bridges: reclaiming orphan service bridge/device");

            // 1. Release the subnet + cluster AllowedIPs when we can recover the
            //    owning service key from the registry.
            if let Some((service, node, subnet)) = name_to_key.get(&name).cloned() {
                if let Some(ref cluster) = self.global_transport {
                    if let Some(ref pubkey) = self.local_wg_pubkey {
                        if let Err(e) = cluster.remove_allowed_ip(pubkey, subnet).await {
                            tracing::warn!(
                                link = %name,
                                subnet = %subnet,
                                error = %e,
                                "prune_orphan_bridges: remove_allowed_ip failed (non-fatal)"
                            );
                        }
                    }
                }
                if let Some(registry) = self.service_subnet_registry.as_mut() {
                    let _ = registry.release(&service, &node);
                }
            }

            // 2. Delete the link itself (idempotent).
            if let Err(e) = crate::netlink::delete_bridge(&name).await {
                tracing::warn!(link = %name, error = %e, "prune_orphan_bridges: delete_bridge failed (non-fatal)");
                continue;
            }

            // 3. Drop any stale in-memory bookkeeping pointing at this link.
            self.service_bridges.retain(|_, b| b.name != name);
            self.service_interfaces.retain(|_, iface| *iface != name);

            reclaimed.push(name);
        }

        if !reclaimed.is_empty() {
            tracing::info!(count = reclaimed.len(), bridges = ?reclaimed, "prune_orphan_bridges: reclaimed orphaned service bridges/devices");
        }
        reclaimed
    }

    /// Non-Linux variant: per-service bridges are a Linux-only mechanic (Windows
    /// uses HCN networks torn down in `teardown_service_overlay`; macOS rides VZ
    /// NAT), so there are no host bridge links to sweep.
    #[cfg(not(target_os = "linux"))]
    #[allow(clippy::unused_async, clippy::unused_self)]
    async fn prune_orphan_bridges(&mut self, _live_bridge_names: &[String]) -> Vec<String> {
        Vec::new()
    }

    /// Initialize the local fallback `ServiceSubnetRegistry` from the configured
    /// cluster CIDR. Called on first `setup_service_overlay` use.
    ///
    /// # Errors
    /// Returns an error when no cluster CIDR is configured or the registry
    /// cannot be built.
    fn ensure_service_subnet_registry(&mut self) -> Result<(), OverlaydError> {
        use zlayer_overlay::allocator::ServiceSubnetRegistry;

        if self.service_subnet_registry.is_some() {
            return Ok(());
        }
        let cluster_cidr = self.cluster_cidr.ok_or_else(|| {
            OverlaydError::Other(
                "service subnet registry needs a cluster CIDR (SetupGlobalOverlay first)"
                    .to_string(),
            )
        })?;
        let cluster_ipnet: ipnet::IpNet = cluster_cidr.to_string().parse().map_err(|e| {
            OverlaydError::Other(format!(
                "failed to convert cluster CIDR {cluster_cidr} to ipnet::IpNet: {e}"
            ))
        })?;
        // Per-service bridge slice prefix. `/26` (V4) = ~61 usable container
        // IPs per service per node — keep in sync with
        // `zlayer_scheduler::raft::DEFAULT_SERVICE_SUBNET_SLICE_PREFIX` (the
        // canonical default; not imported here to avoid a dependency cycle).
        // The older `/28` (13 usable) exhausted under CI churn.
        let slice_prefix: u8 = match cluster_ipnet {
            ipnet::IpNet::V4(_) => 26,
            ipnet::IpNet::V6(_) => 120,
        };
        let mut registry =
            ServiceSubnetRegistry::new(cluster_ipnet, slice_prefix).map_err(|e| {
                OverlaydError::Other(format!("failed to build ServiceSubnetRegistry: {e}"))
            })?;
        // Reserve the node's own overlay IP so no per-service bridge subnet
        // overlaps it — the overlay DNS server listens on `<node_ip>:53`, and a
        // bridge subnet containing that IP would black-hole its containers' DNS
        // (they'd ARP for the node IP on their bridge, where nothing answers).
        if let Some(node_ip) = self.node_ip {
            registry.reserve_ip(node_ip);
        }
        self.service_subnet_registry = Some(registry);
        Ok(())
    }

    // -- IP allocation -------------------------------------------------------

    /// Allocate an overlay IP from the per-service bridge (Linux) or the node
    /// slice (otherwise). `join_global` reserves a second global-overlay IP too,
    /// matching the eth1 attach behavior.
    ///
    /// # Errors
    /// Returns an error if the relevant pool is exhausted.
    fn allocate_ip(&mut self, service: &str, join_global: bool) -> Result<IpAddr, OverlaydError> {
        // `join_global` does not allocate a second IP here: the companion
        // global-overlay IP (eth1) is reserved at attach time. `AllocateIp`
        // returns only the primary (service / slice) IP the caller asked for.
        let _ = join_global;
        #[cfg(target_os = "linux")]
        {
            // A Shared-mode service draws from the single node-wide shared bridge;
            // every other mode draws from its own per-service bridge.
            let use_shared = self
                .service_modes
                .get(service)
                .copied()
                .unwrap_or_default()
                .uses_shared_bridge();
            if use_shared {
                if let Some(bridge) = self.shared_bridge.as_mut() {
                    return bridge.ip_allocator.allocate().ok_or_else(|| {
                        OverlaydError::Overlay(format!(
                            "shared bridge {} subnet {} exhausted",
                            bridge.name, bridge.subnet
                        ))
                    });
                }
            } else if let Some(bridge) = self.service_bridges.get_mut(service) {
                return bridge.ip_allocator.allocate().ok_or_else(|| {
                    OverlaydError::Overlay(format!(
                        "service bridge {} subnet {} exhausted",
                        bridge.name, bridge.subnet
                    ))
                });
            }
        }
        let _ = service;
        self.ip_allocator.allocate()
    }

    /// Return an overlay IP to the allocator (service-bridge pool when known,
    /// otherwise the node slice).
    fn release_ip(&mut self, ip: IpAddr) {
        #[cfg(target_os = "linux")]
        {
            if let Some(bridge) = self.shared_bridge.as_mut() {
                if bridge.subnet.contains(&ip) {
                    bridge.ip_allocator.release(ip);
                    return;
                }
            }
            for bridge in self.service_bridges.values_mut() {
                if bridge.subnet.contains(&ip) {
                    bridge.ip_allocator.release(ip);
                    return;
                }
            }
        }
        self.ip_allocator.release(ip);
    }

    // -- container attach (Linux) -------------------------------------------

    /// Wire a container into the overlay and return its [`AttachResult`].
    ///
    /// # Errors
    /// Returns an error if the container cannot be attached.
    #[allow(clippy::too_many_arguments)]
    async fn attach_container(
        &mut self,
        handle: AttachHandle,
        service: &str,
        join_global: bool,
        ephemeral: bool,
        dns_server: Option<IpAddr>,
        dns_domain: Option<String>,
        isolation_network: Option<String>,
    ) -> Result<AttachResult, OverlaydError> {
        // Record the overlay DNS resolver/zone the main daemon staged for this
        // node so later attaches (and the Windows HCN endpoint `Dns` schema)
        // can fall back to them when a per-attach value isn't supplied.
        if let Some(server) = dns_server {
            self.dns_server_addr = Some(SocketAddr::new(server, 53));
        }
        if dns_domain.is_some() {
            self.dns_domain.clone_from(&dns_domain);
        }
        match handle {
            AttachHandle::LinuxPid { pid } => {
                let ip = self
                    .attach_container_linux(pid, service, join_global, ephemeral, isolation_network)
                    .await?;
                Ok(AttachResult {
                    ip,
                    namespace_guid: None,
                })
            }
            AttachHandle::WindowsContainer { container_id, ip } => {
                self.attach_container_windows(
                    &container_id,
                    service,
                    ip,
                    dns_server,
                    dns_domain,
                    isolation_network,
                )
                .await
            }
            AttachHandle::HostShared { id } => {
                let ip = self
                    .attach_container_host_shared(&id, service, ephemeral, isolation_network)
                    .await?;
                Ok(AttachResult {
                    ip,
                    namespace_guid: None,
                })
            }
            AttachHandle::GuestManaged { .. } => Err(OverlaydError::Other(
                "guest-managed attach must go through attach_container_guest, not attach_container"
                    .to_string(),
            )),
        }
    }

    /// Tear down a container's overlay attachment and release its IP.
    ///
    /// # Errors
    /// Returns an error only if a netlink delete fails for a reason other than
    /// "link not found".
    async fn detach_container(&mut self, handle: AttachHandle) -> Result<(), OverlaydError> {
        match handle {
            AttachHandle::LinuxPid { pid } => self.detach_container_linux(pid).await,
            AttachHandle::WindowsContainer { container_id, .. } => {
                self.detach_container_windows(&container_id).await
            }
            AttachHandle::HostShared { id } => self.detach_container_host_shared(&id).await,
            AttachHandle::GuestManaged { .. } => Err(OverlaydError::Other(
                "guest-managed detach must go through detach_container_guest, not detach_container"
                    .to_string(),
            )),
        }
    }

    // -- container attach (guest-managed) -----------------------------------

    /// Guest-managed overlay attach: allocate the overlay identity for a VM guest
    /// that brings up its own kernel `WireGuard` device.
    ///
    /// overlayd cannot enter the guest's network namespace (it is a VM, not a
    /// host process), so instead of a veth/HCN endpoint it:
    /// 1. allocates the overlay IP from the SAME pool the Linux attach uses (the
    ///    per-service bridge pool when one exists, otherwise the node slice) so
    ///    guest addresses never collide with container addresses;
    /// 2. generates a fresh `WireGuard` keypair for the guest;
    /// 3. builds the peer set the guest must configure — every GLOBAL peer the
    ///    host already knows, plus THIS node itself (so the guest can reach the
    ///    host node over the overlay; carries a keepalive so the guest keeps its
    ///    NAT mapping open from behind VZ NAT);
    /// 4. registers the generated public key as a GLOBAL peer (host route to the
    ///    guest, roaming endpoint learned from the guest's keepalive) so remote
    ///    nodes and this node route to it;
    /// 5. records the attachment keyed by `id` so `DetachContainer` can release
    ///    the IP and remove the peer.
    ///
    /// Platform-agnostic: pure IPAM + keygen + peer bookkeeping (no netns/veth/
    /// HCN), so it compiles and runs on macOS (where the overlayd serving a VZ
    /// host lives) as well as Linux.
    ///
    /// # Errors
    /// Returns an error if the global overlay is not set up, the IP pool is
    /// exhausted, key generation fails, or registering the guest peer fails.
    #[allow(clippy::cast_possible_truncation, clippy::too_many_lines)]
    async fn attach_container_guest(
        &mut self,
        id: &str,
        service: &str,
        join_global: bool,
        dns_server: Option<IpAddr>,
        dns_domain: Option<String>,
        isolation_network: Option<String>,
    ) -> Result<GuestOverlayConfig, OverlaydError> {
        // The global transport must exist: we both register the guest as a peer
        // on it and advertise this node (its public key + listen port) to the
        // guest. Resolve both up front so we fail before allocating anything.
        let node_public_key = self.transport_public_key.clone().ok_or_else(|| {
            OverlaydError::Other(
                "guest-managed attach requires the global overlay to be set up first \
                 (no node WireGuard public key)"
                    .to_string(),
            )
        })?;
        if self.global_transport.is_none() {
            return Err(OverlaydError::Other(
                "guest-managed attach requires the global overlay to be set up first \
                 (no global transport)"
                    .to_string(),
            ));
        }

        // 1. Allocate the overlay IP from the same pool the Linux attach uses and
        //    derive the prefix length from that pool's network. On Linux a
        //    per-service bridge (when present) supplies both the IP and its
        //    subnet's prefix; otherwise (and on every non-Linux host) the node
        //    slice / cluster CIDR does.
        let (overlay_ip, prefix_len, pool_service, dedicated): (IpAddr, u8, Option<String>, bool) = {
            #[cfg(target_os = "linux")]
            {
                let use_shared = self
                    .service_modes
                    .get(service)
                    .copied()
                    .unwrap_or_default()
                    .uses_shared_bridge();
                let bridge = if use_shared {
                    self.shared_bridge.as_mut()
                } else {
                    self.service_bridges.get_mut(service)
                };
                if let Some(bridge) = bridge {
                    let ip = bridge.ip_allocator.allocate().ok_or_else(|| {
                        OverlaydError::Overlay(format!(
                            "service bridge {} subnet {} exhausted",
                            bridge.name, bridge.subnet
                        ))
                    })?;
                    let prefix = bridge.subnet.prefix_len();
                    (ip, prefix, Some(service.to_string()), false)
                } else {
                    let ip = self.ip_allocator.allocate()?;
                    (ip, self.slice_prefix_len(), None, false)
                }
            }
            #[cfg(not(target_os = "linux"))]
            {
                // A Dedicated service owns a second WireGuard device (own crypto +
                // subnet); its guest draws from that device's allocator and lands
                // on the dedicated subnet, not the global cluster mesh. Every other
                // mode hairpins through the node slice on the global transport.
                let dedicated = self
                    .service_modes
                    .get(service)
                    .copied()
                    .unwrap_or_default()
                    .uses_per_service_wg();
                if dedicated {
                    let st = self.service_transports.get_mut(service).ok_or_else(|| {
                        OverlaydError::Other(format!(
                            "Dedicated service {service} has no dedicated overlay; \
                             call setup_service_overlay first"
                        ))
                    })?;
                    let ip = st.ip_allocator.allocate().ok_or_else(|| {
                        OverlaydError::Overlay(format!(
                            "dedicated service {service} subnet {} exhausted",
                            st.subnet
                        ))
                    })?;
                    (ip, st.subnet.prefix_len(), Some(service.to_string()), true)
                } else {
                    let ip = self.ip_allocator.allocate()?;
                    (ip, self.slice_prefix_len(), None, false)
                }
            }
        };
        // `join_global` is informational for a guest-managed attach: the guest's
        // single WireGuard device IS its global-overlay endpoint, so there is no
        // separate eth1 IP to reserve. Touch it so callers stay consistent with
        // the Linux/Windows handles.
        let _ = join_global;

        // 2. Generate the guest's WireGuard keypair (reuse the transport's
        //    native x25519 keygen — never reimplement curve25519 here).
        let (private_key, public_key) = OverlayTransport::generate_keys().await.map_err(|e| {
            // Roll back the IP allocation so a keygen failure leaks nothing.
            self.release_guest_ip(overlay_ip, pool_service.as_deref());
            OverlaydError::Overlay(format!("failed to generate guest keys: {e}"))
        })?;

        // 3. Build the peer set. A VZ guest is behind the host's NAT and can only
        //    reach the LOCAL node (via its NAT gateway) — it cannot dial other
        //    nodes' or sibling guests' endpoints directly. So it gets exactly ONE
        //    peer: this node. ALL overlay traffic (including to sibling containers
        //    and remote nodes) routes through this node, which forwards/hairpins it
        //    (the node already holds a /32 peer for every container — step 4 — and
        //    the real inter-node peers). We deliberately do NOT add the per-guest
        //    /32 peers here: a /32 with no reachable endpoint would win
        //    longest-prefix routing and black-hole sibling traffic. The endpoint
        //    returned here is the node's overlay IP as a placeholder; the VZ
        //    runtime rewrites it to the guest's NAT gateway (the only host address
        //    the guest can reach) before delivering the config. Keepalive holds the
        //    guest's NAT mapping open so the node can reach back.
        //
        //    Dedicated mode: the single peer is this node's DEDICATED per-service
        //    device (its own pubkey + listen port + subnet as AllowedIPs), so the
        //    guest joins that service's isolated mesh. Every other mode peers with
        //    the global cluster device, AllowedIPs = the whole cluster CIDR.
        let (peer_pubkey, peer_listen_port, peer_allowed) = if dedicated {
            let st = self
                .service_transports
                .get(service)
                .expect("dedicated transport allocated above");
            (st.public_key.clone(), st.listen_port, st.subnet.to_string())
        } else {
            let node_allowed = self
                .cluster_cidr
                .or(self.slice_cidr)
                .map_or_else(|| String::from("0.0.0.0/0"), |c| c.to_string());
            (node_public_key, self.overlay_port, node_allowed)
        };
        let node_endpoint = self.node_endpoint_for_guest(peer_listen_port);
        let peers: Vec<PeerSpec> = vec![PeerSpec {
            public_key: peer_pubkey,
            endpoint: node_endpoint,
            allowed_ips: peer_allowed,
            persistent_keepalive_secs: 25,
            // The guest reaches the node via its NAT gateway (the only host
            // address it can route to); it does not run the host's ICE-lite
            // candidate exchange, so no candidates are advertised here.
            candidates: Vec::new(),
        }];

        // 4. Register the guest's public key as a GLOBAL peer (host route to the
        //    guest at <overlay_ip>/32, roaming endpoint learned from keepalive).
        //    Go through the same internal path `AddPeer { Global }` uses.
        let host_route = format!(
            "{}/{}",
            overlay_ip,
            if overlay_ip.is_ipv6() { 128 } else { 32 }
        );
        let guest_peer = PeerSpec {
            public_key: public_key.clone(),
            // Empty/roaming: the guest is behind NAT; boringtun learns its source
            // endpoint from the guest's first keepalive. `0.0.0.0:0` is the
            // wire-safe "unset endpoint" sentinel that still parses as a
            // SocketAddr (peer_spec_to_info requires a parseable endpoint).
            endpoint: "0.0.0.0:0".to_string(),
            allowed_ips: host_route,
            persistent_keepalive_secs: 0,
            // The guest's roaming endpoint is learned from its first keepalive;
            // it advertises no NAT candidates (the host learns the source).
            candidates: Vec::new(),
        };
        let guest_peer_info = peer_spec_to_info(&guest_peer)?;
        let scope = if dedicated {
            PeerScope::Service {
                service: service.to_string(),
            }
        } else {
            PeerScope::Global
        };
        {
            let transport = self.transport_for_scope(&scope)?;
            if let Err(e) = Self::add_peer_on(transport, &guest_peer_info).await {
                self.release_guest_ip(overlay_ip, pool_service.as_deref());
                return Err(e);
            }
        }
        // Track it among the global peers (so a *subsequent* guest attach also
        // learns about this guest) and record the attachment for detach.
        self.global_peers
            .insert(public_key.clone(), guest_peer.clone());
        // Per-network membership + node-side L3 isolation: record the guest's
        // overlay IP in its isolated network's member set, and enforce the
        // cross-platform isolation policy on THIS node. A VZ guest hairpins ALL
        // its overlay traffic through this node's WireGuard device, so the node
        // is the enforcement point: on macOS this dispatches to pf (a per-network
        // table + sub-anchor); on Linux it dispatches to iptables (harmless here
        // — guests do not run on Linux). The guest's own WireGuard AllowedIPs are
        // the in-guest belt; this is the node-side suspenders.
        if let Some(ref net) = isolation_network {
            let node_ip = self
                .node_ip
                .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::new(10, 200, 0, 1)));
            let cidr = self
                .cluster_cidr
                .map_or_else(|| "10.200.0.0/16".to_string(), |c| c.to_string());
            // Peers = current members of the network BEFORE inserting this guest.
            let peers: Vec<IpAddr> = self
                .network_members
                .get(net)
                .map(|m| m.iter().copied().collect())
                .unwrap_or_default();
            if let Err(e) = zlayer_overlay::firewall::ensure_member_isolation(
                net, overlay_ip, &peers, node_ip, &cidr,
            ) {
                tracing::warn!(network = %net, member = %overlay_ip, error = %e, "failed to install per-network L3 isolation for guest (non-fatal)");
            }
            self.network_members
                .entry(net.clone())
                .or_default()
                .insert(overlay_ip);
        }
        self.guest_attachments.insert(
            id.to_string(),
            GuestAttachInfo {
                overlay_ip,
                public_key: public_key.clone(),
                service_name: pool_service,
                isolation_network,
            },
        );

        // 5. Return the config the caller ships into the guest.
        Ok(GuestOverlayConfig {
            overlay_ip,
            prefix_len,
            private_key,
            public_key,
            // The guest's device listens on the same port as its single in-guest
            // peer (the node device it joins): the node's overlay WG port for the
            // global mesh, or the dedicated device's listen port in Dedicated mode.
            listen_port: peer_listen_port,
            peers,
            dns_server: dns_server.or_else(|| self.dns_server_addr.map(|s| s.ip())),
            dns_domain: dns_domain.or_else(|| self.dns_domain.clone()),
        })
    }

    /// Release a guest-managed attach by `id`: drop the host route + global peer
    /// and return the allocated IP to its pool. Idempotent.
    ///
    /// # Errors
    /// Returns an error only if removing the peer from the global transport fails
    /// for a reason other than "peer not found".
    async fn detach_container_guest(&mut self, id: &str) -> Result<(), OverlaydError> {
        let Some(info) = self.guest_attachments.remove(id) else {
            return Ok(());
        };
        // Remove the guest's peer from the same scope it was registered on: a
        // Dedicated guest sits on its service's dedicated device, every other
        // guest on the global cluster device. Mirror the attach-time scope choice
        // so a dedicated guest peer does not leak on teardown.
        let scope = match info.service_name.as_deref() {
            Some(svc)
                if self
                    .service_modes
                    .get(svc)
                    .copied()
                    .unwrap_or_default()
                    .uses_per_service_wg() =>
            {
                PeerScope::Service {
                    service: svc.to_string(),
                }
            }
            _ => PeerScope::Global,
        };
        self.global_peers.remove(&info.public_key);
        if let Ok(transport) = self.transport_for_scope(&scope) {
            if let Err(e) = Self::remove_peer_on(transport, &info.public_key).await {
                tracing::warn!(
                    guest = %id,
                    pubkey = %info.public_key,
                    scope = ?scope,
                    error = %e,
                    "failed to remove guest peer from its overlay transport"
                );
            }
        }
        // Drain the per-network membership set for this guest and tear down the
        // node-side L3 isolation rule for it (pf on macOS, iptables on Linux —
        // the latter is a no-op for guests, which never run on Linux). Drop the
        // network entry once empty.
        if let Some(net) = info.isolation_network.as_deref() {
            if let Some(set) = self.network_members.get_mut(net) {
                set.remove(&info.overlay_ip);
            }
            let remaining_peers: Vec<IpAddr> = self
                .network_members
                .get(net)
                .map(|m| m.iter().copied().collect())
                .unwrap_or_default();
            let node_ip = self
                .node_ip
                .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::new(10, 200, 0, 1)));
            let cidr = self
                .cluster_cidr
                .map_or_else(|| "10.200.0.0/16".to_string(), |c| c.to_string());
            zlayer_overlay::firewall::remove_member_isolation(
                net,
                info.overlay_ip,
                &remaining_peers,
                node_ip,
                &cidr,
            );
            if self
                .network_members
                .get(net)
                .is_some_and(std::collections::HashSet::is_empty)
            {
                self.network_members.remove(net);
            }
        }
        // Return the IP to whichever pool it came from.
        self.release_guest_ip(info.overlay_ip, info.service_name.as_deref());
        Ok(())
    }

    // -- container attach (macOS host-shared) -------------------------------

    /// Host-shared overlay attach: give a macOS host-shared container
    /// ([`AttachHandle::HostShared`] — Seatbelt / native-VZ / libkrun) its own
    /// first-class L3 overlay membership.
    ///
    /// A host-shared container shares the node's host network namespace and its
    /// single cluster `utun`; it cannot get its own netns/veth (Seatbelt) or its
    /// own kernel `WireGuard` device (no guest VM to run one). So instead of a
    /// veth/HCN endpoint or a per-guest WG keypair, this:
    /// 1. allocates a DISTINCT overlay `/32` from the node slice (never the node
    ///    IP — `IpAllocator` reserves offset 1 — and never `None`). The node
    ///    slice is already advertised cluster-wide as this node's `AllowedIPs`,
    ///    so the `/32` auto-routes to this node with no peer reconfiguration;
    /// 2. adds that `/32` as an alias on the node's overlay `utun` so the kernel
    ///    delivers inbound overlay packets for it locally (boringtun decrypts
    ///    and writes the plaintext packet to the utun, which only delivers to a
    ///    configured local address);
    /// 3. records per-network membership + installs node-side L3 isolation when
    ///    `isolation_network` is set (pf on macOS), exactly like the guest path;
    /// 4. records the attachment keyed by `id` so `DetachContainer` can remove
    ///    the alias, drain the membership, and release the IP.
    ///
    /// HONEST CONSTRAINT: host-shared containers share the node's single cluster
    /// `utun`, so `OverlayMode::Dedicated`'s per-service `WireGuard` CRYPTO
    /// isolation cannot apply to them — there is no per-container WG device
    /// without a netns or a guest VM to host one. They still get a distinct
    /// overlay IP + L3 isolation (per-network membership / pf) + overlay DNS,
    /// which is full first-class L3 overlay membership. This is a real OS
    /// constraint of host-shared execution, not a stub.
    ///
    /// # Errors
    /// Returns an error if the node slice is exhausted, or if the global overlay
    /// interface is not set up (so there is no `utun` to alias the `/32` on).
    async fn attach_container_host_shared(
        &mut self,
        id: &str,
        service: &str,
        ephemeral: bool,
        isolation_network: Option<String>,
    ) -> Result<IpAddr, OverlaydError> {
        // 1. Allocate a distinct /32 from the node slice. Never the node IP
        //    (reserved at offset 1), never None — exhaustion maps to the same
        //    `OverlaydError::Overlay` the other attach paths surface.
        let ip = self.ip_allocator.allocate()?;
        let prefix_len: u8 = if ip.is_ipv6() { 128 } else { 32 };

        // 2. Make the /32 locally deliverable on the node's overlay utun via an
        //    alias on the single cluster transport's interface. Roll the IP
        //    allocation back on any failure so nothing leaks.
        let alias_res = if let Some(transport) = self.global_transport.as_ref() {
            transport
                .add_alias(ip, prefix_len)
                .await
                .map_err(|e| OverlaydError::Overlay(e.to_string()))
        } else {
            Err(OverlaydError::Other(
                "host-shared attach requires the global overlay to be set up first \
                 (no utun to alias the container /32 on)"
                    .to_string(),
            ))
        };
        if let Err(e) = alias_res {
            self.ip_allocator.release(ip);
            return Err(e);
        }

        // 3. Per-network membership + node-side L3 isolation (mirror the guest
        //    path). The host-shared container hairpins all overlay traffic
        //    through this node's WireGuard device, so the node is the
        //    enforcement point (pf on macOS).
        if let Some(ref net) = isolation_network {
            let node_ip = self
                .node_ip
                .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::new(10, 200, 0, 1)));
            let cidr = self
                .cluster_cidr
                .map_or_else(|| "10.200.0.0/16".to_string(), |c| c.to_string());
            // Peers = current members of the network BEFORE inserting this one.
            let peers: Vec<IpAddr> = self
                .network_members
                .get(net)
                .map(|m| m.iter().copied().collect())
                .unwrap_or_default();
            if let Err(e) =
                zlayer_overlay::firewall::ensure_member_isolation(net, ip, &peers, node_ip, &cidr)
            {
                tracing::warn!(network = %net, member = %ip, error = %e, "failed to install per-network L3 isolation for host-shared container (non-fatal)");
            }
            self.network_members
                .entry(net.clone())
                .or_default()
                .insert(ip);
        }

        // 4. Record the attachment so detach can reverse all of the above.
        self.host_shared_attachments.insert(
            id.to_string(),
            AttachInfo {
                service_ip: ip,
                service_name: Some(service.to_string()),
                // No separate global/eth1 IP: a host-shared container reaches the
                // global overlay through the SAME /32 aliased on the node utun.
                global_ip: None,
                ephemeral,
                isolation_network,
            },
        );

        Ok(ip)
    }

    /// Release a host-shared attach by `id`: remove the utun `/32` alias, drain
    /// its per-network L3 isolation membership, and return the IP to the node
    /// slice. Idempotent. Mirrors [`Self::detach_container_guest`].
    ///
    /// # Errors
    /// Returns `Ok` even when removing the alias fails (best-effort, logged) —
    /// the IP is always returned to the pool so it can never leak.
    async fn detach_container_host_shared(&mut self, id: &str) -> Result<(), OverlaydError> {
        let Some(info) = self.host_shared_attachments.remove(id) else {
            return Ok(());
        };
        // Drain the per-network membership set and tear down the node-side L3
        // isolation rule for this container; drop the network entry once empty.
        if let Some(net) = info.isolation_network.as_deref() {
            if let Some(set) = self.network_members.get_mut(net) {
                set.remove(&info.service_ip);
            }
            let remaining_peers: Vec<IpAddr> = self
                .network_members
                .get(net)
                .map(|m| m.iter().copied().collect())
                .unwrap_or_default();
            let node_ip = self
                .node_ip
                .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::new(10, 200, 0, 1)));
            let cidr = self
                .cluster_cidr
                .map_or_else(|| "10.200.0.0/16".to_string(), |c| c.to_string());
            zlayer_overlay::firewall::remove_member_isolation(
                net,
                info.service_ip,
                &remaining_peers,
                node_ip,
                &cidr,
            );
            if self
                .network_members
                .get(net)
                .is_some_and(std::collections::HashSet::is_empty)
            {
                self.network_members.remove(net);
            }
        }
        // Remove the utun /32 alias (best-effort: a failed removal must not
        // strand the IP, so we log and still release below).
        let prefix_len: u8 = if info.service_ip.is_ipv6() { 128 } else { 32 };
        if let Some(transport) = self.global_transport.as_ref() {
            if let Err(e) = transport.remove_alias(info.service_ip, prefix_len).await {
                tracing::warn!(
                    container = %id,
                    ip = %info.service_ip,
                    error = %e,
                    "failed to remove host-shared overlay /32 alias from utun (non-fatal)"
                );
            }
        }
        // Return the IP to the node slice.
        self.ip_allocator.release(info.service_ip);

        // Per-job segment lifecycle observability. Unlike the Linux veth path —
        // which reaps a per-service BRIDGE on the last ephemeral detach — a
        // host-shared container shares the node's single cluster utun and owns
        // no per-service bridge or dedicated WG device to tear down (see the
        // HONEST CONSTRAINT note on `attach_container_host_shared`). The only
        // per-segment state is its overlay `/32` + per-network membership, both
        // already reversed above. So `ephemeral` and `service_name` drive the
        // last-leaver TRACE here (mirroring the Linux ephemeral-teardown log)
        // rather than a bridge teardown: an ephemeral (per-job) segment's IP
        // return is logged at info level for reclamation traceability, a
        // managed service's at debug.
        let service = info.service_name.as_deref().unwrap_or("<none>");
        if info.ephemeral {
            tracing::info!(
                container = %id,
                service = %service,
                ip = %info.service_ip,
                "ephemeral host-shared overlay member detached — per-job segment /32 returned to node slice"
            );
        } else {
            tracing::debug!(
                container = %id,
                service = %service,
                ip = %info.service_ip,
                "host-shared overlay member detached — /32 returned to node slice"
            );
        }
        Ok(())
    }

    /// Release a guest overlay IP back to the pool it was drawn from: the named
    /// service bridge's allocator (Linux) when `service` is set and the bridge
    /// still exists, otherwise the node slice allocator.
    fn release_guest_ip(&mut self, ip: IpAddr, service: Option<&str>) {
        #[cfg(target_os = "linux")]
        {
            // A Shared-mode service drew from the single node-wide shared bridge,
            // which is keyed by subnet, not by service name. Try it first.
            if let Some(bridge) = self.shared_bridge.as_mut() {
                if bridge.subnet.contains(&ip) {
                    bridge.ip_allocator.release(ip);
                    return;
                }
            }
            if let Some(svc) = service {
                if let Some(bridge) = self.service_bridges.get_mut(svc) {
                    bridge.ip_allocator.release(ip);
                    return;
                }
            }
        }
        #[cfg(not(target_os = "linux"))]
        {
            // A Dedicated-mode guest drew its IP from the per-service transport's
            // allocator (keyed by service name); return it there so the dedicated
            // subnet does not leak addresses across guest churn.
            if let Some(svc) = service {
                if let Some(st) = self.service_transports.get_mut(svc) {
                    st.ip_allocator.release(ip);
                    return;
                }
            }
        }
        let _ = service;
        self.ip_allocator.release(ip);
    }

    /// Prefix length of the address pool guest IPs are drawn from when not using
    /// a per-service bridge: the node slice if assigned, else the cluster CIDR.
    fn slice_prefix_len(&self) -> u8 {
        self.slice_cidr.or(self.cluster_cidr).map_or(
            if self.node_ip.is_some_and(|ip| ip.is_ipv6()) {
                64
            } else {
                24
            },
            |c| c.prefix(),
        )
    }

    /// Reachable `WireGuard` endpoint for THIS node, advertised to a guest as a
    /// peer on `listen_port` (the node's global overlay port, or a Dedicated
    /// service's per-service device port). overlayd has no public reflexive
    /// address at this layer, so it uses the node's overlay-listen identity
    /// (`node_ip:listen_port`); the caller (the VZ runtime that ships the config
    /// into the guest) rewrites it to the concrete VZ-NAT gateway endpoint the
    /// guest can dial. Falls back to the unspecified address when no node IP is
    /// assigned yet.
    fn node_endpoint_for_guest(&self, listen_port: u16) -> String {
        let ip = self.node_ip.unwrap_or(IpAddr::V4(Ipv4Addr::UNSPECIFIED));
        SocketAddr::new(ip, listen_port).to_string()
    }

    /// Linux veth/netns attach. On non-Linux this returns the node's overlay IP
    /// (host networking) and is never wired for a `LinuxPid` handle in practice.
    #[cfg(target_os = "linux")]
    #[allow(clippy::too_many_lines)]
    async fn attach_container_linux(
        &mut self,
        container_pid: u32,
        service: &str,
        join_global: bool,
        ephemeral: bool,
        isolation_network: Option<String>,
    ) -> Result<IpAddr, OverlaydError> {
        // Resolve which bridge backs this service. A `Shared`-mode service
        // attaches onto the SINGLE node-wide shared bridge; every other mode
        // (`Auto`, `Dedicated`) attaches onto its own per-service bridge. The
        // mode was recorded at `setup_service_overlay` time.
        let use_shared = self
            .service_modes
            .get(service)
            .copied()
            .unwrap_or_default()
            .uses_shared_bridge();

        let (bridge_name, bridge_subnet, bridge_gateway, container_ip) = if use_shared {
            let bridge = self.shared_bridge.as_mut().ok_or_else(|| {
                OverlaydError::Other(format!(
                    "no shared bridge for Shared-mode service {service}; call setup_service_overlay() first"
                ))
            })?;
            let ip = bridge.ip_allocator.allocate().ok_or_else(|| {
                OverlaydError::Overlay(format!(
                    "shared bridge {} subnet {} exhausted",
                    bridge.name, bridge.subnet
                ))
            })?;
            (bridge.name.clone(), bridge.subnet, bridge.gateway, ip)
        } else {
            let bridge = self.service_bridges.get_mut(service).ok_or_else(|| {
                OverlaydError::Other(format!(
                    "no service bridge for service {service}; call setup_service_overlay() first"
                ))
            })?;
            let ip = bridge.ip_allocator.allocate().ok_or_else(|| {
                OverlaydError::Overlay(format!(
                    "service bridge {} subnet {} exhausted",
                    bridge.name, bridge.subnet
                ))
            })?;
            (bridge.name.clone(), bridge.subnet, bridge.gateway, ip)
        };

        let bridge_params = BridgeAttachParams {
            bridge_name: &bridge_name,
            gateway: bridge_gateway,
            subnet_prefix_len: bridge_subnet.prefix_len(),
        };
        if let Err(e) = self
            .attach_to_interface(
                container_pid,
                container_ip,
                "s",
                "eth0",
                Some(&bridge_params),
            )
            .await
        {
            if use_shared {
                if let Some(bridge) = self.shared_bridge.as_mut() {
                    bridge.ip_allocator.release(container_ip);
                }
            } else if let Some(bridge) = self.service_bridges.get_mut(service) {
                bridge.ip_allocator.release(container_ip);
            }
            return Err(e);
        }

        let mut global_ip: Option<IpAddr> = None;
        if join_global && self.global_interface.is_some() {
            let g_ip = self.ip_allocator.allocate()?;
            self.attach_to_interface(container_pid, g_ip, "g", "eth1", None)
                .await?;
            global_ip = Some(g_ip);
        }

        // Per-network L3 isolation: when this attach joins a named isolated
        // network, install the Docker-style iptables rules pinning this member
        // to its own network's members + node + egress, then record it in the
        // membership map. Non-fatal: a host without iptables logs and proceeds.
        if let Some(ref net) = isolation_network {
            let node_ip = self
                .node_ip
                .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::new(10, 200, 0, 1)));
            let cidr = self
                .cluster_cidr
                .map_or_else(|| "10.200.0.0/16".to_string(), |c| c.to_string());
            let peers: Vec<IpAddr> = self
                .network_members
                .get(net)
                .map(|m| m.iter().copied().collect())
                .unwrap_or_default();
            if let Err(e) = zlayer_overlay::firewall::ensure_member_isolation(
                net,
                container_ip,
                &peers,
                node_ip,
                &cidr,
            ) {
                tracing::warn!(network = %net, member = %container_ip, error = %e, "failed to install per-network L3 isolation (non-fatal)");
            }
            self.network_members
                .entry(net.clone())
                .or_default()
                .insert(container_ip);
        }

        self.attached.insert(
            container_pid,
            AttachInfo {
                service_ip: container_ip,
                service_name: Some(service.to_string()),
                global_ip,
                ephemeral,
                isolation_network,
            },
        );

        Ok(container_ip)
    }

    /// Non-Linux fallback: containers share the host network, so return the
    /// node's overlay IP (or loopback).
    #[cfg(not(target_os = "linux"))]
    #[allow(clippy::unused_async)]
    async fn attach_container_linux(
        &mut self,
        _container_pid: u32,
        service: &str,
        _join_global: bool,
        _ephemeral: bool,
        _isolation_network: Option<String>,
    ) -> Result<IpAddr, OverlaydError> {
        tracing::debug!(service = %service, "LinuxPid attach is a no-op off Linux; using node overlay IP");
        Ok(self.node_ip.unwrap_or(IpAddr::V4(Ipv4Addr::LOCALHOST)))
    }

    /// Release the overlay resources held by a Linux container PID. Idempotent.
    #[cfg(target_os = "linux")]
    async fn detach_container_linux(&mut self, pid: u32) -> Result<(), OverlaydError> {
        // "Process id or not, kill the adapter": the host-side veth name is
        // deterministic (`veth-<pid>-{s,g}`), so delete it UNCONDITIONALLY by
        // name — even when no attach record survives (a previous daemon crashed
        // before recording it, or it was already reaped). Without this, a missing
        // record left the host veth orphaned until the PID-keyed periodic sweep
        // (which only fires once the PID is dead). The deletes are idempotent
        // (ENODEV = success), so the always-on `-g` delete is harmless when the
        // container never joined the global overlay.
        let info = self.attached.remove(&pid);

        let veth_s = format!("veth-{pid}-s");
        if let Err(e) = crate::netlink::delete_link_by_name(&veth_s).await {
            tracing::warn!(link = %veth_s, pid, error = %e, "Failed to delete service veth");
        }
        let veth_g = format!("veth-{pid}-g");
        if let Err(e) = crate::netlink::delete_link_by_name(&veth_g).await {
            tracing::warn!(link = %veth_g, pid, error = %e, "Failed to delete global veth");
        }

        // No attach record -> nothing more to release (IP/registry bookkeeping
        // is keyed off the record). The veths above are already gone.
        let Some(info) = info else {
            return Ok(());
        };

        // Return the service IP to whichever pool owns it. A Shared-mode service
        // drew its IP from the single node-wide shared bridge (no per-service
        // bridge exists for it), so try the shared bridge by subnet containment
        // before the named per-service bridge.
        if self.shared_bridge.as_mut().is_some_and(|b| {
            b.subnet.contains(&info.service_ip) && b.ip_allocator.release(info.service_ip)
        }) {
            // released into the shared bridge
        } else if let Some(svc) = info.service_name.as_deref() {
            if let Some(bridge) = self.service_bridges.get_mut(svc) {
                bridge.ip_allocator.release(info.service_ip);
            } else {
                tracing::debug!(service = %svc, ip = %info.service_ip, "detach: service bridge already torn down; dropping service IP release");
            }
        } else {
            self.ip_allocator.release(info.service_ip);
        }
        if let Some(g) = info.global_ip {
            self.ip_allocator.release(g);
        }

        // Per-network L3 isolation drain: remove this member from its isolated
        // network's membership set and tear down its iptables rules against the
        // remaining members. Drop the network entry once empty.
        if let Some(net) = info.isolation_network.as_deref() {
            if let Some(set) = self.network_members.get_mut(net) {
                set.remove(&info.service_ip);
            }
            let still: Vec<IpAddr> = self
                .network_members
                .get(net)
                .map(|m| m.iter().copied().collect())
                .unwrap_or_default();
            let node_ip = self
                .node_ip
                .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::new(10, 200, 0, 1)));
            let cidr = self
                .cluster_cidr
                .map_or_else(|| "10.200.0.0/16".to_string(), |c| c.to_string());
            zlayer_overlay::firewall::remove_member_isolation(
                net,
                info.service_ip,
                &still,
                node_ip,
                &cidr,
            );
            if self
                .network_members
                .get(net)
                .is_some_and(std::collections::HashSet::is_empty)
            {
                self.network_members.remove(net);
            }
        }

        // Ephemeral last-leaver teardown: a standalone/per-job bridge is reclaimed
        // the moment its LAST container leaves (the periodic prune is only the
        // ~300s backstop). Managed attaches use ephemeral=false so their bridge
        // persists across scale-to-0. Route through teardown_service_overlay so
        // overlayd's in-memory state stays synced — never a hand `ip link del`.
        // This container's veth is already removed above, so a 0 member count
        // means no containers remain on the bridge.
        if info.ephemeral {
            if let Some(svc) = info.service_name.clone() {
                if let Some(bridge_name) = self.service_bridges.get(&svc).map(|b| b.name.clone()) {
                    if crate::netlink::bridge_member_count(&bridge_name).await == 0 {
                        tracing::info!(service = %svc, bridge = %bridge_name, "ephemeral overlay bridge idle after last detach — tearing down");
                        self.teardown_service_overlay(&svc).await;
                    }
                }
            }
        }
        Ok(())
    }

    /// Non-Linux fallback: nothing to detach (host networking).
    #[cfg(not(target_os = "linux"))]
    #[allow(clippy::unused_async)]
    async fn detach_container_linux(&mut self, _pid: u32) -> Result<(), OverlaydError> {
        Ok(())
    }

    /// Best-effort sweep of orphan veth endpoints whose owning container PID is
    /// no longer alive. Names matching `veth-<pid>-*` / `vc-<pid>-*` where
    /// `/proc/<pid>` does not exist are deleted.
    #[cfg(target_os = "linux")]
    async fn sweep_orphan_veths() {
        let links = match crate::netlink::list_all_links().await {
            Ok(links) => links,
            Err(e) => {
                tracing::warn!(error = %e, "Failed to list links for orphan sweep");
                return;
            }
        };
        for (_index, name) in links {
            let remainder = if let Some(r) = name.strip_prefix("veth-") {
                r
            } else if let Some(r) = name.strip_prefix("vc-") {
                r
            } else {
                continue;
            };
            let Some(pid_str) = remainder.split('-').next() else {
                continue;
            };
            let pid: u32 = match pid_str.parse() {
                Ok(p) => p,
                Err(_) => continue,
            };
            if Path::new(&format!("/proc/{pid}")).exists() {
                continue;
            }
            tracing::info!(link = %name, pid = pid, "Deleting orphan veth");
            if let Err(e) = crate::netlink::delete_link_by_name(&name).await {
                tracing::warn!(link = %name, error = %e, "Failed to delete orphan veth");
            }
        }
    }

    #[cfg(target_os = "linux")]
    #[allow(clippy::too_many_lines)]
    async fn attach_to_interface(
        &mut self,
        container_pid: u32,
        ip: IpAddr,
        tag: &str,
        container_iface: &str,
        bridge: Option<&BridgeAttachParams<'_>>,
    ) -> Result<(), OverlaydError> {
        // Best-effort cleanup of orphan veths left by a previous daemon crash.
        Self::sweep_orphan_veths().await;

        let is_v6 = ip.is_ipv6();
        let prefix_len: u8 = if let Some(b) = bridge {
            b.subnet_prefix_len
        } else if is_v6 {
            64
        } else {
            24
        };
        let host_prefix: u8 = if is_v6 { 128 } else { 32 };

        let veth_host = format!("veth-{container_pid}-{tag}");
        let veth_pending = format!("vc-{container_pid}-{tag}");
        let veth_container = container_iface.to_string();

        let container_ns_fd = std::os::fd::OwnedFd::from(
            std::fs::File::open(format!("/proc/{container_pid}/ns/net")).map_err(|e| {
                OverlaydError::Overlay(format!("Failed to open /proc/{container_pid}/ns/net: {e}"))
            })?,
        );

        crate::netlink::delete_link_by_name(&veth_host)
            .await
            .map_err(|e| OverlaydError::Overlay(format!("pre-cleanup delete {veth_host}: {e}")))?;
        crate::netlink::delete_link_by_name(&veth_pending)
            .await
            .map_err(|e| {
                OverlaydError::Overlay(format!("pre-cleanup delete {veth_pending}: {e}"))
            })?;

        let bridge_gateway: Option<IpAddr> = bridge.map(|b| b.gateway);
        let bridge_name: Option<String> = bridge.map(|b| b.bridge_name.to_string());
        let node_ip = self.node_ip;

        let result: Result<(), OverlaydError> = async {
            crate::netlink::create_veth_pair(&veth_host, &veth_pending)
                .await
                .map_err(|e| OverlaydError::Overlay(format!("create veth pair: {e}")))?;

            crate::netlink::move_link_into_netns_fd_and_rename(
                &veth_pending,
                AsFd::as_fd(&container_ns_fd),
                &veth_container,
            )
            .map_err(|e| OverlaydError::Overlay(format!("move veth into netns: {e}")))?;

            let vc = veth_container.clone();
            let bridge_gateway_for_netns = bridge_gateway;
            tokio::task::spawn_blocking(move || {
                crate::netlink::with_netns_fd_async(container_ns_fd, move || async move {
                    crate::netlink::add_address_to_link_by_name(&vc, ip, prefix_len).await?;
                    crate::netlink::set_link_up_by_name(&vc).await?;
                    crate::netlink::set_link_up_by_name("lo").await?;
                    if let Some(gw) = bridge_gateway_for_netns {
                        crate::netlink::add_default_route_via_gateway(gw).await?;
                    }
                    Ok(())
                })
            })
            .await
            .map_err(|e| OverlaydError::Overlay(format!("container netns task panicked: {e}")))?
            .map_err(|e| OverlaydError::Overlay(format!("container netns ops: {e}")))?;

            crate::netlink::set_link_up_by_name(&veth_host)
                .await
                .map_err(|e| OverlaydError::Overlay(format!("set {veth_host} up: {e}")))?;

            if let Some(bname) = bridge_name.as_deref() {
                crate::netlink::add_link_to_bridge(&veth_host, bname)
                    .await
                    .map_err(|e| {
                        OverlaydError::Overlay(format!(
                            "enslave {veth_host} to bridge {bname}: {e}"
                        ))
                    })?;
            } else {
                crate::netlink::replace_route_via_dev(ip, host_prefix, &veth_host, node_ip)
                    .await
                    .map_err(|e| {
                        OverlaydError::Overlay(format!("host route for {ip}/{host_prefix}: {e}"))
                    })?;
            }

            Ok(())
        }
        .await;

        // Enable IP forwarding so the host routes between the overlay device(s)
        // and the egress NIC. CRITICAL: this is scoped to the address family
        // actually in use and (for IPv6) to the specific overlay devices —
        // NEVER `net.ipv6.conf.all.forwarding`, whose documented kernel side
        // effect is to force `accept_ra=0` + `autoconf=0` on every IPv6
        // interface (including the public NIC), dropping the RA-learned default
        // route / path-MTU and blackholing the host's own larger reply packets
        // (e.g. inbound SSH stalls after key exchange). Done outside the
        // attach `result` block so a forwarding-sysctl failure can never roll
        // back a successful veth attach. Tracked so teardown reverts it.
        if result.is_ok() {
            self.enable_forwarding_for_attach(is_v6, &veth_host, bridge_name.as_deref());

            // Track the host-side resources this attach created so a clean
            // global teardown reverts every host mutation. The host-side veth
            // half exists in both the bridged and bridgeless paths; the host
            // `/32`(`/128`) route is installed ONLY on the bridgeless path
            // (`replace_route_via_dev` above), so record it only when there was
            // no bridge to enslave into. All deletions are idempotent, so a
            // resource a later per-container detach removes first is harmless.
            self.created_veths.insert(veth_host.clone());
            if bridge_name.is_none() {
                self.created_host_routes
                    .push((ip, host_prefix, veth_host.clone()));
            }
        }

        if result.is_err() {
            let _ = crate::netlink::delete_link_by_name(&veth_host).await;
            let _ = crate::netlink::delete_link_by_name(&veth_pending).await;
        }
        result
    }

    // -- container attach (Windows HCN) -------------------------------------

    /// Windows attach: ensure the overlay HCN Internal network exists, allocate
    /// or validate the IP, create the per-container HCN endpoint + namespace,
    /// and return the bare-lowercase namespace GUID for the agent to embed in
    /// the compute-system document.
    ///
    /// # Errors
    /// Returns an error if the network/endpoint cannot be created or the slice
    /// is exhausted.
    #[cfg(target_os = "windows")]
    #[allow(clippy::too_many_lines)]
    async fn attach_container_windows(
        &mut self,
        container_id: &str,
        service: &str,
        ip_override: Option<IpAddr>,
        dns_server: Option<IpAddr>,
        dns_domain: Option<String>,
        isolation_network: Option<String>,
    ) -> Result<AttachResult, OverlaydError> {
        // Resolve whether THIS service has a dedicated per-service overlay. It
        // does iff a live dedicated transport exists OR a `hcn-internal` marker
        // entry is recorded under `owner_for_service(service)` (the network
        // survives daemon restarts even if the transport map is empty mid-init).
        // Dedicated services attach onto their OWN per-service Internal network
        // and draw IPs from the service subnet; everything else uses the node's
        // base/shared overlay network and the node slice.
        let dedicated_subnet = self.dedicated_service_subnet(service);
        // A `Shared`-mode service attaches onto the SINGLE shared HCN NAT network
        // reused across all Shared services (container ports are exposed via the
        // userspace free-port L4 proxy). The mode was recorded at setup time.
        let use_shared_nat = self
            .service_modes
            .get(service)
            .copied()
            .unwrap_or_default()
            .uses_shared_bridge();

        let (net_id, ip, prefix_length) = if let Some(net) = isolation_network.as_deref() {
            // ----- per-isolation-network Internal HCN network path -----
            //
            // An "isolated" ZLayer network routes its members onto a dedicated
            // HCN Internal vSwitch keyed by the isolation-network NAME (not the
            // service). HCN Internal vSwitches are mutually isolated by default,
            // so same-network members share one vSwitch (reach each other +
            // egress via the network gateway + the node), while different
            // isolation networks land on separate vSwitches and cannot reach
            // each other — L3 isolation with NO ACLs and NO per-member churn.
            // This mirrors the Dedicated per-service branch below, but keyed by
            // the isolation-network name and drawing IPs from a per-network
            // subnet carved deterministically from the node slice.
            let iso_subnet = self.isolation_network_subnet(net)?;
            let net_id = self.ensure_isolation_network(net, iso_subnet).await?;

            // Per-network container IPs come from the isolation network's own
            // subnet (never the node slice), via a lazily-created allocator
            // bounded to that subnet. The allocator is keyed by the isolation
            // network's owner key so it never collides with a same-named
            // dedicated service's allocator. An `ip_override` is honored only
            // when it falls inside the isolation subnet.
            let iso_ipnetwork: IpNetwork = iso_subnet.to_string().parse().map_err(|e| {
                OverlaydError::Other(format!(
                    "failed to parse isolation subnet {iso_subnet}: {e}"
                ))
            })?;
            let alloc_key = crate::network_state::owner_for_isolation_network(net);
            let allocator = self
                .service_ip_allocators
                .entry(alloc_key)
                .or_insert_with(|| IpAllocator::new(iso_ipnetwork));
            let ip = match ip_override {
                Some(ip) if iso_subnet.contains(&ip) => ip,
                Some(ip) => {
                    return Err(OverlaydError::Other(format!(
                        "overridden IP {ip} is not inside isolation network subnet {iso_subnet} for network {net}"
                    )));
                }
                None => allocator.allocate()?,
            };
            (net_id, ip, iso_subnet.prefix_len())
        } else if use_shared_nat {
            // ----- shared HCN NAT network path -----
            let slice = self.slice_cidr.ok_or_else(|| {
                OverlaydError::Other(
                    "no node slice assigned yet (SetupGlobalOverlay with slice_cidr first)"
                        .to_string(),
                )
            })?;
            let slice_ipnet: ipnet::IpNet = slice.to_string().parse().map_err(|e| {
                OverlaydError::Other(format!("failed to parse slice CIDR {slice}: {e}"))
            })?;
            let net_id = self.ensure_shared_nat_network(slice_ipnet).await?;
            let ip = match ip_override {
                Some(ip) => ip,
                None => self.ip_allocator.allocate()?,
            };
            (net_id, ip, slice_ipnet.prefix_len())
        } else if let Some(svc_subnet) = dedicated_subnet {
            // ----- dedicated per-service network path -----
            let net_id = self.ensure_service_network(service, svc_subnet).await?;

            // Allocate (or validate) the IP from the SERVICE subnet, not the
            // node slice. A per-service allocator is created lazily and bounded
            // to the service subnet so addresses stay inside the dedicated
            // network. An `ip_override` inside the service subnet is honored;
            // one outside it is rejected so a slice-allocated IP can't leak onto
            // the dedicated network.
            let svc_ipnetwork: IpNetwork = svc_subnet.to_string().parse().map_err(|e| {
                OverlaydError::Other(format!("failed to parse service subnet {svc_subnet}: {e}"))
            })?;
            let allocator = self
                .service_ip_allocators
                .entry(service.to_string())
                .or_insert_with(|| IpAllocator::new(svc_ipnetwork));
            let ip = match ip_override {
                Some(ip) if svc_subnet.contains(&ip) => ip,
                Some(ip) => {
                    return Err(OverlaydError::Other(format!(
                        "overridden IP {ip} is not inside dedicated service subnet {svc_subnet} for service {service}"
                    )));
                }
                None => allocator.allocate()?,
            };
            (net_id, ip, svc_subnet.prefix_len())
        } else {
            // ----- shared base overlay network path (unchanged) -----
            let slice = self.slice_cidr.ok_or_else(|| {
                OverlaydError::Other(
                    "no node slice assigned yet (SetupGlobalOverlay with slice_cidr first)"
                        .to_string(),
                )
            })?;
            let slice_ipnet: ipnet::IpNet = slice.to_string().parse().map_err(|e| {
                OverlaydError::Other(format!("failed to parse slice CIDR {slice}: {e}"))
            })?;
            let net_id = self.ensure_overlay_network(slice_ipnet).await?;
            let ip = match ip_override {
                Some(ip) => ip,
                None => self.ip_allocator.allocate()?,
            };
            (net_id, ip, slice_ipnet.prefix_len())
        };

        // 3. Create the endpoint + per-container namespace on the network.
        let dns_server_eff = dns_server.or_else(|| self.dns_server_addr.map(|a| a.ip()));
        let dns_domain_for_attach = dns_domain.or_else(|| self.dns_domain.clone());
        let cluster_cidr = self.cluster_cidr.map(|c| c.to_string()).unwrap_or_default();
        let owner_tag = owner_tag(&self.deployment_or_default());
        let cid = container_id.to_string();

        let attachment = tokio::task::spawn_blocking(move || {
            zlayer_hns::attach::EndpointAttachment::create_overlay(
                net_id,
                &owner_tag,
                cid.as_str(),
                ip,
                prefix_length,
                &cluster_cidr,
                dns_server_eff,
                dns_domain_for_attach.as_deref(),
            )
        })
        .await
        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
        .map_err(|e| OverlaydError::Overlay(format!("HCN overlay endpoint attach failed: {e}")))?;

        let namespace_id = attachment.namespace_id();
        let bare_guid = format_guid_bare(namespace_id);

        // Per-network membership: record the container's IP in its isolated
        // network's member set. Windows enforcement is an HCN ACL — a
        // Linux-incompatible mechanism wired separately; overlayd only maintains
        // the membership map here and does NOT call the iptables firewall helper.
        if let Some(ref net) = isolation_network {
            self.network_members
                .entry(net.clone())
                .or_default()
                .insert(ip);
        }

        // Record for autoclean keyed by namespace GUID.
        self.hcn_cleanup
            .insert(namespace_id, (service.to_string(), ip, isolation_network));

        tracing::info!(
            ns = %bare_guid,
            service = %service,
            ip = %ip,
            "Attached container to HCN overlay"
        );

        Ok(AttachResult {
            ip,
            namespace_guid: Some(bare_guid),
        })
    }

    /// Non-Windows path: a `WindowsContainer` handle has no meaning off Windows.
    #[cfg(not(target_os = "windows"))]
    #[allow(clippy::unused_async)]
    async fn attach_container_windows(
        &mut self,
        _container_id: &str,
        _service: &str,
        _ip_override: Option<IpAddr>,
        _dns_server: Option<IpAddr>,
        _dns_domain: Option<String>,
        _isolation_network: Option<String>,
    ) -> Result<AttachResult, OverlaydError> {
        Err(OverlaydError::Other(
            "WindowsContainer attach is only supported on Windows".to_string(),
        ))
    }

    /// Detach a Windows container by its bare namespace GUID and release its IP.
    /// Idempotent: unknown ids are a no-op.
    #[cfg(target_os = "windows")]
    async fn detach_container_windows(
        &mut self,
        namespace_guid: &str,
    ) -> Result<(), OverlaydError> {
        use windows::core::GUID;

        let Ok(guid) = GUID::try_from(namespace_guid) else {
            tracing::warn!(ns = %namespace_guid, "detach: unparseable namespace GUID");
            return Ok(());
        };
        if let Some((service, ip, isolation_network)) = self.hcn_cleanup.remove(&guid) {
            // Release the IP into the pool it was drawn from. An isolation-network
            // member drew from the per-network allocator (keyed by the isolation
            // owner key), NOT the node slice; release it there so the isolation
            // subnet doesn't leak addresses. Everything else came from the node
            // slice.
            if let Some(net) = isolation_network.as_deref() {
                let alloc_key = crate::network_state::owner_for_isolation_network(net);
                if let Some(allocator) = self.service_ip_allocators.get_mut(&alloc_key) {
                    allocator.release(ip);
                } else {
                    self.ip_allocator.release(ip);
                }
            } else {
                self.ip_allocator.release(ip);
            }
            // Drain the per-network membership set.
            let mut net_now_empty: Option<String> = None;
            if let Some(net) = isolation_network.as_deref() {
                if let Some(set) = self.network_members.get_mut(net) {
                    set.remove(&ip);
                }
                if self
                    .network_members
                    .get(net)
                    .is_some_and(std::collections::HashSet::is_empty)
                {
                    self.network_members.remove(net);
                    net_now_empty = Some(net.to_string());
                }
            }
            tracing::info!(ns = %namespace_guid, service = %service, ip = %ip, "Released HCN overlay attachment");

            // Last-member teardown: when the final member of an isolation network
            // leaves, reclaim its per-network HCN Internal network (mirroring the
            // per-service network teardown in `teardown_service_overlay`) so we
            // don't leak an HCN vSwitch until the next full uninstall. Drop the
            // per-network IP allocator and the marker entry too.
            if let Some(net) = net_now_empty {
                self.teardown_isolation_network(&net).await;
            }
        }
        Ok(())
    }

    /// Reclaim the per-isolation-network HCN Internal network for `net`: delete
    /// the HCN network by the GUID recorded in the marker, drop its marker entry,
    /// and discard the per-network IP allocator. Best-effort and idempotent —
    /// called once the last member of the isolation network detaches. Mirrors the
    /// per-service network teardown in [`Self::teardown_service_overlay`].
    #[cfg(target_os = "windows")]
    async fn teardown_isolation_network(&mut self, net: &str) {
        let owner = crate::network_state::owner_for_isolation_network(net);

        // Drop the per-network container-IP allocator.
        self.service_ip_allocators.remove(&owner);

        let marker_path =
            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
        let mut marker = crate::network_state::NetworkState::load(&marker_path);
        let removed_entry = marker.remove(&owner);
        if removed_entry.is_some() {
            if let Err(e) = marker.save(&marker_path) {
                tracing::warn!(network = %net, error = %e, path = %marker_path.display(), "failed to persist isolation-network marker removal");
            }
        }

        if let Some(entry) = removed_entry {
            if entry.kind == "hcn-internal" {
                match windows::core::GUID::try_from(entry.id.as_str()) {
                    Ok(guid) => {
                        let id_str = entry.id.clone();
                        let net_owned = net.to_string();
                        let delete = tokio::task::spawn_blocking(move || {
                            zlayer_hns::network::Network::delete(guid)
                        })
                        .await;
                        match delete {
                            Ok(Ok(())) => {
                                tracing::info!(network = %net_owned, id = %id_str, "deleted per-isolation-network HCN network on last detach");
                            }
                            Ok(Err(e)) => {
                                tracing::warn!(network = %net_owned, id = %id_str, error = %e, "failed to delete isolation-network HCN network (may leak until uninstall)");
                            }
                            Err(e) => {
                                tracing::warn!(network = %net_owned, id = %id_str, error = %e, "spawn_blocking join failed deleting isolation-network HCN network");
                            }
                        }
                    }
                    Err(_) => {
                        tracing::warn!(network = %net, id = %entry.id, "isolation-network marker has unparseable HCN GUID; skipping network delete");
                    }
                }
            }
        }
    }

    /// Non-Windows path.
    #[cfg(not(target_os = "windows"))]
    #[allow(clippy::unused_async)]
    async fn detach_container_windows(
        &mut self,
        _namespace_guid: &str,
    ) -> Result<(), OverlaydError> {
        Ok(())
    }

    /// Ensure the per-daemon HCN overlay (Internal vSwitch, no physical-NIC
    /// binding) exists on the host, reusing one recorded in the
    /// `{data_dir}/agent_network.json` marker or discoverable by name, and
    /// recording it in the marker on create.
    ///
    /// # Errors
    /// Propagates the underlying `zlayer_hns` error on create failure.
    #[cfg(target_os = "windows")]
    #[allow(clippy::too_many_lines)]
    async fn ensure_overlay_network(
        &mut self,
        slice_cidr: ipnet::IpNet,
    ) -> Result<windows::core::GUID, OverlaydError> {
        use windows::core::GUID;

        let daemon_name = self.deployment_or_default();
        let net_name = overlay_network_name(&daemon_name);
        let marker_path =
            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();

        // Fast path: marker names a network GUID that still exists; reopen it.
        if let Some(recorded_id) = crate::network_state::NetworkState::load(&marker_path)
            .get(crate::network_state::OWNER_BASE)
            .and_then(|entry| GUID::try_from(entry.id.as_str()).ok())
        {
            let reopened = tokio::task::spawn_blocking(move || {
                zlayer_hns::network::Network::open(recorded_id).ok()
            })
            .await
            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
            if reopened.is_some() {
                tracing::info!(name = %net_name, "reusing HCN overlay network from marker");
                return Ok(recorded_id);
            }
        }

        // Idempotency: reuse a host network whose queried name matches ours.
        let target_name = net_name.clone();
        let existing = tokio::task::spawn_blocking(move || -> Option<GUID> {
            let guids = zlayer_hns::network::list("{}").ok()?;
            for guid in guids {
                let Ok(network) = zlayer_hns::network::Network::open(guid) else {
                    continue;
                };
                if matches!(network.query("{}"), Ok(props) if props.name == target_name) {
                    return Some(guid);
                }
            }
            None
        })
        .await
        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;

        if let Some(existing_id) = existing {
            tracing::info!(name = %net_name, "reusing existing HCN overlay network");
            return Ok(existing_id);
        }

        let net_id = GUID::new()
            .map_err(|e| OverlaydError::Other(format!("GUID::new for overlay network: {e}")))?;
        let subnet_str = slice_cidr.to_string();

        // Default: an HCN Internal network — an internal vSwitch with NO
        // physical-NIC binding — so container traffic never touches the
        // operator's gateway adapter. Setting ZLAYER_HCN_UPLINK_ADAPTER opts
        // into the legacy Transparent model bound to that named uplink.
        let use_transparent = std::env::var(zlayer_hns::adapter::ZLAYER_UPLINK_ENV)
            .ok()
            .is_some_and(|v| !v.trim().is_empty());

        let net_name_for_create = net_name.clone();
        let subnet_for_create = subnet_str.clone();
        if use_transparent {
            let uplink = zlayer_hns::adapter::find_primary_adapter()
                .map_err(|e| OverlaydError::Other(format!("find_primary_adapter: {e}")))?;
            tracing::warn!(uplink = %uplink, "ZLAYER_HCN_UPLINK_ADAPTER set: creating HCN *Transparent* overlay bound to a physical NIC");
            tokio::task::spawn_blocking(move || {
                zlayer_hns::network::Network::create_transparent(
                    net_id,
                    &net_name_for_create,
                    &subnet_for_create,
                    &uplink,
                )
            })
            .await
            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
            .map_err(|e| {
                OverlaydError::Overlay(format!("HcnCreateNetwork transparent ({net_name}): {e}"))
            })?;
        } else {
            tokio::task::spawn_blocking(move || {
                zlayer_hns::network::Network::create_internal(
                    net_id,
                    &net_name_for_create,
                    &subnet_for_create,
                )
            })
            .await
            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
            .map_err(|e| {
                OverlaydError::Overlay(format!("HcnCreateNetwork internal ({net_name}): {e}"))
            })?;
        }

        // HCN's Static IPAM needs ~1-2s after network create to settle its
        // address pool; without this the first endpoint frequently fails with
        // HCN_E_ADDR_INVALID_OR_RESERVED.
        tokio::time::sleep(std::time::Duration::from_secs(2)).await;

        tracing::info!(
            subnet = %subnet_str,
            mode = if use_transparent { "Transparent" } else { "Internal" },
            "created HCN overlay network"
        );

        // Persist the marker so subsequent runs reuse this network by GUID and a
        // full uninstall knows to delete it. Best-effort.
        let mut marker = crate::network_state::NetworkState::load(&marker_path);
        marker.upsert(crate::network_state::ManagedNetwork {
            owner: crate::network_state::OWNER_BASE.to_string(),
            kind: if use_transparent {
                "hcn-transparent"
            } else {
                "hcn-internal"
            }
            .to_string(),
            name: net_name.clone(),
            id: format_guid_bare(net_id),
            subnet: subnet_str.clone(),
            // Base/Shared HCN network: no dedicated WireGuard identity.
            wg_port: None,
            wg_private_key: None,
            wg_public_key: None,
            interface: None,
        });
        if let Err(e) = marker.save(&marker_path) {
            tracing::warn!(error = %e, path = %marker_path.display(), "failed to persist agent network marker (network still reusable by name)");
        }

        Ok(net_id)
    }

    /// Ensure the SINGLE shared HCN **NAT** network exists on the host, reusing
    /// one recorded under the [`OWNER_SHARED_NAT`] marker owner (or discoverable
    /// by its derived name) and recording it on create. Reused across every
    /// `OverlayMode::Shared` service on this node.
    ///
    /// NAT gives Shared containers outbound connectivity and lets the userspace
    /// free-port L4 proxy (`proxy_manager.rs`) forward `host:FREEPORT` ->
    /// `container_ip:port` without a per-service vSwitch — the Windows analogue
    /// of the Linux node-wide shared bridge. Modeled on
    /// [`Self::ensure_overlay_network`] but keyed on [`OWNER_SHARED_NAT`] and
    /// forced to the NAT network type.
    ///
    /// Returns the network GUID.
    ///
    /// # Errors
    /// Propagates the underlying `zlayer_hns` error on create failure, or an
    /// error if the slice CIDR has no usable gateway host.
    #[cfg(target_os = "windows")]
    #[allow(clippy::too_many_lines)]
    async fn ensure_shared_nat_network(
        &mut self,
        slice_cidr: ipnet::IpNet,
    ) -> Result<windows::core::GUID, OverlaydError> {
        use windows::core::GUID;

        let daemon_name = self.deployment_or_default();
        // Shared NAT network name: `<base overlay name>-shared` so it is
        // unambiguously distinct from the base network and per-service networks.
        let net_name = format!("{}-shared", overlay_network_name(&daemon_name));
        let owner = crate::network_state::OWNER_SHARED_NAT.to_string();
        let marker_path =
            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();

        // Fast path: marker names a network GUID that still exists; reopen it.
        let recorded_id = crate::network_state::NetworkState::load(&marker_path)
            .get(&owner)
            .filter(|entry| entry.kind == "hcn-nat")
            .and_then(|entry| GUID::try_from(entry.id.as_str()).ok());
        if let Some(recorded_id) = recorded_id {
            let reopened = tokio::task::spawn_blocking(move || {
                zlayer_hns::network::Network::open(recorded_id).ok()
            })
            .await
            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
            if reopened.is_some() {
                tracing::info!(name = %net_name, "reusing shared HCN NAT network from marker");
                return Ok(recorded_id);
            }
        }

        // Idempotency: reuse a host network whose queried name matches ours.
        let target_name = net_name.clone();
        let existing = tokio::task::spawn_blocking(move || -> Option<GUID> {
            let guids = zlayer_hns::network::list("{}").ok()?;
            for guid in guids {
                let Ok(network) = zlayer_hns::network::Network::open(guid) else {
                    continue;
                };
                if matches!(network.query("{}"), Ok(props) if props.name == target_name) {
                    return Some(guid);
                }
            }
            None
        })
        .await
        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;

        if let Some(existing_id) = existing {
            tracing::info!(name = %net_name, "reusing existing shared HCN NAT network");
            return Ok(existing_id);
        }

        let net_id = GUID::new()
            .map_err(|e| OverlaydError::Other(format!("GUID::new for shared NAT network: {e}")))?;
        let subnet_str = slice_cidr.to_string();
        let settings = shared_nat_settings(&net_name, &subnet_str).ok_or_else(|| {
            OverlaydError::Other(format!(
                "shared NAT network: slice CIDR '{subnet_str}' has no usable gateway host"
            ))
        })?;

        let net_name_for_create = net_name.clone();
        tokio::task::spawn_blocking(move || {
            zlayer_hns::network::Network::create(net_id, &settings)
        })
        .await
        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
        .map_err(|e| OverlaydError::Overlay(format!("HcnCreateNetwork NAT ({net_name}): {e}")))?;
        let _ = net_name_for_create;

        // HCN's IPAM needs ~1-2s after network create to settle its address pool
        // (same wait as the base/Internal networks).
        tokio::time::sleep(std::time::Duration::from_secs(2)).await;

        tracing::info!(subnet = %subnet_str, "created shared HCN NAT network");

        let mut marker = crate::network_state::NetworkState::load(&marker_path);
        marker.upsert(crate::network_state::ManagedNetwork {
            owner,
            kind: "hcn-nat".to_string(),
            name: net_name.clone(),
            id: format_guid_bare(net_id),
            subnet: subnet_str.clone(),
            wg_port: None,
            wg_private_key: None,
            wg_public_key: None,
            interface: None,
        });
        if let Err(e) = marker.save(&marker_path) {
            tracing::warn!(error = %e, path = %marker_path.display(), "failed to persist shared NAT network marker (network still reusable by name)");
        }

        Ok(net_id)
    }

    /// Ensure the per-service HCN **Internal** network for `service` exists on
    /// the host, reusing one recorded under the `service:<name>` marker owner
    /// (or discoverable by its derived name) and recording it on create.
    ///
    /// This is the Windows analogue of the Linux per-service bridge: a
    /// dedicated (`OverlayMode::Dedicated`) service gets its OWN isolated HCN
    /// Internal network — an internal vSwitch with NO physical-NIC binding —
    /// distinct from the node's shared base overlay network. Containers attach
    /// to it (rather than the base network) so dedicated-service traffic is
    /// segregated at the vSwitch layer. Modeled on [`Self::ensure_overlay_network`]
    /// but keyed on [`owner_for_service`] and forced to the Internal type (never
    /// Transparent — the on-box test asserts zero external vSwitches for
    /// dedicated services).
    ///
    /// Returns the network GUID.
    ///
    /// # Errors
    /// Propagates the underlying `zlayer_hns` error on create failure.
    #[cfg(target_os = "windows")]
    #[allow(clippy::too_many_lines)]
    async fn ensure_service_network(
        &mut self,
        service: &str,
        subnet: ipnet::IpNet,
    ) -> Result<windows::core::GUID, OverlaydError> {
        use windows::core::GUID;

        let daemon_name = self.deployment_or_default();
        // Per-service network name: `<base overlay name>-svc-<service>` so it is
        // unambiguously distinct from the base network and from other services.
        let net_name = format!("{}-svc-{service}", overlay_network_name(&daemon_name));
        let owner = owner_for_service(service);
        let marker_path =
            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();

        // Fast path: marker names a network GUID that still exists; reopen it.
        // Only honor the recorded id when it belongs to an HCN-internal entry —
        // a Dedicated WireGuard marker (`kind == "wg-dedicated"`) stores the
        // transport public key in `id`, NOT an HCN GUID, so it must be ignored
        // for HCN reuse.
        let recorded_hcn_id = crate::network_state::NetworkState::load(&marker_path)
            .get(&owner)
            .filter(|entry| entry.kind == "hcn-internal")
            .and_then(|entry| GUID::try_from(entry.id.as_str()).ok());
        if let Some(recorded_id) = recorded_hcn_id {
            let reopened = tokio::task::spawn_blocking(move || {
                zlayer_hns::network::Network::open(recorded_id).ok()
            })
            .await
            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
            if reopened.is_some() {
                tracing::info!(name = %net_name, service = %service, "reusing per-service HCN network from marker");
                return Ok(recorded_id);
            }
        }

        // Idempotency: reuse a host network whose queried name matches ours.
        let target_name = net_name.clone();
        let existing = tokio::task::spawn_blocking(move || -> Option<GUID> {
            let guids = zlayer_hns::network::list("{}").ok()?;
            for guid in guids {
                let Ok(network) = zlayer_hns::network::Network::open(guid) else {
                    continue;
                };
                if matches!(network.query("{}"), Ok(props) if props.name == target_name) {
                    return Some(guid);
                }
            }
            None
        })
        .await
        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;

        if let Some(existing_id) = existing {
            tracing::info!(name = %net_name, service = %service, "reusing existing per-service HCN network");
            return Ok(existing_id);
        }

        let net_id = GUID::new()
            .map_err(|e| OverlaydError::Other(format!("GUID::new for per-service network: {e}")))?;
        let subnet_str = subnet.to_string();

        // ALWAYS Internal for a dedicated service — never Transparent. The
        // dedicated requirement is isolation; an Internal network binds NO
        // physical NIC (no external vSwitch), which is what the on-box test
        // asserts.
        let net_name_for_create = net_name.clone();
        let subnet_for_create = subnet_str.clone();
        tokio::task::spawn_blocking(move || {
            zlayer_hns::network::Network::create_internal(
                net_id,
                &net_name_for_create,
                &subnet_for_create,
            )
        })
        .await
        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
        .map_err(|e| {
            OverlaydError::Overlay(format!("HcnCreateNetwork internal ({net_name}): {e}"))
        })?;

        // HCN's Static IPAM needs ~1-2s after network create to settle its
        // address pool; without this the first endpoint frequently fails with
        // HCN_E_ADDR_INVALID_OR_RESERVED (same wait as the base network).
        tokio::time::sleep(std::time::Duration::from_secs(2)).await;

        tracing::info!(
            service = %service,
            subnet = %subnet_str,
            "created per-service HCN Internal network"
        );

        // Persist the marker (owner = `service:<name>`, kind = `hcn-internal`)
        // so subsequent runs reuse this network by GUID and a full uninstall
        // (`purge_managed_networks`, which sweeps every `kind` starting with
        // `hcn`) deletes it. Best-effort.
        //
        // A dedicated Windows service shares the SAME owner key for two facts:
        // the dedicated WireGuard identity (written by the cross-platform core
        // in `setup_service_overlay_dedicated`, kind `wg-dedicated`) and this
        // HCN network's GUID. The marker is keyed by owner, so carry the WG
        // identity fields over when we rewrite the entry to `hcn-internal` — the
        // single entry then holds both the HCN GUID (in `id`) and the WG
        // identity (in the `wg_*`/`interface` fields), and the WG private key
        // survives restarts. (The core re-asserts the `wg-dedicated` shape on
        // the next setup; this path re-asserts `hcn-internal` again right after
        // — both are self-healing because the network is also reusable by name.)
        let mut marker = crate::network_state::NetworkState::load(&marker_path);
        let carried = marker.get(&owner).cloned();
        marker.upsert(crate::network_state::ManagedNetwork {
            owner,
            kind: "hcn-internal".to_string(),
            name: net_name.clone(),
            id: format_guid_bare(net_id),
            subnet: subnet_str.clone(),
            wg_port: carried.as_ref().and_then(|c| c.wg_port),
            wg_private_key: carried.as_ref().and_then(|c| c.wg_private_key.clone()),
            wg_public_key: carried.as_ref().and_then(|c| c.wg_public_key.clone()),
            interface: carried.as_ref().and_then(|c| c.interface.clone()),
        });
        if let Err(e) = marker.save(&marker_path) {
            tracing::warn!(service = %service, error = %e, path = %marker_path.display(), "failed to persist per-service network marker (network still reusable by name)");
        }

        Ok(net_id)
    }

    /// Resolve the per-isolation-network subnet for `net`, carving a fixed-size
    /// sub-block out of the node slice deterministically by name hash.
    ///
    /// Isolation networks attach onto a dedicated HCN Internal vSwitch and need
    /// their OWN address pool (never the node slice's shared pool) so a member's
    /// IP is on-link with its network's gateway. Unlike dedicated services,
    /// isolation networks aren't registered in the cluster's
    /// [`ServiceSubnetRegistry`] (a standalone isolated container may use the
    /// base overlay, where no `SetupServiceOverlay` ran), so the subnet is
    /// derived locally and deterministically: the node slice is split into
    /// `/<sub_prefix>` blocks and the network name selects one by hash. The
    /// derivation is stable across restarts (same name -> same block) so a
    /// reused HCN network keeps the same subnet.
    ///
    /// # Errors
    /// Returns an error if no node slice is assigned yet, the slice CIDR is
    /// unparseable, or the slice cannot be subnetted (e.g. already at the host
    /// prefix).
    #[cfg(target_os = "windows")]
    fn isolation_network_subnet(&self, net: &str) -> Result<ipnet::IpNet, OverlaydError> {
        use std::hash::{Hash, Hasher};

        let slice = self.slice_cidr.ok_or_else(|| {
            OverlaydError::Other(
                "no node slice assigned yet (SetupGlobalOverlay with slice_cidr first)".to_string(),
            )
        })?;
        let slice_ipnet: ipnet::IpNet = slice.to_string().parse().map_err(|e| {
            OverlaydError::Other(format!("failed to parse slice CIDR {slice}: {e}"))
        })?;

        // Carve the slice into /<sub_prefix> blocks. A `/28` (V4) gives ~13
        // usable container IPs per isolation network per node — enough for the
        // isolated-container use case — while leaving room for several distinct
        // isolation networks inside one node slice. Clamp to the slice prefix so
        // a slice already more specific than the target just yields itself.
        let sub_prefix: u8 = match slice_ipnet {
            ipnet::IpNet::V4(_) => 28u8.max(slice_ipnet.prefix_len()),
            ipnet::IpNet::V6(_) => 124u8.max(slice_ipnet.prefix_len()),
        };

        let blocks: Vec<ipnet::IpNet> = slice_ipnet
            .subnets(sub_prefix)
            .map_err(|e| {
                OverlaydError::Other(format!(
                    "failed to subnet slice {slice_ipnet} into /{sub_prefix} blocks: {e}"
                ))
            })?
            .collect();
        if blocks.is_empty() {
            return Err(OverlaydError::Other(format!(
                "slice {slice_ipnet} yielded no /{sub_prefix} blocks for isolation network {net}"
            )));
        }

        let mut hasher = std::collections::hash_map::DefaultHasher::new();
        net.hash(&mut hasher);
        // `% blocks.len()` is always < blocks.len() <= usize::MAX, so this never
        // truncates; `try_from` keeps clippy happy without an unchecked cast.
        let idx = usize::try_from(hasher.finish() % blocks.len() as u64).unwrap_or(0);
        Ok(blocks[idx])
    }

    /// Ensure the per-isolation-network HCN **Internal** network for `net` exists
    /// on the host, reusing one recorded under the
    /// [`owner_for_isolation_network`] marker owner (or discoverable by its
    /// derived name) and recording it on create.
    ///
    /// This is the Windows mechanism for per-network L3 isolation: every
    /// `ZLayer` "isolated" network gets its OWN HCN Internal vSwitch — an
    /// internal vSwitch with NO physical-NIC binding. HCN Internal vSwitches are
    /// mutually isolated by default, so same-network members (sharing this
    /// vSwitch) reach each other + egress + the node, while members of a
    /// different isolation network land on a different vSwitch and cannot reach
    /// them. No ACLs, no per-member churn. Modeled on
    /// [`Self::ensure_service_network`] but keyed on
    /// [`owner_for_isolation_network`] and named `<overlay>-iso-<net>`.
    ///
    /// Returns the network GUID.
    ///
    /// # Errors
    /// Propagates the underlying `zlayer_hns` error on create failure.
    #[cfg(target_os = "windows")]
    async fn ensure_isolation_network(
        &mut self,
        net: &str,
        subnet: ipnet::IpNet,
    ) -> Result<windows::core::GUID, OverlaydError> {
        use windows::core::GUID;

        let daemon_name = self.deployment_or_default();
        // Per-isolation-network name: `<base overlay name>-iso-<net>` so it is
        // unambiguously distinct from the base network and per-service networks.
        let net_name = format!("{}-iso-{net}", overlay_network_name(&daemon_name));
        let owner = crate::network_state::owner_for_isolation_network(net);
        let marker_path =
            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();

        // Fast path: marker names a network GUID that still exists; reopen it.
        let recorded_hcn_id = crate::network_state::NetworkState::load(&marker_path)
            .get(&owner)
            .filter(|entry| entry.kind == "hcn-internal")
            .and_then(|entry| GUID::try_from(entry.id.as_str()).ok());
        if let Some(recorded_id) = recorded_hcn_id {
            let reopened = tokio::task::spawn_blocking(move || {
                zlayer_hns::network::Network::open(recorded_id).ok()
            })
            .await
            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
            if reopened.is_some() {
                tracing::info!(name = %net_name, network = %net, "reusing per-isolation-network HCN network from marker");
                return Ok(recorded_id);
            }
        }

        // Idempotency: reuse a host network whose queried name matches ours.
        let target_name = net_name.clone();
        let existing = tokio::task::spawn_blocking(move || -> Option<GUID> {
            let guids = zlayer_hns::network::list("{}").ok()?;
            for guid in guids {
                let Ok(network) = zlayer_hns::network::Network::open(guid) else {
                    continue;
                };
                if matches!(network.query("{}"), Ok(props) if props.name == target_name) {
                    return Some(guid);
                }
            }
            None
        })
        .await
        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;

        if let Some(existing_id) = existing {
            tracing::info!(name = %net_name, network = %net, "reusing existing per-isolation-network HCN network");
            return Ok(existing_id);
        }

        let net_id = GUID::new().map_err(|e| {
            OverlaydError::Other(format!("GUID::new for per-isolation-network network: {e}"))
        })?;
        let subnet_str = subnet.to_string();

        // ALWAYS Internal for an isolation network — never Transparent. The
        // isolation requirement is exactly the Internal-vSwitch property: no
        // physical-NIC binding, mutually isolated from other Internal vSwitches.
        let net_name_for_create = net_name.clone();
        let subnet_for_create = subnet_str.clone();
        tokio::task::spawn_blocking(move || {
            zlayer_hns::network::Network::create_internal(
                net_id,
                &net_name_for_create,
                &subnet_for_create,
            )
        })
        .await
        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
        .map_err(|e| {
            OverlaydError::Overlay(format!("HcnCreateNetwork internal ({net_name}): {e}"))
        })?;

        // HCN's Static IPAM needs ~1-2s after network create to settle its
        // address pool; without this the first endpoint frequently fails with
        // HCN_E_ADDR_INVALID_OR_RESERVED (same wait as the per-service network).
        tokio::time::sleep(std::time::Duration::from_secs(2)).await;

        tracing::info!(
            network = %net,
            subnet = %subnet_str,
            "created per-isolation-network HCN Internal network"
        );

        // Persist the marker (owner = `iso:<net>`, kind = `hcn-internal`) so
        // subsequent runs reuse this network by GUID and a full uninstall
        // (`purge_managed_networks`, which sweeps every `kind` starting with
        // `hcn`) deletes it. Best-effort.
        let mut marker = crate::network_state::NetworkState::load(&marker_path);
        marker.upsert(crate::network_state::ManagedNetwork {
            owner,
            kind: "hcn-internal".to_string(),
            name: net_name.clone(),
            id: format_guid_bare(net_id),
            subnet: subnet_str.clone(),
            // Isolation HCN network: no dedicated WireGuard identity.
            wg_port: None,
            wg_private_key: None,
            wg_public_key: None,
            interface: None,
        });
        if let Err(e) = marker.save(&marker_path) {
            tracing::warn!(network = %net, error = %e, path = %marker_path.display(), "failed to persist per-isolation-network marker (network still reusable by name)");
        }

        Ok(net_id)
    }

    /// Resolve the dedicated per-service subnet for `service`, if the service
    /// runs in `OverlayMode::Dedicated` on this node.
    ///
    /// Source of truth, in order:
    /// 1. The live [`ServiceTransport`] in `service_transports` (the normal
    ///    case once `SetupServiceOverlay` has run this process).
    /// 2. A persisted `hcn-internal` marker entry under
    ///    [`owner_for_service`]`(service)` — covers the window where the HCN
    ///    network exists from a prior run but the transport map is still empty.
    ///
    /// Returns `None` for Shared-mode services (attach onto the base network).
    #[cfg(target_os = "windows")]
    fn dedicated_service_subnet(&self, service: &str) -> Option<ipnet::IpNet> {
        if let Some(st) = self.service_transports.get(service) {
            return Some(st.subnet);
        }
        let marker_path =
            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
        crate::network_state::NetworkState::load(&marker_path)
            .get(&owner_for_service(service))
            .filter(|entry| entry.kind == "hcn-internal")
            .and_then(|entry| entry.subnet.parse::<ipnet::IpNet>().ok())
    }

    /// The daemon name used for HCN network/owner naming, defaulting to
    /// `"zlayer"` when no deployment has been set yet.
    #[cfg(target_os = "windows")]
    fn deployment_or_default(&self) -> String {
        if self.deployment.is_empty() {
            "zlayer".to_string()
        } else {
            self.deployment.clone()
        }
    }

    // -- peers ---------------------------------------------------------------

    /// Resolve a [`PeerScope`] to the live [`OverlayTransport`] its ops target.
    ///
    /// `Global` -> the single cluster transport; `Service { service }` -> that
    /// service's dedicated per-service transport (Dedicated mode only).
    ///
    /// # Errors
    /// Returns an error if the global overlay is not up (for `Global`) or no
    /// dedicated overlay exists for the named service (for `Service`).
    fn transport_for_scope(&self, scope: &PeerScope) -> Result<&OverlayTransport, OverlaydError> {
        match scope {
            PeerScope::Global => self
                .global_transport
                .as_ref()
                .ok_or_else(|| OverlaydError::Other("global overlay not set up".into())),
            PeerScope::Service { service } => self
                .service_transports
                .get(service)
                .map(|s| &s.transport)
                .ok_or_else(|| {
                    OverlaydError::Other(format!("no dedicated overlay for service {service}"))
                }),
        }
    }

    /// Add a peer to a resolved transport.
    ///
    /// # Errors
    /// Wraps the underlying transport error.
    async fn add_peer_on(
        transport: &OverlayTransport,
        peer: &PeerInfo,
    ) -> Result<(), OverlaydError> {
        transport
            .add_peer(peer)
            .await
            .map_err(|e| OverlaydError::Overlay(format!("add_peer failed: {e}")))
    }

    /// Remove a peer (by base64 public key) from a resolved transport.
    ///
    /// # Errors
    /// Wraps the underlying transport error.
    async fn remove_peer_on(
        transport: &OverlayTransport,
        pubkey: &str,
    ) -> Result<(), OverlaydError> {
        transport
            .remove_peer(pubkey)
            .await
            .map_err(|e| OverlaydError::Overlay(format!("remove_peer failed: {e}")))
    }

    /// Plumb a CIDR into a peer's `AllowedIPs` on a resolved transport.
    ///
    /// # Errors
    /// Returns an error when the CIDR is invalid or the UAPI write fails.
    async fn add_allowed_ip_on(
        transport: &OverlayTransport,
        pubkey: &str,
        cidr: &str,
    ) -> Result<(), OverlaydError> {
        let net: ipnet::IpNet = cidr
            .parse()
            .map_err(|e| OverlaydError::Other(format!("invalid CIDR {cidr}: {e}")))?;
        transport
            .add_allowed_ip(pubkey, net)
            .await
            .map_err(|e| OverlaydError::Overlay(format!("add_allowed_ip failed: {e}")))
    }

    /// Remove a CIDR from a peer's `AllowedIPs` on a resolved transport.
    ///
    /// # Errors
    /// Returns an error when the CIDR is invalid or the UAPI write fails.
    async fn remove_allowed_ip_on(
        transport: &OverlayTransport,
        pubkey: &str,
        cidr: &str,
    ) -> Result<(), OverlaydError> {
        let net: ipnet::IpNet = cidr
            .parse()
            .map_err(|e| OverlaydError::Other(format!("invalid CIDR {cidr}: {e}")))?;
        transport
            .remove_allowed_ip(pubkey, net)
            .await
            .map_err(|e| OverlaydError::Overlay(format!("remove_allowed_ip failed: {e}")))
    }

    // -- DNS -----------------------------------------------------------------

    /// Register an overlay DNS A/AAAA record.
    fn register_dns(&mut self, name: String, ip: IpAddr) {
        self.dns_records.insert(name, ip);
    }

    /// Remove an overlay DNS record.
    fn unregister_dns(&mut self, name: &str) {
        self.dns_records.remove(name);
    }

    // -- NAT -----------------------------------------------------------------

    /// Periodic NAT traversal maintenance: lazily start NAT (and the built-in
    /// relay server), re-probe STUN, refresh relays, and run the connect-half —
    /// hole-punching / relaying toward every peer whose direct endpoint has not
    /// produced a recent `WireGuard` handshake.
    ///
    /// No-op when NAT traversal is disabled in the resolved [`NatConfig`].
    ///
    /// # Errors
    /// Returns an error when the underlying STUN refresh fails.
    async fn nat_maintenance_tick(&mut self) -> Result<(), OverlaydError> {
        // Lazily start NAT traversal on the first tick if a config asks for it.
        if self.nat_traversal.is_none() {
            let config = self.nat_config.clone().unwrap_or_default();
            if config.enabled {
                // Stand up the built-in relay server here (once) when the
                // resolved config carries a `relay_server`. The auth credential
                // MUST be cluster-wide-shared (every node's relay *client*
                // derives the same BLAKE2b key via `derive_auth_key`), so it
                // comes from `cluster_relay_credential` — the cluster HS256
                // secret the main daemon stamped into
                // `NatConfigSpec.relay_server.auth_credential`, NOT the node's
                // per-node WireGuard key. When no credential was supplied the
                // relay derives a key from the empty string (only same-config
                // nodes can use it).
                if let Some(relay_cfg) = config.relay_server.clone() {
                    if self.relay_server.is_none() {
                        let credential = self.cluster_relay_credential.clone().unwrap_or_default();
                        let relay = RelayServer::new(&relay_cfg, &credential);
                        match relay.start().await {
                            Ok(bound) => {
                                tracing::info!(
                                    bound = %bound,
                                    external = %relay_cfg.external_addr,
                                    "Built-in relay server started"
                                );
                                self.relay_bound_addr = Some(bound);
                                self.relay_server = Some(relay);
                            }
                            Err(e) => {
                                tracing::warn!(error = %e, "Built-in relay server failed to start");
                            }
                        }
                    }
                }

                let mut nat = NatTraversal::new(config, self.overlay_port);
                match nat.gather_candidates().await {
                    Ok(candidates) => {
                        tracing::info!(count = candidates.len(), "Gathered NAT candidates");
                        self.nat_last_refresh.store(now_unix(), Ordering::SeqCst);
                        self.nat_traversal = Some(nat);
                    }
                    Err(e) => {
                        tracing::warn!(error = %e, "NAT candidate gathering failed");
                        return Ok(());
                    }
                }
                // First-tick connect: try to establish toward every already-known
                // peer (peers added before NAT came up).
                self.nat_connect_known_peers().await;
            } else {
                return Ok(());
            }
        }

        // Refresh STUN/relay state, then run the connect-half for peers that
        // still lack a recent handshake.
        if let Some(nat) = self.nat_traversal.as_mut() {
            match nat.refresh().await {
                Ok(changed) => {
                    if changed {
                        tracing::info!("NAT reflexive address changed during refresh");
                    }
                    self.nat_last_refresh.store(now_unix(), Ordering::SeqCst);
                }
                Err(e) => {
                    return Err(OverlaydError::Overlay(format!(
                        "NAT maintenance tick failed: {e}"
                    )));
                }
            }
        }
        self.nat_connect_known_peers().await;
        Ok(())
    }

    /// The NAT connect-half: for every peer with advertised candidates that has
    /// no recent `WireGuard` handshake, call [`NatTraversal::connect_to_peer`]
    /// (which itself updates the live device's peer endpoint) and record the
    /// resulting [`ConnectionType`].
    ///
    /// Best-effort: a peer with no live global transport, no candidates, or a
    /// failed traversal is left untouched (its persistent direct endpoint keeps
    /// retrying). Candidate sets are collected into a local `Vec` first so the
    /// borrow of `self.nat_traversal` / `self.global_transport` does not overlap
    /// the mutable borrow of `self.peer_connection_type`.
    async fn nat_connect_known_peers(&mut self) {
        // No host transport (VM-only overlay) or no NAT orchestrator → nothing
        // to connect on this node.
        let (Some(_), Some(_)) = (self.global_transport.as_ref(), self.nat_traversal.as_ref())
        else {
            return;
        };
        if self.peer_candidates.is_empty() {
            return;
        }

        // Peers whose handshake is older than this cutoff (or never seen) are
        // candidates for a (re)connect attempt. WireGuard's default keepalive is
        // 25s; 3× that is a generous "the direct endpoint is clearly not
        // establishing" threshold that avoids churning healthy peers.
        let cutoff = now_unix().saturating_sub(75);

        // Snapshot the (pubkey, candidates) work set up front to satisfy the
        // borrow checker (we borrow self.transport + self.nat below).
        let work: Vec<(String, Vec<Candidate>)> = self
            .peer_candidates
            .iter()
            .map(|(k, v)| (k.clone(), v.clone()))
            .collect();

        let transport = self.global_transport.as_ref().expect("checked above");
        let nat = self.nat_traversal.as_ref().expect("checked above");
        let mut results: Vec<(String, ConnectionType)> = Vec::new();

        for (pubkey, candidates) in &work {
            // Skip peers that already have a fresh handshake on the live device.
            match transport.check_peer_handshake(pubkey, cutoff).await {
                Ok(true) => continue,
                Ok(false) => {}
                Err(e) => {
                    tracing::debug!(peer = %pubkey, error = %e, "handshake check failed; attempting connect anyway");
                }
            }
            match nat.connect_to_peer(transport, pubkey, candidates).await {
                Ok(connection_type) => {
                    tracing::info!(
                        peer = %pubkey,
                        connection = %connection_type,
                        "NAT traversal established connection to peer"
                    );
                    results.push((pubkey.clone(), connection_type));
                }
                Err(e) => {
                    tracing::debug!(peer = %pubkey, error = %e, "NAT traversal could not connect to peer this tick");
                }
            }
        }

        for (pubkey, ct) in results {
            self.peer_connection_type.insert(pubkey, ct);
        }
    }

    /// Build a [`NatStatusWire`] from the live NAT orchestrator: this node's
    /// local candidates, the per-peer connection types recorded by the connect
    /// loop (with each peer's current remote endpoint parsed from the UAPI
    /// status dump), and the last STUN-refresh timestamp.
    async fn nat_status_snapshot(&self) -> NatStatusWire {
        let candidates = self
            .nat_traversal
            .as_ref()
            .map(|n| n.local_candidates().iter().map(candidate_to_wire).collect())
            .unwrap_or_default();

        // Map hex-pubkey -> current remote endpoint from the live device's UAPI
        // dump. The dump keys peers by hex; `peer_connection_type` keys by
        // base64, so the join below converts each base64 key to hex.
        let mut endpoints: HashMap<String, String> = HashMap::new();
        if let Some(transport) = self.global_transport.as_ref() {
            if let Ok(dump) = transport.status().await {
                for p in parse_peer_status(&dump) {
                    if !p.endpoint.is_empty() {
                        endpoints.insert(p.public_key, p.endpoint);
                    }
                }
            }
        }

        let peers = self
            .peer_connection_type
            .iter()
            .map(|(pubkey, ct)| {
                let remote_endpoint = zlayer_overlay::nat::pubkey_b64_to_hex(pubkey)
                    .and_then(|hex| endpoints.get(&hex).cloned());
                NatPeerWire {
                    node_id: pubkey.clone(),
                    connection_type: ct.to_string(),
                    remote_endpoint,
                }
            })
            .collect();

        NatStatusWire {
            candidates,
            peers,
            last_refresh: self.nat_last_refresh.load(Ordering::SeqCst),
        }
    }

    // -- status --------------------------------------------------------------

    /// Build a [`StatusSnapshot`] from current overlay state.
    async fn status_snapshot(&self) -> StatusSnapshot {
        let mut peers: Vec<PeerStatus> = Vec::new();
        let public_key = self.transport_public_key.clone();

        if let Some(transport) = self.global_transport.as_ref() {
            // Parse the UAPI dump for per-peer state. Best-effort: a parse
            // failure leaves the peer list empty rather than failing Status.
            if let Ok(dump) = transport.status().await {
                peers = parse_peer_status(&dump);
            }
        }

        let service_count = u32::try_from(self.service_count()).unwrap_or(u32::MAX);
        let peer_count = u32::try_from(peers.len()).unwrap_or(u32::MAX);

        // Per dedicated per-service overlay device: count its peers the same
        // way the global status does (parse the UAPI/status dump).
        let mut dedicated_services: Vec<DedicatedServiceStatus> = Vec::new();
        for (svc, st) in &self.service_transports {
            let peer_count = match st.transport.status().await {
                Ok(dump) => u32::try_from(parse_peer_status(&dump).len()).unwrap_or(u32::MAX),
                Err(_) => 0,
            };
            dedicated_services.push(DedicatedServiceStatus {
                service: svc.clone(),
                interface: st.interface.clone(),
                public_key: st.public_key.clone(),
                listen_port: st.listen_port,
                overlay_ip: st.overlay_ip,
                subnet: st.subnet.to_string(),
                peer_count,
            });
        }

        StatusSnapshot {
            interface: self.global_interface.clone(),
            node_ip: self.node_ip,
            public_key,
            overlay_cidr: self.cluster_cidr.map(|c| c.to_string()),
            slice_cidr: self.slice_cidr.map(|c| c.to_string()),
            peer_count,
            service_count,
            peers,
            dedicated_services,
        }
    }

    /// Number of per-service overlays set up on this node (Shared bridges /
    /// placeholders plus any Dedicated transports not already counted there).
    fn service_count(&self) -> usize {
        let extra_dedicated = self
            .service_transports
            .keys()
            .filter(|svc| !self.service_interfaces.contains_key(*svc))
            .count();
        self.service_interfaces.len() + extra_dedicated
    }

    // -- config helper -------------------------------------------------------

    fn build_config(
        &self,
        private_key: String,
        public_key: String,
        ip: IpAddr,
        mask: u8,
        listen_port: u16,
        physical_egress_ip: Option<IpAddr>,
    ) -> OverlayConfig {
        // Pick the source/advertised address for the WireGuard endpoint.
        //
        // Default is the family-matched UNSPECIFIED (`0.0.0.0` / `::`), which lets
        // the kernel pick a source per outgoing packet. When the caller resolved a
        // physical-egress IP (see `detect_physical_egress`) *and* its family
        // matches the overlay IP's family, we pin `local_endpoint` to that IP so
        // boringtun's data socket sources from — and advertises — the real NIC
        // rather than whatever the default route (possibly a VPN mesh) would pick.
        //
        // Family mismatch (e.g. physical egress is v4 but this overlay is v6) is
        // unusable for source selection, so we warn and fall back to UNSPECIFIED.
        //
        // boringtun limitation: boringtun 0.7's `DeviceConfig` exposes no way to
        // inject or pin the WireGuard DATA socket (its `uapi_fd` is the UAPI
        // CONTROL socket only), so `SO_BINDTODEVICE` on the data socket is
        // impossible today. Setting `local_endpoint` to the physical IP governs
        // source-address selection and the advertised endpoint, which is the
        // realistic scope of control we have.
        let unspecified = match ip {
            IpAddr::V4(_) => IpAddr::V4(Ipv4Addr::UNSPECIFIED),
            IpAddr::V6(_) => IpAddr::V6(Ipv6Addr::UNSPECIFIED),
        };
        let local_addr =
            if rootless_forces_unspecified(std::env::var_os("ZLAYER_ROOTLESS").is_some()) {
                // Rootless: detect_physical_egress() resolves pasta's in-netns tap IP
                // (e.g. 192.168.68.x), which is useless as a WG source/advertised
                // endpoint to remote peers. Force UNSPECIFIED; the kernel picks the
                // source per packet and the real reachable endpoint comes from the
                // advertise_addr path + pasta forwarding.
                unspecified
            } else {
                match physical_egress_ip {
                    Some(egress) if egress.is_ipv4() == ip.is_ipv4() => egress,
                    Some(egress) => {
                        tracing::warn!(
                            physical_egress_ip = %egress,
                            overlay_ip = %ip,
                            "physical egress IP family does not match overlay IP family; \
                             falling back to UNSPECIFIED for WireGuard local_endpoint"
                        );
                        unspecified
                    }
                    None => unspecified,
                }
            };
        let mut config = OverlayConfig {
            local_endpoint: SocketAddr::new(local_addr, listen_port),
            private_key,
            public_key,
            overlay_cidr: format!("{ip}/{mask}"),
            ..OverlayConfig::default()
        };
        if let Some(nat) = self.nat_config.clone() {
            config.nat = nat;
        }
        if let Some(dir) = self.uapi_sock_dir.clone() {
            config.uapi_sock_dir = dir;
        }
        config
    }
}

/// Build an `Auto`-mode [`ServiceOverlayInfo`]: the per-service bridge/placeholder
/// name with every dedicated-device identity field left `None` (`Auto` carries
/// the service subnet on the single cluster-wide `WireGuard` device).
fn cluster_wg_overlay_info(name: String) -> ServiceOverlayInfo {
    ServiceOverlayInfo {
        name,
        mode: OverlayMode::Auto,
        wg_public_key: None,
        wg_port: None,
        overlay_ip: None,
        subnet: None,
    }
}

/// Build a `Shared`-mode [`ServiceOverlayInfo`]: the shared node-wide
/// bridge/placeholder name with every dedicated-device identity field left
/// `None` (Shared mode shares the single cluster device and the node-wide
/// bridge; ports are exposed by the userspace free-port L4 proxy).
fn shared_overlay_info(name: String) -> ServiceOverlayInfo {
    ServiceOverlayInfo {
        name,
        mode: OverlayMode::Shared,
        wg_public_key: None,
        wg_port: None,
        overlay_ip: None,
        subnet: None,
    }
}

/// Build a Dedicated-mode [`ServiceOverlayInfo`] from a dedicated device's
/// identity. `name` is the container-attach handle (bridge name on Linux, the
/// dedicated interface elsewhere).
fn dedicated_overlay_info(
    name: String,
    public_key: &str,
    listen_port: u16,
    overlay_ip: IpAddr,
    subnet: ipnet::IpNet,
) -> ServiceOverlayInfo {
    ServiceOverlayInfo {
        name,
        mode: OverlayMode::Dedicated,
        wg_public_key: Some(public_key.to_string()),
        wg_port: Some(listen_port),
        overlay_ip: Some(overlay_ip),
        subnet: Some(subnet.to_string()),
    }
}

/// Convert a wire [`PeerSpec`] into a `zlayer_overlay::PeerInfo`.
///
/// # Errors
/// Returns an error if `endpoint` cannot be parsed as a `host:port`
/// [`SocketAddr`].
pub fn peer_spec_to_info(spec: &PeerSpec) -> Result<PeerInfo, OverlaydError> {
    let endpoint: SocketAddr = spec.endpoint.parse().map_err(|e| {
        OverlaydError::Other(format!("invalid peer endpoint {}: {e}", spec.endpoint))
    })?;
    Ok(PeerInfo::new(
        spec.public_key.clone(),
        endpoint,
        &spec.allowed_ips,
        std::time::Duration::from_secs(spec.persistent_keepalive_secs),
    ))
}

/// Parse a `wg`-style UAPI/`status` dump into [`PeerStatus`] entries.
///
/// The dump is a series of `key=value` lines; each `public_key=` line starts a
/// new peer block, and subsequent `endpoint=` / `allowed_ip=` /
/// `latest_handshake=` lines belong to it.
fn parse_peer_status(dump: &str) -> Vec<PeerStatus> {
    let mut peers: Vec<PeerStatus> = Vec::new();
    let mut current: Option<PeerStatus> = None;
    let mut allowed: Vec<String> = Vec::new();

    let flush = |peers: &mut Vec<PeerStatus>,
                 current: &mut Option<PeerStatus>,
                 allowed: &mut Vec<String>| {
        if let Some(mut p) = current.take() {
            p.allowed_ips = allowed.join(",");
            peers.push(p);
        }
        allowed.clear();
    };

    for line in dump.lines() {
        let line = line.trim();
        let Some((key, value)) = line.split_once('=') else {
            continue;
        };
        match key.trim() {
            "public_key" | "peer" => {
                flush(&mut peers, &mut current, &mut allowed);
                current = Some(PeerStatus {
                    public_key: value.trim().to_string(),
                    endpoint: String::new(),
                    allowed_ips: String::new(),
                    last_handshake_unix_secs: 0,
                });
            }
            "endpoint" => {
                if let Some(p) = current.as_mut() {
                    p.endpoint = value.trim().to_string();
                }
            }
            "allowed_ip" | "allowed_ips" if current.is_some() => {
                allowed.push(value.trim().to_string());
            }
            "latest_handshake" | "last_handshake_time_sec" => {
                if let Some(p) = current.as_mut() {
                    p.last_handshake_unix_secs = value.trim().parse().unwrap_or(0);
                }
            }
            _ => {}
        }
    }
    flush(&mut peers, &mut current, &mut allowed);
    peers
}

/// Convert a wire [`NatConfigSpec`] into the live [`NatConfig`] overlayd drives.
///
/// Sub-fields left at their zero value in the spec fall back to
/// [`NatConfig::default`]'s value (so a sparsely-populated spec still gets sane
/// STUN servers / timeouts). The `relay_server`'s `auth_credential` is stripped
/// here — it is carried separately on the server (`cluster_relay_credential`)
/// because `RelayServerConfig` has no credential field; this conversion only
/// produces the bind/external/max-sessions triple it does carry.
fn nat_config_spec_to_config(spec: NatConfigSpec) -> NatConfig {
    let defaults = NatConfig::default();
    NatConfig {
        enabled: spec.enabled,
        stun_servers: if spec.stun_servers.is_empty() {
            defaults.stun_servers
        } else {
            spec.stun_servers
                .into_iter()
                .map(|address| StunServerConfig {
                    address,
                    label: None,
                })
                .collect()
        },
        turn_servers: spec
            .turn_servers
            .into_iter()
            .map(|t| TurnServerConfig {
                address: t.addr,
                username: t.username,
                credential: t.credential,
                region: None,
            })
            .collect(),
        hole_punch_timeout_secs: if spec.hole_punch_timeout_secs == 0 {
            defaults.hole_punch_timeout_secs
        } else {
            spec.hole_punch_timeout_secs
        },
        stun_refresh_interval_secs: if spec.stun_refresh_interval_secs == 0 {
            defaults.stun_refresh_interval_secs
        } else {
            spec.stun_refresh_interval_secs
        },
        max_candidate_pairs: if spec.max_candidate_pairs == 0 {
            defaults.max_candidate_pairs
        } else {
            spec.max_candidate_pairs
        },
        relay_server: spec.relay_server.map(|r| RelayServerConfig {
            listen_port: r.listen_port,
            external_addr: r.external_addr,
            max_sessions: if r.max_sessions == 0 {
                default_max_relay_sessions()
            } else {
                r.max_sessions
            },
        }),
    }
}

/// Default relay `max_sessions` used when a spec leaves it at `0`. Mirrors
/// `zlayer_overlay::nat::config`'s private `default_max_relay_sessions` (100).
const fn default_max_relay_sessions() -> usize {
    100
}

/// Parse a wire [`NatCandidateWire`] into a live [`Candidate`].
///
/// Returns `None` when the address does not parse as a `host:port` socket
/// address or the type string is unrecognized. Priority is taken verbatim from
/// the wire (the advertiser already computed it) so the receiver honors the
/// peer's own preference ordering.
fn wire_to_candidate(w: &NatCandidateWire) -> Option<Candidate> {
    let address: SocketAddr = w.address.parse().ok()?;
    let candidate_type = match w.candidate_type.as_str() {
        "host" => CandidateType::Host,
        "server-reflexive" => CandidateType::ServerReflexive,
        "relay" => CandidateType::Relay,
        _ => return None,
    };
    let mut c = Candidate::new(candidate_type, address);
    c.priority = w.priority;
    Some(c)
}

/// Convert a live [`Candidate`] into its wire [`NatCandidateWire`] form for a
/// `NatStatus` response.
fn candidate_to_wire(c: &Candidate) -> NatCandidateWire {
    let candidate_type = match c.candidate_type {
        CandidateType::Host => "host",
        CandidateType::ServerReflexive => "server-reflexive",
        CandidateType::Relay => "relay",
    };
    NatCandidateWire {
        candidate_type: candidate_type.to_string(),
        address: c.address.to_string(),
        priority: c.priority,
    }
}

/// Current Unix time in whole seconds.
fn now_unix() -> u64 {
    std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .unwrap_or_default()
        .as_secs()
}

/// Offset (relative to the slice's network address) reserved for the node's
/// own overlay IP. Offset 1 is always the first usable host of the slice, so
/// the node IP is deterministic (`base + 1`) regardless of allocation order.
const NODE_RESERVED_OFFSET: u64 = 1;

/// Simple IP address allocator supporting both IPv4 and IPv6, bounded to a
/// specific CIDR (typically a per-node `/28` slice). Allocations past the last
/// usable host return an exhaustion error.
///
/// Offset [`NODE_RESERVED_OFFSET`] (the first usable host) is reserved for the
/// node's own overlay IP and is never handed out by [`IpAllocator::allocate`],
/// so the node IP stays deterministic across restarts and immune to container
/// allocation order. Use [`IpAllocator::node_ip`] to read it.
struct IpAllocator {
    /// CIDR the allocator is bounded to.
    cidr: IpNetwork,
    /// Base (network) address of the CIDR.
    base: IpAddr,
    /// Monotonic counter for the next allocation offset relative to `base`.
    /// Starts at [`NODE_RESERVED_OFFSET`] + 1 so the node's reserved IP is
    /// never returned to a container.
    next_offset: AtomicU64,
    /// IPs returned by `release(...)`. `allocate()` drains this first before
    /// incrementing `next_offset`.
    released: parking_lot::Mutex<Vec<IpAddr>>,
}

impl IpAllocator {
    fn new(cidr: IpNetwork) -> Self {
        Self {
            base: cidr.network(),
            cidr,
            // Reserve offset 1 for the node's own overlay IP; container
            // allocation starts at offset 2.
            next_offset: AtomicU64::new(NODE_RESERVED_OFFSET + 1),
            released: parking_lot::Mutex::new(Vec::new()),
        }
    }

    /// The node's own overlay IP for this slice: the first usable host
    /// (`base + 1`), reserved so no container ever receives it. Deterministic
    /// for a given slice CIDR, independent of allocation order or restarts.
    fn node_ip(&self) -> IpAddr {
        self.compute_addr(NODE_RESERVED_OFFSET)
    }

    #[allow(clippy::cast_possible_truncation)]
    fn compute_addr(&self, offset: u64) -> IpAddr {
        match self.base {
            IpAddr::V4(base_v4) => {
                let base_u32 = u32::from_be_bytes(base_v4.octets());
                let addr = base_u32.wrapping_add(offset as u32);
                IpAddr::V4(Ipv4Addr::from(addr.to_be_bytes()))
            }
            IpAddr::V6(base_v6) => {
                let base_u128 = u128::from(base_v6);
                let addr = base_u128.wrapping_add(u128::from(offset));
                IpAddr::V6(Ipv6Addr::from(addr))
            }
        }
    }

    /// Allocate the next IP in the slice, reusing released IPs first.
    ///
    /// # Errors
    /// Returns [`OverlaydError::Overlay`] when the CIDR is exhausted.
    fn allocate(&self) -> Result<IpAddr, OverlaydError> {
        if let Some(ip) = self.released.lock().pop() {
            return Ok(ip);
        }
        let offset = self.next_offset.fetch_add(1, Ordering::SeqCst);
        let addr = self.compute_addr(offset);

        let in_cidr = self.cidr.contains(addr);
        let is_v4_broadcast = matches!(
            (&self.cidr, &addr),
            (IpNetwork::V4(v4), IpAddr::V4(a)) if *a == v4.broadcast()
        );
        if !in_cidr || is_v4_broadcast {
            return Err(OverlaydError::Overlay(format!(
                "IP allocator exhausted: next address {addr} is outside slice {}",
                self.cidr
            )));
        }
        Ok(addr)
    }

    /// Return an IP to the free pool. Idempotent. The node's reserved IP is
    /// never accepted back into the pool so it can never be handed to a
    /// container by a later `allocate()`.
    fn release(&self, ip: IpAddr) {
        if ip == self.node_ip() {
            return;
        }
        let mut released = self.released.lock();
        if !released.contains(&ip) {
            released.push(ip);
        }
    }
}

// -- Windows HCN helpers (ported from the agent's hcs runtime) --------------

/// Owner tag stamped onto every HCN endpoint this server creates. The legacy
/// single-instance value is `"zlayer"`; any other name is used verbatim so two
/// daemons running side-by-side never sweep each other's endpoints.
#[cfg(target_os = "windows")]
fn owner_tag(daemon_name: &str) -> String {
    if daemon_name == "zlayer" {
        "zlayer".to_string()
    } else {
        daemon_name.to_string()
    }
}

/// Name of the per-daemon HCN overlay network on the host. Legacy
/// single-instance value is `"zlayer-overlay"`; any other name becomes
/// `"<daemon_name>-overlay"`.
#[cfg(target_os = "windows")]
fn overlay_network_name(daemon_name: &str) -> String {
    if daemon_name == "zlayer" {
        "zlayer-overlay".to_string()
    } else {
        format!("{daemon_name}-overlay")
    }
}

/// Build the [`zlayer_hns::schema::HostComputeNetwork`] document for the single
/// shared HCN **NAT** network. A NAT network gives every attached container
/// outbound connectivity and host-port forwarding (driven by the userspace
/// free-port L4 proxy), without a per-service vSwitch — the Windows analogue of
/// the Linux node-wide shared bridge. The Static IPAM declares a default route
/// to the subnet gateway so HCN reserves only the gateway (same
/// `HCN_E_ADDR_INVALID_OR_RESERVED` avoidance the Internal/Transparent paths
/// use). Returns `None` when `subnet` has no usable gateway host.
#[cfg(target_os = "windows")]
fn shared_nat_settings(name: &str, subnet: &str) -> Option<zlayer_hns::schema::HostComputeNetwork> {
    use zlayer_hns::schema::{HostComputeNetwork, Ipam, NetworkType, Route, SchemaVersion, Subnet};

    let net: ipnet::IpNet = subnet.parse().ok()?;
    let ipnet::IpNet::V4(v4) = net else {
        // HCN's NAT IPAM is IPv4 in the current schema.
        return None;
    };
    if v4.prefix_len() >= 31 {
        return None;
    }
    let gateway = std::net::Ipv4Addr::from(u32::from(v4.network()).checked_add(1)?).to_string();

    Some(HostComputeNetwork {
        id: None,
        name: name.to_string(),
        ty: NetworkType::Nat,
        policies: Vec::new(),
        mac_pool: None,
        dns: None,
        ipams: vec![Ipam {
            ty: "Static".to_string(),
            subnets: vec![Subnet {
                ip_address_prefix: subnet.to_string(),
                routes: vec![Route {
                    next_hop: gateway,
                    destination_prefix: "0.0.0.0/0".to_string(),
                    metric: None,
                }],
                policies: Vec::new(),
            }],
        }],
        flags: 0,
        schema_version: SchemaVersion::default(),
    })
}

/// Format a GUID as the bare, lowercase, un-braced string HCN/HCS use to
/// identify a namespace inside a compute-system document's
/// `Container.Networking.Namespace` field (e.g. `aabbccdd-eeff-...`).
#[cfg(target_os = "windows")]
fn format_guid_bare(id: windows::core::GUID) -> String {
    format!("{id:?}")
        .trim_matches(|c: char| c == '{' || c == '}')
        .to_ascii_lowercase()
}

/// Delete every host-level HCN network this server created for `daemon_name` and
/// clear the persistent marker. Called on a full uninstall — never on a routine
/// stop/restart. Best-effort throughout. Synchronous (HCN calls are blocking).
#[cfg(target_os = "windows")]
pub fn purge_managed_networks(data_dir: &Path, daemon_name: &str) {
    use windows::core::GUID;

    let marker_path = zlayer_paths::ZLayerDirs::new(data_dir.to_path_buf()).agent_network_state();
    let state = crate::network_state::NetworkState::load(&marker_path);

    // Pass 1: delete recorded HCN networks by GUID.
    for entry in &state.networks {
        if !entry.kind.starts_with("hcn") {
            continue;
        }
        match GUID::try_from(entry.id.as_str()) {
            Ok(guid) => match zlayer_hns::network::Network::delete(guid) {
                Ok(()) => {
                    tracing::info!(name = %entry.name, id = %entry.id, "deleted managed HCN network");
                }
                Err(e) => {
                    tracing::warn!(name = %entry.name, id = %entry.id, error = %e, "failed to delete managed HCN network");
                }
            },
            Err(e) => {
                tracing::warn!(id = %entry.id, error = %e, "managed network marker has unparseable GUID");
            }
        }
    }

    // Pass 2: name-sweep fallback for an overlay network whose marker entry was
    // lost (crash between create and marker write).
    let overlay_name = overlay_network_name(daemon_name);
    if let Ok(guids) = zlayer_hns::network::list("{}") {
        for guid in guids {
            let Ok(network) = zlayer_hns::network::Network::open(guid) else {
                continue;
            };
            let is_ours = matches!(network.query("{}"), Ok(props) if props.name == overlay_name);
            drop(network);
            if is_ours {
                match zlayer_hns::network::Network::delete(guid) {
                    Ok(()) => {
                        tracing::info!(name = %overlay_name, "deleted overlay HCN network (name sweep)");
                    }
                    Err(e) => {
                        tracing::warn!(name = %overlay_name, error = %e, "failed to delete overlay network (name sweep)");
                    }
                }
            }
        }
    }

    if marker_path.exists() {
        if let Err(e) = std::fs::remove_file(&marker_path) {
            tracing::warn!(error = %e, path = %marker_path.display(), "failed to remove agent network marker");
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[cfg(target_os = "linux")]
    #[test]
    fn orphan_bridge_selection() {
        use std::collections::HashSet;

        // Two live per-service bridges the daemon says SHOULD exist.
        let live: HashSet<&str> = ["zl-prod-0-web-b", "zl-prod-0-api-b"].into_iter().collect();
        // The active global device and node-wide shared bridge are protected,
        // plus a live in-memory dedicated device.
        let protected: HashSet<String> = ["zl-prod-0-g", "zl-prod-0-shared-sh", "zl-prod-0-db-d"]
            .into_iter()
            .map(String::from)
            .collect();

        // The full set of host links the kernel would report.
        let host_links = [
            // Live -> keep.
            "zl-prod-0-web-b",
            "zl-prod-0-api-b",
            // Protected global / shared / live dedicated device -> keep.
            "zl-prod-0-g",
            "zl-prod-0-shared-sh",
            "zl-prod-0-db-d",
            // Orphan bridges (the user's observed leaks) -> reclaim.
            "zl-1ca4568944-b",
            "zl-81c6bc17c7-b",
            // Orphan dedicated device -> reclaim.
            "zl-prod-0-gone-d",
            // Container veths owned by the PID-keyed sweep, never here -> skip.
            "veth-4242-s",
            "vc-4242-g",
            // Unrelated host links -> skip.
            "eth0",
            "lo",
            "docker0",
            "zl-not-a-bridge",
        ];

        let orphans: Vec<&str> = host_links
            .into_iter()
            .filter(|n| is_orphan_service_bridge(n, &live, &protected))
            .collect();

        assert_eq!(
            orphans,
            vec!["zl-1ca4568944-b", "zl-81c6bc17c7-b", "zl-prod-0-gone-d"],
            "only orphaned -b/-d service bridges/devices are selected; \
             live, protected (-g/-sh/live -d), veth, and unrelated links are excluded"
        );
    }

    #[test]
    fn peer_spec_to_info_parses_endpoint_and_keepalive() {
        let spec = PeerSpec {
            public_key: "base64key".to_string(),
            endpoint: "1.2.3.4:51820".to_string(),
            allowed_ips: "10.200.0.5/32,10.200.1.0/24".to_string(),
            persistent_keepalive_secs: 25,
            candidates: Vec::new(),
        };
        let info = peer_spec_to_info(&spec).expect("valid spec");
        assert_eq!(info.public_key, "base64key");
        assert_eq!(info.endpoint, "1.2.3.4:51820".parse().unwrap());
        assert_eq!(info.allowed_ips, "10.200.0.5/32,10.200.1.0/24");
        assert_eq!(
            info.persistent_keepalive_interval,
            std::time::Duration::from_secs(25)
        );
    }

    #[test]
    fn peer_spec_to_info_rejects_bad_endpoint() {
        let spec = PeerSpec {
            public_key: "k".to_string(),
            endpoint: "not-a-socket-addr".to_string(),
            allowed_ips: String::new(),
            persistent_keepalive_secs: 0,
            candidates: Vec::new(),
        };
        assert!(peer_spec_to_info(&spec).is_err());
    }

    #[test]
    fn interface_name_never_exceeds_limit() {
        let cases: Vec<(&[&str], &str)> = vec![
            (&["a"], "g"),
            (&["zlayer-manager"], "g"),
            (&["my-very-long-deployment-name-that-goes-on-and-on"], "g"),
            (&["zlayer", "manager"], "s"),
            (
                &["abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz"],
                "s",
            ),
            (&["x"], ""),
        ];
        for (parts, suffix) in &cases {
            let name = make_interface_name(parts, suffix);
            assert!(name.len() <= MAX_IFNAME_LEN, "Name '{name}' too long");
            assert!(name.starts_with("zl-"));
        }
    }

    #[test]
    fn node_ip_is_first_usable_and_reserved() {
        let cidr: IpNetwork = "10.200.0.0/26".parse().unwrap();
        let alloc = IpAllocator::new(cidr);

        // The node IP is the deterministic first-usable host of the slice.
        let expected_node_ip: IpAddr = "10.200.0.1".parse().unwrap();
        assert_eq!(alloc.node_ip(), expected_node_ip);

        // Several container allocations must NEVER hand out the node IP, and
        // the node IP stays put regardless of allocation order.
        let mut handed_out = Vec::new();
        for _ in 0..10 {
            let ip = alloc.allocate().expect("slice not exhausted");
            assert_ne!(
                ip, expected_node_ip,
                "allocate() returned the reserved node IP"
            );
            handed_out.push(ip);
        }
        // Reservation holds after the allocations.
        assert_eq!(alloc.node_ip(), expected_node_ip);

        // First container allocation is offset 2 (base + 2), proving offset 1
        // (the node) was reserved and skipped.
        assert_eq!(handed_out[0], "10.200.0.2".parse::<IpAddr>().unwrap());

        // Releasing the node IP must not pollute the free pool with it.
        alloc.release(expected_node_ip);
        let next = alloc.allocate().expect("slice not exhausted");
        assert_ne!(
            next, expected_node_ip,
            "node IP leaked back into the pool via release()"
        );
    }

    #[test]
    fn node_ip_ipv6_is_first_usable() {
        let cidr: IpNetwork = "fd00:200::/64".parse().unwrap();
        let alloc = IpAllocator::new(cidr);
        let expected: IpAddr = "fd00:200::1".parse().unwrap();
        assert_eq!(alloc.node_ip(), expected);
        for _ in 0..5 {
            assert_ne!(alloc.allocate().unwrap(), expected);
        }
        assert_eq!(alloc.node_ip(), expected);
    }

    #[test]
    fn interface_name_is_deterministic() {
        assert_eq!(
            make_interface_name(&["zlayer-manager"], "g"),
            make_interface_name(&["zlayer-manager"], "g")
        );
    }

    #[test]
    fn parse_peer_status_splits_blocks() {
        let dump = "\
public_key=AAA
endpoint=1.2.3.4:51820
allowed_ip=10.200.0.2/32
allowed_ip=10.200.1.0/24
latest_handshake=1700000000
public_key=BBB
endpoint=5.6.7.8:51820
allowed_ip=10.200.0.3/32
latest_handshake=0
";
        let peers = parse_peer_status(dump);
        assert_eq!(peers.len(), 2);
        assert_eq!(peers[0].public_key, "AAA");
        assert_eq!(peers[0].endpoint, "1.2.3.4:51820");
        assert_eq!(peers[0].allowed_ips, "10.200.0.2/32,10.200.1.0/24");
        assert_eq!(peers[0].last_handshake_unix_secs, 1_700_000_000);
        assert_eq!(peers[1].public_key, "BBB");
        assert_eq!(peers[1].last_handshake_unix_secs, 0);
    }

    #[tokio::test]
    async fn status_snapshot_before_setup_is_empty() {
        let server = OverlaydServer::new(std::path::PathBuf::from("/tmp/zlayer-overlayd-test"));
        let snap = server.status_snapshot().await;
        assert!(snap.interface.is_none());
        assert!(snap.node_ip.is_none());
        assert!(snap.public_key.is_none());
        assert_eq!(snap.peer_count, 0);
        assert_eq!(snap.service_count, 0);
        assert!(snap.peers.is_empty());
    }

    #[tokio::test]
    async fn allocate_and_release_ip_round_trip() {
        let mut server = OverlaydServer::new(std::path::PathBuf::from("/tmp/zlayer-overlayd-test"));
        let a = server.allocate_ip("svc", false).expect("alloc a");
        let b = server.allocate_ip("svc", false).expect("alloc b");
        assert_ne!(a, b);
        server.release_ip(a);
        // Released IP is handed back before the monotonic counter advances.
        let c = server.allocate_ip("svc", false).expect("alloc c");
        assert_eq!(c, a);
    }

    /// Build a throwaway server bound to a unique temp data dir so the marker
    /// file (rehydrated in `new`) never collides between tests.
    fn test_server() -> OverlaydServer {
        let dir = std::env::temp_dir().join(format!(
            "zlayer-overlayd-scope-{}-{}",
            std::process::id(),
            now_unix()
        ));
        OverlaydServer::new(dir)
    }

    /// `nat_config_spec_to_config` fills sparse fields from `NatConfig::default`
    /// and copies populated ones verbatim (the Step-0 wire-config threading).
    #[test]
    fn nat_config_spec_to_config_fills_defaults_and_copies() {
        // Empty spec → defaults (default STUN servers, default timeouts).
        let cfg = nat_config_spec_to_config(NatConfigSpec::default());
        let d = NatConfig::default();
        assert_eq!(cfg.stun_servers.len(), d.stun_servers.len());
        assert_eq!(cfg.hole_punch_timeout_secs, d.hole_punch_timeout_secs);
        assert_eq!(cfg.max_candidate_pairs, d.max_candidate_pairs);
        assert!(cfg.relay_server.is_none());

        // Populated spec → copied verbatim; relay credential is NOT on the
        // produced RelayServerConfig (it is carried separately on the server).
        let spec = NatConfigSpec {
            enabled: true,
            stun_servers: vec!["stun.example:3478".to_string()],
            turn_servers: vec![zlayer_types::nat_wire::TurnServerSpec {
                addr: "turn.example:3478".to_string(),
                username: "u".to_string(),
                credential: "p".to_string(),
            }],
            hole_punch_timeout_secs: 9,
            stun_refresh_interval_secs: 40,
            max_candidate_pairs: 3,
            relay_server: Some(zlayer_types::nat_wire::RelayServerSpec {
                listen_port: 3478,
                external_addr: "1.2.3.4:3478".to_string(),
                max_sessions: 7,
                auth_credential: Some("cluster-secret".to_string()),
            }),
        };
        let cfg = nat_config_spec_to_config(spec);
        assert_eq!(cfg.stun_servers.len(), 1);
        assert_eq!(cfg.stun_servers[0].address, "stun.example:3478");
        assert_eq!(cfg.turn_servers.len(), 1);
        assert_eq!(cfg.hole_punch_timeout_secs, 9);
        assert_eq!(cfg.max_candidate_pairs, 3);
        let relay = cfg.relay_server.expect("relay present");
        assert_eq!(relay.listen_port, 3478);
        assert_eq!(relay.max_sessions, 7);
    }

    /// `wire_to_candidate` parses valid candidates and rejects bad ones;
    /// `candidate_to_wire` is its inverse for the type/address/priority triple.
    #[test]
    fn candidate_wire_conversions_round_trip() {
        let w = NatCandidateWire {
            candidate_type: "server-reflexive".to_string(),
            address: "203.0.113.5:51820".to_string(),
            priority: 50,
        };
        let c = wire_to_candidate(&w).expect("valid candidate");
        assert_eq!(c.candidate_type, CandidateType::ServerReflexive);
        assert_eq!(c.priority, 50);
        let back = candidate_to_wire(&c);
        assert_eq!(back, w);

        // Bad address / type → None.
        assert!(wire_to_candidate(&NatCandidateWire {
            candidate_type: "host".to_string(),
            address: "not-an-addr".to_string(),
            priority: 1,
        })
        .is_none());
        assert!(wire_to_candidate(&NatCandidateWire {
            candidate_type: "bogus".to_string(),
            address: "1.2.3.4:5".to_string(),
            priority: 1,
        })
        .is_none());
    }

    /// `AddPeer` carrying candidates records them in `peer_candidates`; a
    /// candidate-free add (or one with only-invalid candidates) leaves no entry,
    /// and `RemovePeer` clears them.
    #[tokio::test]
    async fn add_peer_records_candidates_and_remove_clears_them() {
        let mut server = test_server();
        let pubkey = "base64key".to_string();
        let resp = server
            .handle(OverlaydRequest::AddPeer {
                peer: PeerSpec {
                    public_key: pubkey.clone(),
                    endpoint: "1.2.3.4:51820".to_string(),
                    allowed_ips: "10.200.0.2/32".to_string(),
                    persistent_keepalive_secs: 25,
                    candidates: vec![NatCandidateWire {
                        candidate_type: "host".to_string(),
                        address: "192.168.1.5:51820".to_string(),
                        priority: 100,
                    }],
                },
                scope: PeerScope::Global,
            })
            .await;
        assert!(matches!(resp, OverlaydResponse::Ok));
        assert_eq!(
            server.peer_candidates.get(&pubkey).map(Vec::len),
            Some(1),
            "candidates must be recorded"
        );

        // Remove clears the candidate + connection-type bookkeeping.
        let resp = server
            .handle(OverlaydRequest::RemovePeer {
                pubkey: pubkey.clone(),
                scope: PeerScope::Global,
            })
            .await;
        assert!(matches!(resp, OverlaydResponse::Ok));
        assert!(!server.peer_candidates.contains_key(&pubkey));
    }

    /// `NatStatus` returns a `NatStatusWire` (empty before any tick) — proving
    /// the new IPC pair is wired through `dispatch`.
    #[tokio::test]
    async fn nat_status_request_returns_wire_snapshot() {
        let mut server = test_server();
        let resp = server.handle(OverlaydRequest::NatStatus).await;
        match resp {
            OverlaydResponse::NatStatus(wire) => {
                assert!(wire.candidates.is_empty());
                assert!(wire.peers.is_empty());
            }
            other => panic!("expected NatStatus response, got {other:?}"),
        }
    }

    /// True when the process can mutate netlink + `/proc/sys` (root). The
    /// teardown-completeness test below is `#[ignore]`d and additionally skips
    /// (not fails) when run via `--ignored` without privileges, matching the
    /// crate's "skip gracefully when not root" convention.
    #[cfg(target_os = "linux")]
    fn is_root() -> bool {
        // SAFETY: `geteuid` is a pure read of the caller's effective uid.
        #[allow(unsafe_code)]
        let euid = unsafe { libc::geteuid() };
        euid == 0
    }

    /// End-to-end teardown completeness: populate the server's
    /// `created_veths` / `created_bridges` / `created_host_routes` tracking sets
    /// with REAL host resources created via netlink, snapshot
    /// `net.ipv4.ip_forward`, force it to `1` (recording the prior value in
    /// `prev_ipv4_forward` exactly as `enable_forwarding_for_attach` does), then
    /// drive the same teardown the `Shutdown` request triggers
    /// (`handle(OverlaydRequest::Shutdown)`), and assert: every tracked veth /
    /// bridge / route is gone at the kernel level AND `ip_forward` is restored to
    /// the snapshot.
    ///
    /// This is the regression for the full teardown fix (revert routes + veths +
    /// bridges + forwarding sysctl on shutdown). Names are unique and <=15 chars;
    /// a belt-and-braces cleanup runs before the asserts so a failed assertion
    /// still leaves the host clean. Skips (returns) when not root.
    #[cfg(target_os = "linux")]
    #[tokio::test(flavor = "multi_thread")]
    #[ignore = "needs CAP_NET_ADMIN + /proc/sys write; run on a privileged Linux host"]
    async fn shutdown_teardown_reverts_resources_and_ip_forward() {
        if !is_root() {
            eprintln!("skipping shutdown_teardown_reverts_resources_and_ip_forward: requires root");
            return;
        }

        let suffix = format!("{:x}", now_unix() & 0xff_ffff);
        let veth_host = format!("vh-{suffix}");
        let veth_peer = format!("vp-{suffix}");
        let bridge = format!("zlb-{suffix}");
        assert!(veth_host.len() <= 15, "veth host name exceeds IFNAMSIZ");
        assert!(veth_peer.len() <= 15, "veth peer name exceeds IFNAMSIZ");
        assert!(bridge.len() <= 15, "bridge name exceeds IFNAMSIZ");

        let dest = IpAddr::V4(Ipv4Addr::new(10, 233, 0, 9));
        let prefix: u8 = 32;

        // --- create real host resources and register them with the server's
        // teardown-tracking sets, exactly as the attach paths do. ---
        crate::netlink::create_veth_pair(&veth_host, &veth_peer)
            .await
            .expect("create_veth_pair");
        crate::netlink::create_bridge(&bridge)
            .await
            .expect("create_bridge");
        crate::netlink::replace_route_via_dev(dest, prefix, &veth_host, None)
            .await
            .expect("replace_route_via_dev");

        let mut server = test_server();
        server.created_veths.insert(veth_host.clone());
        server.created_bridges.insert(bridge.clone());
        server
            .created_host_routes
            .push((dest, prefix, veth_host.clone()));

        // Snapshot ip_forward, then flip it to 1 and record the prior value the
        // way enable_forwarding_for_attach does so revert_forwarding restores it.
        let snapshot =
            crate::netlink::read_sysctl("net.ipv4.ip_forward").unwrap_or_else(|_| "0".to_string());
        server.prev_ipv4_forward = Some(snapshot.clone());
        crate::netlink::set_sysctl("net.ipv4.ip_forward", "1").expect("set ip_forward=1");

        // --- drive teardown via the real Shutdown dispatch path ---
        let resp = server.handle(OverlaydRequest::Shutdown).await;
        assert!(
            matches!(resp, OverlaydResponse::Ok),
            "Shutdown should return Ok, got {resp:?}"
        );

        // Snapshot kernel state AFTER teardown.
        let veth_gone = !std::path::Path::new(&format!("/sys/class/net/{veth_host}")).exists();
        let bridge_gone = !std::path::Path::new(&format!("/sys/class/net/{bridge}")).exists();
        let route_gone = {
            let target = format!("10.233.0.9/{prefix}");
            std::process::Command::new("ip")
                .args(["route", "show", &target, "dev", &veth_host])
                .output()
                .map_or(true, |o| !o.status.success() || o.stdout.is_empty())
        };
        let ip_forward_after = crate::netlink::read_sysctl("net.ipv4.ip_forward")
            .unwrap_or_else(|_| "unknown".to_string());

        // Belt-and-braces cleanup before asserting so the host stays clean even
        // if an assertion fails (teardown should have done all of this already).
        let _ = crate::netlink::delete_route_via_dev(dest, prefix, &veth_host).await;
        let _ = crate::netlink::delete_link_by_name(&veth_host).await;
        let _ = crate::netlink::delete_link_by_name(&veth_peer).await;
        let _ = crate::netlink::delete_link_by_name(&bridge).await;
        // Restore ip_forward to the snapshot regardless of teardown outcome.
        let _ = crate::netlink::set_sysctl("net.ipv4.ip_forward", &snapshot);

        // --- assertions ---
        assert!(veth_gone, "teardown should delete the tracked host veth");
        assert!(bridge_gone, "teardown should delete the tracked bridge");
        assert!(
            route_gone,
            "teardown should delete the tracked /32 host route"
        );
        assert_eq!(
            ip_forward_after.trim(),
            snapshot.trim(),
            "teardown should restore net.ipv4.ip_forward to its pre-overlay value"
        );

        // Tracking sets must be drained by teardown so a re-run starts clean.
        assert!(
            server.created_veths.is_empty(),
            "created_veths should be drained by teardown"
        );
        assert!(
            server.created_bridges.is_empty(),
            "created_bridges should be drained by teardown"
        );
        assert!(
            server.created_host_routes.is_empty(),
            "created_host_routes should be drained by teardown"
        );
    }

    #[test]
    fn build_config_uses_matching_physical_egress_ipv4() {
        let server = test_server();
        let overlay_ip: IpAddr = "10.200.0.1".parse().unwrap();
        let egress: IpAddr = "192.0.2.10".parse().unwrap();
        let config = server.build_config(
            "priv".to_string(),
            "pub".to_string(),
            overlay_ip,
            16,
            51820,
            Some(egress),
        );
        assert_eq!(config.local_endpoint, SocketAddr::new(egress, 51820));
    }

    #[test]
    fn build_config_falls_back_to_unspecified_when_none() {
        let server = test_server();
        let overlay_ip: IpAddr = "10.200.0.1".parse().unwrap();
        let config = server.build_config(
            "priv".to_string(),
            "pub".to_string(),
            overlay_ip,
            16,
            51820,
            None,
        );
        assert_eq!(
            config.local_endpoint,
            SocketAddr::new(IpAddr::V4(Ipv4Addr::UNSPECIFIED), 51820)
        );
    }

    #[test]
    fn build_config_falls_back_to_unspecified_on_family_mismatch() {
        let server = test_server();
        // Overlay is v6 but the resolved physical egress is v4: unusable for
        // source selection, so we must fall back to the v6 UNSPECIFIED address.
        let overlay_ip: IpAddr = "fd00::1".parse().unwrap();
        let egress: IpAddr = "192.0.2.10".parse().unwrap();
        let config = server.build_config(
            "priv".to_string(),
            "pub".to_string(),
            overlay_ip,
            64,
            51820,
            Some(egress),
        );
        assert_eq!(
            config.local_endpoint,
            SocketAddr::new(IpAddr::V6(Ipv6Addr::UNSPECIFIED), 51820)
        );
    }

    #[test]
    fn rootless_forces_unspecified_decision() {
        // Rootless mode must force the WG local_endpoint to UNSPECIFIED because
        // detect_physical_egress() resolves pasta's in-netns tap IP there.
        assert!(rootless_forces_unspecified(true));
        // Non-rootless preserves the existing physical-egress selection path.
        assert!(!rootless_forces_unspecified(false));
    }

    #[tokio::test]
    async fn transport_for_scope_global_requires_setup() {
        let server = test_server();
        // No global overlay set up yet -> Global scope errors. (Can't use
        // `expect_err` because `&OverlayTransport` is not `Debug`.)
        match server.transport_for_scope(&PeerScope::Global) {
            Ok(_) => panic!("global overlay should not be set up"),
            Err(OverlaydError::Other(m)) => {
                assert!(m.contains("global overlay not set up"), "got: {m}");
            }
            Err(other) => panic!("unexpected error: {other:?}"),
        }
    }

    #[tokio::test]
    async fn transport_for_scope_unset_service_errors() {
        let server = test_server();
        match server.transport_for_scope(&PeerScope::Service {
            service: "x".to_string(),
        }) {
            Ok(_) => panic!("no dedicated overlay should exist for x"),
            Err(OverlaydError::Other(m)) => {
                assert_eq!(m, "no dedicated overlay for service x");
            }
            Err(other) => panic!("unexpected error: {other:?}"),
        }
    }

    #[tokio::test]
    async fn add_peer_service_scope_before_setup_errors_via_dispatch() {
        let mut server = test_server();
        let resp = server
            .handle(OverlaydRequest::AddPeer {
                peer: PeerSpec {
                    public_key: "k".to_string(),
                    endpoint: "1.2.3.4:51820".to_string(),
                    allowed_ips: "10.200.0.2/32".to_string(),
                    persistent_keepalive_secs: 0,
                    candidates: Vec::new(),
                },
                scope: PeerScope::Service {
                    service: "x".to_string(),
                },
            })
            .await;
        match resp {
            OverlaydResponse::Err { message } => {
                assert_eq!(message, "no dedicated overlay for service x");
            }
            other => panic!("expected Err response, got {other:?}"),
        }
    }

    /// The host-adapter degrade decision. A `create_interface()` failure is fatal
    /// on Linux (the kernel TUN IS the container data path) and degrades to a
    /// VM-only overlay on macOS/Windows (containers mesh VM-to-VM, the host
    /// utun/Wintun is off the data path). We can't provoke a real utun/Wintun
    /// syscall failure from a Linux test box, so we assert the pure `cfg!`-driven
    /// classifier instead: on this Linux test runner it must report fatal.
    /// (On macOS/Windows the same fn returns `false` — that arm is covered by the
    /// cfg, exercised natively, and cannot be asserted here.)
    #[test]
    fn host_adapter_failure_fatal_decision() {
        // Non-mandatory: platform-driven — fatal on Linux, degrade on macOS/Windows.
        assert_eq!(
            host_adapter_failure_is_fatal(false),
            cfg!(target_os = "linux"),
            "non-mandatory host-adapter failure is fatal only on Linux (kernel TUN is the data path)"
        );
        // Mandatory (host-shared macOS nodes where the utun IS the container data
        // path): fatal on every platform.
        assert!(
            host_adapter_failure_is_fatal(true),
            "a mandatory host adapter must make failure fatal on every platform"
        );
    }

    /// A VM-only overlay leaves `global_transport == None`. The Global-scope peer
    /// dispatch must then WARN-AND-SKIP the on-device install (guests get the
    /// peer via guest-config push) rather than erroring — assert the dispatch
    /// returns `Ok` and still mirrors the peer into `global_peers`. This is the
    /// Linux-runnable proxy for the degraded host-adapter path: it exercises the
    /// exact `None`-tolerant branch without needing a real utun/Wintun failure.
    #[tokio::test]
    async fn add_global_peer_with_no_host_adapter_skips_and_records() {
        let mut server = test_server();
        assert!(
            server.global_transport.is_none(),
            "fresh server has no host adapter (VM-only precondition)"
        );
        let pubkey = "k".to_string();
        let resp = server
            .handle(OverlaydRequest::AddPeer {
                peer: PeerSpec {
                    public_key: pubkey.clone(),
                    endpoint: "1.2.3.4:51820".to_string(),
                    allowed_ips: "10.200.0.2/32".to_string(),
                    persistent_keepalive_secs: 0,
                    candidates: Vec::new(),
                },
                scope: PeerScope::Global,
            })
            .await;
        match resp {
            OverlaydResponse::Ok => {}
            other => panic!("expected Ok (warn-and-skip), got {other:?}"),
        }
        assert!(
            server.global_peers.contains_key(&pubkey),
            "Global peer must still be mirrored for guest-config push"
        );
    }

    /// End-to-end Dedicated setup. Needs a real TUN device, so it is ignored by
    /// default and only runs on a privileged Linux host (mirrors the crate's
    /// other privileged overlay e2e tests).
    #[cfg(target_os = "linux")]
    #[tokio::test]
    #[ignore = "needs CAP_NET_ADMIN; run on a privileged Linux host"]
    async fn dedicated_setup_creates_distinct_device_and_routes_service_peer() {
        let mut server = test_server();
        // Bring up the global overlay first so the cluster CIDR + global device
        // exist (the dedicated device must get a distinct port and key).
        let global_name = server
            .setup_global_overlay(
                "dep".to_string(),
                "i0".to_string(),
                "10.200.0.0/16",
                Some("10.200.0.0/28"),
                zlayer_core::DEFAULT_WG_PORT,
                None,
                false,
            )
            .await
            .expect("global overlay up");
        assert!(!global_name.is_empty());

        // Dedicated service setup.
        let info = server
            .setup_service_overlay("web", OverlayMode::Dedicated)
            .await
            .expect("dedicated service overlay up");
        assert_eq!(info.mode, OverlayMode::Dedicated);
        let port = info.wg_port.expect("dedicated port");
        assert_ne!(
            port, server.overlay_port,
            "dedicated device must not share the global port"
        );

        let st = server
            .service_transports
            .get("web")
            .expect("service transport recorded");
        assert_eq!(st.listen_port, port);
        assert_ne!(
            st.interface, global_name,
            "dedicated interface must differ from global"
        );
        assert_eq!(
            Some(st.public_key.clone()),
            info.wg_public_key,
            "info pubkey matches recorded transport"
        );
        assert_ne!(
            Some(st.public_key.clone()),
            server.transport_public_key,
            "dedicated key must differ from global key"
        );

        // A Service-scoped AddPeer must land on the dedicated device (succeeds),
        // proving scope routing targets the per-service transport.
        let resp = server
            .handle(OverlaydRequest::AddPeer {
                peer: PeerSpec {
                    public_key: {
                        let (_priv, pubk) = OverlayTransport::generate_keys().await.unwrap();
                        pubk
                    },
                    endpoint: "5.6.7.8:51999".to_string(),
                    allowed_ips: "10.201.0.2/32".to_string(),
                    persistent_keepalive_secs: 25,
                    candidates: Vec::new(),
                },
                scope: PeerScope::Service {
                    service: "web".to_string(),
                },
            })
            .await;
        assert!(
            matches!(resp, OverlaydResponse::Ok),
            "service-scoped add_peer should land on the dedicated device, got {resp:?}"
        );
    }

    #[tokio::test]
    async fn guest_attach_requires_global_overlay() {
        // Without a global overlay (no node public key / transport) a
        // guest-managed attach must error rather than allocate anything.
        let mut server = test_server();
        let resp = server
            .handle(OverlaydRequest::AttachContainer {
                handle: AttachHandle::GuestManaged {
                    id: "vm-1".to_string(),
                },
                service: "web".to_string(),
                join_global: true,
                dns_server: None,
                dns_domain: None,
                ephemeral: false,
                isolation_network: None,
            })
            .await;
        match resp {
            OverlaydResponse::Err { message } => {
                assert!(
                    message.contains("global overlay to be set up"),
                    "got: {message}"
                );
            }
            other => panic!("expected Err response, got {other:?}"),
        }
        // Nothing was recorded.
        assert!(server.guest_attachments.is_empty());
    }

    #[tokio::test]
    async fn detach_unknown_guest_is_idempotent() {
        let mut server = test_server();
        // No such guest -> Ok (idempotent), no panic.
        server
            .detach_container_guest("never-attached")
            .await
            .expect("detach of unknown guest is a no-op");
    }

    /// Full guest-managed attach/detach round-trip. Needs a real TUN device (the
    /// global overlay must be live so the guest peer can be installed), so it is
    /// ignored by default and only runs on a privileged Linux host — mirrors the
    /// crate's other privileged overlay e2e tests.
    #[cfg(target_os = "linux")]
    #[tokio::test]
    #[ignore = "needs CAP_NET_ADMIN; run on a privileged Linux host"]
    async fn guest_attach_allocates_config_and_detach_releases() {
        let mut server = test_server();
        server
            .setup_global_overlay(
                "dep".to_string(),
                "i0".to_string(),
                "10.200.0.0/16",
                Some("10.200.0.0/28"),
                zlayer_core::DEFAULT_WG_PORT,
                None,
                false,
            )
            .await
            .expect("global overlay up");

        // Seed a global peer so the guest config carries it through.
        let (_p, other_pub) = OverlayTransport::generate_keys().await.unwrap();
        let add = server
            .handle(OverlaydRequest::AddPeer {
                peer: PeerSpec {
                    public_key: other_pub.clone(),
                    endpoint: "9.9.9.9:51820".to_string(),
                    allowed_ips: "10.200.1.0/28".to_string(),
                    persistent_keepalive_secs: 25,
                    candidates: Vec::new(),
                },
                scope: PeerScope::Global,
            })
            .await;
        assert!(
            matches!(add, OverlaydResponse::Ok),
            "seed peer add: {add:?}"
        );

        let resp = server
            .handle(OverlaydRequest::AttachContainer {
                handle: AttachHandle::GuestManaged {
                    id: "vm-1".to_string(),
                },
                service: "web".to_string(),
                join_global: true,
                dns_server: Some("10.200.0.1".parse().unwrap()),
                dns_domain: Some("overlay".to_string()),
                ephemeral: false,
                isolation_network: None,
            })
            .await;
        let config = match resp {
            OverlaydResponse::GuestConfig(c) => c,
            other => panic!("expected GuestConfig, got {other:?}"),
        };
        assert!(!config.private_key.is_empty());
        assert!(!config.public_key.is_empty());
        assert_ne!(config.private_key, config.public_key);
        assert_eq!(config.listen_port, server.overlay_port);
        assert_eq!(config.dns_server, Some("10.200.0.1".parse().unwrap()));
        // Peers = the seeded global peer + this node (self) + nothing else.
        assert!(
            config.peers.iter().any(|p| p.public_key == other_pub),
            "guest must learn the seeded global peer"
        );
        assert!(
            config
                .peers
                .iter()
                .any(|p| Some(&p.public_key) == server.transport_public_key.as_ref()),
            "guest must learn THIS node as a peer"
        );
        // The guest's own key is registered as a global peer (host route).
        assert!(server.global_peers.contains_key(&config.public_key));
        let info = server
            .guest_attachments
            .get("vm-1")
            .expect("attachment recorded");
        assert_eq!(info.overlay_ip, config.overlay_ip);

        // Detach releases the peer + IP.
        let det = server
            .handle(OverlaydRequest::DetachContainer {
                handle: AttachHandle::GuestManaged {
                    id: "vm-1".to_string(),
                },
            })
            .await;
        assert!(matches!(det, OverlaydResponse::Ok), "detach: {det:?}");
        assert!(!server.guest_attachments.contains_key("vm-1"));
        assert!(!server.global_peers.contains_key(&config.public_key));
    }

    /// The `setup_service_overlay` dispatch must handle ALL THREE modes —
    /// including the default `Auto` — without panicking. `resolve()` is now the
    /// identity, so the old `unreachable!("resolve never returns Auto")` arm
    /// would panic on the default mode; this proves the arm is gone. Each mode
    /// is recorded in `service_modes` BEFORE any netlink/transport work, so we
    /// assert on that deterministically regardless of host privilege (the
    /// downstream bridge/transport bring-up may succeed or fail depending on
    /// `CAP_NET_ADMIN`, but it must never panic).
    #[cfg(target_os = "linux")]
    #[tokio::test]
    async fn dispatch_handles_all_three_modes_without_panic() {
        for mode in [
            OverlayMode::Auto,
            OverlayMode::Shared,
            OverlayMode::Dedicated,
        ] {
            let mut server = test_server();
            let service = format!("svc-{mode:?}");
            // Must return a Result (Ok or Err) — never panic via `unreachable!`.
            let _ = server.setup_service_overlay(&service, mode).await;
            // The resolved mode is recorded up front for the attach path.
            assert_eq!(
                server.service_modes.get(&service).copied(),
                Some(mode.resolve()),
                "mode {mode:?} must be recorded for the attach path"
            );
        }
    }

    /// Two distinct `Shared` services must reuse the SAME node-wide shared
    /// bridge (one bridge, not two), while an `Auto` service gets its OWN
    /// per-service bridge. Needs `CAP_NET_ADMIN` to create the bridges, so it is
    /// ignored by default like the crate's other privileged overlay e2e tests.
    #[cfg(target_os = "linux")]
    #[tokio::test]
    #[ignore = "needs CAP_NET_ADMIN; run on a privileged Linux host"]
    async fn shared_services_reuse_one_bridge_auto_gets_its_own() {
        let mut server = test_server();
        server
            .setup_global_overlay(
                "dep".to_string(),
                "i0".to_string(),
                "10.200.0.0/16",
                Some("10.200.0.0/26"),
                zlayer_core::DEFAULT_WG_PORT,
                None,
                false,
            )
            .await
            .expect("global overlay up");

        // First Shared service creates the shared bridge.
        let info_a = server
            .setup_service_overlay("web", OverlayMode::Shared)
            .await
            .expect("shared service web up");
        assert_eq!(info_a.mode, OverlayMode::Shared);
        let shared_name = server
            .shared_bridge
            .as_ref()
            .expect("shared bridge created")
            .name
            .clone();
        assert_eq!(info_a.name, shared_name);
        // Shared services are NOT per-service bridges.
        assert!(
            !server.service_bridges.contains_key("web"),
            "Shared service must not create a per-service bridge"
        );

        // Second Shared service REUSES the same shared bridge — no new bridge.
        let info_b = server
            .setup_service_overlay("api", OverlayMode::Shared)
            .await
            .expect("shared service api up");
        assert_eq!(
            info_b.name, shared_name,
            "a second Shared service must reuse the SAME node-wide bridge"
        );
        assert!(!server.service_bridges.contains_key("api"));
        // Still exactly one shared bridge object.
        assert_eq!(
            server.shared_bridge.as_ref().map(|b| b.name.clone()),
            Some(shared_name.clone())
        );

        // An Auto service gets its OWN per-service bridge, distinct from the
        // shared bridge.
        let info_c = server
            .setup_service_overlay("batch", OverlayMode::Auto)
            .await
            .expect("auto service batch up");
        assert_eq!(info_c.mode, OverlayMode::Auto);
        assert!(
            server.service_bridges.contains_key("batch"),
            "Auto service must get its own per-service bridge"
        );
        assert_ne!(
            info_c.name, shared_name,
            "Auto per-service bridge must differ from the shared bridge"
        );

        // Both Shared services point their service_interfaces entry at the one
        // shared bridge; the Auto service points at its own.
        assert_eq!(server.service_interfaces.get("web"), Some(&shared_name));
        assert_eq!(server.service_interfaces.get("api"), Some(&shared_name));
        assert_ne!(server.service_interfaces.get("batch"), Some(&shared_name));
    }

    /// A `Shared` service's container attach must draw its IP from the shared
    /// bridge pool and must fail cleanly (no panic, clear error) when the shared
    /// bridge has not been set up yet. Unprivileged: exercises only the
    /// pre-netlink resolution branch.
    #[cfg(target_os = "linux")]
    #[tokio::test]
    async fn attach_shared_without_setup_errors_cleanly() {
        let mut server = test_server();
        // Mark the service Shared but never set up the shared bridge.
        server
            .service_modes
            .insert("web".to_string(), OverlayMode::Shared);
        let err = server
            .attach_container_linux(424_242, "web", false, false, None)
            .await
            .expect_err("attach must fail without a shared bridge");
        match err {
            OverlaydError::Other(m) => {
                assert!(
                    m.contains("no shared bridge"),
                    "expected shared-bridge error, got: {m}"
                );
            }
            other => panic!("unexpected error variant: {other:?}"),
        }
    }

    /// A container attached on a NAMED isolated network must be recorded in the
    /// per-network membership map (`network_members["net-a"]` gains the member's
    /// service IP). Needs `CAP_NET_ADMIN` to bring up the bridge + veth, so it is
    /// ignored by default like the crate's other privileged overlay e2e tests.
    #[cfg(target_os = "linux")]
    #[tokio::test]
    #[ignore = "needs CAP_NET_ADMIN; run on a privileged Linux host"]
    async fn attach_linux_isolated_network_records_membership() {
        let mut server = test_server();
        server
            .setup_global_overlay(
                "dep".to_string(),
                "i0".to_string(),
                "10.200.0.0/16",
                Some("10.200.0.0/26"),
                zlayer_core::DEFAULT_WG_PORT,
                None,
                false,
            )
            .await
            .expect("global overlay up");

        // An Auto service gives us a real per-service bridge to attach onto.
        server
            .setup_service_overlay("web", OverlayMode::Auto)
            .await
            .expect("auto service web up");

        // Attach this very process (a live PID with a real netns) onto the named
        // isolated network "net-a".
        let pid = std::process::id();
        let ip = server
            .attach_container_linux(pid, "web", false, true, Some("net-a".to_string()))
            .await
            .expect("attach onto isolated network");

        // Membership map gained exactly this member under "net-a".
        let members = server
            .network_members
            .get("net-a")
            .expect("net-a membership recorded");
        assert!(
            members.contains(&ip),
            "network_members[net-a] must contain the attached member IP {ip}"
        );

        // Detach drains the membership and drops the now-empty network entry.
        server
            .detach_container_linux(pid)
            .await
            .expect("detach succeeds");
        assert!(
            !server.network_members.contains_key("net-a"),
            "empty isolated network must be dropped from network_members on last detach"
        );
    }

    /// The isolation-network owner key namespace is distinct from the dedicated
    /// per-service namespace, so an isolation network and a service of the same
    /// name never collide on the same marker/allocator key. Platform-agnostic.
    #[test]
    fn isolation_owner_key_distinct_from_service_owner_key() {
        let iso = crate::network_state::owner_for_isolation_network("alpha");
        let svc = crate::network_state::owner_for_service("alpha");
        assert_ne!(
            iso, svc,
            "isolation and service owner keys must not collide for the same name"
        );
        assert_eq!(iso, "iso:alpha");
        assert_eq!(svc, "service:alpha");
    }

    /// `isolation_network_subnet` is deterministic (same name -> same block so a
    /// reused HCN network keeps its subnet across restarts), stays INSIDE the
    /// node slice, and lands DIFFERENT isolation networks on DISJOINT sub-blocks
    /// (the whole point of L3 isolation — distinct networks must not share an
    /// address range). Windows-only (the method is `cfg(windows)`); exercised by
    /// `cargo xwin test`.
    #[cfg(target_os = "windows")]
    #[test]
    fn isolation_network_subnet_is_deterministic_disjoint_and_inside_slice() {
        let mut server = test_server();
        let slice: IpNetwork = "10.200.5.0/26".parse().unwrap();
        server.slice_cidr = Some(slice);
        let slice_net: ipnet::IpNet = "10.200.5.0/26".parse().unwrap();

        // Deterministic: same name -> same block on repeated calls.
        let a1 = server.isolation_network_subnet("alpha").unwrap();
        let a2 = server.isolation_network_subnet("alpha").unwrap();
        assert_eq!(a1, a2, "same isolation network must map to the same subnet");

        // Inside the node slice and at the /28 sub-prefix.
        assert!(
            slice_net.contains(&a1.network()) && slice_net.contains(&a1.broadcast()),
            "isolation subnet {a1} must be wholly inside the node slice {slice_net}"
        );
        assert_eq!(a1.prefix_len(), 28, "expected a /28 isolation sub-block");

        // A different network name carving a different /28 block must be disjoint.
        // (`beta` and `gamma` hash to different indices than `alpha`; pick whichever
        //  of several names lands on a distinct block to assert disjointness.)
        let other = ["beta", "gamma", "delta", "omega", "zeta"]
            .iter()
            .map(|n| server.isolation_network_subnet(n).unwrap())
            .find(|s| *s != a1)
            .expect("at least one other name must land on a different /28 block");
        let overlaps = a1.contains(&other.network()) || other.contains(&a1.network());
        assert!(
            !overlaps,
            "distinct isolation networks must occupy disjoint subnets ({a1} vs {other})"
        );
    }
}