Skip to main content

zlayer_overlayd/
server.rs

1//! The overlayd server engine.
2//!
3//! [`OverlaydServer`] is a near 1:1 migration of the *mechanics* half of the
4//! agent's `OverlayManager`: it owns the single cluster `WireGuard`
5//! [`OverlayTransport`], the per-service Linux bridges (Linux) / HCN Internal
6//! network + endpoints (Windows), the per-node IP allocator, DNS config, and
7//! NAT traversal. The cluster-brain half (Raft, scheduler, service registry)
8//! stays in the main daemon, which drives this server over the IPC contract in
9//! [`zlayer_types::overlayd`].
10//!
11//! Every [`OverlaydRequest`] maps to a method here via [`OverlaydServer::handle`].
12
13use std::collections::HashMap;
14use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr};
15#[cfg(target_os = "linux")]
16use std::os::fd::AsFd;
17use std::path::{Path, PathBuf};
18use std::sync::atomic::{AtomicU64, Ordering};
19
20use ipnetwork::IpNetwork;
21use zlayer_overlay::{NatConfig, NatTraversal, OverlayConfig, OverlayTransport, PeerInfo};
22use zlayer_types::overlayd::{
23    AttachHandle, AttachResult, DedicatedServiceStatus, GuestOverlayConfig, OverlayMode,
24    OverlaydRequest, OverlaydResponse, PeerScope, PeerSpec, PeerStatus, ServiceOverlayInfo,
25    StatusSnapshot,
26};
27
28use crate::error::OverlaydError;
29use crate::network_state::{
30    owner_for_service, DedicatedPortAllocator, ManagedNetwork, NetworkState,
31};
32
33/// Maximum length for Linux network interface names (IFNAMSIZ - 1 for null terminator).
34const MAX_IFNAME_LEN: usize = 15;
35
36/// Generate a Linux-safe interface name guaranteed to be <= 15 chars.
37///
38/// Joins the `parts` with `-` after a `"zl-"` prefix and appends `-{suffix}` if
39/// non-empty. When the result exceeds 15 characters, a deterministic hash of all
40/// parts is used instead to keep the name unique and within the kernel limit.
41#[must_use]
42pub fn make_interface_name(parts: &[&str], suffix: &str) -> String {
43    use std::collections::hash_map::DefaultHasher;
44    use std::hash::{Hash, Hasher};
45
46    let base = format!("zl-{}", parts.join("-"));
47    let candidate = if suffix.is_empty() {
48        base
49    } else {
50        format!("{base}-{suffix}")
51    };
52
53    if candidate.len() <= MAX_IFNAME_LEN {
54        return candidate;
55    }
56
57    // Name is too long -- produce a deterministic hash-based name.
58    let mut hasher = DefaultHasher::new();
59    for part in parts {
60        part.hash(&mut hasher);
61    }
62    suffix.hash(&mut hasher);
63    let hash = format!("{:x}", hasher.finish());
64
65    if suffix.is_empty() {
66        // "zl-" (3) + up to 12 hex chars = 15
67        let budget = MAX_IFNAME_LEN - 3;
68        format!("zl-{}", &hash[..budget.min(hash.len())])
69    } else {
70        // "zl-" (3) + hash + "-" (1) + suffix
71        let suffix_cost = 1 + suffix.len(); // "-" + suffix
72        let hash_budget = MAX_IFNAME_LEN.saturating_sub(3 + suffix_cost);
73        if hash_budget == 0 {
74            let budget = MAX_IFNAME_LEN - 3;
75            format!("zl-{}", &hash[..budget.min(hash.len())])
76        } else {
77            format!("zl-{}-{}", &hash[..hash_budget.min(hash.len())], suffix)
78        }
79    }
80}
81
82/// First usable host address in `subnet`.
83///
84/// For IPv4 this is `network() + 1` (skipping the network address). For IPv6
85/// the same rule applies — the network address is conventionally reserved.
86fn first_usable_ip(subnet: ipnet::IpNet) -> IpAddr {
87    match subnet {
88        ipnet::IpNet::V4(v4) => {
89            let net = u32::from(v4.network());
90            IpAddr::V4(Ipv4Addr::from(net.wrapping_add(1)))
91        }
92        ipnet::IpNet::V6(v6) => {
93            let net = u128::from(v6.network());
94            IpAddr::V6(Ipv6Addr::from(net.wrapping_add(1)))
95        }
96    }
97}
98
99/// Parameters threaded into [`OverlaydServer::attach_to_interface`] when a
100/// container is being attached to a per-service Linux bridge.
101#[cfg(target_os = "linux")]
102#[derive(Debug)]
103struct BridgeAttachParams<'a> {
104    /// Linux bridge name on the host to enslave the host-side veth into.
105    bridge_name: &'a str,
106    /// Bridge's L3 gateway IP. The container's default route is set here.
107    gateway: IpAddr,
108    /// Prefix length of the bridge's subnet.
109    subnet_prefix_len: u8,
110}
111
112/// Tracking info recorded by [`OverlaydServer::attach_container`] for every
113/// container that successfully attaches on Linux. Used by `detach_container`.
114#[cfg(target_os = "linux")]
115#[derive(Debug, Clone)]
116struct AttachInfo {
117    /// IP allocated on the per-service overlay (eth0 inside the container).
118    service_ip: IpAddr,
119    /// Name of the service whose bridge owns `service_ip`.
120    service_name: Option<String>,
121    /// IP allocated on the global overlay (eth1), if the container joined it.
122    global_ip: Option<IpAddr>,
123    /// True iff the container also attached to the global overlay (eth1).
124    joined_global: bool,
125}
126
127/// Tracking info recorded by [`OverlaydServer::attach_container_guest`] for a
128/// guest-managed attach. Platform-agnostic (no netns/veth/HCN): the guest owns
129/// its own `WireGuard` device; the host only allocated the address + registered
130/// the guest's public key as a global peer.
131#[derive(Debug, Clone)]
132struct GuestAttachInfo {
133    /// Overlay IP allocated for the guest (released on detach).
134    overlay_ip: IpAddr,
135    /// Base64 public key registered on the global transport for the guest
136    /// (removed on detach).
137    public_key: String,
138    /// Service whose bridge pool owns `overlay_ip` (Linux service-bridge path);
139    /// `None` when drawn from the node slice. Mirrors `AttachInfo::service_name`
140    /// so detach returns the IP to the right pool.
141    service_name: Option<String>,
142}
143
144/// Per-service Linux bridge state. One bridge per service per node; containers
145/// attach to it via veth pairs and cross-node packets ride the single cluster
146/// `OverlayTransport` with the service subnet plumbed into its `AllowedIPs`.
147#[cfg(target_os = "linux")]
148#[derive(Debug)]
149struct ServiceBridge {
150    /// Linux bridge name, kept under IFNAMSIZ-1 by [`make_interface_name`].
151    name: String,
152    /// CIDR of the service's subnet on this node.
153    subnet: ipnet::IpNet,
154    /// Gateway IP within the subnet (first usable address).
155    gateway: IpAddr,
156    /// Per-service IP allocator covering `subnet`.
157    ip_allocator: zlayer_overlay::allocator::IpAllocator,
158}
159
160/// A dedicated per-service `WireGuard` transport (`OverlayMode::Dedicated`).
161///
162/// Unlike Shared mode — where every service subnet is plumbed onto the single
163/// cluster [`OverlayTransport`] via multi-CIDR `AllowedIPs` — a Dedicated
164/// service owns a *second* real `WireGuard` device with its own crypto context,
165/// listen port, overlay IP, and subnet. The device is portable (boringtun
166/// userspace `WireGuard` works on Linux/macOS/Windows), so this struct is
167/// cross-platform; only the bridge/HCN *attachment* of containers onto it is
168/// platform-gated.
169struct ServiceTransport {
170    /// The live dedicated `WireGuard` device. Dropping it tears down the TUN.
171    transport: OverlayTransport,
172    /// Actual interface name (kernel-assigned `utunN` on macOS).
173    interface: String,
174    /// base64 public key of this dedicated device.
175    public_key: String,
176    /// UDP listen port handed out by [`DedicatedPortAllocator`].
177    listen_port: u16,
178    /// This node's overlay IP on the dedicated device.
179    overlay_ip: std::net::IpAddr,
180    /// The service's subnet carried by the dedicated device.
181    subnet: ipnet::IpNet,
182}
183
184/// The overlay daemon engine.
185pub struct OverlaydServer {
186    /// Deployment name (used for network naming). Set by `SetupGlobalOverlay`.
187    deployment: String,
188    /// Per-daemon-process disambiguator included in overlay link names. Set by
189    /// `SetupGlobalOverlay`.
190    instance_id: String,
191    /// Root data directory; HCN markers, IPAM state, etc. live under it.
192    data_dir: PathBuf,
193    /// Global overlay interface name.
194    global_interface: Option<String>,
195    /// Global overlay transport (kept alive for the TUN device lifetime). The
196    /// SINGLE cluster-wide `WireGuard` transport; every service subnet is
197    /// plumbed through its `AllowedIPs`.
198    global_transport: Option<OverlayTransport>,
199    /// Service-name -> per-service Linux bridge / placeholder name.
200    service_interfaces: HashMap<String, String>,
201    /// Service-name -> dedicated per-service `WireGuard` transport (Dedicated
202    /// mode). Coexists with `global_transport`. Empty for Shared-only nodes.
203    service_transports: HashMap<String, ServiceTransport>,
204    /// Port allocator for dedicated devices (band above the global WG port).
205    dedicated_ports: DedicatedPortAllocator,
206    /// Per-service bridge state (Linux only).
207    #[cfg(target_os = "linux")]
208    service_bridges: HashMap<String, ServiceBridge>,
209    /// Local fallback `ServiceSubnetRegistry`. Used by the Linux Shared bridge
210    /// path and by the cross-platform Dedicated path (subnets stay globally
211    /// unique regardless of mode/OS).
212    service_subnet_registry: Option<zlayer_overlay::allocator::ServiceSubnetRegistry>,
213    /// Local raft node id used as the partition key for service-subnet assign.
214    local_node_id: u64,
215    /// Base64 `WireGuard` public key of THIS node's cluster transport, as told
216    /// by the main daemon via `SetLocalWgPubkey` (used for service-subnet
217    /// `AllowedIPs` plumbing).
218    local_wg_pubkey: Option<String>,
219    /// Public key generated for the live global transport, recorded at
220    /// `setup_global_overlay` time so `Status` can surface it (the transport
221    /// itself exposes no public-key accessor).
222    transport_public_key: Option<String>,
223    /// IP allocator for the node's overlay slice.
224    ip_allocator: IpAllocator,
225    /// This node's IP on the global overlay network.
226    node_ip: Option<IpAddr>,
227    /// `WireGuard` listen port for the overlay network.
228    overlay_port: u16,
229    /// Full cluster CIDR (e.g. `10.200.0.0/16`).
230    cluster_cidr: Option<IpNetwork>,
231    /// Per-node slice CIDR.
232    slice_cidr: Option<IpNetwork>,
233    /// Map of HCN namespace GUID -> (`service_name`, `allocated_ip`) for autoclean.
234    #[cfg(target_os = "windows")]
235    hcn_cleanup: HashMap<windows::core::GUID, (String, std::net::IpAddr)>,
236    /// Per-service container-IP allocators for Windows dedicated services. Each
237    /// is bounded to that service's subnet (not the node slice) so dedicated
238    /// containers draw addresses from their own isolated network. Keyed by
239    /// service name; created lazily on the first dedicated attach.
240    #[cfg(target_os = "windows")]
241    service_ip_allocators: HashMap<String, IpAllocator>,
242    /// Per-PID tracking of overlay attachments on Linux.
243    #[cfg(target_os = "linux")]
244    attached: HashMap<u32, AttachInfo>,
245    /// Peers installed on the GLOBAL transport via `AddPeer { Global }`, keyed by
246    /// base64 public key. Tracked here (in wire-safe [`PeerSpec`] form, with the
247    /// keys kept base64 — the boringtun UAPI dump only exposes hex keys) so a
248    /// guest-managed attach can hand the guest the exact peer set the host's own
249    /// global device carries. Platform-agnostic: the guest path runs on macOS.
250    global_peers: HashMap<String, PeerSpec>,
251    /// Guest-managed overlay attachments, keyed by the opaque container `id` from
252    /// [`AttachHandle::GuestManaged`]. Records the allocated overlay IP and the
253    /// generated public key registered in the mesh so `DetachContainer` can
254    /// release the IP and remove the peer.
255    guest_attachments: HashMap<String, GuestAttachInfo>,
256    /// Overlay DNS server listen address, if one was bootstrapped.
257    dns_server_addr: Option<SocketAddr>,
258    /// DNS domain for overlay service discovery.
259    dns_domain: Option<String>,
260    /// Overlay DNS A/AAAA records this node owns (name -> ip).
261    dns_records: HashMap<String, IpAddr>,
262    /// NAT traversal configuration threaded into every `OverlayConfig`.
263    nat_config: Option<NatConfig>,
264    /// Override for `OverlayConfig::uapi_sock_dir`.
265    uapi_sock_dir: Option<PathBuf>,
266    /// Live NAT traversal orchestrator.
267    nat_traversal: Option<NatTraversal>,
268    /// Unix-epoch seconds of the last successful candidate gather / STUN refresh.
269    nat_last_refresh: AtomicU64,
270    /// Set when a `Shutdown` request has been received.
271    shutdown_requested: bool,
272}
273
274impl OverlaydServer {
275    /// Create a fresh server bound to `data_dir`. The overlay itself is brought
276    /// up lazily by `SetupGlobalOverlay` (which carries the deployment, slice,
277    /// port, and NAT toggle from the main daemon).
278    ///
279    /// # Panics
280    /// Panics only if the compile-time-constant default CIDR `10.200.0.0/16`
281    /// fails to parse (impossible).
282    #[must_use]
283    pub fn new(data_dir: PathBuf) -> Self {
284        // Until SetupGlobalOverlay arrives, the allocator is bounded to the
285        // default cluster /16. SetupGlobalOverlay re-binds it to the node slice.
286        let default_cidr: IpNetwork = "10.200.0.0/16".parse().expect("compile-time constant CIDR");
287        let overlay_port = zlayer_core::DEFAULT_WG_PORT;
288
289        // Rehydrate the dedicated-port allocator from the on-disk marker so a
290        // service that already owns a dedicated overlay re-binds the exact UDP
291        // port it had before this process started.
292        let marker_path = zlayer_paths::ZLayerDirs::new(data_dir.clone()).agent_network_state();
293        let recorded_dedicated_ports: Vec<u16> = NetworkState::load(&marker_path)
294            .networks
295            .iter()
296            .filter(|n| n.owner.starts_with("service:"))
297            .filter_map(|n| n.wg_port)
298            .collect();
299
300        Self {
301            deployment: String::new(),
302            instance_id: String::new(),
303            data_dir,
304            global_interface: None,
305            global_transport: None,
306            service_interfaces: HashMap::new(),
307            service_transports: HashMap::new(),
308            dedicated_ports: DedicatedPortAllocator::new(overlay_port, recorded_dedicated_ports),
309            #[cfg(target_os = "linux")]
310            service_bridges: HashMap::new(),
311            service_subnet_registry: None,
312            local_node_id: 0,
313            local_wg_pubkey: None,
314            transport_public_key: None,
315            ip_allocator: IpAllocator::new(default_cidr),
316            node_ip: None,
317            overlay_port,
318            cluster_cidr: Some(default_cidr),
319            slice_cidr: None,
320            #[cfg(target_os = "windows")]
321            hcn_cleanup: HashMap::new(),
322            #[cfg(target_os = "windows")]
323            service_ip_allocators: HashMap::new(),
324            #[cfg(target_os = "linux")]
325            attached: HashMap::new(),
326            global_peers: HashMap::new(),
327            guest_attachments: HashMap::new(),
328            dns_server_addr: None,
329            dns_domain: None,
330            dns_records: HashMap::new(),
331            nat_config: None,
332            uapi_sock_dir: None,
333            nat_traversal: None,
334            nat_last_refresh: AtomicU64::new(0),
335            shutdown_requested: false,
336        }
337    }
338
339    /// Override the `WireGuard` UAPI socket directory for every overlay
340    /// transport built by this server.
341    #[must_use]
342    pub fn with_uapi_sock_dir(mut self, dir: impl Into<PathBuf>) -> Self {
343        self.uapi_sock_dir = Some(dir.into());
344        self
345    }
346
347    /// Whether a `Shutdown` request has been received.
348    #[must_use]
349    pub fn shutdown_requested(&self) -> bool {
350        self.shutdown_requested
351    }
352
353    /// The root data directory this server was constructed with. Used by the
354    /// uninstall path (`purge_managed_networks`) and for HCN marker resolution.
355    #[must_use]
356    pub fn data_dir(&self) -> &Path {
357        &self.data_dir
358    }
359
360    // -- request dispatch ----------------------------------------------------
361
362    /// Execute one [`OverlaydRequest`], producing the [`OverlaydResponse`] the
363    /// server sends back over IPC. Any internal error is folded into
364    /// [`OverlaydResponse::Err`].
365    pub async fn handle(&mut self, req: OverlaydRequest) -> OverlaydResponse {
366        match self.dispatch(req).await {
367            Ok(resp) => resp,
368            Err(e) => OverlaydResponse::Err {
369                message: e.to_string(),
370            },
371        }
372    }
373
374    #[allow(clippy::too_many_lines)]
375    async fn dispatch(&mut self, req: OverlaydRequest) -> Result<OverlaydResponse, OverlaydError> {
376        match req {
377            OverlaydRequest::SetLocalNodeId { node_id } => {
378                self.local_node_id = node_id;
379                Ok(OverlaydResponse::Ok)
380            }
381            OverlaydRequest::SetLocalWgPubkey { pubkey } => {
382                self.local_wg_pubkey = Some(pubkey);
383                Ok(OverlaydResponse::Ok)
384            }
385            OverlaydRequest::SetupGlobalOverlay {
386                deployment,
387                instance_id,
388                cluster_cidr,
389                slice_cidr,
390                wg_port,
391                nat_enabled,
392            } => {
393                let name = self
394                    .setup_global_overlay(
395                        deployment,
396                        instance_id,
397                        &cluster_cidr,
398                        slice_cidr.as_deref(),
399                        wg_port,
400                        nat_enabled,
401                    )
402                    .await?;
403                Ok(OverlaydResponse::BridgeName { name })
404            }
405            OverlaydRequest::TeardownGlobalOverlay => {
406                self.teardown_global_overlay();
407                Ok(OverlaydResponse::Ok)
408            }
409            OverlaydRequest::SetupServiceOverlay { service, mode } => {
410                let info = self.setup_service_overlay(&service, mode).await?;
411                Ok(OverlaydResponse::ServiceOverlay(info))
412            }
413            OverlaydRequest::TeardownServiceOverlay { service } => {
414                self.teardown_service_overlay(&service).await;
415                Ok(OverlaydResponse::Ok)
416            }
417            OverlaydRequest::AllocateIp {
418                service,
419                join_global,
420            } => {
421                let ip = self.allocate_ip(&service, join_global)?;
422                Ok(OverlaydResponse::Ip { ip })
423            }
424            OverlaydRequest::ReleaseIp { ip } => {
425                self.release_ip(ip);
426                Ok(OverlaydResponse::Ok)
427            }
428            OverlaydRequest::AttachContainer {
429                handle,
430                service,
431                join_global,
432                dns_server,
433                dns_domain,
434            } => {
435                // A guest-managed attach takes a wholly separate path: it cannot
436                // build a veth/HCN endpoint (the target is a VM, not a host
437                // process), so it allocates the overlay identity + peer set and
438                // returns it as `GuestConfig`. PID/HCN handles keep the existing
439                // veth/HCN attach and return `Attached`.
440                if let AttachHandle::GuestManaged { id } = handle {
441                    // Record the overlay DNS resolver/zone the daemon staged for
442                    // this node so the guest config can fall back to them (same
443                    // bookkeeping `attach_container` does for the other handles).
444                    if let Some(server) = dns_server {
445                        self.dns_server_addr = Some(SocketAddr::new(server, 53));
446                    }
447                    if dns_domain.is_some() {
448                        self.dns_domain.clone_from(&dns_domain);
449                    }
450                    let config = self
451                        .attach_container_guest(&id, &service, join_global, dns_server, dns_domain)
452                        .await?;
453                    Ok(OverlaydResponse::GuestConfig(config))
454                } else {
455                    let result = self
456                        .attach_container(handle, &service, join_global, dns_server, dns_domain)
457                        .await?;
458                    Ok(OverlaydResponse::Attached(result))
459                }
460            }
461            OverlaydRequest::DetachContainer { handle } => {
462                if let AttachHandle::GuestManaged { id } = handle {
463                    self.detach_container_guest(&id).await?;
464                } else {
465                    self.detach_container(handle).await?;
466                }
467                Ok(OverlaydResponse::Ok)
468            }
469            // `scope` selects the target device: `Global` (default) = the single
470            // cluster transport; `Service { service }` = that service's
471            // dedicated per-service transport.
472            OverlaydRequest::AddPeer { peer, scope } => {
473                let info = peer_spec_to_info(&peer)?;
474                let transport = self.transport_for_scope(&scope)?;
475                Self::add_peer_on(transport, &info).await?;
476                // Mirror Global peers into `global_peers` so a guest-managed
477                // attach can reproduce the host's global peer set for the guest.
478                if matches!(scope, PeerScope::Global) {
479                    self.global_peers.insert(peer.public_key.clone(), peer);
480                }
481                Ok(OverlaydResponse::Ok)
482            }
483            OverlaydRequest::RemovePeer { pubkey, scope } => {
484                let transport = self.transport_for_scope(&scope)?;
485                Self::remove_peer_on(transport, &pubkey).await?;
486                if matches!(scope, PeerScope::Global) {
487                    self.global_peers.remove(&pubkey);
488                }
489                Ok(OverlaydResponse::Ok)
490            }
491            OverlaydRequest::AddAllowedIp {
492                pubkey,
493                cidr,
494                scope,
495            } => {
496                let transport = self.transport_for_scope(&scope)?;
497                Self::add_allowed_ip_on(transport, &pubkey, &cidr).await?;
498                Ok(OverlaydResponse::Ok)
499            }
500            OverlaydRequest::RemoveAllowedIp {
501                pubkey,
502                cidr,
503                scope,
504            } => {
505                let transport = self.transport_for_scope(&scope)?;
506                Self::remove_allowed_ip_on(transport, &pubkey, &cidr).await?;
507                Ok(OverlaydResponse::Ok)
508            }
509            OverlaydRequest::RegisterDns { name, ip } => {
510                self.register_dns(name, ip);
511                Ok(OverlaydResponse::Ok)
512            }
513            OverlaydRequest::UnregisterDns { name } => {
514                self.unregister_dns(&name);
515                Ok(OverlaydResponse::Ok)
516            }
517            OverlaydRequest::Status => Ok(OverlaydResponse::Status(self.status_snapshot().await)),
518            OverlaydRequest::NatTick => {
519                self.nat_maintenance_tick().await?;
520                Ok(OverlaydResponse::Ok)
521            }
522            OverlaydRequest::Shutdown => {
523                self.shutdown_requested = true;
524                self.teardown_global_overlay();
525                Ok(OverlaydResponse::Ok)
526            }
527        }
528    }
529
530    // -- global overlay ------------------------------------------------------
531
532    /// Bring up (or reuse) this node's base/global overlay.
533    ///
534    /// Idempotent: if a global transport is already live, reuse it (recreating
535    /// without this guard could yank the kernel TUN out from under the running
536    /// boringtun worker). Re-binds the IP allocator to `slice_cidr` if one is
537    /// supplied so container IPs never collide across nodes.
538    ///
539    /// # Errors
540    /// Returns an error if key generation or interface creation fails.
541    async fn setup_global_overlay(
542        &mut self,
543        deployment: String,
544        instance_id: String,
545        cluster_cidr: &str,
546        slice_cidr: Option<&str>,
547        wg_port: u16,
548        nat_enabled: bool,
549    ) -> Result<String, OverlaydError> {
550        self.deployment = deployment;
551        self.instance_id = instance_id;
552        self.overlay_port = wg_port;
553
554        let cluster: IpNetwork = cluster_cidr.parse().map_err(|e| {
555            OverlaydError::Other(format!("invalid cluster CIDR {cluster_cidr}: {e}"))
556        })?;
557        self.cluster_cidr = Some(cluster);
558        if let Some(slice) = slice_cidr {
559            let slice_net: IpNetwork = slice
560                .parse()
561                .map_err(|e| OverlaydError::Other(format!("invalid slice CIDR {slice}: {e}")))?;
562            self.slice_cidr = Some(slice_net);
563            self.ip_allocator = IpAllocator::new(slice_net);
564        }
565        // NAT defaults to enabled (NatConfig::default()); honor an explicit
566        // disable from the main daemon by stamping a disabled config.
567        if !nat_enabled {
568            self.nat_config = Some(NatConfig {
569                enabled: false,
570                ..NatConfig::default()
571            });
572        }
573
574        if let Some(name) = self.global_interface.clone() {
575            if self.global_transport.is_some() {
576                tracing::debug!(
577                    deployment = %self.deployment,
578                    "Global overlay already active, reusing existing transport"
579                );
580                return Ok(name);
581            }
582        }
583
584        let interface_name = make_interface_name(&[&self.deployment, &self.instance_id], "g");
585
586        let (private_key, public_key) = OverlayTransport::generate_keys()
587            .await
588            .map_err(|e| OverlaydError::Overlay(format!("Failed to generate keys: {e}")))?;
589
590        let node_ip = self.ip_allocator.allocate()?;
591        self.transport_public_key = Some(public_key.clone());
592        let physical_egress_ip = match zlayer_overlay::detect_physical_egress().await {
593            Ok(egress) => Some(egress.ip),
594            Err(e) => {
595                tracing::warn!(
596                    error = %e,
597                    "failed to detect physical egress; WireGuard local_endpoint \
598                     will bind UNSPECIFIED for the global overlay"
599                );
600                None
601            }
602        };
603        let config = self.build_config(
604            private_key,
605            public_key,
606            node_ip,
607            16,
608            self.overlay_port,
609            physical_egress_ip,
610        );
611        let mut transport = OverlayTransport::new(config, interface_name);
612
613        transport
614            .create_interface()
615            .await
616            .map_err(|e| OverlaydError::Overlay(format!("Failed to create global overlay: {e}")))?;
617        transport.configure(&[]).await.map_err(|e| {
618            OverlaydError::Overlay(format!("Failed to configure global overlay: {e}"))
619        })?;
620
621        // Read back the actual interface name (on macOS, the kernel assigns utunN).
622        let actual_name = transport.interface_name().to_string();
623
624        self.node_ip = Some(node_ip);
625        self.global_interface = Some(actual_name.clone());
626        self.global_transport = Some(transport);
627        Ok(actual_name)
628    }
629
630    /// Tear down the node's base overlay (e.g. on full uninstall / shutdown).
631    fn teardown_global_overlay(&mut self) {
632        if let Some(mut transport) = self.global_transport.take() {
633            tracing::info!("Shutting down global overlay");
634            transport.shutdown();
635        }
636        self.global_interface = None;
637        self.transport_public_key = None;
638    }
639
640    // -- service overlay -----------------------------------------------------
641
642    /// Set up the per-service Linux bridge that backs `service` on this node.
643    ///
644    /// Returns the bridge name on success.
645    ///
646    /// # Errors
647    /// Returns an error if subnet assignment fails (exhaustion), if the bridge
648    /// cannot be created, or if the cluster transport rejects the `AllowedIPs`
649    /// update.
650    #[cfg(target_os = "linux")]
651    async fn setup_service_overlay(
652        &mut self,
653        service: &str,
654        mode: OverlayMode,
655    ) -> Result<ServiceOverlayInfo, OverlaydError> {
656        match mode.resolve() {
657            OverlayMode::Shared => self.setup_service_overlay_shared(service).await,
658            OverlayMode::Dedicated => self.setup_service_overlay_dedicated(service).await,
659            OverlayMode::Auto => unreachable!("OverlayMode::resolve never returns Auto"),
660        }
661    }
662
663    /// Shared-mode per-service overlay (Linux): the per-service bridge backed by
664    /// the single cluster transport. This is the original `setup_service_overlay`
665    /// body verbatim, now returning a [`ServiceOverlayInfo`] with the bridge name
666    /// and all identity fields `None` (Shared mode shares the cluster device).
667    ///
668    /// Returns the bridge name on success.
669    ///
670    /// # Errors
671    /// Returns an error if subnet assignment fails (exhaustion), if the bridge
672    /// cannot be created, or if the cluster transport rejects the `AllowedIPs`
673    /// update.
674    #[cfg(target_os = "linux")]
675    #[allow(clippy::too_many_lines)]
676    async fn setup_service_overlay_shared(
677        &mut self,
678        service: &str,
679    ) -> Result<ServiceOverlayInfo, OverlaydError> {
680        // 1. Idempotency check.
681        if let Some(existing) = self.service_bridges.get(service) {
682            let name = existing.name.clone();
683            tracing::debug!(service = %service, bridge = %name, "Service bridge already active, reusing");
684            return Ok(shared_overlay_info(name));
685        }
686
687        // 2. Assign subnet via the (currently local) ServiceSubnetRegistry.
688        self.ensure_service_subnet_registry()?;
689        let subnet: ipnet::IpNet = {
690            let registry = self
691                .service_subnet_registry
692                .as_mut()
693                .expect("ensure_service_subnet_registry leaves Some");
694            let node_key = self.local_node_id.to_string();
695            registry.assign(service, &node_key).map_err(|e| {
696                OverlaydError::Overlay(format!(
697                    "ServiceSubnetRegistry::assign({service}, {node_key}) failed: {e}"
698                ))
699            })?
700        };
701
702        // 3+4+6. Create the per-service Linux bridge, assign its gateway, bring
703        // it up, build the per-service IpAllocator, and record it.
704        let bridge_name = self.create_service_bridge(service, subnet).await?;
705
706        // 5. Plumb subnet into the cluster transport's local AllowedIPs so the
707        // single cluster device carries this service's cross-node traffic
708        // (Shared mode shares one crypto context for every service).
709        if let Some(ref cluster) = self.global_transport {
710            if let Some(ref pubkey) = self.local_wg_pubkey {
711                if let Err(e) = cluster.add_allowed_ip(pubkey, subnet).await {
712                    tracing::warn!(
713                        service = %service,
714                        subnet = %subnet,
715                        error = %e,
716                        "Failed to add service subnet to cluster transport AllowedIPs (non-fatal)"
717                    );
718                }
719            } else {
720                tracing::debug!(service = %service, "local_wg_pubkey not yet set; skipping cluster AllowedIPs update");
721            }
722        }
723
724        Ok(shared_overlay_info(bridge_name))
725    }
726
727    /// Create the per-service Linux bridge for `service` on `subnet`, assign its
728    /// gateway, bring it up, build the per-service [`IpAllocator`], and record it
729    /// in `service_bridges` + `service_interfaces`. Returns the bridge name.
730    ///
731    /// Shared and Dedicated mode share this bridge mechanic verbatim — the ONLY
732    /// difference between the two modes is which `WireGuard` device the service
733    /// subnet/peers are plumbed onto (the single cluster transport for Shared,
734    /// the dedicated per-service transport for Dedicated). This helper does NOT
735    /// touch any transport's `AllowedIPs`; the caller does that against the
736    /// device it owns.
737    ///
738    /// # Errors
739    /// Returns an error if the bridge cannot be created, addressed, or brought
740    /// up, or if the per-service `IpAllocator` cannot be built.
741    #[cfg(target_os = "linux")]
742    async fn create_service_bridge(
743        &mut self,
744        service: &str,
745        subnet: ipnet::IpNet,
746    ) -> Result<String, OverlaydError> {
747        use zlayer_overlay::allocator::IpAllocator as OverlayIpAllocator;
748
749        let bridge_name = make_interface_name(&[&self.deployment, &self.instance_id, service], "b");
750
751        if let Err(e) = crate::netlink::create_bridge(&bridge_name).await {
752            return Err(OverlaydError::Overlay(format!(
753                "create_bridge({bridge_name}) failed: {e}"
754            )));
755        }
756        if let Err(e) = crate::netlink::set_bridge_stp(&bridge_name, false) {
757            tracing::warn!(bridge = %bridge_name, error = %e, "set_bridge_stp(off) failed (non-fatal)");
758        }
759
760        // Gateway = first usable host in the subnet, assigned to the bridge.
761        let gateway = first_usable_ip(subnet);
762        if let Err(e) =
763            crate::netlink::add_address_to_link_by_name(&bridge_name, gateway, subnet.prefix_len())
764                .await
765        {
766            let _ = crate::netlink::delete_bridge(&bridge_name).await;
767            return Err(OverlaydError::Overlay(format!(
768                "add_address_to_link_by_name({bridge_name}, {gateway}/{}) failed: {e}",
769                subnet.prefix_len()
770            )));
771        }
772        if let Err(e) = crate::netlink::set_link_up_by_name(&bridge_name).await {
773            let _ = crate::netlink::delete_bridge(&bridge_name).await;
774            return Err(OverlaydError::Overlay(format!(
775                "set_link_up_by_name({bridge_name}) failed: {e}"
776            )));
777        }
778
779        // Build per-service IpAllocator, reserve the gateway.
780        let mut ip_allocator = OverlayIpAllocator::new(&subnet.to_string()).map_err(|e| {
781            OverlaydError::Overlay(format!("IpAllocator::new({subnet}) failed: {e}"))
782        })?;
783        let _ = ip_allocator.allocate_specific(gateway);
784
785        self.service_bridges.insert(
786            service.to_string(),
787            ServiceBridge {
788                name: bridge_name.clone(),
789                subnet,
790                gateway,
791                ip_allocator,
792            },
793        );
794        self.service_interfaces
795            .insert(service.to_string(), bridge_name.clone());
796
797        tracing::info!(service = %service, bridge = %bridge_name, subnet = %subnet, gateway = %gateway, "Service bridge created");
798        Ok(bridge_name)
799    }
800
801    /// Non-Linux variant of `setup_service_overlay`. On Windows the per-service
802    /// segment is the HCN Internal network created lazily at attach time, and on
803    /// macOS containers fall through to host networking. Registers the service
804    /// in `service_interfaces` with a placeholder name so presence checks work.
805    ///
806    /// # Errors
807    /// Infallible on non-Linux; the `Result` is preserved for ABI parity.
808    #[cfg(not(target_os = "linux"))]
809    async fn setup_service_overlay(
810        &mut self,
811        service: &str,
812        mode: OverlayMode,
813    ) -> Result<ServiceOverlayInfo, OverlaydError> {
814        match mode.resolve() {
815            OverlayMode::Shared => self.setup_service_overlay_shared(service).await,
816            OverlayMode::Dedicated => self.setup_service_overlay_dedicated(service).await,
817            OverlayMode::Auto => unreachable!("OverlayMode::resolve never returns Auto"),
818        }
819    }
820
821    /// Shared-mode per-service overlay (non-Linux): on Windows the per-service
822    /// segment is the HCN Internal network created lazily at attach time, and on
823    /// macOS containers fall through to host networking. Registers the service
824    /// in `service_interfaces` with a placeholder name so presence checks work.
825    ///
826    /// # Errors
827    /// Infallible on non-Linux; the `Result` is preserved for ABI parity.
828    #[cfg(not(target_os = "linux"))]
829    #[allow(clippy::unused_async)]
830    async fn setup_service_overlay_shared(
831        &mut self,
832        service: &str,
833    ) -> Result<ServiceOverlayInfo, OverlaydError> {
834        let placeholder = make_interface_name(&[&self.deployment, &self.instance_id, service], "b");
835        self.service_interfaces
836            .insert(service.to_string(), placeholder.clone());
837        tracing::debug!(service = %service, "Service overlay bridge setup is Linux-only; using direct networking placeholder");
838        Ok(shared_overlay_info(placeholder))
839    }
840
841    /// Dedicated-mode per-service overlay: stand up a *second* real `WireGuard`
842    /// device for `service` with its own crypto context, listen port, overlay
843    /// IP, and subnet — distinct from the single cluster transport.
844    ///
845    /// The cross-platform core (identity, subnet assign, transport bring-up,
846    /// marker persist, status) runs on every OS; only the *attachment* of
847    /// containers onto the device is platform-gated:
848    /// - Linux: a per-service bridge (same mechanic as Shared) routed over the
849    ///   dedicated device instead of the cluster device.
850    /// - Windows: a per-service HCN Internal network (a later task; a clearly
851    ///   marked seam returns an error here for now).
852    /// - macOS: nothing further — the utun device is the attachment.
853    ///
854    /// # Errors
855    /// Returns an error if port/key/subnet allocation, transport bring-up,
856    /// marker persistence, or the platform attachment fails.
857    #[allow(clippy::too_many_lines)]
858    async fn setup_service_overlay_dedicated(
859        &mut self,
860        service: &str,
861    ) -> Result<ServiceOverlayInfo, OverlaydError> {
862        // ----- cross-platform core (runs on every OS) -----
863
864        // 1. Idempotency: an existing dedicated transport returns its identity.
865        if let Some(st) = self.service_transports.get(service) {
866            return Ok(dedicated_overlay_info(
867                st.interface.clone(),
868                &st.public_key,
869                st.listen_port,
870                st.overlay_ip,
871                st.subnet,
872            ));
873        }
874
875        // 2. Identity: reuse a stable identity from the marker if one exists
876        //    (so the device re-binds the same key + port across restarts),
877        //    otherwise mint a fresh port + keypair + interface name.
878        let marker_path =
879            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
880        let recorded = NetworkState::load(&marker_path)
881            .get(&owner_for_service(service))
882            .cloned();
883
884        let (private_key, public_key, listen_port, iface_hint) = match recorded.as_ref() {
885            Some(entry)
886                if entry.wg_private_key.is_some()
887                    && entry.wg_public_key.is_some()
888                    && entry.wg_port.is_some()
889                    && entry.interface.is_some() =>
890            {
891                let port = entry.wg_port.expect("checked above");
892                self.dedicated_ports.reserve(port);
893                (
894                    entry.wg_private_key.clone().expect("checked above"),
895                    entry.wg_public_key.clone().expect("checked above"),
896                    port,
897                    entry.interface.clone().expect("checked above"),
898                )
899            }
900            _ => {
901                let port = self.dedicated_ports.allocate()?;
902                let (priv_key, pub_key) = OverlayTransport::generate_keys()
903                    .await
904                    .map_err(|e| OverlaydError::Overlay(format!("Failed to generate keys: {e}")))?;
905                let iface =
906                    make_interface_name(&[&self.deployment, &self.instance_id, service], "d");
907                (priv_key, pub_key, port, iface)
908            }
909        };
910
911        // 3. Subnet: assign from the same registry Shared uses, so per-service
912        //    subnets stay globally unique regardless of mode.
913        self.ensure_service_subnet_registry()?;
914        let subnet: ipnet::IpNet = {
915            let registry = self
916                .service_subnet_registry
917                .as_mut()
918                .expect("ensure_service_subnet_registry leaves Some");
919            let node_key = self.local_node_id.to_string();
920            registry.assign(service, &node_key).map_err(|e| {
921                OverlaydError::Overlay(format!(
922                    "ServiceSubnetRegistry::assign({service}, {node_key}) failed: {e}"
923                ))
924            })?
925        };
926        let overlay_ip = first_usable_ip(subnet);
927
928        // 4. Build + bring up the dedicated transport. The device's overlay CIDR
929        //    is the service subnet (so boringtun routes that subnet over THIS
930        //    device), and its listen port is the dedicated port.
931        let physical_egress_ip = match zlayer_overlay::detect_physical_egress().await {
932            Ok(egress) => Some(egress.ip),
933            Err(e) => {
934                tracing::warn!(
935                    error = %e,
936                    service = %service,
937                    "failed to detect physical egress; WireGuard local_endpoint \
938                     will bind UNSPECIFIED for the dedicated overlay"
939                );
940                None
941            }
942        };
943        let config = self.build_config(
944            private_key.clone(),
945            public_key.clone(),
946            overlay_ip,
947            subnet.prefix_len(),
948            listen_port,
949            physical_egress_ip,
950        );
951        let mut transport = OverlayTransport::new(config, iface_hint);
952        transport.create_interface().await.map_err(|e| {
953            OverlaydError::Overlay(format!(
954                "Failed to create dedicated overlay for {service}: {e}"
955            ))
956        })?;
957        transport.configure(&[]).await.map_err(|e| {
958            OverlaydError::Overlay(format!(
959                "Failed to configure dedicated overlay for {service}: {e}"
960            ))
961        })?;
962        let actual_iface = transport.interface_name().to_string();
963
964        // 5. Persist the marker so the identity survives restarts. Match the
965        //    base/Shared entry shape (owner/kind/name/id/subnet) plus the
966        //    dedicated WG fields.
967        let mut marker = NetworkState::load(&marker_path);
968        marker.upsert(ManagedNetwork {
969            owner: owner_for_service(service),
970            kind: "wg-dedicated".to_string(),
971            name: actual_iface.clone(),
972            id: public_key.clone(),
973            subnet: subnet.to_string(),
974            wg_port: Some(listen_port),
975            wg_private_key: Some(private_key),
976            wg_public_key: Some(public_key.clone()),
977            interface: Some(actual_iface.clone()),
978        });
979        if let Err(e) = marker.save(&marker_path) {
980            tracing::warn!(service = %service, error = %e, path = %marker_path.display(), "failed to persist dedicated-overlay marker (device still live)");
981        }
982
983        // 6. Record the live transport.
984        self.service_transports.insert(
985            service.to_string(),
986            ServiceTransport {
987                transport,
988                interface: actual_iface.clone(),
989                public_key: public_key.clone(),
990                listen_port,
991                overlay_ip,
992                subnet,
993            },
994        );
995
996        tracing::info!(
997            service = %service,
998            interface = %actual_iface,
999            listen_port,
1000            subnet = %subnet,
1001            overlay_ip = %overlay_ip,
1002            "Dedicated per-service overlay device created"
1003        );
1004
1005        // ----- platform-gated attachment -----
1006        // `name` in the returned info is the container-attach handle: the bridge
1007        // name on Linux, the dedicated interface elsewhere.
1008        let name = self
1009            .attach_dedicated_service(service, subnet, overlay_ip)
1010            .await?;
1011
1012        Ok(dedicated_overlay_info(
1013            name,
1014            &public_key,
1015            listen_port,
1016            overlay_ip,
1017            subnet,
1018        ))
1019    }
1020
1021    /// Linux attachment for a dedicated per-service overlay: create the same
1022    /// per-service bridge Shared uses, but route the service subnet over the
1023    /// DEDICATED device rather than the cluster device.
1024    ///
1025    /// Concretely, the dedicated transport's overlay CIDR already covers
1026    /// `subnet` (set at `build_config` time in the core), so boringtun routes
1027    /// `subnet` out the dedicated TUN; we additionally plumb `subnet` onto this
1028    /// node's own `AllowedIPs` entry on the dedicated device so locally
1029    /// originated packets to the subnet are accepted. Returns the bridge name.
1030    ///
1031    /// # Errors
1032    /// Returns an error if the bridge cannot be created.
1033    #[cfg(target_os = "linux")]
1034    async fn attach_dedicated_service(
1035        &mut self,
1036        service: &str,
1037        subnet: ipnet::IpNet,
1038        overlay_ip: IpAddr,
1039    ) -> Result<String, OverlaydError> {
1040        let _ = overlay_ip;
1041        let bridge_name = self.create_service_bridge(service, subnet).await?;
1042
1043        // Plumb the service subnet onto the DEDICATED device (not the cluster
1044        // device). The dedicated transport's overlay CIDR already routes the
1045        // subnet out its TUN; adding it to our own pubkey's AllowedIPs keeps the
1046        // local-accept side consistent with the Shared path's cluster plumbing.
1047        if let Some(st) = self.service_transports.get(service) {
1048            if let Some(ref pubkey) = self.local_wg_pubkey {
1049                if let Err(e) = st.transport.add_allowed_ip(pubkey, subnet).await {
1050                    tracing::warn!(
1051                        service = %service,
1052                        subnet = %subnet,
1053                        error = %e,
1054                        "Failed to add service subnet to dedicated transport AllowedIPs (non-fatal)"
1055                    );
1056                }
1057            } else {
1058                tracing::debug!(service = %service, "local_wg_pubkey not yet set; skipping dedicated AllowedIPs update");
1059            }
1060        }
1061
1062        Ok(bridge_name)
1063    }
1064
1065    /// Windows attachment for a dedicated per-service overlay.
1066    ///
1067    /// The cross-platform core has already stood up the dedicated Wintun
1068    /// transport (the encrypted node-to-node path for the service subnet). This
1069    /// adds the *container-facing* side: a per-service HCN **Internal** network
1070    /// onto which the agent's containers attach (instead of the node's shared
1071    /// base overlay network), so dedicated-service traffic is isolated at the
1072    /// vSwitch layer. Returns the per-service network's name, which the caller
1073    /// records as the [`ServiceOverlayInfo::name`] attach handle.
1074    ///
1075    /// # Errors
1076    /// Propagates any error from [`Self::ensure_service_network`].
1077    #[cfg(target_os = "windows")]
1078    async fn attach_dedicated_service(
1079        &mut self,
1080        service: &str,
1081        subnet: ipnet::IpNet,
1082        _overlay_ip: IpAddr,
1083    ) -> Result<String, OverlaydError> {
1084        // Create (or reuse) the per-service Internal HCN network. The returned
1085        // GUID is recorded in the marker under `owner_for_service(service)`;
1086        // the `AttachContainer` handler reuses it via the same marker lookup.
1087        let _net_id = self.ensure_service_network(service, subnet).await?;
1088        // The attach handle reported back is the per-service network's name.
1089        let daemon_name = self.deployment_or_default();
1090        Ok(format!(
1091            "{}-svc-{service}",
1092            overlay_network_name(&daemon_name)
1093        ))
1094    }
1095
1096    /// macOS attachment for a dedicated per-service overlay: the cross-platform
1097    /// core already brought up a utun device; there is no bridge, so the
1098    /// interface name itself is the attach handle.
1099    #[cfg(all(not(target_os = "linux"), not(target_os = "windows")))]
1100    #[allow(clippy::unused_async)]
1101    async fn attach_dedicated_service(
1102        &mut self,
1103        service: &str,
1104        _subnet: ipnet::IpNet,
1105        _overlay_ip: IpAddr,
1106    ) -> Result<String, OverlaydError> {
1107        let iface = self
1108            .service_transports
1109            .get(service)
1110            .map(|st| st.interface.clone())
1111            .unwrap_or_default();
1112        Ok(iface)
1113    }
1114
1115    /// Tear down the per-service segment for `service`. Idempotent.
1116    // Only the Linux body awaits (netlink + cluster AllowedIPs); other targets
1117    // are synchronous (transport shutdown is sync) but must keep the async
1118    // signature for the dispatch call.
1119    #[cfg_attr(not(target_os = "linux"), allow(clippy::unused_async))]
1120    async fn teardown_service_overlay(&mut self, service: &str) {
1121        // Shared-mode segment teardown (bridge on Linux, placeholder elsewhere).
1122        #[cfg(target_os = "linux")]
1123        {
1124            let removed = self.service_bridges.remove(service);
1125            self.service_interfaces.remove(service);
1126            if let Some(bridge) = removed {
1127                if let Some(ref cluster) = self.global_transport {
1128                    if let Some(ref pubkey) = self.local_wg_pubkey {
1129                        if let Err(e) = cluster.remove_allowed_ip(pubkey, bridge.subnet).await {
1130                            tracing::warn!(
1131                                service = %service,
1132                                subnet = %bridge.subnet,
1133                                error = %e,
1134                                "Failed to remove service subnet from cluster AllowedIPs (non-fatal)"
1135                            );
1136                        }
1137                    }
1138                }
1139
1140                if let Err(e) = crate::netlink::delete_bridge(&bridge.name).await {
1141                    tracing::warn!(service = %service, bridge = %bridge.name, error = %e, "delete_bridge failed (non-fatal)");
1142                }
1143
1144                if let Some(registry) = self.service_subnet_registry.as_mut() {
1145                    let node_key = self.local_node_id.to_string();
1146                    let _ = registry.release(service, &node_key);
1147                }
1148
1149                tracing::info!(service = %service, bridge = %bridge.name, "Tore down service bridge");
1150            }
1151        }
1152        #[cfg(not(target_os = "linux"))]
1153        {
1154            if let Some(iface) = self.service_interfaces.remove(service) {
1155                tracing::info!(service = %service, interface = %iface, "Removed service overlay interface (placeholder, non-Linux)");
1156            }
1157        }
1158
1159        // Dedicated-mode teardown (cross-platform): tear down the per-service
1160        // transport, free its port, and drop its marker entry. No-op when the
1161        // service ran in Shared mode (nothing in `service_transports`).
1162        if let Some(mut st) = self.service_transports.remove(service) {
1163            st.transport.shutdown();
1164            self.dedicated_ports.release(st.listen_port);
1165
1166            // Release the subnet assignment (Shared releases it inside the
1167            // Linux block above; the dedicated subnet lives in the same
1168            // registry, so release it here for the dedicated case on every OS).
1169            if let Some(registry) = self.service_subnet_registry.as_mut() {
1170                let node_key = self.local_node_id.to_string();
1171                let _ = registry.release(service, &node_key);
1172            }
1173
1174            let marker_path =
1175                zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
1176            let mut marker = NetworkState::load(&marker_path);
1177            let removed_entry = marker.remove(&owner_for_service(service));
1178            if removed_entry.is_some() {
1179                if let Err(e) = marker.save(&marker_path) {
1180                    tracing::warn!(service = %service, error = %e, path = %marker_path.display(), "failed to persist dedicated-overlay marker removal");
1181                }
1182            }
1183
1184            // Windows: delete the per-service HCN Internal network this service
1185            // owned. The marker entry's `id` is the bare HCN GUID (set by
1186            // `ensure_service_network`); delete the network so a dedicated
1187            // service tears down cleanly without waiting for a full uninstall.
1188            // Also drop the per-service container-IP allocator.
1189            #[cfg(target_os = "windows")]
1190            {
1191                self.service_ip_allocators.remove(service);
1192                if let Some(entry) = removed_entry.as_ref() {
1193                    if entry.kind == "hcn-internal" {
1194                        if let Ok(guid) = windows::core::GUID::try_from(entry.id.as_str()) {
1195                            match zlayer_hns::network::Network::delete(guid) {
1196                                Ok(()) => {
1197                                    tracing::info!(service = %service, id = %entry.id, "deleted per-service HCN network");
1198                                }
1199                                Err(e) => {
1200                                    tracing::warn!(service = %service, id = %entry.id, error = %e, "failed to delete per-service HCN network (may leak until uninstall)");
1201                                }
1202                            }
1203                        } else {
1204                            tracing::warn!(service = %service, id = %entry.id, "per-service marker has unparseable HCN GUID; skipping network delete");
1205                        }
1206                    }
1207                }
1208            }
1209            #[cfg(not(target_os = "windows"))]
1210            drop(removed_entry);
1211
1212            tracing::info!(
1213                service = %service,
1214                interface = %st.interface,
1215                listen_port = st.listen_port,
1216                "Tore down dedicated per-service overlay device"
1217            );
1218        }
1219    }
1220
1221    /// Initialize the local fallback `ServiceSubnetRegistry` from the configured
1222    /// cluster CIDR. Called on first `setup_service_overlay` use.
1223    ///
1224    /// # Errors
1225    /// Returns an error when no cluster CIDR is configured or the registry
1226    /// cannot be built.
1227    fn ensure_service_subnet_registry(&mut self) -> Result<(), OverlaydError> {
1228        use zlayer_overlay::allocator::ServiceSubnetRegistry;
1229
1230        if self.service_subnet_registry.is_some() {
1231            return Ok(());
1232        }
1233        let cluster_cidr = self.cluster_cidr.ok_or_else(|| {
1234            OverlaydError::Other(
1235                "service subnet registry needs a cluster CIDR (SetupGlobalOverlay first)"
1236                    .to_string(),
1237            )
1238        })?;
1239        let cluster_ipnet: ipnet::IpNet = cluster_cidr.to_string().parse().map_err(|e| {
1240            OverlaydError::Other(format!(
1241                "failed to convert cluster CIDR {cluster_cidr} to ipnet::IpNet: {e}"
1242            ))
1243        })?;
1244        let slice_prefix: u8 = match cluster_ipnet {
1245            ipnet::IpNet::V4(_) => 28,
1246            ipnet::IpNet::V6(_) => 120,
1247        };
1248        let registry = ServiceSubnetRegistry::new(cluster_ipnet, slice_prefix).map_err(|e| {
1249            OverlaydError::Other(format!("failed to build ServiceSubnetRegistry: {e}"))
1250        })?;
1251        self.service_subnet_registry = Some(registry);
1252        Ok(())
1253    }
1254
1255    // -- IP allocation -------------------------------------------------------
1256
1257    /// Allocate an overlay IP from the per-service bridge (Linux) or the node
1258    /// slice (otherwise). `join_global` reserves a second global-overlay IP too,
1259    /// matching the eth1 attach behavior.
1260    ///
1261    /// # Errors
1262    /// Returns an error if the relevant pool is exhausted.
1263    fn allocate_ip(&mut self, service: &str, join_global: bool) -> Result<IpAddr, OverlaydError> {
1264        // `join_global` does not allocate a second IP here: the companion
1265        // global-overlay IP (eth1) is reserved at attach time. `AllocateIp`
1266        // returns only the primary (service / slice) IP the caller asked for.
1267        let _ = join_global;
1268        #[cfg(target_os = "linux")]
1269        {
1270            if let Some(bridge) = self.service_bridges.get_mut(service) {
1271                return bridge.ip_allocator.allocate().ok_or_else(|| {
1272                    OverlaydError::Overlay(format!(
1273                        "service bridge {} subnet {} exhausted",
1274                        bridge.name, bridge.subnet
1275                    ))
1276                });
1277            }
1278        }
1279        let _ = service;
1280        self.ip_allocator.allocate()
1281    }
1282
1283    /// Return an overlay IP to the allocator (service-bridge pool when known,
1284    /// otherwise the node slice).
1285    fn release_ip(&mut self, ip: IpAddr) {
1286        #[cfg(target_os = "linux")]
1287        {
1288            for bridge in self.service_bridges.values_mut() {
1289                if bridge.subnet.contains(&ip) {
1290                    bridge.ip_allocator.release(ip);
1291                    return;
1292                }
1293            }
1294        }
1295        self.ip_allocator.release(ip);
1296    }
1297
1298    // -- container attach (Linux) -------------------------------------------
1299
1300    /// Wire a container into the overlay and return its [`AttachResult`].
1301    ///
1302    /// # Errors
1303    /// Returns an error if the container cannot be attached.
1304    async fn attach_container(
1305        &mut self,
1306        handle: AttachHandle,
1307        service: &str,
1308        join_global: bool,
1309        dns_server: Option<IpAddr>,
1310        dns_domain: Option<String>,
1311    ) -> Result<AttachResult, OverlaydError> {
1312        // Record the overlay DNS resolver/zone the main daemon staged for this
1313        // node so later attaches (and the Windows HCN endpoint `Dns` schema)
1314        // can fall back to them when a per-attach value isn't supplied.
1315        if let Some(server) = dns_server {
1316            self.dns_server_addr = Some(SocketAddr::new(server, 53));
1317        }
1318        if dns_domain.is_some() {
1319            self.dns_domain.clone_from(&dns_domain);
1320        }
1321        match handle {
1322            AttachHandle::LinuxPid { pid } => {
1323                let ip = self
1324                    .attach_container_linux(pid, service, join_global)
1325                    .await?;
1326                Ok(AttachResult {
1327                    ip,
1328                    namespace_guid: None,
1329                })
1330            }
1331            AttachHandle::WindowsContainer { container_id, ip } => {
1332                self.attach_container_windows(&container_id, service, ip, dns_server, dns_domain)
1333                    .await
1334            }
1335            AttachHandle::GuestManaged { .. } => Err(OverlaydError::Other(
1336                "guest-managed attach must go through attach_container_guest, not attach_container"
1337                    .to_string(),
1338            )),
1339        }
1340    }
1341
1342    /// Tear down a container's overlay attachment and release its IP.
1343    ///
1344    /// # Errors
1345    /// Returns an error only if a netlink delete fails for a reason other than
1346    /// "link not found".
1347    async fn detach_container(&mut self, handle: AttachHandle) -> Result<(), OverlaydError> {
1348        match handle {
1349            AttachHandle::LinuxPid { pid } => self.detach_container_linux(pid).await,
1350            AttachHandle::WindowsContainer { container_id, .. } => {
1351                self.detach_container_windows(&container_id).await
1352            }
1353            AttachHandle::GuestManaged { .. } => Err(OverlaydError::Other(
1354                "guest-managed detach must go through detach_container_guest, not detach_container"
1355                    .to_string(),
1356            )),
1357        }
1358    }
1359
1360    // -- container attach (guest-managed) -----------------------------------
1361
1362    /// Guest-managed overlay attach: allocate the overlay identity for a VM guest
1363    /// that brings up its own kernel `WireGuard` device.
1364    ///
1365    /// overlayd cannot enter the guest's network namespace (it is a VM, not a
1366    /// host process), so instead of a veth/HCN endpoint it:
1367    /// 1. allocates the overlay IP from the SAME pool the Linux attach uses (the
1368    ///    per-service bridge pool when one exists, otherwise the node slice) so
1369    ///    guest addresses never collide with container addresses;
1370    /// 2. generates a fresh `WireGuard` keypair for the guest;
1371    /// 3. builds the peer set the guest must configure — every GLOBAL peer the
1372    ///    host already knows, plus THIS node itself (so the guest can reach the
1373    ///    host node over the overlay; carries a keepalive so the guest keeps its
1374    ///    NAT mapping open from behind VZ NAT);
1375    /// 4. registers the generated public key as a GLOBAL peer (host route to the
1376    ///    guest, roaming endpoint learned from the guest's keepalive) so remote
1377    ///    nodes and this node route to it;
1378    /// 5. records the attachment keyed by `id` so `DetachContainer` can release
1379    ///    the IP and remove the peer.
1380    ///
1381    /// Platform-agnostic: pure IPAM + keygen + peer bookkeeping (no netns/veth/
1382    /// HCN), so it compiles and runs on macOS (where the overlayd serving a VZ
1383    /// host lives) as well as Linux.
1384    ///
1385    /// # Errors
1386    /// Returns an error if the global overlay is not set up, the IP pool is
1387    /// exhausted, key generation fails, or registering the guest peer fails.
1388    #[allow(clippy::cast_possible_truncation)]
1389    async fn attach_container_guest(
1390        &mut self,
1391        id: &str,
1392        service: &str,
1393        join_global: bool,
1394        dns_server: Option<IpAddr>,
1395        dns_domain: Option<String>,
1396    ) -> Result<GuestOverlayConfig, OverlaydError> {
1397        // The global transport must exist: we both register the guest as a peer
1398        // on it and advertise this node (its public key + listen port) to the
1399        // guest. Resolve both up front so we fail before allocating anything.
1400        let node_public_key = self.transport_public_key.clone().ok_or_else(|| {
1401            OverlaydError::Other(
1402                "guest-managed attach requires the global overlay to be set up first \
1403                 (no node WireGuard public key)"
1404                    .to_string(),
1405            )
1406        })?;
1407        if self.global_transport.is_none() {
1408            return Err(OverlaydError::Other(
1409                "guest-managed attach requires the global overlay to be set up first \
1410                 (no global transport)"
1411                    .to_string(),
1412            ));
1413        }
1414
1415        // 1. Allocate the overlay IP from the same pool the Linux attach uses and
1416        //    derive the prefix length from that pool's network. On Linux a
1417        //    per-service bridge (when present) supplies both the IP and its
1418        //    subnet's prefix; otherwise (and on every non-Linux host) the node
1419        //    slice / cluster CIDR does.
1420        let (overlay_ip, prefix_len, pool_service): (IpAddr, u8, Option<String>) = {
1421            #[cfg(target_os = "linux")]
1422            {
1423                if let Some(bridge) = self.service_bridges.get_mut(service) {
1424                    let ip = bridge.ip_allocator.allocate().ok_or_else(|| {
1425                        OverlaydError::Overlay(format!(
1426                            "service bridge {} subnet {} exhausted",
1427                            bridge.name, bridge.subnet
1428                        ))
1429                    })?;
1430                    let prefix = bridge.subnet.prefix_len();
1431                    (ip, prefix, Some(service.to_string()))
1432                } else {
1433                    let ip = self.ip_allocator.allocate()?;
1434                    (ip, self.slice_prefix_len(), None)
1435                }
1436            }
1437            #[cfg(not(target_os = "linux"))]
1438            {
1439                let _ = service;
1440                let ip = self.ip_allocator.allocate()?;
1441                (ip, self.slice_prefix_len(), None)
1442            }
1443        };
1444        // `join_global` is informational for a guest-managed attach: the guest's
1445        // single WireGuard device IS its global-overlay endpoint, so there is no
1446        // separate eth1 IP to reserve. Touch it so callers stay consistent with
1447        // the Linux/Windows handles.
1448        let _ = join_global;
1449
1450        // 2. Generate the guest's WireGuard keypair (reuse the transport's
1451        //    native x25519 keygen — never reimplement curve25519 here).
1452        let (private_key, public_key) = OverlayTransport::generate_keys().await.map_err(|e| {
1453            // Roll back the IP allocation so a keygen failure leaks nothing.
1454            self.release_guest_ip(overlay_ip, pool_service.as_deref());
1455            OverlaydError::Overlay(format!("failed to generate guest keys: {e}"))
1456        })?;
1457
1458        // 3. Build the peer set. A VZ guest is behind the host's NAT and can only
1459        //    reach the LOCAL node (via its NAT gateway) — it cannot dial other
1460        //    nodes' or sibling guests' endpoints directly. So it gets exactly ONE
1461        //    peer: this node, with AllowedIPs covering the whole cluster CIDR.
1462        //    ALL overlay traffic (including to sibling containers and remote
1463        //    nodes) routes through this node, which forwards/hairpins it (the
1464        //    node already holds a /32 peer for every container — step 4 — and the
1465        //    real inter-node peers). We deliberately do NOT add the per-guest /32
1466        //    peers here: a /32 with no reachable endpoint would win longest-prefix
1467        //    routing and black-hole sibling traffic. The endpoint returned here is
1468        //    the node's overlay IP as a placeholder; the VZ runtime rewrites it to
1469        //    the guest's NAT gateway (the only host address the guest can reach)
1470        //    before delivering the config. Keepalive holds the guest's NAT mapping
1471        //    open so the node can reach back.
1472        let node_allowed = self
1473            .cluster_cidr
1474            .or(self.slice_cidr)
1475            .map_or_else(|| String::from("0.0.0.0/0"), |c| c.to_string());
1476        let node_endpoint = self.node_endpoint_for_guest();
1477        let peers: Vec<PeerSpec> = vec![PeerSpec {
1478            public_key: node_public_key,
1479            endpoint: node_endpoint,
1480            allowed_ips: node_allowed,
1481            persistent_keepalive_secs: 25,
1482        }];
1483
1484        // 4. Register the guest's public key as a GLOBAL peer (host route to the
1485        //    guest at <overlay_ip>/32, roaming endpoint learned from keepalive).
1486        //    Go through the same internal path `AddPeer { Global }` uses.
1487        let host_route = format!(
1488            "{}/{}",
1489            overlay_ip,
1490            if overlay_ip.is_ipv6() { 128 } else { 32 }
1491        );
1492        let guest_peer = PeerSpec {
1493            public_key: public_key.clone(),
1494            // Empty/roaming: the guest is behind NAT; boringtun learns its source
1495            // endpoint from the guest's first keepalive. `0.0.0.0:0` is the
1496            // wire-safe "unset endpoint" sentinel that still parses as a
1497            // SocketAddr (peer_spec_to_info requires a parseable endpoint).
1498            endpoint: "0.0.0.0:0".to_string(),
1499            allowed_ips: host_route,
1500            persistent_keepalive_secs: 0,
1501        };
1502        let guest_peer_info = peer_spec_to_info(&guest_peer)?;
1503        {
1504            let transport = self.transport_for_scope(&PeerScope::Global)?;
1505            if let Err(e) = Self::add_peer_on(transport, &guest_peer_info).await {
1506                self.release_guest_ip(overlay_ip, pool_service.as_deref());
1507                return Err(e);
1508            }
1509        }
1510        // Track it among the global peers (so a *subsequent* guest attach also
1511        // learns about this guest) and record the attachment for detach.
1512        self.global_peers
1513            .insert(public_key.clone(), guest_peer.clone());
1514        self.guest_attachments.insert(
1515            id.to_string(),
1516            GuestAttachInfo {
1517                overlay_ip,
1518                public_key: public_key.clone(),
1519                service_name: pool_service,
1520            },
1521        );
1522
1523        // 5. Return the config the caller ships into the guest.
1524        Ok(GuestOverlayConfig {
1525            overlay_ip,
1526            prefix_len,
1527            private_key,
1528            public_key,
1529            // The guest's device listens on the node's overlay WG port (the
1530            // convention every overlay device on this node uses).
1531            listen_port: self.overlay_port,
1532            peers,
1533            dns_server: dns_server.or_else(|| self.dns_server_addr.map(|s| s.ip())),
1534            dns_domain: dns_domain.or_else(|| self.dns_domain.clone()),
1535        })
1536    }
1537
1538    /// Release a guest-managed attach by `id`: drop the host route + global peer
1539    /// and return the allocated IP to its pool. Idempotent.
1540    ///
1541    /// # Errors
1542    /// Returns an error only if removing the peer from the global transport fails
1543    /// for a reason other than "peer not found".
1544    async fn detach_container_guest(&mut self, id: &str) -> Result<(), OverlaydError> {
1545        let Some(info) = self.guest_attachments.remove(id) else {
1546            return Ok(());
1547        };
1548        // Remove the guest's global peer (mirror the RemovePeer { Global } path).
1549        self.global_peers.remove(&info.public_key);
1550        if let Ok(transport) = self.transport_for_scope(&PeerScope::Global) {
1551            if let Err(e) = Self::remove_peer_on(transport, &info.public_key).await {
1552                tracing::warn!(
1553                    guest = %id,
1554                    pubkey = %info.public_key,
1555                    error = %e,
1556                    "failed to remove guest peer from global transport"
1557                );
1558            }
1559        }
1560        // Return the IP to whichever pool it came from.
1561        self.release_guest_ip(info.overlay_ip, info.service_name.as_deref());
1562        Ok(())
1563    }
1564
1565    /// Release a guest overlay IP back to the pool it was drawn from: the named
1566    /// service bridge's allocator (Linux) when `service` is set and the bridge
1567    /// still exists, otherwise the node slice allocator.
1568    fn release_guest_ip(&mut self, ip: IpAddr, service: Option<&str>) {
1569        #[cfg(target_os = "linux")]
1570        {
1571            if let Some(svc) = service {
1572                if let Some(bridge) = self.service_bridges.get_mut(svc) {
1573                    bridge.ip_allocator.release(ip);
1574                    return;
1575                }
1576            }
1577        }
1578        let _ = service;
1579        self.ip_allocator.release(ip);
1580    }
1581
1582    /// Prefix length of the address pool guest IPs are drawn from when not using
1583    /// a per-service bridge: the node slice if assigned, else the cluster CIDR.
1584    fn slice_prefix_len(&self) -> u8 {
1585        self.slice_cidr.or(self.cluster_cidr).map_or(
1586            if self.node_ip.is_some_and(|ip| ip.is_ipv6()) {
1587                64
1588            } else {
1589                24
1590            },
1591            |c| c.prefix(),
1592        )
1593    }
1594
1595    /// Reachable `WireGuard` endpoint for THIS node, advertised to a guest as a
1596    /// peer. overlayd has no public reflexive address at this layer, so it uses
1597    /// the node's overlay-listen identity (`node_ip:overlay_port`); the caller
1598    /// (the VZ runtime that ships the config into the guest) rewrites it to the
1599    /// concrete VZ-NAT gateway endpoint the guest can dial. Falls back to the
1600    /// unspecified address when no node IP is assigned yet.
1601    fn node_endpoint_for_guest(&self) -> String {
1602        let ip = self.node_ip.unwrap_or(IpAddr::V4(Ipv4Addr::UNSPECIFIED));
1603        SocketAddr::new(ip, self.overlay_port).to_string()
1604    }
1605
1606    /// Linux veth/netns attach. On non-Linux this returns the node's overlay IP
1607    /// (host networking) and is never wired for a `LinuxPid` handle in practice.
1608    #[cfg(target_os = "linux")]
1609    async fn attach_container_linux(
1610        &mut self,
1611        container_pid: u32,
1612        service: &str,
1613        join_global: bool,
1614    ) -> Result<IpAddr, OverlaydError> {
1615        // Look up the per-service bridge.
1616        let (bridge_name, bridge_subnet, bridge_gateway, container_ip) = {
1617            let bridge = self.service_bridges.get_mut(service).ok_or_else(|| {
1618                OverlaydError::Other(format!(
1619                    "no service bridge for service {service}; call setup_service_overlay() first"
1620                ))
1621            })?;
1622            let ip = bridge.ip_allocator.allocate().ok_or_else(|| {
1623                OverlaydError::Overlay(format!(
1624                    "service bridge {} subnet {} exhausted",
1625                    bridge.name, bridge.subnet
1626                ))
1627            })?;
1628            (bridge.name.clone(), bridge.subnet, bridge.gateway, ip)
1629        };
1630
1631        let bridge_params = BridgeAttachParams {
1632            bridge_name: &bridge_name,
1633            gateway: bridge_gateway,
1634            subnet_prefix_len: bridge_subnet.prefix_len(),
1635        };
1636        if let Err(e) = self
1637            .attach_to_interface(
1638                container_pid,
1639                container_ip,
1640                "s",
1641                "eth0",
1642                Some(&bridge_params),
1643            )
1644            .await
1645        {
1646            if let Some(bridge) = self.service_bridges.get_mut(service) {
1647                bridge.ip_allocator.release(container_ip);
1648            }
1649            return Err(e);
1650        }
1651
1652        let mut global_ip: Option<IpAddr> = None;
1653        if join_global && self.global_interface.is_some() {
1654            let g_ip = self.ip_allocator.allocate()?;
1655            self.attach_to_interface(container_pid, g_ip, "g", "eth1", None)
1656                .await?;
1657            global_ip = Some(g_ip);
1658        }
1659
1660        self.attached.insert(
1661            container_pid,
1662            AttachInfo {
1663                service_ip: container_ip,
1664                service_name: Some(service.to_string()),
1665                global_ip,
1666                joined_global: global_ip.is_some(),
1667            },
1668        );
1669
1670        Ok(container_ip)
1671    }
1672
1673    /// Non-Linux fallback: containers share the host network, so return the
1674    /// node's overlay IP (or loopback).
1675    #[cfg(not(target_os = "linux"))]
1676    #[allow(clippy::unused_async)]
1677    async fn attach_container_linux(
1678        &mut self,
1679        _container_pid: u32,
1680        service: &str,
1681        _join_global: bool,
1682    ) -> Result<IpAddr, OverlaydError> {
1683        tracing::debug!(service = %service, "LinuxPid attach is a no-op off Linux; using node overlay IP");
1684        Ok(self.node_ip.unwrap_or(IpAddr::V4(Ipv4Addr::LOCALHOST)))
1685    }
1686
1687    /// Release the overlay resources held by a Linux container PID. Idempotent.
1688    #[cfg(target_os = "linux")]
1689    async fn detach_container_linux(&mut self, pid: u32) -> Result<(), OverlaydError> {
1690        let Some(info) = self.attached.remove(&pid) else {
1691            return Ok(());
1692        };
1693
1694        let veth_s = format!("veth-{pid}-s");
1695        if let Err(e) = crate::netlink::delete_link_by_name(&veth_s).await {
1696            tracing::warn!(link = %veth_s, pid, error = %e, "Failed to delete service veth");
1697        }
1698        if info.joined_global {
1699            let veth_g = format!("veth-{pid}-g");
1700            if let Err(e) = crate::netlink::delete_link_by_name(&veth_g).await {
1701                tracing::warn!(link = %veth_g, pid, error = %e, "Failed to delete global veth");
1702            }
1703        }
1704
1705        if let Some(svc) = info.service_name.as_deref() {
1706            if let Some(bridge) = self.service_bridges.get_mut(svc) {
1707                bridge.ip_allocator.release(info.service_ip);
1708            } else {
1709                tracing::debug!(service = %svc, ip = %info.service_ip, "detach: service bridge already torn down; dropping service IP release");
1710            }
1711        } else {
1712            self.ip_allocator.release(info.service_ip);
1713        }
1714        if let Some(g) = info.global_ip {
1715            self.ip_allocator.release(g);
1716        }
1717        Ok(())
1718    }
1719
1720    /// Non-Linux fallback: nothing to detach (host networking).
1721    #[cfg(not(target_os = "linux"))]
1722    #[allow(clippy::unused_async)]
1723    async fn detach_container_linux(&mut self, _pid: u32) -> Result<(), OverlaydError> {
1724        Ok(())
1725    }
1726
1727    /// Best-effort sweep of orphan veth endpoints whose owning container PID is
1728    /// no longer alive. Names matching `veth-<pid>-*` / `vc-<pid>-*` where
1729    /// `/proc/<pid>` does not exist are deleted.
1730    #[cfg(target_os = "linux")]
1731    async fn sweep_orphan_veths() {
1732        let links = match crate::netlink::list_all_links().await {
1733            Ok(links) => links,
1734            Err(e) => {
1735                tracing::warn!(error = %e, "Failed to list links for orphan sweep");
1736                return;
1737            }
1738        };
1739        for (_index, name) in links {
1740            let remainder = if let Some(r) = name.strip_prefix("veth-") {
1741                r
1742            } else if let Some(r) = name.strip_prefix("vc-") {
1743                r
1744            } else {
1745                continue;
1746            };
1747            let Some(pid_str) = remainder.split('-').next() else {
1748                continue;
1749            };
1750            let pid: u32 = match pid_str.parse() {
1751                Ok(p) => p,
1752                Err(_) => continue,
1753            };
1754            if Path::new(&format!("/proc/{pid}")).exists() {
1755                continue;
1756            }
1757            tracing::info!(link = %name, pid = pid, "Deleting orphan veth");
1758            if let Err(e) = crate::netlink::delete_link_by_name(&name).await {
1759                tracing::warn!(link = %name, error = %e, "Failed to delete orphan veth");
1760            }
1761        }
1762    }
1763
1764    #[cfg(target_os = "linux")]
1765    #[allow(clippy::too_many_lines)]
1766    async fn attach_to_interface(
1767        &self,
1768        container_pid: u32,
1769        ip: IpAddr,
1770        tag: &str,
1771        container_iface: &str,
1772        bridge: Option<&BridgeAttachParams<'_>>,
1773    ) -> Result<(), OverlaydError> {
1774        // Best-effort cleanup of orphan veths left by a previous daemon crash.
1775        Self::sweep_orphan_veths().await;
1776
1777        let is_v6 = ip.is_ipv6();
1778        let prefix_len: u8 = if let Some(b) = bridge {
1779            b.subnet_prefix_len
1780        } else if is_v6 {
1781            64
1782        } else {
1783            24
1784        };
1785        let host_prefix: u8 = if is_v6 { 128 } else { 32 };
1786
1787        let veth_host = format!("veth-{container_pid}-{tag}");
1788        let veth_pending = format!("vc-{container_pid}-{tag}");
1789        let veth_container = container_iface.to_string();
1790
1791        let container_ns_fd = std::os::fd::OwnedFd::from(
1792            std::fs::File::open(format!("/proc/{container_pid}/ns/net")).map_err(|e| {
1793                OverlaydError::Overlay(format!("Failed to open /proc/{container_pid}/ns/net: {e}"))
1794            })?,
1795        );
1796
1797        crate::netlink::delete_link_by_name(&veth_host)
1798            .await
1799            .map_err(|e| OverlaydError::Overlay(format!("pre-cleanup delete {veth_host}: {e}")))?;
1800        crate::netlink::delete_link_by_name(&veth_pending)
1801            .await
1802            .map_err(|e| {
1803                OverlaydError::Overlay(format!("pre-cleanup delete {veth_pending}: {e}"))
1804            })?;
1805
1806        let bridge_gateway: Option<IpAddr> = bridge.map(|b| b.gateway);
1807        let bridge_name: Option<String> = bridge.map(|b| b.bridge_name.to_string());
1808        let node_ip = self.node_ip;
1809
1810        let result: Result<(), OverlaydError> = async {
1811            crate::netlink::create_veth_pair(&veth_host, &veth_pending)
1812                .await
1813                .map_err(|e| OverlaydError::Overlay(format!("create veth pair: {e}")))?;
1814
1815            crate::netlink::move_link_into_netns_fd_and_rename(
1816                &veth_pending,
1817                AsFd::as_fd(&container_ns_fd),
1818                &veth_container,
1819            )
1820            .map_err(|e| OverlaydError::Overlay(format!("move veth into netns: {e}")))?;
1821
1822            let vc = veth_container.clone();
1823            let bridge_gateway_for_netns = bridge_gateway;
1824            tokio::task::spawn_blocking(move || {
1825                crate::netlink::with_netns_fd_async(container_ns_fd, move || async move {
1826                    crate::netlink::add_address_to_link_by_name(&vc, ip, prefix_len).await?;
1827                    crate::netlink::set_link_up_by_name(&vc).await?;
1828                    crate::netlink::set_link_up_by_name("lo").await?;
1829                    if let Some(gw) = bridge_gateway_for_netns {
1830                        crate::netlink::add_default_route_via_gateway(gw).await?;
1831                    }
1832                    Ok(())
1833                })
1834            })
1835            .await
1836            .map_err(|e| OverlaydError::Overlay(format!("container netns task panicked: {e}")))?
1837            .map_err(|e| OverlaydError::Overlay(format!("container netns ops: {e}")))?;
1838
1839            crate::netlink::set_link_up_by_name(&veth_host)
1840                .await
1841                .map_err(|e| OverlaydError::Overlay(format!("set {veth_host} up: {e}")))?;
1842
1843            if let Some(bname) = bridge_name.as_deref() {
1844                crate::netlink::add_link_to_bridge(&veth_host, bname)
1845                    .await
1846                    .map_err(|e| {
1847                        OverlaydError::Overlay(format!(
1848                            "enslave {veth_host} to bridge {bname}: {e}"
1849                        ))
1850                    })?;
1851            } else {
1852                crate::netlink::replace_route_via_dev(ip, host_prefix, &veth_host, node_ip)
1853                    .await
1854                    .map_err(|e| {
1855                        OverlaydError::Overlay(format!("host route for {ip}/{host_prefix}: {e}"))
1856                    })?;
1857            }
1858
1859            let _ = crate::netlink::set_sysctl("net.ipv4.ip_forward", "1");
1860            let _ = crate::netlink::set_sysctl("net.ipv6.conf.all.forwarding", "1");
1861
1862            Ok(())
1863        }
1864        .await;
1865
1866        if result.is_err() {
1867            let _ = crate::netlink::delete_link_by_name(&veth_host).await;
1868            let _ = crate::netlink::delete_link_by_name(&veth_pending).await;
1869        }
1870        result
1871    }
1872
1873    // -- container attach (Windows HCN) -------------------------------------
1874
1875    /// Windows attach: ensure the overlay HCN Internal network exists, allocate
1876    /// or validate the IP, create the per-container HCN endpoint + namespace,
1877    /// and return the bare-lowercase namespace GUID for the agent to embed in
1878    /// the compute-system document.
1879    ///
1880    /// # Errors
1881    /// Returns an error if the network/endpoint cannot be created or the slice
1882    /// is exhausted.
1883    #[cfg(target_os = "windows")]
1884    async fn attach_container_windows(
1885        &mut self,
1886        container_id: &str,
1887        service: &str,
1888        ip_override: Option<IpAddr>,
1889        dns_server: Option<IpAddr>,
1890        dns_domain: Option<String>,
1891    ) -> Result<AttachResult, OverlaydError> {
1892        // Resolve whether THIS service has a dedicated per-service overlay. It
1893        // does iff a live dedicated transport exists OR a `hcn-internal` marker
1894        // entry is recorded under `owner_for_service(service)` (the network
1895        // survives daemon restarts even if the transport map is empty mid-init).
1896        // Dedicated services attach onto their OWN per-service Internal network
1897        // and draw IPs from the service subnet; everything else uses the node's
1898        // shared base overlay network and the node slice.
1899        let dedicated_subnet = self.dedicated_service_subnet(service);
1900
1901        let (net_id, ip, prefix_length) = if let Some(svc_subnet) = dedicated_subnet {
1902            // ----- dedicated per-service network path -----
1903            let net_id = self.ensure_service_network(service, svc_subnet).await?;
1904
1905            // Allocate (or validate) the IP from the SERVICE subnet, not the
1906            // node slice. A per-service allocator is created lazily and bounded
1907            // to the service subnet so addresses stay inside the dedicated
1908            // network. An `ip_override` inside the service subnet is honored;
1909            // one outside it is rejected so a slice-allocated IP can't leak onto
1910            // the dedicated network.
1911            let svc_ipnetwork: IpNetwork = svc_subnet.to_string().parse().map_err(|e| {
1912                OverlaydError::Other(format!("failed to parse service subnet {svc_subnet}: {e}"))
1913            })?;
1914            let allocator = self
1915                .service_ip_allocators
1916                .entry(service.to_string())
1917                .or_insert_with(|| IpAllocator::new(svc_ipnetwork));
1918            let ip = match ip_override {
1919                Some(ip) if svc_subnet.contains(&ip) => ip,
1920                Some(ip) => {
1921                    return Err(OverlaydError::Other(format!(
1922                        "overridden IP {ip} is not inside dedicated service subnet {svc_subnet} for service {service}"
1923                    )));
1924                }
1925                None => allocator.allocate()?,
1926            };
1927            (net_id, ip, svc_subnet.prefix_len())
1928        } else {
1929            // ----- shared base overlay network path (unchanged) -----
1930            let slice = self.slice_cidr.ok_or_else(|| {
1931                OverlaydError::Other(
1932                    "no node slice assigned yet (SetupGlobalOverlay with slice_cidr first)"
1933                        .to_string(),
1934                )
1935            })?;
1936            let slice_ipnet: ipnet::IpNet = slice.to_string().parse().map_err(|e| {
1937                OverlaydError::Other(format!("failed to parse slice CIDR {slice}: {e}"))
1938            })?;
1939            let net_id = self.ensure_overlay_network(slice_ipnet).await?;
1940            let ip = match ip_override {
1941                Some(ip) => ip,
1942                None => self.ip_allocator.allocate()?,
1943            };
1944            (net_id, ip, slice_ipnet.prefix_len())
1945        };
1946
1947        // 3. Create the endpoint + per-container namespace on the network.
1948        let dns_server_eff = dns_server.or_else(|| self.dns_server_addr.map(|a| a.ip()));
1949        let dns_domain_for_attach = dns_domain.or_else(|| self.dns_domain.clone());
1950        let cluster_cidr = self.cluster_cidr.map(|c| c.to_string()).unwrap_or_default();
1951        let owner_tag = owner_tag(&self.deployment_or_default());
1952        let cid = container_id.to_string();
1953
1954        let attachment = tokio::task::spawn_blocking(move || {
1955            zlayer_hns::attach::EndpointAttachment::create_overlay(
1956                net_id,
1957                &owner_tag,
1958                cid.as_str(),
1959                ip,
1960                prefix_length,
1961                &cluster_cidr,
1962                dns_server_eff,
1963                dns_domain_for_attach.as_deref(),
1964            )
1965        })
1966        .await
1967        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
1968        .map_err(|e| OverlaydError::Overlay(format!("HCN overlay endpoint attach failed: {e}")))?;
1969
1970        let namespace_id = attachment.namespace_id();
1971        let bare_guid = format_guid_bare(namespace_id);
1972
1973        // Record for autoclean keyed by namespace GUID.
1974        self.hcn_cleanup
1975            .insert(namespace_id, (service.to_string(), ip));
1976
1977        tracing::info!(
1978            ns = %bare_guid,
1979            service = %service,
1980            ip = %ip,
1981            "Attached container to HCN overlay"
1982        );
1983
1984        Ok(AttachResult {
1985            ip,
1986            namespace_guid: Some(bare_guid),
1987        })
1988    }
1989
1990    /// Non-Windows path: a `WindowsContainer` handle has no meaning off Windows.
1991    #[cfg(not(target_os = "windows"))]
1992    #[allow(clippy::unused_async)]
1993    async fn attach_container_windows(
1994        &mut self,
1995        _container_id: &str,
1996        _service: &str,
1997        _ip_override: Option<IpAddr>,
1998        _dns_server: Option<IpAddr>,
1999        _dns_domain: Option<String>,
2000    ) -> Result<AttachResult, OverlaydError> {
2001        Err(OverlaydError::Other(
2002            "WindowsContainer attach is only supported on Windows".to_string(),
2003        ))
2004    }
2005
2006    /// Detach a Windows container by its bare namespace GUID and release its IP.
2007    /// Idempotent: unknown ids are a no-op.
2008    #[cfg(target_os = "windows")]
2009    #[allow(clippy::unused_async)]
2010    async fn detach_container_windows(
2011        &mut self,
2012        namespace_guid: &str,
2013    ) -> Result<(), OverlaydError> {
2014        use windows::core::GUID;
2015
2016        let Ok(guid) = GUID::try_from(namespace_guid) else {
2017            tracing::warn!(ns = %namespace_guid, "detach: unparseable namespace GUID");
2018            return Ok(());
2019        };
2020        if let Some((service, ip)) = self.hcn_cleanup.remove(&guid) {
2021            self.ip_allocator.release(ip);
2022            tracing::info!(ns = %namespace_guid, service = %service, ip = %ip, "Released HCN overlay attachment");
2023        }
2024        Ok(())
2025    }
2026
2027    /// Non-Windows path.
2028    #[cfg(not(target_os = "windows"))]
2029    #[allow(clippy::unused_async)]
2030    async fn detach_container_windows(
2031        &mut self,
2032        _namespace_guid: &str,
2033    ) -> Result<(), OverlaydError> {
2034        Ok(())
2035    }
2036
2037    /// Ensure the per-daemon HCN overlay (Internal vSwitch, no physical-NIC
2038    /// binding) exists on the host, reusing one recorded in the
2039    /// `{data_dir}/agent_network.json` marker or discoverable by name, and
2040    /// recording it in the marker on create.
2041    ///
2042    /// # Errors
2043    /// Propagates the underlying `zlayer_hns` error on create failure.
2044    #[cfg(target_os = "windows")]
2045    #[allow(clippy::too_many_lines)]
2046    async fn ensure_overlay_network(
2047        &mut self,
2048        slice_cidr: ipnet::IpNet,
2049    ) -> Result<windows::core::GUID, OverlaydError> {
2050        use windows::core::GUID;
2051
2052        let daemon_name = self.deployment_or_default();
2053        let net_name = overlay_network_name(&daemon_name);
2054        let marker_path =
2055            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
2056
2057        // Fast path: marker names a network GUID that still exists; reopen it.
2058        if let Some(recorded_id) = crate::network_state::NetworkState::load(&marker_path)
2059            .get(crate::network_state::OWNER_BASE)
2060            .and_then(|entry| GUID::try_from(entry.id.as_str()).ok())
2061        {
2062            let reopened = tokio::task::spawn_blocking(move || {
2063                zlayer_hns::network::Network::open(recorded_id).ok()
2064            })
2065            .await
2066            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
2067            if reopened.is_some() {
2068                tracing::info!(name = %net_name, "reusing HCN overlay network from marker");
2069                return Ok(recorded_id);
2070            }
2071        }
2072
2073        // Idempotency: reuse a host network whose queried name matches ours.
2074        let target_name = net_name.clone();
2075        let existing = tokio::task::spawn_blocking(move || -> Option<GUID> {
2076            let guids = zlayer_hns::network::list("{}").ok()?;
2077            for guid in guids {
2078                let Ok(network) = zlayer_hns::network::Network::open(guid) else {
2079                    continue;
2080                };
2081                if matches!(network.query("{}"), Ok(props) if props.name == target_name) {
2082                    return Some(guid);
2083                }
2084            }
2085            None
2086        })
2087        .await
2088        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
2089
2090        if let Some(existing_id) = existing {
2091            tracing::info!(name = %net_name, "reusing existing HCN overlay network");
2092            return Ok(existing_id);
2093        }
2094
2095        let net_id = GUID::new()
2096            .map_err(|e| OverlaydError::Other(format!("GUID::new for overlay network: {e}")))?;
2097        let subnet_str = slice_cidr.to_string();
2098
2099        // Default: an HCN Internal network — an internal vSwitch with NO
2100        // physical-NIC binding — so container traffic never touches the
2101        // operator's gateway adapter. Setting ZLAYER_HCN_UPLINK_ADAPTER opts
2102        // into the legacy Transparent model bound to that named uplink.
2103        let use_transparent = std::env::var(zlayer_hns::adapter::ZLAYER_UPLINK_ENV)
2104            .ok()
2105            .is_some_and(|v| !v.trim().is_empty());
2106
2107        let net_name_for_create = net_name.clone();
2108        let subnet_for_create = subnet_str.clone();
2109        if use_transparent {
2110            let uplink = zlayer_hns::adapter::find_primary_adapter()
2111                .map_err(|e| OverlaydError::Other(format!("find_primary_adapter: {e}")))?;
2112            tracing::warn!(uplink = %uplink, "ZLAYER_HCN_UPLINK_ADAPTER set: creating HCN *Transparent* overlay bound to a physical NIC");
2113            tokio::task::spawn_blocking(move || {
2114                zlayer_hns::network::Network::create_transparent(
2115                    net_id,
2116                    &net_name_for_create,
2117                    &subnet_for_create,
2118                    &uplink,
2119                )
2120            })
2121            .await
2122            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
2123            .map_err(|e| {
2124                OverlaydError::Overlay(format!("HcnCreateNetwork transparent ({net_name}): {e}"))
2125            })?;
2126        } else {
2127            tokio::task::spawn_blocking(move || {
2128                zlayer_hns::network::Network::create_internal(
2129                    net_id,
2130                    &net_name_for_create,
2131                    &subnet_for_create,
2132                )
2133            })
2134            .await
2135            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
2136            .map_err(|e| {
2137                OverlaydError::Overlay(format!("HcnCreateNetwork internal ({net_name}): {e}"))
2138            })?;
2139        }
2140
2141        // HCN's Static IPAM needs ~1-2s after network create to settle its
2142        // address pool; without this the first endpoint frequently fails with
2143        // HCN_E_ADDR_INVALID_OR_RESERVED.
2144        tokio::time::sleep(std::time::Duration::from_millis(2000)).await;
2145
2146        tracing::info!(
2147            subnet = %subnet_str,
2148            mode = if use_transparent { "Transparent" } else { "Internal" },
2149            "created HCN overlay network"
2150        );
2151
2152        // Persist the marker so subsequent runs reuse this network by GUID and a
2153        // full uninstall knows to delete it. Best-effort.
2154        let mut marker = crate::network_state::NetworkState::load(&marker_path);
2155        marker.upsert(crate::network_state::ManagedNetwork {
2156            owner: crate::network_state::OWNER_BASE.to_string(),
2157            kind: if use_transparent {
2158                "hcn-transparent"
2159            } else {
2160                "hcn-internal"
2161            }
2162            .to_string(),
2163            name: net_name.clone(),
2164            id: format_guid_bare(net_id),
2165            subnet: subnet_str.clone(),
2166            // Base/Shared HCN network: no dedicated WireGuard identity.
2167            wg_port: None,
2168            wg_private_key: None,
2169            wg_public_key: None,
2170            interface: None,
2171        });
2172        if let Err(e) = marker.save(&marker_path) {
2173            tracing::warn!(error = %e, path = %marker_path.display(), "failed to persist agent network marker (network still reusable by name)");
2174        }
2175
2176        Ok(net_id)
2177    }
2178
2179    /// Ensure the per-service HCN **Internal** network for `service` exists on
2180    /// the host, reusing one recorded under the `service:<name>` marker owner
2181    /// (or discoverable by its derived name) and recording it on create.
2182    ///
2183    /// This is the Windows analogue of the Linux per-service bridge: a
2184    /// dedicated (`OverlayMode::Dedicated`) service gets its OWN isolated HCN
2185    /// Internal network — an internal vSwitch with NO physical-NIC binding —
2186    /// distinct from the node's shared base overlay network. Containers attach
2187    /// to it (rather than the base network) so dedicated-service traffic is
2188    /// segregated at the vSwitch layer. Modeled on [`Self::ensure_overlay_network`]
2189    /// but keyed on [`owner_for_service`] and forced to the Internal type (never
2190    /// Transparent — the on-box test asserts zero external vSwitches for
2191    /// dedicated services).
2192    ///
2193    /// Returns the network GUID.
2194    ///
2195    /// # Errors
2196    /// Propagates the underlying `zlayer_hns` error on create failure.
2197    #[cfg(target_os = "windows")]
2198    #[allow(clippy::too_many_lines)]
2199    async fn ensure_service_network(
2200        &mut self,
2201        service: &str,
2202        subnet: ipnet::IpNet,
2203    ) -> Result<windows::core::GUID, OverlaydError> {
2204        use windows::core::GUID;
2205
2206        let daemon_name = self.deployment_or_default();
2207        // Per-service network name: `<base overlay name>-svc-<service>` so it is
2208        // unambiguously distinct from the base network and from other services.
2209        let net_name = format!("{}-svc-{service}", overlay_network_name(&daemon_name));
2210        let owner = owner_for_service(service);
2211        let marker_path =
2212            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
2213
2214        // Fast path: marker names a network GUID that still exists; reopen it.
2215        // Only honor the recorded id when it belongs to an HCN-internal entry —
2216        // a Dedicated WireGuard marker (`kind == "wg-dedicated"`) stores the
2217        // transport public key in `id`, NOT an HCN GUID, so it must be ignored
2218        // for HCN reuse.
2219        let recorded_hcn_id = crate::network_state::NetworkState::load(&marker_path)
2220            .get(&owner)
2221            .filter(|entry| entry.kind == "hcn-internal")
2222            .and_then(|entry| GUID::try_from(entry.id.as_str()).ok());
2223        if let Some(recorded_id) = recorded_hcn_id {
2224            let reopened = tokio::task::spawn_blocking(move || {
2225                zlayer_hns::network::Network::open(recorded_id).ok()
2226            })
2227            .await
2228            .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
2229            if reopened.is_some() {
2230                tracing::info!(name = %net_name, service = %service, "reusing per-service HCN network from marker");
2231                return Ok(recorded_id);
2232            }
2233        }
2234
2235        // Idempotency: reuse a host network whose queried name matches ours.
2236        let target_name = net_name.clone();
2237        let existing = tokio::task::spawn_blocking(move || -> Option<GUID> {
2238            let guids = zlayer_hns::network::list("{}").ok()?;
2239            for guid in guids {
2240                let Ok(network) = zlayer_hns::network::Network::open(guid) else {
2241                    continue;
2242                };
2243                if matches!(network.query("{}"), Ok(props) if props.name == target_name) {
2244                    return Some(guid);
2245                }
2246            }
2247            None
2248        })
2249        .await
2250        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
2251
2252        if let Some(existing_id) = existing {
2253            tracing::info!(name = %net_name, service = %service, "reusing existing per-service HCN network");
2254            return Ok(existing_id);
2255        }
2256
2257        let net_id = GUID::new()
2258            .map_err(|e| OverlaydError::Other(format!("GUID::new for per-service network: {e}")))?;
2259        let subnet_str = subnet.to_string();
2260
2261        // ALWAYS Internal for a dedicated service — never Transparent. The
2262        // dedicated requirement is isolation; an Internal network binds NO
2263        // physical NIC (no external vSwitch), which is what the on-box test
2264        // asserts.
2265        let net_name_for_create = net_name.clone();
2266        let subnet_for_create = subnet_str.clone();
2267        tokio::task::spawn_blocking(move || {
2268            zlayer_hns::network::Network::create_internal(
2269                net_id,
2270                &net_name_for_create,
2271                &subnet_for_create,
2272            )
2273        })
2274        .await
2275        .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
2276        .map_err(|e| {
2277            OverlaydError::Overlay(format!("HcnCreateNetwork internal ({net_name}): {e}"))
2278        })?;
2279
2280        // HCN's Static IPAM needs ~1-2s after network create to settle its
2281        // address pool; without this the first endpoint frequently fails with
2282        // HCN_E_ADDR_INVALID_OR_RESERVED (same wait as the base network).
2283        tokio::time::sleep(std::time::Duration::from_millis(2000)).await;
2284
2285        tracing::info!(
2286            service = %service,
2287            subnet = %subnet_str,
2288            "created per-service HCN Internal network"
2289        );
2290
2291        // Persist the marker (owner = `service:<name>`, kind = `hcn-internal`)
2292        // so subsequent runs reuse this network by GUID and a full uninstall
2293        // (`purge_managed_networks`, which sweeps every `kind` starting with
2294        // `hcn`) deletes it. Best-effort.
2295        //
2296        // A dedicated Windows service shares the SAME owner key for two facts:
2297        // the dedicated WireGuard identity (written by the cross-platform core
2298        // in `setup_service_overlay_dedicated`, kind `wg-dedicated`) and this
2299        // HCN network's GUID. The marker is keyed by owner, so carry the WG
2300        // identity fields over when we rewrite the entry to `hcn-internal` — the
2301        // single entry then holds both the HCN GUID (in `id`) and the WG
2302        // identity (in the `wg_*`/`interface` fields), and the WG private key
2303        // survives restarts. (The core re-asserts the `wg-dedicated` shape on
2304        // the next setup; this path re-asserts `hcn-internal` again right after
2305        // — both are self-healing because the network is also reusable by name.)
2306        let mut marker = crate::network_state::NetworkState::load(&marker_path);
2307        let carried = marker.get(&owner).cloned();
2308        marker.upsert(crate::network_state::ManagedNetwork {
2309            owner,
2310            kind: "hcn-internal".to_string(),
2311            name: net_name.clone(),
2312            id: format_guid_bare(net_id),
2313            subnet: subnet_str.clone(),
2314            wg_port: carried.as_ref().and_then(|c| c.wg_port),
2315            wg_private_key: carried.as_ref().and_then(|c| c.wg_private_key.clone()),
2316            wg_public_key: carried.as_ref().and_then(|c| c.wg_public_key.clone()),
2317            interface: carried.as_ref().and_then(|c| c.interface.clone()),
2318        });
2319        if let Err(e) = marker.save(&marker_path) {
2320            tracing::warn!(service = %service, error = %e, path = %marker_path.display(), "failed to persist per-service network marker (network still reusable by name)");
2321        }
2322
2323        Ok(net_id)
2324    }
2325
2326    /// Resolve the dedicated per-service subnet for `service`, if the service
2327    /// runs in `OverlayMode::Dedicated` on this node.
2328    ///
2329    /// Source of truth, in order:
2330    /// 1. The live [`ServiceTransport`] in `service_transports` (the normal
2331    ///    case once `SetupServiceOverlay` has run this process).
2332    /// 2. A persisted `hcn-internal` marker entry under
2333    ///    [`owner_for_service`]`(service)` — covers the window where the HCN
2334    ///    network exists from a prior run but the transport map is still empty.
2335    ///
2336    /// Returns `None` for Shared-mode services (attach onto the base network).
2337    #[cfg(target_os = "windows")]
2338    fn dedicated_service_subnet(&self, service: &str) -> Option<ipnet::IpNet> {
2339        if let Some(st) = self.service_transports.get(service) {
2340            return Some(st.subnet);
2341        }
2342        let marker_path =
2343            zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
2344        crate::network_state::NetworkState::load(&marker_path)
2345            .get(&owner_for_service(service))
2346            .filter(|entry| entry.kind == "hcn-internal")
2347            .and_then(|entry| entry.subnet.parse::<ipnet::IpNet>().ok())
2348    }
2349
2350    /// The daemon name used for HCN network/owner naming, defaulting to
2351    /// `"zlayer"` when no deployment has been set yet.
2352    #[cfg(target_os = "windows")]
2353    fn deployment_or_default(&self) -> String {
2354        if self.deployment.is_empty() {
2355            "zlayer".to_string()
2356        } else {
2357            self.deployment.clone()
2358        }
2359    }
2360
2361    // -- peers ---------------------------------------------------------------
2362
2363    /// Resolve a [`PeerScope`] to the live [`OverlayTransport`] its ops target.
2364    ///
2365    /// `Global` -> the single cluster transport; `Service { service }` -> that
2366    /// service's dedicated per-service transport (Dedicated mode only).
2367    ///
2368    /// # Errors
2369    /// Returns an error if the global overlay is not up (for `Global`) or no
2370    /// dedicated overlay exists for the named service (for `Service`).
2371    fn transport_for_scope(&self, scope: &PeerScope) -> Result<&OverlayTransport, OverlaydError> {
2372        match scope {
2373            PeerScope::Global => self
2374                .global_transport
2375                .as_ref()
2376                .ok_or_else(|| OverlaydError::Other("global overlay not set up".into())),
2377            PeerScope::Service { service } => self
2378                .service_transports
2379                .get(service)
2380                .map(|s| &s.transport)
2381                .ok_or_else(|| {
2382                    OverlaydError::Other(format!("no dedicated overlay for service {service}"))
2383                }),
2384        }
2385    }
2386
2387    /// Add a peer to a resolved transport.
2388    ///
2389    /// # Errors
2390    /// Wraps the underlying transport error.
2391    async fn add_peer_on(
2392        transport: &OverlayTransport,
2393        peer: &PeerInfo,
2394    ) -> Result<(), OverlaydError> {
2395        transport
2396            .add_peer(peer)
2397            .await
2398            .map_err(|e| OverlaydError::Overlay(format!("add_peer failed: {e}")))
2399    }
2400
2401    /// Remove a peer (by base64 public key) from a resolved transport.
2402    ///
2403    /// # Errors
2404    /// Wraps the underlying transport error.
2405    async fn remove_peer_on(
2406        transport: &OverlayTransport,
2407        pubkey: &str,
2408    ) -> Result<(), OverlaydError> {
2409        transport
2410            .remove_peer(pubkey)
2411            .await
2412            .map_err(|e| OverlaydError::Overlay(format!("remove_peer failed: {e}")))
2413    }
2414
2415    /// Plumb a CIDR into a peer's `AllowedIPs` on a resolved transport.
2416    ///
2417    /// # Errors
2418    /// Returns an error when the CIDR is invalid or the UAPI write fails.
2419    async fn add_allowed_ip_on(
2420        transport: &OverlayTransport,
2421        pubkey: &str,
2422        cidr: &str,
2423    ) -> Result<(), OverlaydError> {
2424        let net: ipnet::IpNet = cidr
2425            .parse()
2426            .map_err(|e| OverlaydError::Other(format!("invalid CIDR {cidr}: {e}")))?;
2427        transport
2428            .add_allowed_ip(pubkey, net)
2429            .await
2430            .map_err(|e| OverlaydError::Overlay(format!("add_allowed_ip failed: {e}")))
2431    }
2432
2433    /// Remove a CIDR from a peer's `AllowedIPs` on a resolved transport.
2434    ///
2435    /// # Errors
2436    /// Returns an error when the CIDR is invalid or the UAPI write fails.
2437    async fn remove_allowed_ip_on(
2438        transport: &OverlayTransport,
2439        pubkey: &str,
2440        cidr: &str,
2441    ) -> Result<(), OverlaydError> {
2442        let net: ipnet::IpNet = cidr
2443            .parse()
2444            .map_err(|e| OverlaydError::Other(format!("invalid CIDR {cidr}: {e}")))?;
2445        transport
2446            .remove_allowed_ip(pubkey, net)
2447            .await
2448            .map_err(|e| OverlaydError::Overlay(format!("remove_allowed_ip failed: {e}")))
2449    }
2450
2451    // -- DNS -----------------------------------------------------------------
2452
2453    /// Register an overlay DNS A/AAAA record.
2454    fn register_dns(&mut self, name: String, ip: IpAddr) {
2455        self.dns_records.insert(name, ip);
2456    }
2457
2458    /// Remove an overlay DNS record.
2459    fn unregister_dns(&mut self, name: &str) {
2460        self.dns_records.remove(name);
2461    }
2462
2463    // -- NAT -----------------------------------------------------------------
2464
2465    /// Periodic NAT traversal maintenance: re-probe STUN, refresh relays.
2466    /// No-op when NAT traversal has not been started.
2467    ///
2468    /// # Errors
2469    /// Returns an error when the underlying STUN refresh fails.
2470    async fn nat_maintenance_tick(&mut self) -> Result<(), OverlaydError> {
2471        // Lazily start NAT traversal on the first tick if a config asks for it.
2472        if self.nat_traversal.is_none() {
2473            let config = self.nat_config.clone().unwrap_or_default();
2474            if config.enabled {
2475                let mut nat = NatTraversal::new(config, self.overlay_port);
2476                match nat.gather_candidates().await {
2477                    Ok(candidates) => {
2478                        tracing::info!(count = candidates.len(), "Gathered NAT candidates");
2479                        self.nat_last_refresh.store(now_unix(), Ordering::SeqCst);
2480                        self.nat_traversal = Some(nat);
2481                    }
2482                    Err(e) => {
2483                        tracing::warn!(error = %e, "NAT candidate gathering failed");
2484                        return Ok(());
2485                    }
2486                }
2487            } else {
2488                return Ok(());
2489            }
2490        }
2491
2492        let Some(nat) = self.nat_traversal.as_mut() else {
2493            return Ok(());
2494        };
2495        match nat.refresh().await {
2496            Ok(changed) => {
2497                if changed {
2498                    tracing::info!("NAT reflexive address changed during refresh");
2499                }
2500                self.nat_last_refresh.store(now_unix(), Ordering::SeqCst);
2501                Ok(())
2502            }
2503            Err(e) => Err(OverlaydError::Overlay(format!(
2504                "NAT maintenance tick failed: {e}"
2505            ))),
2506        }
2507    }
2508
2509    // -- status --------------------------------------------------------------
2510
2511    /// Build a [`StatusSnapshot`] from current overlay state.
2512    async fn status_snapshot(&self) -> StatusSnapshot {
2513        let mut peers: Vec<PeerStatus> = Vec::new();
2514        let public_key = self.transport_public_key.clone();
2515
2516        if let Some(transport) = self.global_transport.as_ref() {
2517            // Parse the UAPI dump for per-peer state. Best-effort: a parse
2518            // failure leaves the peer list empty rather than failing Status.
2519            if let Ok(dump) = transport.status().await {
2520                peers = parse_peer_status(&dump);
2521            }
2522        }
2523
2524        let service_count = u32::try_from(self.service_count()).unwrap_or(u32::MAX);
2525        let peer_count = u32::try_from(peers.len()).unwrap_or(u32::MAX);
2526
2527        // Per dedicated per-service overlay device: count its peers the same
2528        // way the global status does (parse the UAPI/status dump).
2529        let mut dedicated_services: Vec<DedicatedServiceStatus> = Vec::new();
2530        for (svc, st) in &self.service_transports {
2531            let peer_count = match st.transport.status().await {
2532                Ok(dump) => u32::try_from(parse_peer_status(&dump).len()).unwrap_or(u32::MAX),
2533                Err(_) => 0,
2534            };
2535            dedicated_services.push(DedicatedServiceStatus {
2536                service: svc.clone(),
2537                interface: st.interface.clone(),
2538                public_key: st.public_key.clone(),
2539                listen_port: st.listen_port,
2540                overlay_ip: st.overlay_ip,
2541                subnet: st.subnet.to_string(),
2542                peer_count,
2543            });
2544        }
2545
2546        StatusSnapshot {
2547            interface: self.global_interface.clone(),
2548            node_ip: self.node_ip,
2549            public_key,
2550            overlay_cidr: self.cluster_cidr.map(|c| c.to_string()),
2551            slice_cidr: self.slice_cidr.map(|c| c.to_string()),
2552            peer_count,
2553            service_count,
2554            peers,
2555            dedicated_services,
2556        }
2557    }
2558
2559    /// Number of per-service overlays set up on this node (Shared bridges /
2560    /// placeholders plus any Dedicated transports not already counted there).
2561    fn service_count(&self) -> usize {
2562        let extra_dedicated = self
2563            .service_transports
2564            .keys()
2565            .filter(|svc| !self.service_interfaces.contains_key(*svc))
2566            .count();
2567        self.service_interfaces.len() + extra_dedicated
2568    }
2569
2570    // -- config helper -------------------------------------------------------
2571
2572    fn build_config(
2573        &self,
2574        private_key: String,
2575        public_key: String,
2576        ip: IpAddr,
2577        mask: u8,
2578        listen_port: u16,
2579        physical_egress_ip: Option<IpAddr>,
2580    ) -> OverlayConfig {
2581        // Pick the source/advertised address for the WireGuard endpoint.
2582        //
2583        // Default is the family-matched UNSPECIFIED (`0.0.0.0` / `::`), which lets
2584        // the kernel pick a source per outgoing packet. When the caller resolved a
2585        // physical-egress IP (see `detect_physical_egress`) *and* its family
2586        // matches the overlay IP's family, we pin `local_endpoint` to that IP so
2587        // boringtun's data socket sources from — and advertises — the real NIC
2588        // rather than whatever the default route (possibly a VPN mesh) would pick.
2589        //
2590        // Family mismatch (e.g. physical egress is v4 but this overlay is v6) is
2591        // unusable for source selection, so we warn and fall back to UNSPECIFIED.
2592        //
2593        // boringtun limitation: boringtun 0.7's `DeviceConfig` exposes no way to
2594        // inject or pin the WireGuard DATA socket (its `uapi_fd` is the UAPI
2595        // CONTROL socket only), so `SO_BINDTODEVICE` on the data socket is
2596        // impossible today. Setting `local_endpoint` to the physical IP governs
2597        // source-address selection and the advertised endpoint, which is the
2598        // realistic scope of control we have.
2599        let unspecified = match ip {
2600            IpAddr::V4(_) => IpAddr::V4(Ipv4Addr::UNSPECIFIED),
2601            IpAddr::V6(_) => IpAddr::V6(Ipv6Addr::UNSPECIFIED),
2602        };
2603        let local_addr = match physical_egress_ip {
2604            Some(egress) if egress.is_ipv4() == ip.is_ipv4() => egress,
2605            Some(egress) => {
2606                tracing::warn!(
2607                    physical_egress_ip = %egress,
2608                    overlay_ip = %ip,
2609                    "physical egress IP family does not match overlay IP family; \
2610                     falling back to UNSPECIFIED for WireGuard local_endpoint"
2611                );
2612                unspecified
2613            }
2614            None => unspecified,
2615        };
2616        let mut config = OverlayConfig {
2617            local_endpoint: SocketAddr::new(local_addr, listen_port),
2618            private_key,
2619            public_key,
2620            overlay_cidr: format!("{ip}/{mask}"),
2621            ..OverlayConfig::default()
2622        };
2623        if let Some(nat) = self.nat_config.clone() {
2624            config.nat = nat;
2625        }
2626        if let Some(dir) = self.uapi_sock_dir.clone() {
2627            config.uapi_sock_dir = dir;
2628        }
2629        config
2630    }
2631}
2632
2633/// Build a Shared-mode [`ServiceOverlayInfo`]: the bridge/placeholder name with
2634/// every dedicated-device identity field left `None` (Shared mode shares the
2635/// single cluster device).
2636fn shared_overlay_info(name: String) -> ServiceOverlayInfo {
2637    ServiceOverlayInfo {
2638        name,
2639        mode: OverlayMode::Shared,
2640        wg_public_key: None,
2641        wg_port: None,
2642        overlay_ip: None,
2643        subnet: None,
2644    }
2645}
2646
2647/// Build a Dedicated-mode [`ServiceOverlayInfo`] from a dedicated device's
2648/// identity. `name` is the container-attach handle (bridge name on Linux, the
2649/// dedicated interface elsewhere).
2650fn dedicated_overlay_info(
2651    name: String,
2652    public_key: &str,
2653    listen_port: u16,
2654    overlay_ip: IpAddr,
2655    subnet: ipnet::IpNet,
2656) -> ServiceOverlayInfo {
2657    ServiceOverlayInfo {
2658        name,
2659        mode: OverlayMode::Dedicated,
2660        wg_public_key: Some(public_key.to_string()),
2661        wg_port: Some(listen_port),
2662        overlay_ip: Some(overlay_ip),
2663        subnet: Some(subnet.to_string()),
2664    }
2665}
2666
2667/// Convert a wire [`PeerSpec`] into a `zlayer_overlay::PeerInfo`.
2668///
2669/// # Errors
2670/// Returns an error if `endpoint` cannot be parsed as a `host:port`
2671/// [`SocketAddr`].
2672pub fn peer_spec_to_info(spec: &PeerSpec) -> Result<PeerInfo, OverlaydError> {
2673    let endpoint: SocketAddr = spec.endpoint.parse().map_err(|e| {
2674        OverlaydError::Other(format!("invalid peer endpoint {}: {e}", spec.endpoint))
2675    })?;
2676    Ok(PeerInfo::new(
2677        spec.public_key.clone(),
2678        endpoint,
2679        &spec.allowed_ips,
2680        std::time::Duration::from_secs(spec.persistent_keepalive_secs),
2681    ))
2682}
2683
2684/// Parse a `wg`-style UAPI/`status` dump into [`PeerStatus`] entries.
2685///
2686/// The dump is a series of `key=value` lines; each `public_key=` line starts a
2687/// new peer block, and subsequent `endpoint=` / `allowed_ip=` /
2688/// `latest_handshake=` lines belong to it.
2689fn parse_peer_status(dump: &str) -> Vec<PeerStatus> {
2690    let mut peers: Vec<PeerStatus> = Vec::new();
2691    let mut current: Option<PeerStatus> = None;
2692    let mut allowed: Vec<String> = Vec::new();
2693
2694    let flush = |peers: &mut Vec<PeerStatus>,
2695                 current: &mut Option<PeerStatus>,
2696                 allowed: &mut Vec<String>| {
2697        if let Some(mut p) = current.take() {
2698            p.allowed_ips = allowed.join(",");
2699            peers.push(p);
2700        }
2701        allowed.clear();
2702    };
2703
2704    for line in dump.lines() {
2705        let line = line.trim();
2706        let Some((key, value)) = line.split_once('=') else {
2707            continue;
2708        };
2709        match key.trim() {
2710            "public_key" | "peer" => {
2711                flush(&mut peers, &mut current, &mut allowed);
2712                current = Some(PeerStatus {
2713                    public_key: value.trim().to_string(),
2714                    endpoint: String::new(),
2715                    allowed_ips: String::new(),
2716                    last_handshake_unix_secs: 0,
2717                });
2718            }
2719            "endpoint" => {
2720                if let Some(p) = current.as_mut() {
2721                    p.endpoint = value.trim().to_string();
2722                }
2723            }
2724            "allowed_ip" | "allowed_ips" => {
2725                if current.is_some() {
2726                    allowed.push(value.trim().to_string());
2727                }
2728            }
2729            "latest_handshake" | "last_handshake_time_sec" => {
2730                if let Some(p) = current.as_mut() {
2731                    p.last_handshake_unix_secs = value.trim().parse().unwrap_or(0);
2732                }
2733            }
2734            _ => {}
2735        }
2736    }
2737    flush(&mut peers, &mut current, &mut allowed);
2738    peers
2739}
2740
2741/// Current Unix time in whole seconds.
2742fn now_unix() -> u64 {
2743    std::time::SystemTime::now()
2744        .duration_since(std::time::UNIX_EPOCH)
2745        .unwrap_or_default()
2746        .as_secs()
2747}
2748
2749/// Simple IP address allocator supporting both IPv4 and IPv6, bounded to a
2750/// specific CIDR (typically a per-node `/28` slice). Allocations past the last
2751/// usable host return an exhaustion error.
2752struct IpAllocator {
2753    /// CIDR the allocator is bounded to.
2754    cidr: IpNetwork,
2755    /// Base (network) address of the CIDR.
2756    base: IpAddr,
2757    /// Monotonic counter for the next allocation offset relative to `base`.
2758    next_offset: AtomicU64,
2759    /// IPs returned by `release(...)`. `allocate()` drains this first before
2760    /// incrementing `next_offset`.
2761    released: parking_lot::Mutex<Vec<IpAddr>>,
2762}
2763
2764impl IpAllocator {
2765    fn new(cidr: IpNetwork) -> Self {
2766        Self {
2767            base: cidr.network(),
2768            cidr,
2769            next_offset: AtomicU64::new(1),
2770            released: parking_lot::Mutex::new(Vec::new()),
2771        }
2772    }
2773
2774    #[allow(clippy::cast_possible_truncation)]
2775    fn compute_addr(&self, offset: u64) -> IpAddr {
2776        match self.base {
2777            IpAddr::V4(base_v4) => {
2778                let base_u32 = u32::from_be_bytes(base_v4.octets());
2779                let addr = base_u32.wrapping_add(offset as u32);
2780                IpAddr::V4(Ipv4Addr::from(addr.to_be_bytes()))
2781            }
2782            IpAddr::V6(base_v6) => {
2783                let base_u128 = u128::from(base_v6);
2784                let addr = base_u128.wrapping_add(u128::from(offset));
2785                IpAddr::V6(Ipv6Addr::from(addr))
2786            }
2787        }
2788    }
2789
2790    /// Allocate the next IP in the slice, reusing released IPs first.
2791    ///
2792    /// # Errors
2793    /// Returns [`OverlaydError::Overlay`] when the CIDR is exhausted.
2794    fn allocate(&self) -> Result<IpAddr, OverlaydError> {
2795        if let Some(ip) = self.released.lock().pop() {
2796            return Ok(ip);
2797        }
2798        let offset = self.next_offset.fetch_add(1, Ordering::SeqCst);
2799        let addr = self.compute_addr(offset);
2800
2801        let in_cidr = self.cidr.contains(addr);
2802        let is_v4_broadcast = matches!(
2803            (&self.cidr, &addr),
2804            (IpNetwork::V4(v4), IpAddr::V4(a)) if *a == v4.broadcast()
2805        );
2806        if !in_cidr || is_v4_broadcast {
2807            return Err(OverlaydError::Overlay(format!(
2808                "IP allocator exhausted: next address {addr} is outside slice {}",
2809                self.cidr
2810            )));
2811        }
2812        Ok(addr)
2813    }
2814
2815    /// Return an IP to the free pool. Idempotent.
2816    fn release(&self, ip: IpAddr) {
2817        let mut released = self.released.lock();
2818        if !released.contains(&ip) {
2819            released.push(ip);
2820        }
2821    }
2822}
2823
2824// -- Windows HCN helpers (ported from the agent's hcs runtime) --------------
2825
2826/// Owner tag stamped onto every HCN endpoint this server creates. The legacy
2827/// single-instance value is `"zlayer"`; any other name is used verbatim so two
2828/// daemons running side-by-side never sweep each other's endpoints.
2829#[cfg(target_os = "windows")]
2830fn owner_tag(daemon_name: &str) -> String {
2831    if daemon_name == "zlayer" {
2832        "zlayer".to_string()
2833    } else {
2834        daemon_name.to_string()
2835    }
2836}
2837
2838/// Name of the per-daemon HCN overlay network on the host. Legacy
2839/// single-instance value is `"zlayer-overlay"`; any other name becomes
2840/// `"<daemon_name>-overlay"`.
2841#[cfg(target_os = "windows")]
2842fn overlay_network_name(daemon_name: &str) -> String {
2843    if daemon_name == "zlayer" {
2844        "zlayer-overlay".to_string()
2845    } else {
2846        format!("{daemon_name}-overlay")
2847    }
2848}
2849
2850/// Format a GUID as the bare, lowercase, un-braced string HCN/HCS use to
2851/// identify a namespace inside a compute-system document's
2852/// `Container.Networking.Namespace` field (e.g. `aabbccdd-eeff-...`).
2853#[cfg(target_os = "windows")]
2854fn format_guid_bare(id: windows::core::GUID) -> String {
2855    format!("{id:?}")
2856        .trim_matches(|c: char| c == '{' || c == '}')
2857        .to_ascii_lowercase()
2858}
2859
2860/// Delete every host-level HCN network this server created for `daemon_name` and
2861/// clear the persistent marker. Called on a full uninstall — never on a routine
2862/// stop/restart. Best-effort throughout. Synchronous (HCN calls are blocking).
2863#[cfg(target_os = "windows")]
2864pub fn purge_managed_networks(data_dir: &Path, daemon_name: &str) {
2865    use windows::core::GUID;
2866
2867    let marker_path = zlayer_paths::ZLayerDirs::new(data_dir.to_path_buf()).agent_network_state();
2868    let state = crate::network_state::NetworkState::load(&marker_path);
2869
2870    // Pass 1: delete recorded HCN networks by GUID.
2871    for entry in &state.networks {
2872        if !entry.kind.starts_with("hcn") {
2873            continue;
2874        }
2875        match GUID::try_from(entry.id.as_str()) {
2876            Ok(guid) => match zlayer_hns::network::Network::delete(guid) {
2877                Ok(()) => {
2878                    tracing::info!(name = %entry.name, id = %entry.id, "deleted managed HCN network");
2879                }
2880                Err(e) => {
2881                    tracing::warn!(name = %entry.name, id = %entry.id, error = %e, "failed to delete managed HCN network");
2882                }
2883            },
2884            Err(e) => {
2885                tracing::warn!(id = %entry.id, error = %e, "managed network marker has unparseable GUID");
2886            }
2887        }
2888    }
2889
2890    // Pass 2: name-sweep fallback for an overlay network whose marker entry was
2891    // lost (crash between create and marker write).
2892    let overlay_name = overlay_network_name(daemon_name);
2893    if let Ok(guids) = zlayer_hns::network::list("{}") {
2894        for guid in guids {
2895            let Ok(network) = zlayer_hns::network::Network::open(guid) else {
2896                continue;
2897            };
2898            let is_ours = matches!(network.query("{}"), Ok(props) if props.name == overlay_name);
2899            drop(network);
2900            if is_ours {
2901                match zlayer_hns::network::Network::delete(guid) {
2902                    Ok(()) => {
2903                        tracing::info!(name = %overlay_name, "deleted overlay HCN network (name sweep)");
2904                    }
2905                    Err(e) => {
2906                        tracing::warn!(name = %overlay_name, error = %e, "failed to delete overlay network (name sweep)");
2907                    }
2908                }
2909            }
2910        }
2911    }
2912
2913    if marker_path.exists() {
2914        if let Err(e) = std::fs::remove_file(&marker_path) {
2915            tracing::warn!(error = %e, path = %marker_path.display(), "failed to remove agent network marker");
2916        }
2917    }
2918}
2919
2920#[cfg(test)]
2921mod tests {
2922    use super::*;
2923
2924    #[test]
2925    fn peer_spec_to_info_parses_endpoint_and_keepalive() {
2926        let spec = PeerSpec {
2927            public_key: "base64key".to_string(),
2928            endpoint: "1.2.3.4:51820".to_string(),
2929            allowed_ips: "10.200.0.5/32,10.200.1.0/24".to_string(),
2930            persistent_keepalive_secs: 25,
2931        };
2932        let info = peer_spec_to_info(&spec).expect("valid spec");
2933        assert_eq!(info.public_key, "base64key");
2934        assert_eq!(info.endpoint, "1.2.3.4:51820".parse().unwrap());
2935        assert_eq!(info.allowed_ips, "10.200.0.5/32,10.200.1.0/24");
2936        assert_eq!(
2937            info.persistent_keepalive_interval,
2938            std::time::Duration::from_secs(25)
2939        );
2940    }
2941
2942    #[test]
2943    fn peer_spec_to_info_rejects_bad_endpoint() {
2944        let spec = PeerSpec {
2945            public_key: "k".to_string(),
2946            endpoint: "not-a-socket-addr".to_string(),
2947            allowed_ips: String::new(),
2948            persistent_keepalive_secs: 0,
2949        };
2950        assert!(peer_spec_to_info(&spec).is_err());
2951    }
2952
2953    #[test]
2954    fn interface_name_never_exceeds_limit() {
2955        let cases: Vec<(&[&str], &str)> = vec![
2956            (&["a"], "g"),
2957            (&["zlayer-manager"], "g"),
2958            (&["my-very-long-deployment-name-that-goes-on-and-on"], "g"),
2959            (&["zlayer", "manager"], "s"),
2960            (
2961                &["abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz"],
2962                "s",
2963            ),
2964            (&["x"], ""),
2965        ];
2966        for (parts, suffix) in &cases {
2967            let name = make_interface_name(parts, suffix);
2968            assert!(name.len() <= MAX_IFNAME_LEN, "Name '{name}' too long");
2969            assert!(name.starts_with("zl-"));
2970        }
2971    }
2972
2973    #[test]
2974    fn interface_name_is_deterministic() {
2975        assert_eq!(
2976            make_interface_name(&["zlayer-manager"], "g"),
2977            make_interface_name(&["zlayer-manager"], "g")
2978        );
2979    }
2980
2981    #[test]
2982    fn parse_peer_status_splits_blocks() {
2983        let dump = "\
2984public_key=AAA
2985endpoint=1.2.3.4:51820
2986allowed_ip=10.200.0.2/32
2987allowed_ip=10.200.1.0/24
2988latest_handshake=1700000000
2989public_key=BBB
2990endpoint=5.6.7.8:51820
2991allowed_ip=10.200.0.3/32
2992latest_handshake=0
2993";
2994        let peers = parse_peer_status(dump);
2995        assert_eq!(peers.len(), 2);
2996        assert_eq!(peers[0].public_key, "AAA");
2997        assert_eq!(peers[0].endpoint, "1.2.3.4:51820");
2998        assert_eq!(peers[0].allowed_ips, "10.200.0.2/32,10.200.1.0/24");
2999        assert_eq!(peers[0].last_handshake_unix_secs, 1_700_000_000);
3000        assert_eq!(peers[1].public_key, "BBB");
3001        assert_eq!(peers[1].last_handshake_unix_secs, 0);
3002    }
3003
3004    #[tokio::test]
3005    async fn status_snapshot_before_setup_is_empty() {
3006        let server = OverlaydServer::new(std::path::PathBuf::from("/tmp/zlayer-overlayd-test"));
3007        let snap = server.status_snapshot().await;
3008        assert!(snap.interface.is_none());
3009        assert!(snap.node_ip.is_none());
3010        assert!(snap.public_key.is_none());
3011        assert_eq!(snap.peer_count, 0);
3012        assert_eq!(snap.service_count, 0);
3013        assert!(snap.peers.is_empty());
3014    }
3015
3016    #[tokio::test]
3017    async fn allocate_and_release_ip_round_trip() {
3018        let mut server = OverlaydServer::new(std::path::PathBuf::from("/tmp/zlayer-overlayd-test"));
3019        let a = server.allocate_ip("svc", false).expect("alloc a");
3020        let b = server.allocate_ip("svc", false).expect("alloc b");
3021        assert_ne!(a, b);
3022        server.release_ip(a);
3023        // Released IP is handed back before the monotonic counter advances.
3024        let c = server.allocate_ip("svc", false).expect("alloc c");
3025        assert_eq!(c, a);
3026    }
3027
3028    /// Build a throwaway server bound to a unique temp data dir so the marker
3029    /// file (rehydrated in `new`) never collides between tests.
3030    fn test_server() -> OverlaydServer {
3031        let dir = std::env::temp_dir().join(format!(
3032            "zlayer-overlayd-scope-{}-{}",
3033            std::process::id(),
3034            now_unix()
3035        ));
3036        OverlaydServer::new(dir)
3037    }
3038
3039    #[test]
3040    fn build_config_uses_matching_physical_egress_ipv4() {
3041        let server = test_server();
3042        let overlay_ip: IpAddr = "10.200.0.1".parse().unwrap();
3043        let egress: IpAddr = "192.0.2.10".parse().unwrap();
3044        let config = server.build_config(
3045            "priv".to_string(),
3046            "pub".to_string(),
3047            overlay_ip,
3048            16,
3049            51820,
3050            Some(egress),
3051        );
3052        assert_eq!(config.local_endpoint, SocketAddr::new(egress, 51820));
3053    }
3054
3055    #[test]
3056    fn build_config_falls_back_to_unspecified_when_none() {
3057        let server = test_server();
3058        let overlay_ip: IpAddr = "10.200.0.1".parse().unwrap();
3059        let config = server.build_config(
3060            "priv".to_string(),
3061            "pub".to_string(),
3062            overlay_ip,
3063            16,
3064            51820,
3065            None,
3066        );
3067        assert_eq!(
3068            config.local_endpoint,
3069            SocketAddr::new(IpAddr::V4(Ipv4Addr::UNSPECIFIED), 51820)
3070        );
3071    }
3072
3073    #[test]
3074    fn build_config_falls_back_to_unspecified_on_family_mismatch() {
3075        let server = test_server();
3076        // Overlay is v6 but the resolved physical egress is v4: unusable for
3077        // source selection, so we must fall back to the v6 UNSPECIFIED address.
3078        let overlay_ip: IpAddr = "fd00::1".parse().unwrap();
3079        let egress: IpAddr = "192.0.2.10".parse().unwrap();
3080        let config = server.build_config(
3081            "priv".to_string(),
3082            "pub".to_string(),
3083            overlay_ip,
3084            64,
3085            51820,
3086            Some(egress),
3087        );
3088        assert_eq!(
3089            config.local_endpoint,
3090            SocketAddr::new(IpAddr::V6(Ipv6Addr::UNSPECIFIED), 51820)
3091        );
3092    }
3093
3094    #[tokio::test]
3095    async fn transport_for_scope_global_requires_setup() {
3096        let server = test_server();
3097        // No global overlay set up yet -> Global scope errors. (Can't use
3098        // `expect_err` because `&OverlayTransport` is not `Debug`.)
3099        match server.transport_for_scope(&PeerScope::Global) {
3100            Ok(_) => panic!("global overlay should not be set up"),
3101            Err(OverlaydError::Other(m)) => {
3102                assert!(m.contains("global overlay not set up"), "got: {m}");
3103            }
3104            Err(other) => panic!("unexpected error: {other:?}"),
3105        }
3106    }
3107
3108    #[tokio::test]
3109    async fn transport_for_scope_unset_service_errors() {
3110        let server = test_server();
3111        match server.transport_for_scope(&PeerScope::Service {
3112            service: "x".to_string(),
3113        }) {
3114            Ok(_) => panic!("no dedicated overlay should exist for x"),
3115            Err(OverlaydError::Other(m)) => {
3116                assert_eq!(m, "no dedicated overlay for service x");
3117            }
3118            Err(other) => panic!("unexpected error: {other:?}"),
3119        }
3120    }
3121
3122    #[tokio::test]
3123    async fn add_peer_service_scope_before_setup_errors_via_dispatch() {
3124        let mut server = test_server();
3125        let resp = server
3126            .handle(OverlaydRequest::AddPeer {
3127                peer: PeerSpec {
3128                    public_key: "k".to_string(),
3129                    endpoint: "1.2.3.4:51820".to_string(),
3130                    allowed_ips: "10.200.0.2/32".to_string(),
3131                    persistent_keepalive_secs: 0,
3132                },
3133                scope: PeerScope::Service {
3134                    service: "x".to_string(),
3135                },
3136            })
3137            .await;
3138        match resp {
3139            OverlaydResponse::Err { message } => {
3140                assert_eq!(message, "no dedicated overlay for service x");
3141            }
3142            other => panic!("expected Err response, got {other:?}"),
3143        }
3144    }
3145
3146    /// End-to-end Dedicated setup. Needs a real TUN device, so it is ignored by
3147    /// default and only runs on a privileged Linux host (mirrors the crate's
3148    /// other privileged overlay e2e tests).
3149    #[cfg(target_os = "linux")]
3150    #[tokio::test]
3151    #[ignore = "needs CAP_NET_ADMIN; run on a privileged Linux host"]
3152    async fn dedicated_setup_creates_distinct_device_and_routes_service_peer() {
3153        let mut server = test_server();
3154        // Bring up the global overlay first so the cluster CIDR + global device
3155        // exist (the dedicated device must get a distinct port and key).
3156        let global_name = server
3157            .setup_global_overlay(
3158                "dep".to_string(),
3159                "i0".to_string(),
3160                "10.200.0.0/16",
3161                Some("10.200.0.0/28"),
3162                zlayer_core::DEFAULT_WG_PORT,
3163                false,
3164            )
3165            .await
3166            .expect("global overlay up");
3167        assert!(!global_name.is_empty());
3168
3169        // Dedicated service setup.
3170        let info = server
3171            .setup_service_overlay("web", OverlayMode::Dedicated)
3172            .await
3173            .expect("dedicated service overlay up");
3174        assert_eq!(info.mode, OverlayMode::Dedicated);
3175        let port = info.wg_port.expect("dedicated port");
3176        assert_ne!(
3177            port, server.overlay_port,
3178            "dedicated device must not share the global port"
3179        );
3180
3181        let st = server
3182            .service_transports
3183            .get("web")
3184            .expect("service transport recorded");
3185        assert_eq!(st.listen_port, port);
3186        assert_ne!(
3187            st.interface, global_name,
3188            "dedicated interface must differ from global"
3189        );
3190        assert_eq!(
3191            Some(st.public_key.clone()),
3192            info.wg_public_key,
3193            "info pubkey matches recorded transport"
3194        );
3195        assert_ne!(
3196            Some(st.public_key.clone()),
3197            server.transport_public_key,
3198            "dedicated key must differ from global key"
3199        );
3200
3201        // A Service-scoped AddPeer must land on the dedicated device (succeeds),
3202        // proving scope routing targets the per-service transport.
3203        let resp = server
3204            .handle(OverlaydRequest::AddPeer {
3205                peer: PeerSpec {
3206                    public_key: {
3207                        let (_priv, pubk) = OverlayTransport::generate_keys().await.unwrap();
3208                        pubk
3209                    },
3210                    endpoint: "5.6.7.8:51999".to_string(),
3211                    allowed_ips: "10.201.0.2/32".to_string(),
3212                    persistent_keepalive_secs: 25,
3213                },
3214                scope: PeerScope::Service {
3215                    service: "web".to_string(),
3216                },
3217            })
3218            .await;
3219        assert!(
3220            matches!(resp, OverlaydResponse::Ok),
3221            "service-scoped add_peer should land on the dedicated device, got {resp:?}"
3222        );
3223    }
3224
3225    #[tokio::test]
3226    async fn guest_attach_requires_global_overlay() {
3227        // Without a global overlay (no node public key / transport) a
3228        // guest-managed attach must error rather than allocate anything.
3229        let mut server = test_server();
3230        let resp = server
3231            .handle(OverlaydRequest::AttachContainer {
3232                handle: AttachHandle::GuestManaged {
3233                    id: "vm-1".to_string(),
3234                },
3235                service: "web".to_string(),
3236                join_global: true,
3237                dns_server: None,
3238                dns_domain: None,
3239            })
3240            .await;
3241        match resp {
3242            OverlaydResponse::Err { message } => {
3243                assert!(
3244                    message.contains("global overlay to be set up"),
3245                    "got: {message}"
3246                );
3247            }
3248            other => panic!("expected Err response, got {other:?}"),
3249        }
3250        // Nothing was recorded.
3251        assert!(server.guest_attachments.is_empty());
3252    }
3253
3254    #[tokio::test]
3255    async fn detach_unknown_guest_is_idempotent() {
3256        let mut server = test_server();
3257        // No such guest -> Ok (idempotent), no panic.
3258        server
3259            .detach_container_guest("never-attached")
3260            .await
3261            .expect("detach of unknown guest is a no-op");
3262    }
3263
3264    /// Full guest-managed attach/detach round-trip. Needs a real TUN device (the
3265    /// global overlay must be live so the guest peer can be installed), so it is
3266    /// ignored by default and only runs on a privileged Linux host — mirrors the
3267    /// crate's other privileged overlay e2e tests.
3268    #[cfg(target_os = "linux")]
3269    #[tokio::test]
3270    #[ignore = "needs CAP_NET_ADMIN; run on a privileged Linux host"]
3271    async fn guest_attach_allocates_config_and_detach_releases() {
3272        let mut server = test_server();
3273        server
3274            .setup_global_overlay(
3275                "dep".to_string(),
3276                "i0".to_string(),
3277                "10.200.0.0/16",
3278                Some("10.200.0.0/28"),
3279                zlayer_core::DEFAULT_WG_PORT,
3280                false,
3281            )
3282            .await
3283            .expect("global overlay up");
3284
3285        // Seed a global peer so the guest config carries it through.
3286        let (_p, other_pub) = OverlayTransport::generate_keys().await.unwrap();
3287        let add = server
3288            .handle(OverlaydRequest::AddPeer {
3289                peer: PeerSpec {
3290                    public_key: other_pub.clone(),
3291                    endpoint: "9.9.9.9:51820".to_string(),
3292                    allowed_ips: "10.200.1.0/28".to_string(),
3293                    persistent_keepalive_secs: 25,
3294                },
3295                scope: PeerScope::Global,
3296            })
3297            .await;
3298        assert!(
3299            matches!(add, OverlaydResponse::Ok),
3300            "seed peer add: {add:?}"
3301        );
3302
3303        let resp = server
3304            .handle(OverlaydRequest::AttachContainer {
3305                handle: AttachHandle::GuestManaged {
3306                    id: "vm-1".to_string(),
3307                },
3308                service: "web".to_string(),
3309                join_global: true,
3310                dns_server: Some("10.200.0.1".parse().unwrap()),
3311                dns_domain: Some("overlay".to_string()),
3312            })
3313            .await;
3314        let config = match resp {
3315            OverlaydResponse::GuestConfig(c) => c,
3316            other => panic!("expected GuestConfig, got {other:?}"),
3317        };
3318        assert!(!config.private_key.is_empty());
3319        assert!(!config.public_key.is_empty());
3320        assert_ne!(config.private_key, config.public_key);
3321        assert_eq!(config.listen_port, server.overlay_port);
3322        assert_eq!(config.dns_server, Some("10.200.0.1".parse().unwrap()));
3323        // Peers = the seeded global peer + this node (self) + nothing else.
3324        assert!(
3325            config.peers.iter().any(|p| p.public_key == other_pub),
3326            "guest must learn the seeded global peer"
3327        );
3328        assert!(
3329            config
3330                .peers
3331                .iter()
3332                .any(|p| Some(&p.public_key) == server.transport_public_key.as_ref()),
3333            "guest must learn THIS node as a peer"
3334        );
3335        // The guest's own key is registered as a global peer (host route).
3336        assert!(server.global_peers.contains_key(&config.public_key));
3337        let info = server
3338            .guest_attachments
3339            .get("vm-1")
3340            .expect("attachment recorded");
3341        assert_eq!(info.overlay_ip, config.overlay_ip);
3342
3343        // Detach releases the peer + IP.
3344        let det = server
3345            .handle(OverlaydRequest::DetachContainer {
3346                handle: AttachHandle::GuestManaged {
3347                    id: "vm-1".to_string(),
3348                },
3349            })
3350            .await;
3351        assert!(matches!(det, OverlaydResponse::Ok), "detach: {det:?}");
3352        assert!(!server.guest_attachments.contains_key("vm-1"));
3353        assert!(!server.global_peers.contains_key(&config.public_key));
3354    }
3355}