zlayer_overlayd/server.rs
1//! The overlayd server engine.
2//!
3//! [`OverlaydServer`] is a near 1:1 migration of the *mechanics* half of the
4//! agent's `OverlayManager`: it owns the single cluster `WireGuard`
5//! [`OverlayTransport`], the per-service Linux bridges (Linux) / HCN Internal
6//! network + endpoints (Windows), the per-node IP allocator, DNS config, and
7//! NAT traversal. The cluster-brain half (Raft, scheduler, service registry)
8//! stays in the main daemon, which drives this server over the IPC contract in
9//! [`zlayer_types::overlayd`].
10//!
11//! Every [`OverlaydRequest`] maps to a method here via [`OverlaydServer::handle`].
12
13use std::collections::HashMap;
14use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr};
15#[cfg(target_os = "linux")]
16use std::os::fd::AsFd;
17use std::path::{Path, PathBuf};
18use std::sync::atomic::{AtomicU64, Ordering};
19
20use ipnetwork::IpNetwork;
21use zlayer_overlay::nat::{RelayServerConfig, StunServerConfig, TurnServerConfig};
22use zlayer_overlay::{
23 Candidate, CandidateType, ConnectionType, NatConfig, NatTraversal, OverlayConfig,
24 OverlayTransport, PeerInfo, RelayServer,
25};
26use zlayer_types::overlayd::{
27 AttachHandle, AttachResult, DedicatedServiceStatus, GuestOverlayConfig, NatCandidateWire,
28 NatConfigSpec, NatPeerWire, NatStatusWire, OverlayMode, OverlaydRequest, OverlaydResponse,
29 PeerScope, PeerSpec, PeerStatus, ServiceOverlayInfo, StatusSnapshot,
30};
31
32use crate::error::OverlaydError;
33use crate::network_state::{
34 owner_for_service, DedicatedPortAllocator, ManagedNetwork, NetworkState,
35};
36
37/// Maximum length for Linux network interface names (IFNAMSIZ - 1 for null terminator).
38const MAX_IFNAME_LEN: usize = 15;
39
40/// Reserved [`zlayer_overlay::allocator::ServiceSubnetRegistry`] key for the
41/// single node-wide shared bridge (`OverlayMode::Shared`). The leading NUL-like
42/// sentinel can never collide with a real service name (service names come from
43/// deployment specs and are DNS-label-shaped), so the shared bridge always gets
44/// exactly one stable subnet distinct from every per-service subnet.
45#[cfg(target_os = "linux")]
46const SHARED_BRIDGE_REGISTRY_KEY: &str = "__zlayer_shared_bridge__";
47
48/// Generate a Linux-safe interface name guaranteed to be <= 15 chars.
49///
50/// Joins the `parts` with `-` after a `"zl-"` prefix and appends `-{suffix}` if
51/// non-empty. When the result exceeds 15 characters, a deterministic hash of all
52/// parts is used instead to keep the name unique and within the kernel limit.
53#[must_use]
54pub fn make_interface_name(parts: &[&str], suffix: &str) -> String {
55 use std::collections::hash_map::DefaultHasher;
56 use std::hash::{Hash, Hasher};
57
58 let base = format!("zl-{}", parts.join("-"));
59 let candidate = if suffix.is_empty() {
60 base
61 } else {
62 format!("{base}-{suffix}")
63 };
64
65 if candidate.len() <= MAX_IFNAME_LEN {
66 return candidate;
67 }
68
69 // Name is too long -- produce a deterministic hash-based name.
70 let mut hasher = DefaultHasher::new();
71 for part in parts {
72 part.hash(&mut hasher);
73 }
74 suffix.hash(&mut hasher);
75 let hash = format!("{:x}", hasher.finish());
76
77 if suffix.is_empty() {
78 // "zl-" (3) + up to 12 hex chars = 15
79 let budget = MAX_IFNAME_LEN - 3;
80 format!("zl-{}", &hash[..budget.min(hash.len())])
81 } else {
82 // "zl-" (3) + hash + "-" (1) + suffix
83 let suffix_cost = 1 + suffix.len(); // "-" + suffix
84 let hash_budget = MAX_IFNAME_LEN.saturating_sub(3 + suffix_cost);
85 if hash_budget == 0 {
86 let budget = MAX_IFNAME_LEN - 3;
87 format!("zl-{}", &hash[..budget.min(hash.len())])
88 } else {
89 format!("zl-{}-{}", &hash[..hash_budget.min(hash.len())], suffix)
90 }
91 }
92}
93
94/// Pure orphan-selection predicate for [`OverlaydServer::prune_orphan_bridges`].
95///
96/// Returns `true` iff `name` is one of OUR per-service bridge (`zl-…-b`) or
97/// dedicated device (`zl-…-d`) interfaces AND is neither in the `live` set (the
98/// names the daemon says SHOULD exist) nor `protected` (the active global `-g`
99/// device, the node-wide `-sh` shared bridge, and any live in-memory service
100/// bridge/device). The `zl-` prefix gate keeps the sweep off unrelated host
101/// links; the `-b`/`-d` suffix gate keeps it off the global/shared interfaces
102/// and the `veth-…`/`vc-…` container-veth namespace (those are reclaimed by the
103/// PID-keyed `sweep_orphan_veths`, never here).
104#[cfg(target_os = "linux")]
105fn is_orphan_service_bridge(
106 name: &str,
107 live: &std::collections::HashSet<&str>,
108 protected: &std::collections::HashSet<String>,
109) -> bool {
110 if !name.starts_with("zl-") {
111 return false;
112 }
113 if !(name.ends_with("-b") || name.ends_with("-d")) {
114 return false;
115 }
116 !live.contains(name) && !protected.contains(name)
117}
118
119/// First usable host address in `subnet`.
120///
121/// For IPv4 this is `network() + 1` (skipping the network address). For IPv6
122/// the same rule applies — the network address is conventionally reserved.
123fn first_usable_ip(subnet: ipnet::IpNet) -> IpAddr {
124 match subnet {
125 ipnet::IpNet::V4(v4) => {
126 let net = u32::from(v4.network());
127 IpAddr::V4(Ipv4Addr::from(net.wrapping_add(1)))
128 }
129 ipnet::IpNet::V6(v6) => {
130 let net = u128::from(v6.network());
131 IpAddr::V6(Ipv6Addr::from(net.wrapping_add(1)))
132 }
133 }
134}
135
136/// Parameters threaded into [`OverlaydServer::attach_to_interface`] when a
137/// container is being attached to a per-service Linux bridge.
138#[cfg(target_os = "linux")]
139#[derive(Debug)]
140struct BridgeAttachParams<'a> {
141 /// Linux bridge name on the host to enslave the host-side veth into.
142 bridge_name: &'a str,
143 /// Bridge's L3 gateway IP. The container's default route is set here.
144 gateway: IpAddr,
145 /// Prefix length of the bridge's subnet.
146 subnet_prefix_len: u8,
147}
148
149/// Tracking info recorded by [`OverlaydServer::attach_container`] for every
150/// container that successfully attaches on Linux (via the per-PID `attached`
151/// map) and for every macOS host-shared container (via the
152/// `host_shared_attachments` map). Used by `detach_container`. Cross-platform
153/// so the host-shared path — which runs on macOS — can reuse the same record.
154#[derive(Debug, Clone)]
155struct AttachInfo {
156 /// IP allocated on the per-service overlay (eth0 inside the container).
157 service_ip: IpAddr,
158 /// Name of the service whose bridge owns `service_ip`.
159 service_name: Option<String>,
160 /// IP allocated on the global overlay (eth1), if the container joined it.
161 /// `Some` iff the container also attached to the global overlay; the
162 /// detach path now deletes `veth-<pid>-g` unconditionally (idempotent), so
163 /// no separate `joined_global` flag is needed.
164 ///
165 /// Linux-only: this is the per-container global/eth1 IP, allocated and read
166 /// solely by the Linux veth attach/detach paths. Host-shared containers
167 /// (macOS/Windows) share the node's single cluster utun and reach the
168 /// global overlay through their node `/32` alias, so they never allocate a
169 /// separate eth1 IP — it is always `None` off Linux and never read there.
170 #[cfg_attr(not(target_os = "linux"), allow(dead_code))]
171 global_ip: Option<IpAddr>,
172 /// True when this attach asked overlayd to reap the per-service bridge
173 /// once the LAST container detaches (ephemeral/per-job networks). False
174 /// for managed services (bridge persists across scale-to-0).
175 ephemeral: bool,
176 /// `Some(network)` when this container joined the named isolated network;
177 /// drives per-network L3 isolation membership cleanup on detach.
178 isolation_network: Option<String>,
179}
180
181/// Tracking info recorded by [`OverlaydServer::attach_container_guest`] for a
182/// guest-managed attach. Platform-agnostic (no netns/veth/HCN): the guest owns
183/// its own `WireGuard` device; the host only allocated the address + registered
184/// the guest's public key as a global peer.
185#[derive(Debug, Clone)]
186struct GuestAttachInfo {
187 /// Overlay IP allocated for the guest (released on detach).
188 overlay_ip: IpAddr,
189 /// Base64 public key registered on the global transport for the guest
190 /// (removed on detach).
191 public_key: String,
192 /// Service whose bridge pool owns `overlay_ip` (Linux service-bridge path);
193 /// `None` when drawn from the node slice. Mirrors `AttachInfo::service_name`
194 /// so detach returns the IP to the right pool.
195 service_name: Option<String>,
196 /// `Some(network)` when this guest joined the named isolated network;
197 /// drives per-network membership cleanup on detach. The guest's own
198 /// enforcement (`WireGuard` `AllowedIPs`) is wired separately — overlayd only
199 /// maintains the membership map here.
200 isolation_network: Option<String>,
201}
202
203/// Per-service Linux bridge state. One bridge per service per node; containers
204/// attach to it via veth pairs and cross-node packets ride the single cluster
205/// `OverlayTransport` with the service subnet plumbed into its `AllowedIPs`.
206#[cfg(target_os = "linux")]
207#[derive(Debug)]
208struct ServiceBridge {
209 /// Linux bridge name, kept under IFNAMSIZ-1 by [`make_interface_name`].
210 name: String,
211 /// CIDR of the service's subnet on this node.
212 subnet: ipnet::IpNet,
213 /// Gateway IP within the subnet (first usable address).
214 gateway: IpAddr,
215 /// Per-service IP allocator covering `subnet`.
216 ip_allocator: zlayer_overlay::allocator::IpAllocator,
217}
218
219/// A dedicated per-service `WireGuard` transport (`OverlayMode::Dedicated`).
220///
221/// Unlike Shared mode — where every service subnet is plumbed onto the single
222/// cluster [`OverlayTransport`] via multi-CIDR `AllowedIPs` — a Dedicated
223/// service owns a *second* real `WireGuard` device with its own crypto context,
224/// listen port, overlay IP, and subnet. The device is portable (boringtun
225/// userspace `WireGuard` works on Linux/macOS/Windows), so this struct is
226/// cross-platform; only the bridge/HCN *attachment* of containers onto it is
227/// platform-gated.
228struct ServiceTransport {
229 /// The live dedicated `WireGuard` device. Dropping it tears down the TUN.
230 transport: OverlayTransport,
231 /// Actual interface name (kernel-assigned `utunN` on macOS).
232 interface: String,
233 /// base64 public key of this dedicated device.
234 public_key: String,
235 /// UDP listen port handed out by [`DedicatedPortAllocator`].
236 listen_port: u16,
237 /// This node's overlay IP on the dedicated device.
238 overlay_ip: std::net::IpAddr,
239 /// The service's subnet carried by the dedicated device.
240 subnet: ipnet::IpNet,
241 /// Guest-attach IPAM bounded to `subnet`. VZ-Linux / WSL2 guests that join
242 /// this Dedicated service draw their overlay IP from here so they land on
243 /// the dedicated device's subnet (own crypto) rather than the node slice.
244 /// The node's own `overlay_ip` is reserved at setup so guests never collide
245 /// with it. Unused on Linux, where dedicated containers attach via a
246 /// per-service bridge that owns its own allocator.
247 #[cfg_attr(target_os = "linux", allow(dead_code))]
248 ip_allocator: zlayer_overlay::allocator::IpAllocator,
249}
250
251/// The overlay daemon engine.
252pub struct OverlaydServer {
253 /// Deployment name (used for network naming). Set by `SetupGlobalOverlay`.
254 deployment: String,
255 /// Per-daemon-process disambiguator included in overlay link names. Set by
256 /// `SetupGlobalOverlay`.
257 instance_id: String,
258 /// Root data directory; HCN markers, IPAM state, etc. live under it.
259 data_dir: PathBuf,
260 /// Global overlay interface name.
261 global_interface: Option<String>,
262 /// Global overlay transport (kept alive for the TUN device lifetime). The
263 /// SINGLE cluster-wide `WireGuard` transport; every service subnet is
264 /// plumbed through its `AllowedIPs`.
265 global_transport: Option<OverlayTransport>,
266 /// Service-name -> per-service Linux bridge / placeholder name.
267 service_interfaces: HashMap<String, String>,
268 /// Service-name -> dedicated per-service `WireGuard` transport (Dedicated
269 /// mode). Coexists with `global_transport`. Empty for Shared-only nodes.
270 service_transports: HashMap<String, ServiceTransport>,
271 /// Port allocator for dedicated devices (band above the global WG port).
272 dedicated_ports: DedicatedPortAllocator,
273 /// Per-service bridge state (Linux only).
274 #[cfg(target_os = "linux")]
275 service_bridges: HashMap<String, ServiceBridge>,
276 /// The SINGLE node-wide shared bridge backing every `OverlayMode::Shared`
277 /// service (Linux only). Created once on the first Shared-service setup and
278 /// reused for all subsequent ones; container ports are exposed via the
279 /// userspace free-port L4 proxy (`proxy_manager.rs`), not per-service
280 /// bridges. `None` until the first Shared service is set up.
281 #[cfg(target_os = "linux")]
282 shared_bridge: Option<ServiceBridge>,
283 /// Resolved per-service overlay mode, recorded at `setup_service_overlay_*`
284 /// time so the container ATTACH path knows which data-plane a service uses
285 /// (per-service bridge for `Auto`/`Dedicated` vs the single shared bridge
286 /// for `Shared`) without re-deriving it. Cross-platform.
287 service_modes: HashMap<String, OverlayMode>,
288 /// Local fallback `ServiceSubnetRegistry`. Used by the Linux Shared bridge
289 /// path and by the cross-platform Dedicated path (subnets stay globally
290 /// unique regardless of mode/OS).
291 service_subnet_registry: Option<zlayer_overlay::allocator::ServiceSubnetRegistry>,
292 /// Local raft node id used as the partition key for service-subnet assign.
293 local_node_id: u64,
294 /// Base64 `WireGuard` public key of THIS node's cluster transport, as told
295 /// by the main daemon via `SetLocalWgPubkey` (used for service-subnet
296 /// `AllowedIPs` plumbing).
297 local_wg_pubkey: Option<String>,
298 /// Public key generated for the live global transport, recorded at
299 /// `setup_global_overlay` time so `Status` can surface it (the transport
300 /// itself exposes no public-key accessor).
301 transport_public_key: Option<String>,
302 /// IP allocator for the node's overlay slice.
303 ip_allocator: IpAllocator,
304 /// This node's IP on the global overlay network.
305 node_ip: Option<IpAddr>,
306 /// `WireGuard` listen port for the overlay network.
307 overlay_port: u16,
308 /// Full cluster CIDR (e.g. `10.200.0.0/16`).
309 cluster_cidr: Option<IpNetwork>,
310 /// Per-node slice CIDR.
311 slice_cidr: Option<IpNetwork>,
312 /// Map of HCN namespace GUID -> (`service_name`, `allocated_ip`,
313 /// `isolation_network`) for autoclean. The trailing `isolation_network` lets
314 /// detach drain the per-network membership map for this container.
315 #[cfg(target_os = "windows")]
316 hcn_cleanup: HashMap<windows::core::GUID, (String, std::net::IpAddr, Option<String>)>,
317 /// Per-service container-IP allocators for Windows dedicated services. Each
318 /// is bounded to that service's subnet (not the node slice) so dedicated
319 /// containers draw addresses from their own isolated network. Keyed by
320 /// service name; created lazily on the first dedicated attach.
321 #[cfg(target_os = "windows")]
322 service_ip_allocators: HashMap<String, IpAllocator>,
323 /// Per-PID tracking of overlay attachments on Linux.
324 #[cfg(target_os = "linux")]
325 attached: HashMap<u32, AttachInfo>,
326 /// Per-isolated-network membership: network name -> the set of member
327 /// overlay (service) IPs currently attached to it. Drives per-network L3
328 /// isolation (a member reaches only its own network's members + node +
329 /// egress). Populated on attach, drained on detach, across all platforms.
330 network_members: std::collections::HashMap<String, std::collections::HashSet<IpAddr>>,
331 /// Peers installed on the GLOBAL transport via `AddPeer { Global }`, keyed by
332 /// base64 public key. Tracked here (in wire-safe [`PeerSpec`] form, with the
333 /// keys kept base64 — the boringtun UAPI dump only exposes hex keys) so a
334 /// guest-managed attach can hand the guest the exact peer set the host's own
335 /// global device carries. Platform-agnostic: the guest path runs on macOS.
336 global_peers: HashMap<String, PeerSpec>,
337 /// Guest-managed overlay attachments, keyed by the opaque container `id` from
338 /// [`AttachHandle::GuestManaged`]. Records the allocated overlay IP and the
339 /// generated public key registered in the mesh so `DetachContainer` can
340 /// release the IP and remove the peer.
341 guest_attachments: HashMap<String, GuestAttachInfo>,
342 /// Host-shared overlay attachments, keyed by the opaque container `id` from
343 /// [`AttachHandle::HostShared`] (macOS Seatbelt / native-VZ / libkrun
344 /// containers that share the node's host network namespace and its single
345 /// cluster `utun`). Records the distinct overlay `/32` allocated for the
346 /// container so `DetachContainer` can remove the utun alias, drain the
347 /// per-network L3 isolation membership, and release the IP. Cross-platform
348 /// (the host-shared path compiles everywhere; it is exercised on macOS).
349 host_shared_attachments: HashMap<String, AttachInfo>,
350 /// Overlay DNS server listen address, if one was bootstrapped.
351 dns_server_addr: Option<SocketAddr>,
352 /// DNS domain for overlay service discovery.
353 dns_domain: Option<String>,
354 /// Overlay DNS A/AAAA records this node owns (name -> ip).
355 dns_records: HashMap<String, IpAddr>,
356 /// NAT traversal configuration threaded into every `OverlayConfig`.
357 nat_config: Option<NatConfig>,
358 /// Override for `OverlayConfig::uapi_sock_dir`.
359 uapi_sock_dir: Option<PathBuf>,
360 /// Live NAT traversal orchestrator.
361 nat_traversal: Option<NatTraversal>,
362 /// Unix-epoch seconds of the last successful candidate gather / STUN refresh.
363 nat_last_refresh: AtomicU64,
364 /// NAT-traversal candidates each peer advertised, keyed by base64 public
365 /// key. Populated from `AddPeer { Global }` (the join-time candidate
366 /// exchange); the NAT maintenance tick feeds these into
367 /// `NatTraversal::connect_to_peer` to hole-punch / relay toward a peer whose
368 /// direct endpoint has not produced a recent `WireGuard` handshake.
369 peer_candidates: HashMap<String, Vec<Candidate>>,
370 /// The [`ConnectionType`] last negotiated to each peer (keyed by base64
371 /// public key), recorded by the connect loop so `NatStatus` can report
372 /// direct / hole-punched / relayed per peer.
373 peer_connection_type: HashMap<String, ConnectionType>,
374 /// Built-in relay server, started lazily on the first NAT tick when the
375 /// resolved [`NatConfig::relay_server`] is `Some`. Kept alive for the
376 /// daemon's lifetime so its background accept loop keeps running.
377 relay_server: Option<RelayServer>,
378 /// The address the built-in [`Self::relay_server`] actually bound (the real
379 /// port when `listen_port == 0`).
380 relay_bound_addr: Option<SocketAddr>,
381 /// Cluster-shared credential used to derive the built-in relay server's
382 /// `BLAKE2b` auth key. Carried in `NatConfigSpec.relay_server.auth_credential`
383 /// (the main daemon sets it from the cluster HS256 secret) so every node's
384 /// relay client derives the *same* key. `None` when no credential was
385 /// supplied (the relay then derives a key from the empty string — only nodes
386 /// that likewise have no credential can use it).
387 cluster_relay_credential: Option<String>,
388 /// Set when a `Shutdown` request has been received.
389 shutdown_requested: bool,
390 /// IPv4 `net.ipv4.ip_forward` value observed BEFORE the daemon first
391 /// enabled forwarding for an overlay container attach. `Some(prev)` is
392 /// recorded exactly once (the first time we flip it to `1`); teardown
393 /// restores `prev` so a clean shutdown reverts host routing state the
394 /// daemon turned on without clobbering an operator who set it. `None`
395 /// means the daemon never enabled IPv4 forwarding (nothing to revert).
396 #[cfg(target_os = "linux")]
397 prev_ipv4_forward: Option<String>,
398 /// Per-interface IPv6 `net.ipv6.conf.<dev>.forwarding` was enabled on
399 /// these device names for overlay routing. We enable forwarding
400 /// PER-INTERFACE (never `net.ipv6.conf.all.forwarding`, which has the
401 /// documented side effect of forcing `accept_ra=0` + `autoconf=0` on
402 /// every IPv6 interface — including the public NIC — and silently
403 /// dropping the RA-learned default route / path-MTU, which blackholes
404 /// the host's own larger reply packets). Teardown clears forwarding on
405 /// exactly these devices.
406 #[cfg(target_os = "linux")]
407 ipv6_forward_ifaces: std::collections::HashSet<String>,
408 /// Host-side veth device names THIS daemon created (`veth-<pid>-<tag>`),
409 /// recorded right after a successful `create_veth_pair`. A clean global
410 /// teardown deletes each so no host veth half is left dangling once the
411 /// overlay stops. Per-container detach may delete some of these first;
412 /// deletion is idempotent (a missing device is ignored). Only names this
413 /// daemon created are tracked — never a blanket prefix sweep that could
414 /// catch a concurrent overlay's interfaces.
415 #[cfg(target_os = "linux")]
416 created_veths: std::collections::HashSet<String>,
417 /// `zl-*` bridge device names THIS daemon created (per-service and the
418 /// node-wide shared bridge), recorded right after a successful
419 /// `create_bridge` + address + up. Deleting the bridge link on teardown
420 /// also drops its gateway address and up state, so the name alone is enough
421 /// to fully revert it.
422 #[cfg(target_os = "linux")]
423 created_bridges: std::collections::HashSet<String>,
424 /// Host `/32` (`/128`) routes to a container IP via a host-side veth that
425 /// THIS daemon installed via `replace_route_via_dev` (the bridgeless attach
426 /// path). Each entry is `(dest, prefix_len, dev)` — enough to delete the
427 /// exact route on teardown via `delete_route_via_dev`. Deletion is
428 /// idempotent (a route a prior detach already removed is ignored).
429 #[cfg(target_os = "linux")]
430 created_host_routes: Vec<(IpAddr, u8, String)>,
431}
432
433/// Whether rootless mode forces the `WireGuard` `local_endpoint` to UNSPECIFIED.
434///
435/// In rootless mode `detect_physical_egress()` runs inside the daemon netns and
436/// resolves pasta's in-netns tap IP, which is a meaningless WG source/advertised
437/// endpoint to remote peers. Extracted as a pure fn so the decision is testable
438/// without mutating the process-global `ZLAYER_ROOTLESS` env var (env writes race
439/// across parallel tests).
440fn rootless_forces_unspecified(rootless: bool) -> bool {
441 rootless
442}
443
444/// Whether a failure to create the HOST overlay adapter is fatal for the node.
445///
446/// On Linux the host adapter (a kernel TUN brought up via netlink, with the
447/// rootless userns+netns path as a fallback) IS the container data path, so a
448/// creation failure must abort overlay setup. On macOS/Windows, Linux
449/// containers live in a VZ VM / WSL2 distro that creates its OWN overlay device
450/// and meshes VM-to-VM over UDP — the host adapter (utun/Wintun, which needs
451/// root/Administrator) is only the host's own membership in the overlay and is
452/// NOT on the container data path. So on those platforms a host-adapter failure
453/// must DEGRADE to a VM-only overlay (warn + continue) rather than abort.
454///
455/// Extracted as a `cfg!`-driven pure fn so the degrade decision is unit-testable
456/// on Linux without needing to provoke a real utun/Wintun syscall failure.
457fn host_adapter_failure_is_fatal(host_adapter_mandatory: bool) -> bool {
458 cfg!(target_os = "linux") || host_adapter_mandatory
459}
460
461impl OverlaydServer {
462 /// Create a fresh server bound to `data_dir`. The overlay itself is brought
463 /// up lazily by `SetupGlobalOverlay` (which carries the deployment, slice,
464 /// port, and NAT toggle from the main daemon).
465 ///
466 /// # Panics
467 /// Panics only if the compile-time-constant default CIDR `10.200.0.0/16`
468 /// fails to parse (impossible).
469 #[must_use]
470 pub fn new(data_dir: PathBuf) -> Self {
471 // Until SetupGlobalOverlay arrives, the allocator is bounded to the
472 // default cluster /16. SetupGlobalOverlay re-binds it to the node slice.
473 let default_cidr: IpNetwork = "10.200.0.0/16".parse().expect("compile-time constant CIDR");
474 let overlay_port = zlayer_core::DEFAULT_WG_PORT;
475
476 // Rehydrate the dedicated-port allocator from the on-disk marker so a
477 // service that already owns a dedicated overlay re-binds the exact UDP
478 // port it had before this process started.
479 let marker_path = zlayer_paths::ZLayerDirs::new(data_dir.clone()).agent_network_state();
480 let recorded_dedicated_ports: Vec<u16> = NetworkState::load(&marker_path)
481 .networks
482 .iter()
483 .filter(|n| n.owner.starts_with("service:"))
484 .filter_map(|n| n.wg_port)
485 .collect();
486
487 Self {
488 deployment: String::new(),
489 instance_id: String::new(),
490 data_dir,
491 global_interface: None,
492 global_transport: None,
493 service_interfaces: HashMap::new(),
494 service_transports: HashMap::new(),
495 dedicated_ports: DedicatedPortAllocator::new(overlay_port, recorded_dedicated_ports),
496 #[cfg(target_os = "linux")]
497 service_bridges: HashMap::new(),
498 #[cfg(target_os = "linux")]
499 shared_bridge: None,
500 service_modes: HashMap::new(),
501 service_subnet_registry: None,
502 local_node_id: 0,
503 local_wg_pubkey: None,
504 transport_public_key: None,
505 ip_allocator: IpAllocator::new(default_cidr),
506 node_ip: None,
507 overlay_port,
508 cluster_cidr: Some(default_cidr),
509 slice_cidr: None,
510 #[cfg(target_os = "windows")]
511 hcn_cleanup: HashMap::new(),
512 #[cfg(target_os = "windows")]
513 service_ip_allocators: HashMap::new(),
514 #[cfg(target_os = "linux")]
515 attached: HashMap::new(),
516 network_members: std::collections::HashMap::new(),
517 global_peers: HashMap::new(),
518 guest_attachments: HashMap::new(),
519 host_shared_attachments: HashMap::new(),
520 dns_server_addr: None,
521 dns_domain: None,
522 dns_records: HashMap::new(),
523 nat_config: None,
524 uapi_sock_dir: None,
525 nat_traversal: None,
526 nat_last_refresh: AtomicU64::new(0),
527 peer_candidates: HashMap::new(),
528 peer_connection_type: HashMap::new(),
529 relay_server: None,
530 relay_bound_addr: None,
531 cluster_relay_credential: None,
532 shutdown_requested: false,
533 #[cfg(target_os = "linux")]
534 prev_ipv4_forward: None,
535 #[cfg(target_os = "linux")]
536 ipv6_forward_ifaces: std::collections::HashSet::new(),
537 #[cfg(target_os = "linux")]
538 created_veths: std::collections::HashSet::new(),
539 #[cfg(target_os = "linux")]
540 created_bridges: std::collections::HashSet::new(),
541 #[cfg(target_os = "linux")]
542 created_host_routes: Vec::new(),
543 }
544 }
545
546 /// Override the `WireGuard` UAPI socket directory for every overlay
547 /// transport built by this server.
548 #[must_use]
549 pub fn with_uapi_sock_dir(mut self, dir: impl Into<PathBuf>) -> Self {
550 self.uapi_sock_dir = Some(dir.into());
551 self
552 }
553
554 /// Whether a `Shutdown` request has been received.
555 #[must_use]
556 pub fn shutdown_requested(&self) -> bool {
557 self.shutdown_requested
558 }
559
560 /// The root data directory this server was constructed with. Used by the
561 /// uninstall path (`purge_managed_networks`) and for HCN marker resolution.
562 #[must_use]
563 pub fn data_dir(&self) -> &Path {
564 &self.data_dir
565 }
566
567 // -- request dispatch ----------------------------------------------------
568
569 /// Execute one [`OverlaydRequest`], producing the [`OverlaydResponse`] the
570 /// server sends back over IPC. Any internal error is folded into
571 /// [`OverlaydResponse::Err`].
572 pub async fn handle(&mut self, req: OverlaydRequest) -> OverlaydResponse {
573 match self.dispatch(req).await {
574 Ok(resp) => resp,
575 Err(e) => OverlaydResponse::Err {
576 message: e.to_string(),
577 },
578 }
579 }
580
581 #[allow(clippy::too_many_lines)]
582 async fn dispatch(&mut self, req: OverlaydRequest) -> Result<OverlaydResponse, OverlaydError> {
583 match req {
584 OverlaydRequest::SetLocalNodeId { node_id } => {
585 self.local_node_id = node_id;
586 Ok(OverlaydResponse::Ok)
587 }
588 OverlaydRequest::SetLocalWgPubkey { pubkey } => {
589 self.local_wg_pubkey = Some(pubkey);
590 Ok(OverlaydResponse::Ok)
591 }
592 OverlaydRequest::SetupGlobalOverlay {
593 deployment,
594 instance_id,
595 cluster_cidr,
596 slice_cidr,
597 wg_port,
598 nat,
599 host_adapter_mandatory,
600 } => {
601 let name = self
602 .setup_global_overlay(
603 deployment,
604 instance_id,
605 &cluster_cidr,
606 slice_cidr.as_deref(),
607 wg_port,
608 nat,
609 host_adapter_mandatory,
610 )
611 .await?;
612 Ok(OverlaydResponse::BridgeName { name })
613 }
614 OverlaydRequest::TeardownGlobalOverlay => {
615 self.teardown_global_overlay();
616 Ok(OverlaydResponse::Ok)
617 }
618 OverlaydRequest::SetupServiceOverlay { service, mode } => {
619 let info = self.setup_service_overlay(&service, mode).await?;
620 Ok(OverlaydResponse::ServiceOverlay(info))
621 }
622 OverlaydRequest::TeardownServiceOverlay { service } => {
623 self.teardown_service_overlay(&service).await;
624 Ok(OverlaydResponse::Ok)
625 }
626 OverlaydRequest::AllocateIp {
627 service,
628 join_global,
629 } => {
630 let ip = self.allocate_ip(&service, join_global)?;
631 Ok(OverlaydResponse::Ip { ip })
632 }
633 OverlaydRequest::ReleaseIp { ip } => {
634 self.release_ip(ip);
635 Ok(OverlaydResponse::Ok)
636 }
637 OverlaydRequest::AttachContainer {
638 handle,
639 service,
640 join_global,
641 dns_server,
642 dns_domain,
643 ephemeral,
644 isolation_network,
645 } => {
646 // A guest-managed attach takes a wholly separate path: it cannot
647 // build a veth/HCN endpoint (the target is a VM, not a host
648 // process), so it allocates the overlay identity + peer set and
649 // returns it as `GuestConfig`. PID/HCN handles keep the existing
650 // veth/HCN attach and return `Attached`.
651 if let AttachHandle::GuestManaged { id } = handle {
652 // Record the overlay DNS resolver/zone the daemon staged for
653 // this node so the guest config can fall back to them (same
654 // bookkeeping `attach_container` does for the other handles).
655 if let Some(server) = dns_server {
656 self.dns_server_addr = Some(SocketAddr::new(server, 53));
657 }
658 if dns_domain.is_some() {
659 self.dns_domain.clone_from(&dns_domain);
660 }
661 let config = self
662 .attach_container_guest(
663 &id,
664 &service,
665 join_global,
666 dns_server,
667 dns_domain,
668 isolation_network,
669 )
670 .await?;
671 Ok(OverlaydResponse::GuestConfig(config))
672 } else {
673 let result = self
674 .attach_container(
675 handle,
676 &service,
677 join_global,
678 ephemeral,
679 dns_server,
680 dns_domain,
681 isolation_network,
682 )
683 .await?;
684 Ok(OverlaydResponse::Attached(result))
685 }
686 }
687 OverlaydRequest::DetachContainer { handle } => {
688 if let AttachHandle::GuestManaged { id } = handle {
689 self.detach_container_guest(&id).await?;
690 } else {
691 self.detach_container(handle).await?;
692 }
693 Ok(OverlaydResponse::Ok)
694 }
695 // `scope` selects the target device: `Global` (default) = the single
696 // cluster transport; `Service { service }` = that service's
697 // dedicated per-service transport.
698 OverlaydRequest::AddPeer { peer, scope } => {
699 let info = peer_spec_to_info(&peer)?;
700 // VM-only overlay (macOS/Windows host adapter unavailable):
701 // there is no host transport to program for the Global scope, so
702 // WARN-AND-SKIP the on-device install instead of erroring. The
703 // peer is still mirrored into `global_peers` below so guests can
704 // reproduce the global peer set via the separate guest-config
705 // push — the host simply doesn't join. `Some` transports are
706 // unaffected.
707 if matches!(scope, PeerScope::Global) && self.global_transport.is_none() {
708 tracing::warn!(
709 peer = %peer.public_key,
710 "global overlay has no host adapter (VM-only overlay); \
711 skipping host peer install — guests receive this peer via \
712 guest-config push"
713 );
714 } else {
715 let transport = self.transport_for_scope(&scope)?;
716 Self::add_peer_on(transport, &info).await?;
717 }
718 // Record the peer's advertised NAT candidates (if any) so the
719 // NAT maintenance tick can hole-punch / relay toward it. Stored
720 // for both scopes keyed by public key (the cluster transport is
721 // the one carrying packets either way). Empty candidate lists
722 // are dropped from the map so the tick's borrow loop stays cheap.
723 if peer.candidates.is_empty() {
724 self.peer_candidates.remove(&peer.public_key);
725 } else {
726 let parsed: Vec<Candidate> = peer
727 .candidates
728 .iter()
729 .filter_map(wire_to_candidate)
730 .collect();
731 if parsed.is_empty() {
732 self.peer_candidates.remove(&peer.public_key);
733 } else {
734 self.peer_candidates.insert(peer.public_key.clone(), parsed);
735 }
736 }
737 // Mirror Global peers into `global_peers` so a guest-managed
738 // attach can reproduce the host's global peer set for the guest.
739 if matches!(scope, PeerScope::Global) {
740 self.global_peers.insert(peer.public_key.clone(), peer);
741 }
742 Ok(OverlaydResponse::Ok)
743 }
744 OverlaydRequest::RemovePeer { pubkey, scope } => {
745 // VM-only overlay: no host transport for the Global scope, so the
746 // on-device removal is a no-op — just drop it from `global_peers`
747 // below. `Some` transports are unaffected.
748 if matches!(scope, PeerScope::Global) && self.global_transport.is_none() {
749 tracing::warn!(
750 peer = %pubkey,
751 "global overlay has no host adapter (VM-only overlay); \
752 skipping host peer removal"
753 );
754 } else {
755 let transport = self.transport_for_scope(&scope)?;
756 Self::remove_peer_on(transport, &pubkey).await?;
757 }
758 if matches!(scope, PeerScope::Global) {
759 self.global_peers.remove(&pubkey);
760 }
761 self.peer_candidates.remove(&pubkey);
762 self.peer_connection_type.remove(&pubkey);
763 Ok(OverlaydResponse::Ok)
764 }
765 OverlaydRequest::AddAllowedIp {
766 pubkey,
767 cidr,
768 scope,
769 } => {
770 // VM-only overlay: no host device to plumb AllowedIPs into for the
771 // Global scope — warn-and-skip. `Some` transports are unaffected.
772 if matches!(scope, PeerScope::Global) && self.global_transport.is_none() {
773 tracing::warn!(
774 peer = %pubkey,
775 cidr = %cidr,
776 "global overlay has no host adapter (VM-only overlay); \
777 skipping host AllowedIP add"
778 );
779 } else {
780 let transport = self.transport_for_scope(&scope)?;
781 Self::add_allowed_ip_on(transport, &pubkey, &cidr).await?;
782 }
783 Ok(OverlaydResponse::Ok)
784 }
785 OverlaydRequest::RemoveAllowedIp {
786 pubkey,
787 cidr,
788 scope,
789 } => {
790 // VM-only overlay: no host device for the Global scope — the
791 // removal is a no-op. `Some` transports are unaffected.
792 if matches!(scope, PeerScope::Global) && self.global_transport.is_none() {
793 tracing::warn!(
794 peer = %pubkey,
795 cidr = %cidr,
796 "global overlay has no host adapter (VM-only overlay); \
797 skipping host AllowedIP removal"
798 );
799 } else {
800 let transport = self.transport_for_scope(&scope)?;
801 Self::remove_allowed_ip_on(transport, &pubkey, &cidr).await?;
802 }
803 Ok(OverlaydResponse::Ok)
804 }
805 OverlaydRequest::RegisterDns { name, ip } => {
806 self.register_dns(name, ip);
807 Ok(OverlaydResponse::Ok)
808 }
809 OverlaydRequest::UnregisterDns { name } => {
810 self.unregister_dns(&name);
811 Ok(OverlaydResponse::Ok)
812 }
813 OverlaydRequest::WriteScopedResolver {
814 zone,
815 node_ip,
816 port,
817 } => {
818 #[cfg(target_os = "macos")]
819 {
820 zlayer_overlay::dns::write_scoped_resolver(&zone, node_ip, port).map_err(
821 |e| OverlaydError::Overlay(format!("write_scoped_resolver({zone}): {e}")),
822 )?;
823 Ok(OverlaydResponse::Ok)
824 }
825 #[cfg(not(target_os = "macos"))]
826 {
827 let _ = (zone, node_ip, port);
828 Err(OverlaydError::Overlay(
829 "scoped resolver is macOS-only".into(),
830 ))
831 }
832 }
833 OverlaydRequest::RemoveScopedResolver { zone } => {
834 #[cfg(target_os = "macos")]
835 {
836 zlayer_overlay::dns::remove_scoped_resolver(&zone).map_err(|e| {
837 OverlaydError::Overlay(format!("remove_scoped_resolver({zone}): {e}"))
838 })?;
839 Ok(OverlaydResponse::Ok)
840 }
841 #[cfg(not(target_os = "macos"))]
842 {
843 let _ = zone;
844 Err(OverlaydError::Overlay(
845 "scoped resolver is macOS-only".into(),
846 ))
847 }
848 }
849 OverlaydRequest::PruneOrphanBridges { live_bridge_names } => {
850 let reclaimed = self.prune_orphan_bridges(&live_bridge_names).await;
851 Ok(OverlaydResponse::PrunedBridges { reclaimed })
852 }
853 OverlaydRequest::Status => Ok(OverlaydResponse::Status(self.status_snapshot().await)),
854 OverlaydRequest::NatTick => {
855 self.nat_maintenance_tick().await?;
856 Ok(OverlaydResponse::Ok)
857 }
858 OverlaydRequest::NatStatus => Ok(OverlaydResponse::NatStatus(
859 self.nat_status_snapshot().await,
860 )),
861 OverlaydRequest::Shutdown => {
862 self.shutdown_requested = true;
863 self.teardown_global_overlay();
864 Ok(OverlaydResponse::Ok)
865 }
866 }
867 }
868
869 // -- global overlay ------------------------------------------------------
870
871 /// Bring up (or reuse) this node's base/global overlay.
872 ///
873 /// Idempotent: if a global transport is already live, reuse it (recreating
874 /// without this guard could yank the kernel TUN out from under the running
875 /// boringtun worker). Re-binds the IP allocator to `slice_cidr` if one is
876 /// supplied so container IPs never collide across nodes.
877 ///
878 /// # Errors
879 /// Returns an error if key generation or interface creation fails.
880 #[allow(clippy::too_many_lines)]
881 #[allow(clippy::too_many_arguments)]
882 async fn setup_global_overlay(
883 &mut self,
884 deployment: String,
885 instance_id: String,
886 cluster_cidr: &str,
887 slice_cidr: Option<&str>,
888 wg_port: u16,
889 nat: Option<NatConfigSpec>,
890 host_adapter_mandatory: bool,
891 ) -> Result<String, OverlaydError> {
892 self.deployment = deployment;
893 self.instance_id = instance_id;
894 self.overlay_port = wg_port;
895
896 let cluster: IpNetwork = cluster_cidr.parse().map_err(|e| {
897 OverlaydError::Other(format!("invalid cluster CIDR {cluster_cidr}: {e}"))
898 })?;
899 self.cluster_cidr = Some(cluster);
900 if let Some(slice) = slice_cidr {
901 let slice_net: IpNetwork = slice
902 .parse()
903 .map_err(|e| OverlaydError::Other(format!("invalid slice CIDR {slice}: {e}")))?;
904 self.slice_cidr = Some(slice_net);
905 self.ip_allocator = IpAllocator::new(slice_net);
906 }
907 // Thread the full operator-supplied NAT config (STUN/TURN servers,
908 // timeouts, relay-server bind + credential) into overlayd. `None` means
909 // the main daemon supplied no explicit config, so overlayd keeps its
910 // built-in `NatConfig::default()` (NAT enabled, Google STUN). A `Some`
911 // spec is converted verbatim — including the relay credential, stashed
912 // separately so the relay server can be stood up with a cluster-shared
913 // auth key on the first NAT tick.
914 if let Some(spec) = nat {
915 self.cluster_relay_credential = spec
916 .relay_server
917 .as_ref()
918 .and_then(|r| r.auth_credential.clone());
919 self.nat_config = Some(nat_config_spec_to_config(spec));
920 }
921
922 if let Some(name) = self.global_interface.clone() {
923 if self.global_transport.is_some() {
924 tracing::debug!(
925 deployment = %self.deployment,
926 "Global overlay already active, reusing existing transport"
927 );
928 return Ok(name);
929 }
930 }
931
932 let interface_name = make_interface_name(&[&self.deployment, &self.instance_id], "g");
933
934 let (private_key, public_key) = OverlayTransport::generate_keys()
935 .await
936 .map_err(|e| OverlaydError::Overlay(format!("Failed to generate keys: {e}")))?;
937
938 // The node's own overlay IP is the deterministic first-usable host of
939 // its slice (reserved offset 1), NOT a racy `allocate()` that drifts by
940 // allocation order. Containers draw from offset 2 onward, so the node
941 // IP is stable across restarts and never collides with a container.
942 let node_ip = self.ip_allocator.node_ip();
943 self.transport_public_key = Some(public_key.clone());
944 let physical_egress_ip = match zlayer_overlay::detect_physical_egress().await {
945 Ok(egress) => Some(egress.ip),
946 Err(e) => {
947 tracing::warn!(
948 error = %e,
949 "failed to detect physical egress; WireGuard local_endpoint \
950 will bind UNSPECIFIED for the global overlay"
951 );
952 None
953 }
954 };
955 let config = self.build_config(
956 private_key,
957 public_key,
958 node_ip,
959 16,
960 self.overlay_port,
961 physical_egress_ip,
962 );
963 // Remove any stale `-g` interface with this (now deterministic) name
964 // left by a previous daemon instance, so the create below cleanly
965 // REPLACES it instead of failing "File exists" or orphaning the old
966 // one. With a stable per-host instance id the name is constant across
967 // restarts, so exactly one global interface ever exists.
968 #[cfg(target_os = "linux")]
969 let _ = crate::netlink::delete_link_by_name(&interface_name).await;
970 let mut transport = OverlayTransport::new(config, interface_name);
971
972 // Creating the host overlay adapter is fatal on Linux (the kernel TUN IS
973 // the container data path) but only DEGRADES on macOS/Windows: there,
974 // Linux containers run in a VZ VM / WSL2 distro that creates its own
975 // overlay device and meshes VM-to-VM over UDP, so the host adapter
976 // (utun/Wintun, needs root/Administrator) is just the host's own overlay
977 // membership and is NOT on the container data path. The allocator and
978 // `node_ip` are already bound above, so guest-config push + IP allocation
979 // keep working even when the host adapter is unavailable.
980 // Map the (non-`Send`) `Box<dyn Error>` to an owned `String` BEFORE the
981 // match so no non-`Send` value is held across the `configure().await`
982 // below — the daemon's request handler future must stay `Send`.
983 let create_result = transport
984 .create_interface()
985 .await
986 .map_err(|e| e.to_string());
987 let actual_name = match create_result {
988 Ok(()) => {
989 transport.configure(&[]).await.map_err(|e| {
990 OverlaydError::Overlay(format!("Failed to configure global overlay: {e}"))
991 })?;
992 // Read back the actual interface name (on macOS, the kernel
993 // assigns utunN).
994 let actual_name = transport.interface_name().to_string();
995 self.node_ip = Some(node_ip);
996 self.global_interface = Some(actual_name.clone());
997 self.global_transport = Some(transport);
998 actual_name
999 }
1000 Err(e) if !host_adapter_failure_is_fatal(host_adapter_mandatory) => {
1001 // macOS / Windows: continue with a VM-only overlay. Leave
1002 // `global_transport == None` (the natural "no host adapter"
1003 // signal), keep `node_ip` so allocation/guest config are
1004 // unaffected, and SKIP `configure` (no device to program).
1005 tracing::warn!(
1006 error = %e,
1007 "host overlay adapter unavailable (needs root/Administrator); \
1008 continuing with VM-only overlay — the host will not join the \
1009 overlay, but containers running in the VM mesh VM-to-VM and IP \
1010 allocation/guest config are unaffected"
1011 );
1012 self.node_ip = Some(node_ip);
1013 self.global_interface = None;
1014 self.global_transport = None;
1015 // No real device exists; return an honest marker so the IPC
1016 // response is a success without implying a live adapter.
1017 "(host-adapter-disabled)".to_string()
1018 }
1019 Err(e) => {
1020 // Linux (and any future fatal-on-failure target): unchanged —
1021 // a host-adapter creation failure aborts overlay setup.
1022 return Err(OverlaydError::Overlay(format!(
1023 "Failed to create global overlay: {e}"
1024 )));
1025 }
1026 };
1027
1028 // In rootless mode the daemon runs in its own network namespace and
1029 // `pasta` provides egress NAT + inbound port forwarding; the host-table
1030 // iptables setup below is at best a no-op inside the netns and at worst
1031 // spurious, so skip it entirely. Otherwise install the host firewall
1032 // rules as usual.
1033 if std::env::var_os("ZLAYER_ROOTLESS").is_none() {
1034 // Stop systemd-networkd / NetworkManager from managing the overlay
1035 // links overlayd just created. With a permissive default match they
1036 // try to bring `zl-*` up / run DHCP and (seen on a CI runner)
1037 // SIGABRT on the networkd watchdog while processing a `zl-*` Link
1038 // UP. Best-effort; reverted in `teardown_global_overlay`.
1039 zlayer_overlay::networkd::mark_overlay_interfaces_unmanaged();
1040
1041 // Allow overlay traffic through the host firewall (UFW / firewalld /
1042 // a bare `iptables -P FORWARD DROP`). Without this, a container's DNS
1043 // query to the node overlay IP — and inter-service overlay traffic —
1044 // is dropped by the host's INPUT/FORWARD policy before it reaches
1045 // ZLayer's resolver. Best-effort: a host without `iptables` logs a
1046 // warning rather than aborting overlay setup.
1047 if let Err(e) =
1048 zlayer_overlay::firewall::ensure_overlay_subnet_rules(&cluster.to_string())
1049 {
1050 tracing::warn!(
1051 error = %e,
1052 cidr = %cluster,
1053 "failed to install overlay firewall allow-rules; service DNS / \
1054 cross-service traffic may be blocked by the host firewall"
1055 );
1056 }
1057
1058 // SNAT overlay-sourced egress so containers can reach the LAN/internet.
1059 // The allow-rules above + `ip_forward` only get the packet *forwarded*
1060 // out the WAN NIC; without masquerade it leaves with a private
1061 // `10.200.0.0/16` source and replies never route back (ENETUNREACH /
1062 // hangs for `wget http://<public-ip>`). Best-effort, same as above.
1063 if let Err(e) =
1064 zlayer_overlay::firewall::ensure_overlay_masquerade(&cluster.to_string())
1065 {
1066 tracing::warn!(
1067 error = %e,
1068 cidr = %cluster,
1069 "failed to install overlay egress masquerade; overlay containers \
1070 may be unable to reach the LAN / internet"
1071 );
1072 }
1073 } else {
1074 tracing::info!(
1075 "rootless mode: skipping host iptables (pasta provides egress + port forwarding)"
1076 );
1077 }
1078
1079 Ok(actual_name)
1080 }
1081
1082 /// Tear down the node's base overlay (e.g. on full uninstall / shutdown).
1083 fn teardown_global_overlay(&mut self) {
1084 if let Some(mut transport) = self.global_transport.take() {
1085 tracing::info!("Shutting down global overlay");
1086 transport.shutdown();
1087 }
1088 self.global_interface = None;
1089 self.transport_public_key = None;
1090
1091 // Revert host network state this daemon mutated so a clean stop
1092 // recovers connectivity WITHOUT requiring a reboot. Forwarding
1093 // sysctls and the overlay iptables chains are otherwise sticky:
1094 // they survive both the daemon stop and an `iptables -F`, so prior
1095 // to this the only way to undo them was a reboot.
1096 #[cfg(target_os = "linux")]
1097 self.revert_forwarding();
1098 zlayer_overlay::firewall::remove_overlay_masquerade();
1099 zlayer_overlay::firewall::remove_overlay_subnet_rules();
1100 // `remove_member_isolation` deliberately leaves the ZLAYER-OVERLAY-ISO
1101 // chain + its FORWARD jump resident (other members may still use them);
1102 // on a full overlay teardown remove the whole chain so nothing leaks.
1103 zlayer_overlay::firewall::remove_overlay_isolation();
1104 // macOS: strip the pf overlay anchor + the two marked `/etc/pf.conf`
1105 // lines this node installs for the cluster/DNS ports. Without this they
1106 // leak past daemon stop (the anchor file and `/etc/pf.conf` refs are
1107 // sticky on disk). Idempotent: a missing anchor / not-root / disabled-pf
1108 // case is treated as a successful no-op by the backend. cfg-gated so
1109 // Linux/Windows teardown behaviour is unchanged.
1110 #[cfg(target_os = "macos")]
1111 if let Err(e) = zlayer_overlay::firewall::remove_overlay_rules() {
1112 tracing::warn!(error = %e, "failed to remove macOS pf overlay rules during teardown");
1113 }
1114 // Remove the systemd-networkd / NetworkManager "unmanaged" drop-ins we
1115 // installed at setup so a clean stop fully reverts host network state.
1116 zlayer_overlay::networkd::unmark_overlay_interfaces_unmanaged();
1117
1118 // Revert the host-side netlink resources this daemon created (veths,
1119 // host /32 routes, bridges). The netlink helpers are async; this fn must
1120 // keep its sync signature, so bridge to the surrounding multi-thread
1121 // tokio runtime via block_in_place + Handle::block_on. Order matters:
1122 // delete routes first (they reference the veth as their oif), then the
1123 // host-side veths, then the bridges (deleting a bridge link drops its
1124 // address + up state). Every delete is best-effort + idempotent: a
1125 // resource a prior per-container detach already removed surfaces as
1126 // NotFound/ESRCH which the helpers treat as success, and a genuine
1127 // failure is logged and skipped so a partial teardown never aborts the
1128 // rest.
1129 #[cfg(target_os = "linux")]
1130 {
1131 let routes: Vec<(IpAddr, u8, String)> = std::mem::take(&mut self.created_host_routes);
1132 let veths: Vec<String> = self.created_veths.drain().collect();
1133 let bridges: Vec<String> = self.created_bridges.drain().collect();
1134
1135 let delete_all = || async {
1136 for (dest, prefix, dev) in &routes {
1137 if let Err(e) = crate::netlink::delete_route_via_dev(*dest, *prefix, dev).await
1138 {
1139 tracing::warn!(
1140 dest = %dest, prefix, dev = %dev, error = %e,
1141 "teardown: failed to delete host route (continuing)"
1142 );
1143 }
1144 }
1145 for veth in &veths {
1146 if let Err(e) = crate::netlink::delete_link_by_name(veth).await {
1147 tracing::warn!(
1148 veth = %veth, error = %e,
1149 "teardown: failed to delete host-side veth (continuing)"
1150 );
1151 }
1152 }
1153 for bridge in &bridges {
1154 if let Err(e) = crate::netlink::delete_link_by_name(bridge).await {
1155 tracing::warn!(
1156 bridge = %bridge, error = %e,
1157 "teardown: failed to delete bridge (continuing)"
1158 );
1159 }
1160 }
1161 };
1162
1163 match tokio::runtime::Handle::try_current() {
1164 Ok(handle) => {
1165 tokio::task::block_in_place(|| handle.block_on(delete_all()));
1166 }
1167 Err(_) => {
1168 // No ambient runtime (e.g. a non-async shutdown path): spin
1169 // up a throwaway current-thread runtime to drive the deletes.
1170 match tokio::runtime::Builder::new_current_thread()
1171 .enable_all()
1172 .build()
1173 {
1174 Ok(rt) => rt.block_on(delete_all()),
1175 Err(e) => tracing::warn!(
1176 error = %e,
1177 "teardown: could not build a runtime to revert netlink \
1178 resources; veths/routes/bridges left in place"
1179 ),
1180 }
1181 }
1182 }
1183 }
1184 }
1185
1186 /// Enable IP forwarding for an overlay container attach, scoped to the
1187 /// address family in use and (for IPv6) to the specific overlay devices.
1188 ///
1189 /// IPv4 has no per-interface forwarding knob that affects routing the way
1190 /// we need, so `net.ipv4.ip_forward` is global — but that is harmless for
1191 /// the host's own INPUT / reply path (it only permits the box to route
1192 /// transit traffic). We snapshot its prior value once so teardown can
1193 /// restore it.
1194 ///
1195 /// IPv6 is the dangerous case: `net.ipv6.conf.all.forwarding=1` forces
1196 /// `accept_ra=0` + `autoconf=0` on EVERY IPv6 interface, which drops the
1197 /// RA-learned default route and path-MTU on the public NIC and blackholes
1198 /// the host's own larger reply packets. We therefore enable forwarding
1199 /// only on the specific overlay device(s) via
1200 /// `net.ipv6.conf.<dev>.forwarding`, which routes overlay traffic without
1201 /// touching the physical NIC's RA / PMTU state.
1202 #[cfg(target_os = "linux")]
1203 fn enable_forwarding_for_attach(
1204 &mut self,
1205 is_v6: bool,
1206 veth_host: &str,
1207 bridge_name: Option<&str>,
1208 ) {
1209 // IPv4 forwarding (global) — required for v4 overlay egress, benign
1210 // for INPUT. Snapshot the prior value exactly once.
1211 if self.prev_ipv4_forward.is_none() {
1212 let prev = crate::netlink::read_sysctl("net.ipv4.ip_forward")
1213 .unwrap_or_else(|_| "0".to_string());
1214 self.prev_ipv4_forward = Some(prev);
1215 }
1216 let _ = crate::netlink::set_sysctl("net.ipv4.ip_forward", "1");
1217
1218 // IPv6 forwarding — PER-INTERFACE only. Enable on the host-side veth
1219 // and (when bridged) the bridge so the overlay routes, without the
1220 // `all.forwarding` RA/PMTU side effect on the physical NIC. The Linux
1221 // sysctl name uses '/' for the interface segment escaped to '.' by
1222 // set_sysctl's dot-translation — so pass the device name with any
1223 // literal dots intact (overlay device names never contain dots).
1224 if is_v6 {
1225 for dev in std::iter::once(veth_host).chain(bridge_name) {
1226 let key = format!("net.ipv6.conf.{dev}.forwarding");
1227 if crate::netlink::set_sysctl(&key, "1").is_ok() {
1228 self.ipv6_forward_ifaces.insert(dev.to_string());
1229 }
1230 }
1231 }
1232 }
1233
1234 /// Revert the forwarding sysctls this daemon enabled (counterpart of
1235 /// [`Self::enable_forwarding_for_attach`]). Restores the snapshotted IPv4
1236 /// value and clears per-interface IPv6 forwarding on exactly the devices
1237 /// we touched. Best-effort: a failed write (device already gone, `/proc`
1238 /// not writable) is ignored — the worst case is the pre-existing sticky
1239 /// state, never a crash on shutdown.
1240 #[cfg(target_os = "linux")]
1241 fn revert_forwarding(&mut self) {
1242 if let Some(prev) = self.prev_ipv4_forward.take() {
1243 let _ = crate::netlink::set_sysctl("net.ipv4.ip_forward", &prev);
1244 }
1245 for dev in self.ipv6_forward_ifaces.drain() {
1246 let key = format!("net.ipv6.conf.{dev}.forwarding");
1247 let _ = crate::netlink::set_sysctl(&key, "0");
1248 }
1249 }
1250
1251 // -- service overlay -----------------------------------------------------
1252
1253 /// Set up the per-service Linux bridge that backs `service` on this node.
1254 ///
1255 /// Returns the bridge name on success.
1256 ///
1257 /// # Errors
1258 /// Returns an error if subnet assignment fails (exhaustion), if the bridge
1259 /// cannot be created, or if the cluster transport rejects the `AllowedIPs`
1260 /// update.
1261 #[cfg(target_os = "linux")]
1262 async fn setup_service_overlay(
1263 &mut self,
1264 service: &str,
1265 mode: OverlayMode,
1266 ) -> Result<ServiceOverlayInfo, OverlaydError> {
1267 // Decision surface is the two predicates on `OverlayMode` (see
1268 // `zlayer_types::overlay`), not an ad-hoc variant match:
1269 // - uses_shared_bridge() -> the single node-wide shared bridge (+ the
1270 // userspace free-port L4 proxy wired in `proxy_manager.rs`).
1271 // - uses_per_service_wg() -> a dedicated per-service WireGuard device.
1272 // - uses_isolation_scope() -> Isolated: Auto topology here; the L3
1273 // fence is applied at ATTACH time via `isolation_network`.
1274 // - otherwise (Auto) -> per-service Linux bridge carried on the
1275 // single cluster-wide WireGuard interface (today's default).
1276 // Record the resolved mode so the container ATTACH path can branch.
1277 let resolved = mode.resolve();
1278 self.service_modes.insert(service.to_string(), resolved);
1279 if resolved.uses_shared_bridge() {
1280 self.setup_service_overlay_shared_bridge(service).await
1281 } else if resolved.uses_per_service_wg() {
1282 self.setup_service_overlay_dedicated(service).await
1283 } else if resolved.uses_isolation_scope() {
1284 // Isolated == Auto topology (per-service bridge on the cluster-wide
1285 // WireGuard); the L3 fence is applied at ATTACH time via
1286 // `isolation_network`, not in segment setup. Same target as the
1287 // default, made explicit so a new mode can't silently fall through.
1288 self.setup_service_overlay_cluster_wg(service).await
1289 } else {
1290 self.setup_service_overlay_cluster_wg(service).await
1291 }
1292 }
1293
1294 /// `Auto`-mode per-service overlay (Linux): a per-service Linux bridge backed
1295 /// by the SINGLE cluster-wide `WireGuard` transport (the service subnet is
1296 /// plumbed onto the cluster device's `AllowedIPs`). This is the original
1297 /// default `setup_service_overlay` body, returning a [`ServiceOverlayInfo`]
1298 /// with the bridge name and all dedicated-device identity fields `None`
1299 /// (`Auto` shares the cluster device).
1300 ///
1301 /// Returns the bridge name on success.
1302 ///
1303 /// # Errors
1304 /// Returns an error if subnet assignment fails (exhaustion), if the bridge
1305 /// cannot be created, or if the cluster transport rejects the `AllowedIPs`
1306 /// update.
1307 #[cfg(target_os = "linux")]
1308 #[allow(clippy::too_many_lines)]
1309 async fn setup_service_overlay_cluster_wg(
1310 &mut self,
1311 service: &str,
1312 ) -> Result<ServiceOverlayInfo, OverlaydError> {
1313 // 1. Idempotency check.
1314 if let Some(existing) = self.service_bridges.get(service) {
1315 let name = existing.name.clone();
1316 tracing::debug!(service = %service, bridge = %name, "Service bridge already active, reusing");
1317 return Ok(cluster_wg_overlay_info(name));
1318 }
1319
1320 // 2. Assign subnet via the (currently local) ServiceSubnetRegistry.
1321 self.ensure_service_subnet_registry()?;
1322 let subnet: ipnet::IpNet = {
1323 let registry = self
1324 .service_subnet_registry
1325 .as_mut()
1326 .expect("ensure_service_subnet_registry leaves Some");
1327 let node_key = self.local_node_id.to_string();
1328 registry.assign(service, &node_key).map_err(|e| {
1329 OverlaydError::Overlay(format!(
1330 "ServiceSubnetRegistry::assign({service}, {node_key}) failed: {e}"
1331 ))
1332 })?
1333 };
1334
1335 // 3+4+6. Create the per-service Linux bridge, assign its gateway, bring
1336 // it up, build the per-service IpAllocator, and record it.
1337 let bridge_name = self.create_service_bridge(service, subnet).await?;
1338
1339 // 5. Plumb subnet into the cluster transport's local AllowedIPs so the
1340 // single cluster device carries this service's cross-node traffic
1341 // (Shared mode shares one crypto context for every service).
1342 if let Some(ref cluster) = self.global_transport {
1343 if let Some(ref pubkey) = self.local_wg_pubkey {
1344 if let Err(e) = cluster.add_allowed_ip(pubkey, subnet).await {
1345 tracing::warn!(
1346 service = %service,
1347 subnet = %subnet,
1348 error = %e,
1349 "Failed to add service subnet to cluster transport AllowedIPs (non-fatal)"
1350 );
1351 }
1352 } else {
1353 tracing::debug!(service = %service, "local_wg_pubkey not yet set; skipping cluster AllowedIPs update");
1354 }
1355 }
1356
1357 Ok(cluster_wg_overlay_info(bridge_name))
1358 }
1359
1360 /// `Shared`-mode per-service overlay (Linux): attach `service` onto the
1361 /// SINGLE node-wide shared Linux bridge (created once, reused by every
1362 /// Shared service on this node), carried on the cluster-wide `WireGuard`
1363 /// interface. There is NO per-service bridge and NO per-service `WireGuard`;
1364 /// container ports are exposed via the userspace free-port L4 proxy
1365 /// (`proxy_manager.rs`). Returns the shared bridge name.
1366 ///
1367 /// Idempotent: the shared bridge is allocated a single subnet and brought up
1368 /// exactly once; subsequent Shared services reuse it. The service is recorded
1369 /// in `service_interfaces` (pointing at the shared bridge) so presence checks
1370 /// and the attach path resolve it.
1371 ///
1372 /// # Errors
1373 /// Returns an error if the one-time shared-subnet assignment fails
1374 /// (exhaustion), if the shared bridge cannot be created, or if the cluster
1375 /// transport rejects the `AllowedIPs` update.
1376 #[cfg(target_os = "linux")]
1377 async fn setup_service_overlay_shared_bridge(
1378 &mut self,
1379 service: &str,
1380 ) -> Result<ServiceOverlayInfo, OverlaydError> {
1381 let bridge_name = self.ensure_shared_bridge().await?;
1382 // Point this service at the shared bridge so presence checks succeed and
1383 // the attach path resolves it to the shared bridge.
1384 self.service_interfaces
1385 .insert(service.to_string(), bridge_name.clone());
1386 tracing::info!(service = %service, bridge = %bridge_name, "Service attached to shared node-wide bridge");
1387 Ok(shared_overlay_info(bridge_name))
1388 }
1389
1390 /// Ensure the single node-wide shared Linux bridge exists, returning its
1391 /// name. Created once with its own subnet (drawn from the same
1392 /// `ServiceSubnetRegistry` every service subnet comes from, under a fixed
1393 /// reserved key so it never collides with a real service) and plumbed onto
1394 /// the cluster transport's `AllowedIPs` so shared containers are
1395 /// mesh-reachable across nodes. Subsequent calls return the existing name.
1396 ///
1397 /// # Errors
1398 /// Returns an error if subnet assignment fails or the bridge cannot be
1399 /// created/addressed/brought up.
1400 #[cfg(target_os = "linux")]
1401 async fn ensure_shared_bridge(&mut self) -> Result<String, OverlaydError> {
1402 use zlayer_overlay::allocator::IpAllocator as OverlayIpAllocator;
1403
1404 if let Some(existing) = self.shared_bridge.as_ref() {
1405 return Ok(existing.name.clone());
1406 }
1407
1408 // One subnet for the whole shared bridge. Use a fixed reserved key in the
1409 // registry (never a real service name) so the shared bridge gets exactly
1410 // one stable subnet, distinct from every per-service subnet.
1411 self.ensure_service_subnet_registry()?;
1412 let subnet: ipnet::IpNet = {
1413 let registry = self
1414 .service_subnet_registry
1415 .as_mut()
1416 .expect("ensure_service_subnet_registry leaves Some");
1417 let node_key = self.local_node_id.to_string();
1418 registry.assign(SHARED_BRIDGE_REGISTRY_KEY, &node_key).map_err(|e| {
1419 OverlaydError::Overlay(format!(
1420 "ServiceSubnetRegistry::assign({SHARED_BRIDGE_REGISTRY_KEY}, {node_key}) failed: {e}"
1421 ))
1422 })?
1423 };
1424
1425 // Deterministic, IFNAMSIZ-safe shared-bridge name (one per node). Use the
1426 // same naming helper as per-service bridges with a fixed key so it stays
1427 // <= 15 chars and is unambiguous (`zl-...-sh`).
1428 let bridge_name =
1429 make_interface_name(&[&self.deployment, &self.instance_id, "shared"], "sh");
1430
1431 if let Err(e) = crate::netlink::create_bridge(&bridge_name).await {
1432 return Err(OverlaydError::Overlay(format!(
1433 "create_bridge({bridge_name}) failed: {e}"
1434 )));
1435 }
1436 if let Err(e) = crate::netlink::set_bridge_stp(&bridge_name, false) {
1437 tracing::warn!(bridge = %bridge_name, error = %e, "set_bridge_stp(off) failed (non-fatal)");
1438 }
1439
1440 // Flush stale addresses first: `create_bridge` is idempotent on EEXIST, so
1441 // a shared bridge that survived a restart would otherwise accumulate a
1442 // second gateway (the same dual-address bug fixed for per-service bridges).
1443 let gateway = first_usable_ip(subnet);
1444 if let Err(e) = crate::netlink::flush_addresses_on_link_by_name(&bridge_name).await {
1445 tracing::warn!(bridge = %bridge_name, error = %e, "flush_addresses_on_link_by_name failed (non-fatal)");
1446 }
1447 if let Err(e) =
1448 crate::netlink::add_address_to_link_by_name(&bridge_name, gateway, subnet.prefix_len())
1449 .await
1450 {
1451 let _ = crate::netlink::delete_bridge(&bridge_name).await;
1452 return Err(OverlaydError::Overlay(format!(
1453 "add_address_to_link_by_name({bridge_name}, {gateway}/{}) failed: {e}",
1454 subnet.prefix_len()
1455 )));
1456 }
1457 if let Err(e) = crate::netlink::set_link_up_by_name(&bridge_name).await {
1458 let _ = crate::netlink::delete_bridge(&bridge_name).await;
1459 return Err(OverlaydError::Overlay(format!(
1460 "set_link_up_by_name({bridge_name}) failed: {e}"
1461 )));
1462 }
1463
1464 // Track the shared bridge for global teardown (deleting the link drops
1465 // its gateway address + up state).
1466 self.created_bridges.insert(bridge_name.clone());
1467
1468 let mut ip_allocator = OverlayIpAllocator::new(&subnet.to_string()).map_err(|e| {
1469 OverlaydError::Overlay(format!("IpAllocator::new({subnet}) failed: {e}"))
1470 })?;
1471 let _ = ip_allocator.allocate_specific(gateway);
1472
1473 // Plumb the shared subnet onto the cluster transport's AllowedIPs so the
1474 // single cluster device carries shared-bridge cross-node traffic (same
1475 // mechanism the cluster-WG per-service path uses).
1476 if let Some(ref cluster) = self.global_transport {
1477 if let Some(ref pubkey) = self.local_wg_pubkey {
1478 if let Err(e) = cluster.add_allowed_ip(pubkey, subnet).await {
1479 tracing::warn!(
1480 subnet = %subnet,
1481 error = %e,
1482 "Failed to add shared-bridge subnet to cluster transport AllowedIPs (non-fatal)"
1483 );
1484 }
1485 } else {
1486 tracing::debug!(
1487 "local_wg_pubkey not yet set; skipping shared-bridge cluster AllowedIPs update"
1488 );
1489 }
1490 }
1491
1492 self.shared_bridge = Some(ServiceBridge {
1493 name: bridge_name.clone(),
1494 subnet,
1495 gateway,
1496 ip_allocator,
1497 });
1498
1499 tracing::info!(bridge = %bridge_name, subnet = %subnet, gateway = %gateway, "Shared node-wide bridge created");
1500 Ok(bridge_name)
1501 }
1502
1503 /// Create the per-service Linux bridge for `service` on `subnet`, assign its
1504 /// gateway, bring it up, build the per-service [`IpAllocator`], and record it
1505 /// in `service_bridges` + `service_interfaces`. Returns the bridge name.
1506 ///
1507 /// Shared and Dedicated mode share this bridge mechanic verbatim — the ONLY
1508 /// difference between the two modes is which `WireGuard` device the service
1509 /// subnet/peers are plumbed onto (the single cluster transport for Shared,
1510 /// the dedicated per-service transport for Dedicated). This helper does NOT
1511 /// touch any transport's `AllowedIPs`; the caller does that against the
1512 /// device it owns.
1513 ///
1514 /// # Errors
1515 /// Returns an error if the bridge cannot be created, addressed, or brought
1516 /// up, or if the per-service `IpAllocator` cannot be built.
1517 #[cfg(target_os = "linux")]
1518 async fn create_service_bridge(
1519 &mut self,
1520 service: &str,
1521 subnet: ipnet::IpNet,
1522 ) -> Result<String, OverlaydError> {
1523 use zlayer_overlay::allocator::IpAllocator as OverlayIpAllocator;
1524
1525 let bridge_name = make_interface_name(&[&self.deployment, &self.instance_id, service], "b");
1526
1527 if let Err(e) = crate::netlink::create_bridge(&bridge_name).await {
1528 return Err(OverlaydError::Overlay(format!(
1529 "create_bridge({bridge_name}) failed: {e}"
1530 )));
1531 }
1532 if let Err(e) = crate::netlink::set_bridge_stp(&bridge_name, false) {
1533 tracing::warn!(bridge = %bridge_name, error = %e, "set_bridge_stp(off) failed (non-fatal)");
1534 }
1535
1536 // Gateway = first usable host in the subnet, assigned to the bridge.
1537 // Flush any pre-existing addresses FIRST: `create_bridge` is idempotent
1538 // on EEXIST, so a bridge that survived a restart would otherwise keep its
1539 // old gateway and we'd stack the new one on top (the observed dual
1540 // /28 + /26 bug). Flushing makes the assignment idempotent and self-heals
1541 // such bridges. Non-fatal: on a brand-new bridge there is nothing to flush.
1542 let gateway = first_usable_ip(subnet);
1543 if let Err(e) = crate::netlink::flush_addresses_on_link_by_name(&bridge_name).await {
1544 tracing::warn!(bridge = %bridge_name, error = %e, "flush_addresses_on_link_by_name failed (non-fatal)");
1545 }
1546 if let Err(e) =
1547 crate::netlink::add_address_to_link_by_name(&bridge_name, gateway, subnet.prefix_len())
1548 .await
1549 {
1550 let _ = crate::netlink::delete_bridge(&bridge_name).await;
1551 return Err(OverlaydError::Overlay(format!(
1552 "add_address_to_link_by_name({bridge_name}, {gateway}/{}) failed: {e}",
1553 subnet.prefix_len()
1554 )));
1555 }
1556 if let Err(e) = crate::netlink::set_link_up_by_name(&bridge_name).await {
1557 let _ = crate::netlink::delete_bridge(&bridge_name).await;
1558 return Err(OverlaydError::Overlay(format!(
1559 "set_link_up_by_name({bridge_name}) failed: {e}"
1560 )));
1561 }
1562
1563 // Track the per-service bridge for global teardown (deleting the link
1564 // drops its gateway address + up state).
1565 self.created_bridges.insert(bridge_name.clone());
1566
1567 // Build per-service IpAllocator, reserve the gateway.
1568 let mut ip_allocator = OverlayIpAllocator::new(&subnet.to_string()).map_err(|e| {
1569 OverlaydError::Overlay(format!("IpAllocator::new({subnet}) failed: {e}"))
1570 })?;
1571 let _ = ip_allocator.allocate_specific(gateway);
1572
1573 self.service_bridges.insert(
1574 service.to_string(),
1575 ServiceBridge {
1576 name: bridge_name.clone(),
1577 subnet,
1578 gateway,
1579 ip_allocator,
1580 },
1581 );
1582 self.service_interfaces
1583 .insert(service.to_string(), bridge_name.clone());
1584
1585 tracing::info!(service = %service, bridge = %bridge_name, subnet = %subnet, gateway = %gateway, "Service bridge created");
1586 Ok(bridge_name)
1587 }
1588
1589 /// Non-Linux variant of `setup_service_overlay`. On Windows the per-service
1590 /// segment is the HCN Internal network created lazily at attach time, and on
1591 /// macOS containers fall through to host networking. Registers the service
1592 /// in `service_interfaces` with a placeholder name so presence checks work.
1593 ///
1594 /// # Errors
1595 /// Infallible on non-Linux; the `Result` is preserved for ABI parity.
1596 #[cfg(not(target_os = "linux"))]
1597 async fn setup_service_overlay(
1598 &mut self,
1599 service: &str,
1600 mode: OverlayMode,
1601 ) -> Result<ServiceOverlayInfo, OverlaydError> {
1602 // Same predicate-driven decision surface as Linux (see
1603 // `zlayer_types::overlay`). The container ATTACH path differentiates the
1604 // modes per-OS; here we only record the resolved mode and register the
1605 // appropriate placeholder/info so presence checks and `Status` work.
1606 //
1607 // - uses_per_service_wg() -> the cross-platform dedicated path (a real
1608 // per-service WireGuard device; on Windows it also stands up a
1609 // per-service HCN Internal network at attach time).
1610 // - otherwise (`Auto` and `Shared`) -> no per-service WireGuard device.
1611 // On macOS both rely on VZ NAT + host-port forwarding (the free-port
1612 // L4 proxy), so they route to the SAME real path — the only honest
1613 // mapping a VZ guest can express (it has no per-service bridge or WG
1614 // to differentiate). On Windows the attach path reads the recorded
1615 // mode to send `Shared` containers onto a shared HCN NAT network and
1616 // `Auto` containers onto the node's base overlay network.
1617 // - uses_isolation_scope() -> Isolated: Auto topology here; the L3
1618 // fence is applied at ATTACH time via `isolation_network`.
1619 let resolved = mode.resolve();
1620 self.service_modes.insert(service.to_string(), resolved);
1621 if resolved.uses_per_service_wg() {
1622 self.setup_service_overlay_dedicated(service).await
1623 } else if resolved.uses_shared_bridge() {
1624 self.setup_service_overlay_shared_bridge(service).await
1625 } else if resolved.uses_isolation_scope() {
1626 // Isolated == Auto topology (per-service bridge on the cluster-wide
1627 // WireGuard); the L3 fence is applied at ATTACH time via
1628 // `isolation_network`, not in segment setup. Same target as the
1629 // default, made explicit so a new mode can't silently fall through.
1630 self.setup_service_overlay_cluster_wg(service).await
1631 } else {
1632 self.setup_service_overlay_cluster_wg(service).await
1633 }
1634 }
1635
1636 /// `Auto`-mode per-service overlay (non-Linux): on Windows the per-service
1637 /// segment is the node's base overlay HCN network used at attach time, and on
1638 /// macOS containers ride VZ NAT. Registers the service in `service_interfaces`
1639 /// with a placeholder name so presence checks work.
1640 ///
1641 /// # Errors
1642 /// Infallible on non-Linux; the `Result` is preserved for ABI parity.
1643 #[cfg(not(target_os = "linux"))]
1644 #[allow(clippy::unused_async)]
1645 async fn setup_service_overlay_cluster_wg(
1646 &mut self,
1647 service: &str,
1648 ) -> Result<ServiceOverlayInfo, OverlaydError> {
1649 let placeholder = make_interface_name(&[&self.deployment, &self.instance_id, service], "b");
1650 self.service_interfaces
1651 .insert(service.to_string(), placeholder.clone());
1652 tracing::debug!(service = %service, "Service overlay bridge setup is Linux-only; using direct networking placeholder");
1653 Ok(cluster_wg_overlay_info(placeholder))
1654 }
1655
1656 /// `Shared`-mode per-service overlay (non-Linux). There is no per-service
1657 /// `WireGuard` device and no per-service bridge:
1658 /// - macOS: the container is a VZ VM behind VZ NAT (a single shared host
1659 /// adapter with host-port forwarding); its ports are exposed by the
1660 /// userspace free-port L4 proxy. Nothing to provision here beyond a
1661 /// placeholder so presence checks succeed.
1662 /// - Windows: containers attach to a SINGLE shared HCN NAT network reused
1663 /// across all Shared services (created lazily at attach time); a placeholder
1664 /// interface is registered here.
1665 ///
1666 /// Registers the service in `service_interfaces` with a placeholder name.
1667 ///
1668 /// # Errors
1669 /// Infallible on non-Linux; the `Result` is preserved for ABI parity.
1670 #[cfg(not(target_os = "linux"))]
1671 #[allow(clippy::unused_async)]
1672 async fn setup_service_overlay_shared_bridge(
1673 &mut self,
1674 service: &str,
1675 ) -> Result<ServiceOverlayInfo, OverlaydError> {
1676 // A single placeholder shared by every Shared service on this node (it
1677 // names the shared data-plane, not a per-service interface).
1678 let placeholder =
1679 make_interface_name(&[&self.deployment, &self.instance_id, "shared"], "sh");
1680 self.service_interfaces
1681 .insert(service.to_string(), placeholder.clone());
1682 tracing::debug!(service = %service, "Shared-mode service uses the node-wide shared data-plane (VZ NAT on macOS / shared HCN NAT on Windows)");
1683 Ok(shared_overlay_info(placeholder))
1684 }
1685
1686 /// Dedicated-mode per-service overlay: stand up a *second* real `WireGuard`
1687 /// device for `service` with its own crypto context, listen port, overlay
1688 /// IP, and subnet — distinct from the single cluster transport.
1689 ///
1690 /// The cross-platform core (identity, subnet assign, transport bring-up,
1691 /// marker persist, status) runs on every OS; only the *attachment* of
1692 /// containers onto the device is platform-gated:
1693 /// - Linux: a per-service bridge (same mechanic as Shared) routed over the
1694 /// dedicated device instead of the cluster device.
1695 /// - Windows: a per-service HCN Internal network (a later task; a clearly
1696 /// marked seam returns an error here for now).
1697 /// - macOS: nothing further — the utun device is the attachment.
1698 ///
1699 /// # Errors
1700 /// Returns an error if port/key/subnet allocation, transport bring-up,
1701 /// marker persistence, or the platform attachment fails.
1702 #[allow(clippy::too_many_lines)]
1703 async fn setup_service_overlay_dedicated(
1704 &mut self,
1705 service: &str,
1706 ) -> Result<ServiceOverlayInfo, OverlaydError> {
1707 // ----- cross-platform core (runs on every OS) -----
1708
1709 // 1. Idempotency: an existing dedicated transport returns its identity.
1710 if let Some(st) = self.service_transports.get(service) {
1711 return Ok(dedicated_overlay_info(
1712 st.interface.clone(),
1713 &st.public_key,
1714 st.listen_port,
1715 st.overlay_ip,
1716 st.subnet,
1717 ));
1718 }
1719
1720 // 2. Identity: reuse a stable identity from the marker if one exists
1721 // (so the device re-binds the same key + port across restarts),
1722 // otherwise mint a fresh port + keypair + interface name.
1723 let marker_path =
1724 zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
1725 let recorded = NetworkState::load(&marker_path)
1726 .get(&owner_for_service(service))
1727 .cloned();
1728
1729 let (private_key, public_key, listen_port, iface_hint) = match recorded.as_ref() {
1730 Some(entry)
1731 if entry.wg_private_key.is_some()
1732 && entry.wg_public_key.is_some()
1733 && entry.wg_port.is_some()
1734 && entry.interface.is_some() =>
1735 {
1736 let port = entry.wg_port.expect("checked above");
1737 self.dedicated_ports.reserve(port);
1738 (
1739 entry.wg_private_key.clone().expect("checked above"),
1740 entry.wg_public_key.clone().expect("checked above"),
1741 port,
1742 entry.interface.clone().expect("checked above"),
1743 )
1744 }
1745 _ => {
1746 let port = self.dedicated_ports.allocate()?;
1747 let (priv_key, pub_key) = OverlayTransport::generate_keys()
1748 .await
1749 .map_err(|e| OverlaydError::Overlay(format!("Failed to generate keys: {e}")))?;
1750 let iface =
1751 make_interface_name(&[&self.deployment, &self.instance_id, service], "d");
1752 (priv_key, pub_key, port, iface)
1753 }
1754 };
1755
1756 // 3. Subnet: assign from the same registry Shared uses, so per-service
1757 // subnets stay globally unique regardless of mode.
1758 self.ensure_service_subnet_registry()?;
1759 let subnet: ipnet::IpNet = {
1760 let registry = self
1761 .service_subnet_registry
1762 .as_mut()
1763 .expect("ensure_service_subnet_registry leaves Some");
1764 let node_key = self.local_node_id.to_string();
1765 registry.assign(service, &node_key).map_err(|e| {
1766 OverlaydError::Overlay(format!(
1767 "ServiceSubnetRegistry::assign({service}, {node_key}) failed: {e}"
1768 ))
1769 })?
1770 };
1771 let overlay_ip = first_usable_ip(subnet);
1772
1773 // 4. Build + bring up the dedicated transport. The device's overlay CIDR
1774 // is the service subnet (so boringtun routes that subnet over THIS
1775 // device), and its listen port is the dedicated port.
1776 let physical_egress_ip = match zlayer_overlay::detect_physical_egress().await {
1777 Ok(egress) => Some(egress.ip),
1778 Err(e) => {
1779 tracing::warn!(
1780 error = %e,
1781 service = %service,
1782 "failed to detect physical egress; WireGuard local_endpoint \
1783 will bind UNSPECIFIED for the dedicated overlay"
1784 );
1785 None
1786 }
1787 };
1788 let config = self.build_config(
1789 private_key.clone(),
1790 public_key.clone(),
1791 overlay_ip,
1792 subnet.prefix_len(),
1793 listen_port,
1794 physical_egress_ip,
1795 );
1796 let mut transport = OverlayTransport::new(config, iface_hint);
1797 transport.create_interface().await.map_err(|e| {
1798 OverlaydError::Overlay(format!(
1799 "Failed to create dedicated overlay for {service}: {e}"
1800 ))
1801 })?;
1802 transport.configure(&[]).await.map_err(|e| {
1803 OverlaydError::Overlay(format!(
1804 "Failed to configure dedicated overlay for {service}: {e}"
1805 ))
1806 })?;
1807 let actual_iface = transport.interface_name().to_string();
1808
1809 // 5. Persist the marker so the identity survives restarts. Match the
1810 // base/Shared entry shape (owner/kind/name/id/subnet) plus the
1811 // dedicated WG fields.
1812 let mut marker = NetworkState::load(&marker_path);
1813 marker.upsert(ManagedNetwork {
1814 owner: owner_for_service(service),
1815 kind: "wg-dedicated".to_string(),
1816 name: actual_iface.clone(),
1817 id: public_key.clone(),
1818 subnet: subnet.to_string(),
1819 wg_port: Some(listen_port),
1820 wg_private_key: Some(private_key),
1821 wg_public_key: Some(public_key.clone()),
1822 interface: Some(actual_iface.clone()),
1823 });
1824 if let Err(e) = marker.save(&marker_path) {
1825 tracing::warn!(service = %service, error = %e, path = %marker_path.display(), "failed to persist dedicated-overlay marker (device still live)");
1826 }
1827
1828 // 6. Record the live transport. Build the guest-attach IPAM bounded to
1829 // the service subnet, reserving the node's own dedicated-device IP so
1830 // a joining guest never draws it.
1831 let mut ip_allocator = zlayer_overlay::allocator::IpAllocator::new(&subnet.to_string())
1832 .map_err(|e| {
1833 OverlaydError::Overlay(format!("IpAllocator::new({subnet}) failed: {e}"))
1834 })?;
1835 let _ = ip_allocator.allocate_specific(overlay_ip);
1836 self.service_transports.insert(
1837 service.to_string(),
1838 ServiceTransport {
1839 transport,
1840 interface: actual_iface.clone(),
1841 public_key: public_key.clone(),
1842 listen_port,
1843 overlay_ip,
1844 subnet,
1845 ip_allocator,
1846 },
1847 );
1848
1849 tracing::info!(
1850 service = %service,
1851 interface = %actual_iface,
1852 listen_port,
1853 subnet = %subnet,
1854 overlay_ip = %overlay_ip,
1855 "Dedicated per-service overlay device created"
1856 );
1857
1858 // ----- platform-gated attachment -----
1859 // `name` in the returned info is the container-attach handle: the bridge
1860 // name on Linux, the dedicated interface elsewhere.
1861 let name = self
1862 .attach_dedicated_service(service, subnet, overlay_ip)
1863 .await?;
1864
1865 Ok(dedicated_overlay_info(
1866 name,
1867 &public_key,
1868 listen_port,
1869 overlay_ip,
1870 subnet,
1871 ))
1872 }
1873
1874 /// Linux attachment for a dedicated per-service overlay: create the same
1875 /// per-service bridge Shared uses, but route the service subnet over the
1876 /// DEDICATED device rather than the cluster device.
1877 ///
1878 /// Concretely, the dedicated transport's overlay CIDR already covers
1879 /// `subnet` (set at `build_config` time in the core), so boringtun routes
1880 /// `subnet` out the dedicated TUN; we additionally plumb `subnet` onto this
1881 /// node's own `AllowedIPs` entry on the dedicated device so locally
1882 /// originated packets to the subnet are accepted. Returns the bridge name.
1883 ///
1884 /// # Errors
1885 /// Returns an error if the bridge cannot be created.
1886 #[cfg(target_os = "linux")]
1887 async fn attach_dedicated_service(
1888 &mut self,
1889 service: &str,
1890 subnet: ipnet::IpNet,
1891 overlay_ip: IpAddr,
1892 ) -> Result<String, OverlaydError> {
1893 let _ = overlay_ip;
1894 let bridge_name = self.create_service_bridge(service, subnet).await?;
1895
1896 // Plumb the service subnet onto the DEDICATED device (not the cluster
1897 // device). The dedicated transport's overlay CIDR already routes the
1898 // subnet out its TUN; adding it to our own pubkey's AllowedIPs keeps the
1899 // local-accept side consistent with the Shared path's cluster plumbing.
1900 if let Some(st) = self.service_transports.get(service) {
1901 if let Some(ref pubkey) = self.local_wg_pubkey {
1902 if let Err(e) = st.transport.add_allowed_ip(pubkey, subnet).await {
1903 tracing::warn!(
1904 service = %service,
1905 subnet = %subnet,
1906 error = %e,
1907 "Failed to add service subnet to dedicated transport AllowedIPs (non-fatal)"
1908 );
1909 }
1910 } else {
1911 tracing::debug!(service = %service, "local_wg_pubkey not yet set; skipping dedicated AllowedIPs update");
1912 }
1913 }
1914
1915 Ok(bridge_name)
1916 }
1917
1918 /// Windows attachment for a dedicated per-service overlay.
1919 ///
1920 /// The cross-platform core has already stood up the dedicated Wintun
1921 /// transport (the encrypted node-to-node path for the service subnet). This
1922 /// adds the *container-facing* side: a per-service HCN **Internal** network
1923 /// onto which the agent's containers attach (instead of the node's shared
1924 /// base overlay network), so dedicated-service traffic is isolated at the
1925 /// vSwitch layer. Returns the per-service network's name, which the caller
1926 /// records as the [`ServiceOverlayInfo::name`] attach handle.
1927 ///
1928 /// # Errors
1929 /// Propagates any error from [`Self::ensure_service_network`].
1930 #[cfg(target_os = "windows")]
1931 async fn attach_dedicated_service(
1932 &mut self,
1933 service: &str,
1934 subnet: ipnet::IpNet,
1935 _overlay_ip: IpAddr,
1936 ) -> Result<String, OverlaydError> {
1937 // Create (or reuse) the per-service Internal HCN network. The returned
1938 // GUID is recorded in the marker under `owner_for_service(service)`;
1939 // the `AttachContainer` handler reuses it via the same marker lookup.
1940 let _net_id = self.ensure_service_network(service, subnet).await?;
1941 // The attach handle reported back is the per-service network's name.
1942 let daemon_name = self.deployment_or_default();
1943 Ok(format!(
1944 "{}-svc-{service}",
1945 overlay_network_name(&daemon_name)
1946 ))
1947 }
1948
1949 /// macOS attachment for a dedicated per-service overlay: the cross-platform
1950 /// core already brought up a utun device; there is no bridge, so the
1951 /// interface name itself is the attach handle.
1952 #[cfg(all(not(target_os = "linux"), not(target_os = "windows")))]
1953 #[allow(clippy::unused_async)]
1954 async fn attach_dedicated_service(
1955 &mut self,
1956 service: &str,
1957 _subnet: ipnet::IpNet,
1958 _overlay_ip: IpAddr,
1959 ) -> Result<String, OverlaydError> {
1960 let iface = self
1961 .service_transports
1962 .get(service)
1963 .map(|st| st.interface.clone())
1964 .unwrap_or_default();
1965 Ok(iface)
1966 }
1967
1968 /// Tear down the per-service segment for `service`. Idempotent.
1969 // Only the Linux body awaits (netlink + cluster AllowedIPs); other targets
1970 // are synchronous (transport shutdown is sync) but must keep the async
1971 // signature for the dispatch call.
1972 #[cfg_attr(not(target_os = "linux"), allow(clippy::unused_async))]
1973 async fn teardown_service_overlay(&mut self, service: &str) {
1974 // Drop the recorded mode; a `Shared` service's containers no longer route
1975 // to the shared bridge once it is gone. The node-wide shared bridge
1976 // itself is deliberately NOT torn down here — other Shared services reuse
1977 // it (it is reclaimed only on full overlay teardown / uninstall).
1978 self.service_modes.remove(service);
1979
1980 // Auto-mode segment teardown (per-service bridge on Linux, placeholder
1981 // elsewhere). A Shared-mode service has no per-service bridge, so
1982 // `service_bridges.remove` is a no-op for it (its `service_interfaces`
1983 // placeholder pointing at the shared bridge is removed below).
1984 #[cfg(target_os = "linux")]
1985 {
1986 let removed = self.service_bridges.remove(service);
1987 self.service_interfaces.remove(service);
1988
1989 // Remove the subnet from the cluster AllowedIPs only when we still
1990 // know it (the in-memory entry survived).
1991 if let Some(ref bridge) = removed {
1992 if let Some(ref cluster) = self.global_transport {
1993 if let Some(ref pubkey) = self.local_wg_pubkey {
1994 if let Err(e) = cluster.remove_allowed_ip(pubkey, bridge.subnet).await {
1995 tracing::warn!(
1996 service = %service,
1997 subnet = %bridge.subnet,
1998 error = %e,
1999 "Failed to remove service subnet from cluster AllowedIPs (non-fatal)"
2000 );
2001 }
2002 }
2003 }
2004 }
2005
2006 // Delete the physical bridge by its DETERMINISTIC name, regardless of
2007 // whether the in-memory entry survived. After an overlayd restart the
2008 // `service_bridges` map is empty, so a delete gated on `Some(..)` would
2009 // silently leak the `zl-…-b` link forever (the observed orphan/linkdown
2010 // bridges). `delete_bridge` no-ops on ENODEV, so deleting an absent link
2011 // is safe — and the `-b` suffix never collides with a Shared service's
2012 // shared `-sh` bridge, so this can't tear down the wrong thing.
2013 let bridge_name = removed.as_ref().map_or_else(
2014 || make_interface_name(&[&self.deployment, &self.instance_id, service], "b"),
2015 |b| b.name.clone(),
2016 );
2017 if let Err(e) = crate::netlink::delete_bridge(&bridge_name).await {
2018 tracing::warn!(service = %service, bridge = %bridge_name, error = %e, "delete_bridge failed (non-fatal)");
2019 }
2020
2021 // Release the subnet-registry slot by service name (works whether or
2022 // not the in-memory entry survived).
2023 if let Some(registry) = self.service_subnet_registry.as_mut() {
2024 let node_key = self.local_node_id.to_string();
2025 let _ = registry.release(service, &node_key);
2026 }
2027
2028 if removed.is_some() {
2029 tracing::info!(service = %service, bridge = %bridge_name, "Tore down service bridge");
2030 } else {
2031 tracing::debug!(service = %service, bridge = %bridge_name, "best-effort delete of (possibly absent) service bridge by name");
2032 }
2033 }
2034 #[cfg(not(target_os = "linux"))]
2035 {
2036 if let Some(iface) = self.service_interfaces.remove(service) {
2037 tracing::info!(service = %service, interface = %iface, "Removed service overlay interface (placeholder, non-Linux)");
2038 }
2039 }
2040
2041 // Dedicated-mode teardown (cross-platform): tear down the per-service
2042 // transport, free its port, and drop its marker entry. No-op when the
2043 // service ran in Shared mode (nothing in `service_transports`).
2044 if let Some(mut st) = self.service_transports.remove(service) {
2045 st.transport.shutdown();
2046 self.dedicated_ports.release(st.listen_port);
2047
2048 // Release the subnet assignment (Shared releases it inside the
2049 // Linux block above; the dedicated subnet lives in the same
2050 // registry, so release it here for the dedicated case on every OS).
2051 if let Some(registry) = self.service_subnet_registry.as_mut() {
2052 let node_key = self.local_node_id.to_string();
2053 let _ = registry.release(service, &node_key);
2054 }
2055
2056 let marker_path =
2057 zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
2058 let mut marker = NetworkState::load(&marker_path);
2059 let removed_entry = marker.remove(&owner_for_service(service));
2060 if removed_entry.is_some() {
2061 if let Err(e) = marker.save(&marker_path) {
2062 tracing::warn!(service = %service, error = %e, path = %marker_path.display(), "failed to persist dedicated-overlay marker removal");
2063 }
2064 }
2065
2066 // Windows: delete the per-service HCN Internal network this service
2067 // owned. The marker entry's `id` is the bare HCN GUID (set by
2068 // `ensure_service_network`); delete the network so a dedicated
2069 // service tears down cleanly without waiting for a full uninstall.
2070 // Also drop the per-service container-IP allocator.
2071 #[cfg(target_os = "windows")]
2072 {
2073 self.service_ip_allocators.remove(service);
2074 if let Some(entry) = removed_entry.as_ref() {
2075 if entry.kind == "hcn-internal" {
2076 if let Ok(guid) = windows::core::GUID::try_from(entry.id.as_str()) {
2077 match zlayer_hns::network::Network::delete(guid) {
2078 Ok(()) => {
2079 tracing::info!(service = %service, id = %entry.id, "deleted per-service HCN network");
2080 }
2081 Err(e) => {
2082 tracing::warn!(service = %service, id = %entry.id, error = %e, "failed to delete per-service HCN network (may leak until uninstall)");
2083 }
2084 }
2085 } else {
2086 tracing::warn!(service = %service, id = %entry.id, "per-service marker has unparseable HCN GUID; skipping network delete");
2087 }
2088 }
2089 }
2090 }
2091 #[cfg(not(target_os = "windows"))]
2092 drop(removed_entry);
2093
2094 tracing::info!(
2095 service = %service,
2096 interface = %st.interface,
2097 listen_port = st.listen_port,
2098 "Tore down dedicated per-service overlay device"
2099 );
2100 }
2101 }
2102
2103 /// Reclaim orphaned per-service host bridges (and their stale device veths)
2104 /// that no live deployment still owns. `live_bridge_names` is the full set of
2105 /// `zl-…-b` bridge names every currently-restored service SHOULD own,
2106 /// computed by the main daemon from storage.
2107 ///
2108 /// For every host link whose name looks like one of OUR per-service bridge
2109 /// (`…-b`) or dedicated device (`…-d`) interfaces but is NOT in `live` and is
2110 /// NOT the active global (`-g`) or shared (`-sh`) interface, we:
2111 /// 1. delete the link (idempotent — ENODEV is success),
2112 /// 2. release its service-subnet registry assignment + cluster `AllowedIPs`
2113 /// when the `(service, node)` key can be recovered from the registry
2114 /// snapshot by reproducing the deterministic bridge name, and
2115 /// 3. drop any stale in-memory `service_bridges`/`service_interfaces`
2116 /// entries pointing at it.
2117 ///
2118 /// Best-effort + idempotent: a failure on one link is logged and the sweep
2119 /// continues. Returns the names actually reclaimed.
2120 #[cfg(target_os = "linux")]
2121 async fn prune_orphan_bridges(&mut self, live_bridge_names: &[String]) -> Vec<String> {
2122 use std::collections::HashSet;
2123
2124 let links = match crate::netlink::list_all_links().await {
2125 Ok(links) => links,
2126 Err(e) => {
2127 tracing::warn!(error = %e, "prune_orphan_bridges: failed to list host links");
2128 return Vec::new();
2129 }
2130 };
2131
2132 let live: HashSet<&str> = live_bridge_names.iter().map(String::as_str).collect();
2133
2134 // The interfaces we must NEVER reclaim even though they carry the `zl-`
2135 // prefix: the active global transport device and the node-wide shared
2136 // bridge. (Container veths `veth-…`/`vc-…` are handled by the separate
2137 // PID-keyed `sweep_orphan_veths`; here we only target service bridges +
2138 // dedicated device interfaces, which `sweep_orphan_veths` never touches.)
2139 let mut protected: HashSet<String> = HashSet::new();
2140 if let Some(g) = self.global_interface.clone() {
2141 protected.insert(g);
2142 }
2143 if let Some(ref sh) = self.shared_bridge {
2144 protected.insert(sh.name.clone());
2145 }
2146 // Protect every dedicated-service WireGuard transport (`…-d`) by name. A
2147 // `-d` is a WG device, not a bridge — it has no `brif`, so the zero-member
2148 // guard below treats it as 0 members, and the daemon's `live` set only
2149 // carries `…-b` names; without this it would be reaped as a live device.
2150 //
2151 // We deliberately do NOT blanket-protect `service_bridges` (`…-b`) here.
2152 // That map holds BOTH managed-service bridges AND standalone/per-job
2153 // bridges (e.g. a Runner's per-job network), and overlayd cannot tell
2154 // them apart — a standalone container's `DetachContainer` releases the
2155 // veth/IP but never removes the bridge or its `service_bridges` entry, so
2156 // a blanket protect shielded those orphans forever (only a restart, which
2157 // wipes the map, ever cleared them). Managed bridges stay protected by
2158 // being in the daemon's authoritative `live` set; standalone bridges are
2159 // not in storage, so they fall through to the zero-member guard and are
2160 // reclaimed once idle.
2161 for st in self.service_transports.values() {
2162 protected.insert(st.interface.clone());
2163 }
2164
2165 // Snapshot the subnet registry once so we can recover the `(service,
2166 // node)` key for an orphan by reproducing its deterministic bridge/device
2167 // name. The registry has no release-by-subnet API, so we map name ->
2168 // (service, node) here.
2169 let mut name_to_key: HashMap<String, (String, String, ipnet::IpNet)> = HashMap::new();
2170 if let Some(registry) = self.service_subnet_registry.as_ref() {
2171 for ((service, node), subnet) in registry.snapshot().assignments {
2172 let bridge =
2173 make_interface_name(&[&self.deployment, &self.instance_id, &service], "b");
2174 let device =
2175 make_interface_name(&[&self.deployment, &self.instance_id, &service], "d");
2176 name_to_key.insert(bridge, (service.clone(), node.clone(), subnet));
2177 name_to_key.insert(device, (service, node, subnet));
2178 }
2179 }
2180
2181 let mut reclaimed = Vec::new();
2182 for (_index, name) in links {
2183 // Only consider OUR per-service bridge (`-b`) or dedicated device
2184 // (`-d`) interfaces that are neither live nor protected. The pure
2185 // predicate (unit-tested in `orphan_bridge_selection`) keeps us off
2186 // unrelated host links, the global/shared interfaces, and the veth
2187 // namespaces.
2188 if !is_orphan_service_bridge(&name, &live, &protected) {
2189 continue;
2190 }
2191
2192 // Zero-member guard: only reclaim a non-live candidate once it is
2193 // IDLE — no member links. A `-b` bridge with a running container has
2194 // ≥1 veth in its `brif`, so an in-use (or a sub-ms mid-creation,
2195 // pre-attach is the only 0-member window) standalone bridge is left
2196 // alone; an orphan `-d` has no `brif` (0) and is correctly reaped.
2197 // This is what makes dropping the `service_bridges` blanket-protect
2198 // safe — a live managed bridge is already excluded by `live`, and any
2199 // other in-use bridge is excluded here.
2200 if crate::netlink::bridge_member_count(&name).await > 0 {
2201 continue;
2202 }
2203
2204 tracing::info!(link = %name, "prune_orphan_bridges: reclaiming orphan service bridge/device");
2205
2206 // 1. Release the subnet + cluster AllowedIPs when we can recover the
2207 // owning service key from the registry.
2208 if let Some((service, node, subnet)) = name_to_key.get(&name).cloned() {
2209 if let Some(ref cluster) = self.global_transport {
2210 if let Some(ref pubkey) = self.local_wg_pubkey {
2211 if let Err(e) = cluster.remove_allowed_ip(pubkey, subnet).await {
2212 tracing::warn!(
2213 link = %name,
2214 subnet = %subnet,
2215 error = %e,
2216 "prune_orphan_bridges: remove_allowed_ip failed (non-fatal)"
2217 );
2218 }
2219 }
2220 }
2221 if let Some(registry) = self.service_subnet_registry.as_mut() {
2222 let _ = registry.release(&service, &node);
2223 }
2224 }
2225
2226 // 2. Delete the link itself (idempotent).
2227 if let Err(e) = crate::netlink::delete_bridge(&name).await {
2228 tracing::warn!(link = %name, error = %e, "prune_orphan_bridges: delete_bridge failed (non-fatal)");
2229 continue;
2230 }
2231
2232 // 3. Drop any stale in-memory bookkeeping pointing at this link.
2233 self.service_bridges.retain(|_, b| b.name != name);
2234 self.service_interfaces.retain(|_, iface| *iface != name);
2235
2236 reclaimed.push(name);
2237 }
2238
2239 if !reclaimed.is_empty() {
2240 tracing::info!(count = reclaimed.len(), bridges = ?reclaimed, "prune_orphan_bridges: reclaimed orphaned service bridges/devices");
2241 }
2242 reclaimed
2243 }
2244
2245 /// Non-Linux variant: per-service bridges are a Linux-only mechanic (Windows
2246 /// uses HCN networks torn down in `teardown_service_overlay`; macOS rides VZ
2247 /// NAT), so there are no host bridge links to sweep.
2248 #[cfg(not(target_os = "linux"))]
2249 #[allow(clippy::unused_async, clippy::unused_self)]
2250 async fn prune_orphan_bridges(&mut self, _live_bridge_names: &[String]) -> Vec<String> {
2251 Vec::new()
2252 }
2253
2254 /// Initialize the local fallback `ServiceSubnetRegistry` from the configured
2255 /// cluster CIDR. Called on first `setup_service_overlay` use.
2256 ///
2257 /// # Errors
2258 /// Returns an error when no cluster CIDR is configured or the registry
2259 /// cannot be built.
2260 fn ensure_service_subnet_registry(&mut self) -> Result<(), OverlaydError> {
2261 use zlayer_overlay::allocator::ServiceSubnetRegistry;
2262
2263 if self.service_subnet_registry.is_some() {
2264 return Ok(());
2265 }
2266 let cluster_cidr = self.cluster_cidr.ok_or_else(|| {
2267 OverlaydError::Other(
2268 "service subnet registry needs a cluster CIDR (SetupGlobalOverlay first)"
2269 .to_string(),
2270 )
2271 })?;
2272 let cluster_ipnet: ipnet::IpNet = cluster_cidr.to_string().parse().map_err(|e| {
2273 OverlaydError::Other(format!(
2274 "failed to convert cluster CIDR {cluster_cidr} to ipnet::IpNet: {e}"
2275 ))
2276 })?;
2277 // Per-service bridge slice prefix. `/26` (V4) = ~61 usable container
2278 // IPs per service per node — keep in sync with
2279 // `zlayer_scheduler::raft::DEFAULT_SERVICE_SUBNET_SLICE_PREFIX` (the
2280 // canonical default; not imported here to avoid a dependency cycle).
2281 // The older `/28` (13 usable) exhausted under CI churn.
2282 let slice_prefix: u8 = match cluster_ipnet {
2283 ipnet::IpNet::V4(_) => 26,
2284 ipnet::IpNet::V6(_) => 120,
2285 };
2286 let mut registry =
2287 ServiceSubnetRegistry::new(cluster_ipnet, slice_prefix).map_err(|e| {
2288 OverlaydError::Other(format!("failed to build ServiceSubnetRegistry: {e}"))
2289 })?;
2290 // Reserve the node's own overlay IP so no per-service bridge subnet
2291 // overlaps it — the overlay DNS server listens on `<node_ip>:53`, and a
2292 // bridge subnet containing that IP would black-hole its containers' DNS
2293 // (they'd ARP for the node IP on their bridge, where nothing answers).
2294 if let Some(node_ip) = self.node_ip {
2295 registry.reserve_ip(node_ip);
2296 }
2297 self.service_subnet_registry = Some(registry);
2298 Ok(())
2299 }
2300
2301 // -- IP allocation -------------------------------------------------------
2302
2303 /// Allocate an overlay IP from the per-service bridge (Linux) or the node
2304 /// slice (otherwise). `join_global` reserves a second global-overlay IP too,
2305 /// matching the eth1 attach behavior.
2306 ///
2307 /// # Errors
2308 /// Returns an error if the relevant pool is exhausted.
2309 fn allocate_ip(&mut self, service: &str, join_global: bool) -> Result<IpAddr, OverlaydError> {
2310 // `join_global` does not allocate a second IP here: the companion
2311 // global-overlay IP (eth1) is reserved at attach time. `AllocateIp`
2312 // returns only the primary (service / slice) IP the caller asked for.
2313 let _ = join_global;
2314 #[cfg(target_os = "linux")]
2315 {
2316 // A Shared-mode service draws from the single node-wide shared bridge;
2317 // every other mode draws from its own per-service bridge.
2318 let use_shared = self
2319 .service_modes
2320 .get(service)
2321 .copied()
2322 .unwrap_or_default()
2323 .uses_shared_bridge();
2324 if use_shared {
2325 if let Some(bridge) = self.shared_bridge.as_mut() {
2326 return bridge.ip_allocator.allocate().ok_or_else(|| {
2327 OverlaydError::Overlay(format!(
2328 "shared bridge {} subnet {} exhausted",
2329 bridge.name, bridge.subnet
2330 ))
2331 });
2332 }
2333 } else if let Some(bridge) = self.service_bridges.get_mut(service) {
2334 return bridge.ip_allocator.allocate().ok_or_else(|| {
2335 OverlaydError::Overlay(format!(
2336 "service bridge {} subnet {} exhausted",
2337 bridge.name, bridge.subnet
2338 ))
2339 });
2340 }
2341 }
2342 let _ = service;
2343 self.ip_allocator.allocate()
2344 }
2345
2346 /// Return an overlay IP to the allocator (service-bridge pool when known,
2347 /// otherwise the node slice).
2348 fn release_ip(&mut self, ip: IpAddr) {
2349 #[cfg(target_os = "linux")]
2350 {
2351 if let Some(bridge) = self.shared_bridge.as_mut() {
2352 if bridge.subnet.contains(&ip) {
2353 bridge.ip_allocator.release(ip);
2354 return;
2355 }
2356 }
2357 for bridge in self.service_bridges.values_mut() {
2358 if bridge.subnet.contains(&ip) {
2359 bridge.ip_allocator.release(ip);
2360 return;
2361 }
2362 }
2363 }
2364 self.ip_allocator.release(ip);
2365 }
2366
2367 // -- container attach (Linux) -------------------------------------------
2368
2369 /// Wire a container into the overlay and return its [`AttachResult`].
2370 ///
2371 /// # Errors
2372 /// Returns an error if the container cannot be attached.
2373 #[allow(clippy::too_many_arguments)]
2374 async fn attach_container(
2375 &mut self,
2376 handle: AttachHandle,
2377 service: &str,
2378 join_global: bool,
2379 ephemeral: bool,
2380 dns_server: Option<IpAddr>,
2381 dns_domain: Option<String>,
2382 isolation_network: Option<String>,
2383 ) -> Result<AttachResult, OverlaydError> {
2384 // Record the overlay DNS resolver/zone the main daemon staged for this
2385 // node so later attaches (and the Windows HCN endpoint `Dns` schema)
2386 // can fall back to them when a per-attach value isn't supplied.
2387 if let Some(server) = dns_server {
2388 self.dns_server_addr = Some(SocketAddr::new(server, 53));
2389 }
2390 if dns_domain.is_some() {
2391 self.dns_domain.clone_from(&dns_domain);
2392 }
2393 match handle {
2394 AttachHandle::LinuxPid { pid } => {
2395 let ip = self
2396 .attach_container_linux(pid, service, join_global, ephemeral, isolation_network)
2397 .await?;
2398 Ok(AttachResult {
2399 ip,
2400 namespace_guid: None,
2401 })
2402 }
2403 AttachHandle::WindowsContainer { container_id, ip } => {
2404 self.attach_container_windows(
2405 &container_id,
2406 service,
2407 ip,
2408 dns_server,
2409 dns_domain,
2410 isolation_network,
2411 )
2412 .await
2413 }
2414 AttachHandle::HostShared { id } => {
2415 let ip = self
2416 .attach_container_host_shared(&id, service, ephemeral, isolation_network)
2417 .await?;
2418 Ok(AttachResult {
2419 ip,
2420 namespace_guid: None,
2421 })
2422 }
2423 AttachHandle::GuestManaged { .. } => Err(OverlaydError::Other(
2424 "guest-managed attach must go through attach_container_guest, not attach_container"
2425 .to_string(),
2426 )),
2427 }
2428 }
2429
2430 /// Tear down a container's overlay attachment and release its IP.
2431 ///
2432 /// # Errors
2433 /// Returns an error only if a netlink delete fails for a reason other than
2434 /// "link not found".
2435 async fn detach_container(&mut self, handle: AttachHandle) -> Result<(), OverlaydError> {
2436 match handle {
2437 AttachHandle::LinuxPid { pid } => self.detach_container_linux(pid).await,
2438 AttachHandle::WindowsContainer { container_id, .. } => {
2439 self.detach_container_windows(&container_id).await
2440 }
2441 AttachHandle::HostShared { id } => self.detach_container_host_shared(&id).await,
2442 AttachHandle::GuestManaged { .. } => Err(OverlaydError::Other(
2443 "guest-managed detach must go through detach_container_guest, not detach_container"
2444 .to_string(),
2445 )),
2446 }
2447 }
2448
2449 // -- container attach (guest-managed) -----------------------------------
2450
2451 /// Guest-managed overlay attach: allocate the overlay identity for a VM guest
2452 /// that brings up its own kernel `WireGuard` device.
2453 ///
2454 /// overlayd cannot enter the guest's network namespace (it is a VM, not a
2455 /// host process), so instead of a veth/HCN endpoint it:
2456 /// 1. allocates the overlay IP from the SAME pool the Linux attach uses (the
2457 /// per-service bridge pool when one exists, otherwise the node slice) so
2458 /// guest addresses never collide with container addresses;
2459 /// 2. generates a fresh `WireGuard` keypair for the guest;
2460 /// 3. builds the peer set the guest must configure — every GLOBAL peer the
2461 /// host already knows, plus THIS node itself (so the guest can reach the
2462 /// host node over the overlay; carries a keepalive so the guest keeps its
2463 /// NAT mapping open from behind VZ NAT);
2464 /// 4. registers the generated public key as a GLOBAL peer (host route to the
2465 /// guest, roaming endpoint learned from the guest's keepalive) so remote
2466 /// nodes and this node route to it;
2467 /// 5. records the attachment keyed by `id` so `DetachContainer` can release
2468 /// the IP and remove the peer.
2469 ///
2470 /// Platform-agnostic: pure IPAM + keygen + peer bookkeeping (no netns/veth/
2471 /// HCN), so it compiles and runs on macOS (where the overlayd serving a VZ
2472 /// host lives) as well as Linux.
2473 ///
2474 /// # Errors
2475 /// Returns an error if the global overlay is not set up, the IP pool is
2476 /// exhausted, key generation fails, or registering the guest peer fails.
2477 #[allow(clippy::cast_possible_truncation, clippy::too_many_lines)]
2478 async fn attach_container_guest(
2479 &mut self,
2480 id: &str,
2481 service: &str,
2482 join_global: bool,
2483 dns_server: Option<IpAddr>,
2484 dns_domain: Option<String>,
2485 isolation_network: Option<String>,
2486 ) -> Result<GuestOverlayConfig, OverlaydError> {
2487 // The global transport must exist: we both register the guest as a peer
2488 // on it and advertise this node (its public key + listen port) to the
2489 // guest. Resolve both up front so we fail before allocating anything.
2490 let node_public_key = self.transport_public_key.clone().ok_or_else(|| {
2491 OverlaydError::Other(
2492 "guest-managed attach requires the global overlay to be set up first \
2493 (no node WireGuard public key)"
2494 .to_string(),
2495 )
2496 })?;
2497 if self.global_transport.is_none() {
2498 return Err(OverlaydError::Other(
2499 "guest-managed attach requires the global overlay to be set up first \
2500 (no global transport)"
2501 .to_string(),
2502 ));
2503 }
2504
2505 // 1. Allocate the overlay IP from the same pool the Linux attach uses and
2506 // derive the prefix length from that pool's network. On Linux a
2507 // per-service bridge (when present) supplies both the IP and its
2508 // subnet's prefix; otherwise (and on every non-Linux host) the node
2509 // slice / cluster CIDR does.
2510 let (overlay_ip, prefix_len, pool_service, dedicated): (IpAddr, u8, Option<String>, bool) = {
2511 #[cfg(target_os = "linux")]
2512 {
2513 let use_shared = self
2514 .service_modes
2515 .get(service)
2516 .copied()
2517 .unwrap_or_default()
2518 .uses_shared_bridge();
2519 let bridge = if use_shared {
2520 self.shared_bridge.as_mut()
2521 } else {
2522 self.service_bridges.get_mut(service)
2523 };
2524 if let Some(bridge) = bridge {
2525 let ip = bridge.ip_allocator.allocate().ok_or_else(|| {
2526 OverlaydError::Overlay(format!(
2527 "service bridge {} subnet {} exhausted",
2528 bridge.name, bridge.subnet
2529 ))
2530 })?;
2531 let prefix = bridge.subnet.prefix_len();
2532 (ip, prefix, Some(service.to_string()), false)
2533 } else {
2534 let ip = self.ip_allocator.allocate()?;
2535 (ip, self.slice_prefix_len(), None, false)
2536 }
2537 }
2538 #[cfg(not(target_os = "linux"))]
2539 {
2540 // A Dedicated service owns a second WireGuard device (own crypto +
2541 // subnet); its guest draws from that device's allocator and lands
2542 // on the dedicated subnet, not the global cluster mesh. Every other
2543 // mode hairpins through the node slice on the global transport.
2544 let dedicated = self
2545 .service_modes
2546 .get(service)
2547 .copied()
2548 .unwrap_or_default()
2549 .uses_per_service_wg();
2550 if dedicated {
2551 let st = self.service_transports.get_mut(service).ok_or_else(|| {
2552 OverlaydError::Other(format!(
2553 "Dedicated service {service} has no dedicated overlay; \
2554 call setup_service_overlay first"
2555 ))
2556 })?;
2557 let ip = st.ip_allocator.allocate().ok_or_else(|| {
2558 OverlaydError::Overlay(format!(
2559 "dedicated service {service} subnet {} exhausted",
2560 st.subnet
2561 ))
2562 })?;
2563 (ip, st.subnet.prefix_len(), Some(service.to_string()), true)
2564 } else {
2565 let ip = self.ip_allocator.allocate()?;
2566 (ip, self.slice_prefix_len(), None, false)
2567 }
2568 }
2569 };
2570 // `join_global` is informational for a guest-managed attach: the guest's
2571 // single WireGuard device IS its global-overlay endpoint, so there is no
2572 // separate eth1 IP to reserve. Touch it so callers stay consistent with
2573 // the Linux/Windows handles.
2574 let _ = join_global;
2575
2576 // 2. Generate the guest's WireGuard keypair (reuse the transport's
2577 // native x25519 keygen — never reimplement curve25519 here).
2578 let (private_key, public_key) = OverlayTransport::generate_keys().await.map_err(|e| {
2579 // Roll back the IP allocation so a keygen failure leaks nothing.
2580 self.release_guest_ip(overlay_ip, pool_service.as_deref());
2581 OverlaydError::Overlay(format!("failed to generate guest keys: {e}"))
2582 })?;
2583
2584 // 3. Build the peer set. A VZ guest is behind the host's NAT and can only
2585 // reach the LOCAL node (via its NAT gateway) — it cannot dial other
2586 // nodes' or sibling guests' endpoints directly. So it gets exactly ONE
2587 // peer: this node. ALL overlay traffic (including to sibling containers
2588 // and remote nodes) routes through this node, which forwards/hairpins it
2589 // (the node already holds a /32 peer for every container — step 4 — and
2590 // the real inter-node peers). We deliberately do NOT add the per-guest
2591 // /32 peers here: a /32 with no reachable endpoint would win
2592 // longest-prefix routing and black-hole sibling traffic. The endpoint
2593 // returned here is the node's overlay IP as a placeholder; the VZ
2594 // runtime rewrites it to the guest's NAT gateway (the only host address
2595 // the guest can reach) before delivering the config. Keepalive holds the
2596 // guest's NAT mapping open so the node can reach back.
2597 //
2598 // Dedicated mode: the single peer is this node's DEDICATED per-service
2599 // device (its own pubkey + listen port + subnet as AllowedIPs), so the
2600 // guest joins that service's isolated mesh. Every other mode peers with
2601 // the global cluster device, AllowedIPs = the whole cluster CIDR.
2602 let (peer_pubkey, peer_listen_port, peer_allowed) = if dedicated {
2603 let st = self
2604 .service_transports
2605 .get(service)
2606 .expect("dedicated transport allocated above");
2607 (st.public_key.clone(), st.listen_port, st.subnet.to_string())
2608 } else {
2609 let node_allowed = self
2610 .cluster_cidr
2611 .or(self.slice_cidr)
2612 .map_or_else(|| String::from("0.0.0.0/0"), |c| c.to_string());
2613 (node_public_key, self.overlay_port, node_allowed)
2614 };
2615 let node_endpoint = self.node_endpoint_for_guest(peer_listen_port);
2616 let peers: Vec<PeerSpec> = vec![PeerSpec {
2617 public_key: peer_pubkey,
2618 endpoint: node_endpoint,
2619 allowed_ips: peer_allowed,
2620 persistent_keepalive_secs: 25,
2621 // The guest reaches the node via its NAT gateway (the only host
2622 // address it can route to); it does not run the host's ICE-lite
2623 // candidate exchange, so no candidates are advertised here.
2624 candidates: Vec::new(),
2625 }];
2626
2627 // 4. Register the guest's public key as a GLOBAL peer (host route to the
2628 // guest at <overlay_ip>/32, roaming endpoint learned from keepalive).
2629 // Go through the same internal path `AddPeer { Global }` uses.
2630 let host_route = format!(
2631 "{}/{}",
2632 overlay_ip,
2633 if overlay_ip.is_ipv6() { 128 } else { 32 }
2634 );
2635 let guest_peer = PeerSpec {
2636 public_key: public_key.clone(),
2637 // Empty/roaming: the guest is behind NAT; boringtun learns its source
2638 // endpoint from the guest's first keepalive. `0.0.0.0:0` is the
2639 // wire-safe "unset endpoint" sentinel that still parses as a
2640 // SocketAddr (peer_spec_to_info requires a parseable endpoint).
2641 endpoint: "0.0.0.0:0".to_string(),
2642 allowed_ips: host_route,
2643 persistent_keepalive_secs: 0,
2644 // The guest's roaming endpoint is learned from its first keepalive;
2645 // it advertises no NAT candidates (the host learns the source).
2646 candidates: Vec::new(),
2647 };
2648 let guest_peer_info = peer_spec_to_info(&guest_peer)?;
2649 let scope = if dedicated {
2650 PeerScope::Service {
2651 service: service.to_string(),
2652 }
2653 } else {
2654 PeerScope::Global
2655 };
2656 {
2657 let transport = self.transport_for_scope(&scope)?;
2658 if let Err(e) = Self::add_peer_on(transport, &guest_peer_info).await {
2659 self.release_guest_ip(overlay_ip, pool_service.as_deref());
2660 return Err(e);
2661 }
2662 }
2663 // Track it among the global peers (so a *subsequent* guest attach also
2664 // learns about this guest) and record the attachment for detach.
2665 self.global_peers
2666 .insert(public_key.clone(), guest_peer.clone());
2667 // Per-network membership + node-side L3 isolation: record the guest's
2668 // overlay IP in its isolated network's member set, and enforce the
2669 // cross-platform isolation policy on THIS node. A VZ guest hairpins ALL
2670 // its overlay traffic through this node's WireGuard device, so the node
2671 // is the enforcement point: on macOS this dispatches to pf (a per-network
2672 // table + sub-anchor); on Linux it dispatches to iptables (harmless here
2673 // — guests do not run on Linux). The guest's own WireGuard AllowedIPs are
2674 // the in-guest belt; this is the node-side suspenders.
2675 if let Some(ref net) = isolation_network {
2676 let node_ip = self
2677 .node_ip
2678 .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::new(10, 200, 0, 1)));
2679 let cidr = self
2680 .cluster_cidr
2681 .map_or_else(|| "10.200.0.0/16".to_string(), |c| c.to_string());
2682 // Peers = current members of the network BEFORE inserting this guest.
2683 let peers: Vec<IpAddr> = self
2684 .network_members
2685 .get(net)
2686 .map(|m| m.iter().copied().collect())
2687 .unwrap_or_default();
2688 if let Err(e) = zlayer_overlay::firewall::ensure_member_isolation(
2689 net, overlay_ip, &peers, node_ip, &cidr,
2690 ) {
2691 tracing::warn!(network = %net, member = %overlay_ip, error = %e, "failed to install per-network L3 isolation for guest (non-fatal)");
2692 }
2693 self.network_members
2694 .entry(net.clone())
2695 .or_default()
2696 .insert(overlay_ip);
2697 }
2698 self.guest_attachments.insert(
2699 id.to_string(),
2700 GuestAttachInfo {
2701 overlay_ip,
2702 public_key: public_key.clone(),
2703 service_name: pool_service,
2704 isolation_network,
2705 },
2706 );
2707
2708 // 5. Return the config the caller ships into the guest.
2709 Ok(GuestOverlayConfig {
2710 overlay_ip,
2711 prefix_len,
2712 private_key,
2713 public_key,
2714 // The guest's device listens on the same port as its single in-guest
2715 // peer (the node device it joins): the node's overlay WG port for the
2716 // global mesh, or the dedicated device's listen port in Dedicated mode.
2717 listen_port: peer_listen_port,
2718 peers,
2719 dns_server: dns_server.or_else(|| self.dns_server_addr.map(|s| s.ip())),
2720 dns_domain: dns_domain.or_else(|| self.dns_domain.clone()),
2721 })
2722 }
2723
2724 /// Release a guest-managed attach by `id`: drop the host route + global peer
2725 /// and return the allocated IP to its pool. Idempotent.
2726 ///
2727 /// # Errors
2728 /// Returns an error only if removing the peer from the global transport fails
2729 /// for a reason other than "peer not found".
2730 async fn detach_container_guest(&mut self, id: &str) -> Result<(), OverlaydError> {
2731 let Some(info) = self.guest_attachments.remove(id) else {
2732 return Ok(());
2733 };
2734 // Remove the guest's peer from the same scope it was registered on: a
2735 // Dedicated guest sits on its service's dedicated device, every other
2736 // guest on the global cluster device. Mirror the attach-time scope choice
2737 // so a dedicated guest peer does not leak on teardown.
2738 let scope = match info.service_name.as_deref() {
2739 Some(svc)
2740 if self
2741 .service_modes
2742 .get(svc)
2743 .copied()
2744 .unwrap_or_default()
2745 .uses_per_service_wg() =>
2746 {
2747 PeerScope::Service {
2748 service: svc.to_string(),
2749 }
2750 }
2751 _ => PeerScope::Global,
2752 };
2753 self.global_peers.remove(&info.public_key);
2754 if let Ok(transport) = self.transport_for_scope(&scope) {
2755 if let Err(e) = Self::remove_peer_on(transport, &info.public_key).await {
2756 tracing::warn!(
2757 guest = %id,
2758 pubkey = %info.public_key,
2759 scope = ?scope,
2760 error = %e,
2761 "failed to remove guest peer from its overlay transport"
2762 );
2763 }
2764 }
2765 // Drain the per-network membership set for this guest and tear down the
2766 // node-side L3 isolation rule for it (pf on macOS, iptables on Linux —
2767 // the latter is a no-op for guests, which never run on Linux). Drop the
2768 // network entry once empty.
2769 if let Some(net) = info.isolation_network.as_deref() {
2770 if let Some(set) = self.network_members.get_mut(net) {
2771 set.remove(&info.overlay_ip);
2772 }
2773 let remaining_peers: Vec<IpAddr> = self
2774 .network_members
2775 .get(net)
2776 .map(|m| m.iter().copied().collect())
2777 .unwrap_or_default();
2778 let node_ip = self
2779 .node_ip
2780 .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::new(10, 200, 0, 1)));
2781 let cidr = self
2782 .cluster_cidr
2783 .map_or_else(|| "10.200.0.0/16".to_string(), |c| c.to_string());
2784 zlayer_overlay::firewall::remove_member_isolation(
2785 net,
2786 info.overlay_ip,
2787 &remaining_peers,
2788 node_ip,
2789 &cidr,
2790 );
2791 if self
2792 .network_members
2793 .get(net)
2794 .is_some_and(std::collections::HashSet::is_empty)
2795 {
2796 self.network_members.remove(net);
2797 }
2798 }
2799 // Return the IP to whichever pool it came from.
2800 self.release_guest_ip(info.overlay_ip, info.service_name.as_deref());
2801 Ok(())
2802 }
2803
2804 // -- container attach (macOS host-shared) -------------------------------
2805
2806 /// Host-shared overlay attach: give a macOS host-shared container
2807 /// ([`AttachHandle::HostShared`] — Seatbelt / native-VZ / libkrun) its own
2808 /// first-class L3 overlay membership.
2809 ///
2810 /// A host-shared container shares the node's host network namespace and its
2811 /// single cluster `utun`; it cannot get its own netns/veth (Seatbelt) or its
2812 /// own kernel `WireGuard` device (no guest VM to run one). So instead of a
2813 /// veth/HCN endpoint or a per-guest WG keypair, this:
2814 /// 1. allocates a DISTINCT overlay `/32` from the node slice (never the node
2815 /// IP — `IpAllocator` reserves offset 1 — and never `None`). The node
2816 /// slice is already advertised cluster-wide as this node's `AllowedIPs`,
2817 /// so the `/32` auto-routes to this node with no peer reconfiguration;
2818 /// 2. adds that `/32` as an alias on the node's overlay `utun` so the kernel
2819 /// delivers inbound overlay packets for it locally (boringtun decrypts
2820 /// and writes the plaintext packet to the utun, which only delivers to a
2821 /// configured local address);
2822 /// 3. records per-network membership + installs node-side L3 isolation when
2823 /// `isolation_network` is set (pf on macOS), exactly like the guest path;
2824 /// 4. records the attachment keyed by `id` so `DetachContainer` can remove
2825 /// the alias, drain the membership, and release the IP.
2826 ///
2827 /// HONEST CONSTRAINT: host-shared containers share the node's single cluster
2828 /// `utun`, so `OverlayMode::Dedicated`'s per-service `WireGuard` CRYPTO
2829 /// isolation cannot apply to them — there is no per-container WG device
2830 /// without a netns or a guest VM to host one. They still get a distinct
2831 /// overlay IP + L3 isolation (per-network membership / pf) + overlay DNS,
2832 /// which is full first-class L3 overlay membership. This is a real OS
2833 /// constraint of host-shared execution, not a stub.
2834 ///
2835 /// # Errors
2836 /// Returns an error if the node slice is exhausted, or if the global overlay
2837 /// interface is not set up (so there is no `utun` to alias the `/32` on).
2838 async fn attach_container_host_shared(
2839 &mut self,
2840 id: &str,
2841 service: &str,
2842 ephemeral: bool,
2843 isolation_network: Option<String>,
2844 ) -> Result<IpAddr, OverlaydError> {
2845 // 1. Allocate a distinct /32 from the node slice. Never the node IP
2846 // (reserved at offset 1), never None — exhaustion maps to the same
2847 // `OverlaydError::Overlay` the other attach paths surface.
2848 let ip = self.ip_allocator.allocate()?;
2849 let prefix_len: u8 = if ip.is_ipv6() { 128 } else { 32 };
2850
2851 // 2. Make the /32 locally deliverable on the node's overlay utun via an
2852 // alias on the single cluster transport's interface. Roll the IP
2853 // allocation back on any failure so nothing leaks.
2854 let alias_res = if let Some(transport) = self.global_transport.as_ref() {
2855 transport
2856 .add_alias(ip, prefix_len)
2857 .await
2858 .map_err(|e| OverlaydError::Overlay(e.to_string()))
2859 } else {
2860 Err(OverlaydError::Other(
2861 "host-shared attach requires the global overlay to be set up first \
2862 (no utun to alias the container /32 on)"
2863 .to_string(),
2864 ))
2865 };
2866 if let Err(e) = alias_res {
2867 self.ip_allocator.release(ip);
2868 return Err(e);
2869 }
2870
2871 // 3. Per-network membership + node-side L3 isolation (mirror the guest
2872 // path). The host-shared container hairpins all overlay traffic
2873 // through this node's WireGuard device, so the node is the
2874 // enforcement point (pf on macOS).
2875 if let Some(ref net) = isolation_network {
2876 let node_ip = self
2877 .node_ip
2878 .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::new(10, 200, 0, 1)));
2879 let cidr = self
2880 .cluster_cidr
2881 .map_or_else(|| "10.200.0.0/16".to_string(), |c| c.to_string());
2882 // Peers = current members of the network BEFORE inserting this one.
2883 let peers: Vec<IpAddr> = self
2884 .network_members
2885 .get(net)
2886 .map(|m| m.iter().copied().collect())
2887 .unwrap_or_default();
2888 if let Err(e) =
2889 zlayer_overlay::firewall::ensure_member_isolation(net, ip, &peers, node_ip, &cidr)
2890 {
2891 tracing::warn!(network = %net, member = %ip, error = %e, "failed to install per-network L3 isolation for host-shared container (non-fatal)");
2892 }
2893 self.network_members
2894 .entry(net.clone())
2895 .or_default()
2896 .insert(ip);
2897 }
2898
2899 // 4. Record the attachment so detach can reverse all of the above.
2900 self.host_shared_attachments.insert(
2901 id.to_string(),
2902 AttachInfo {
2903 service_ip: ip,
2904 service_name: Some(service.to_string()),
2905 // No separate global/eth1 IP: a host-shared container reaches the
2906 // global overlay through the SAME /32 aliased on the node utun.
2907 global_ip: None,
2908 ephemeral,
2909 isolation_network,
2910 },
2911 );
2912
2913 Ok(ip)
2914 }
2915
2916 /// Release a host-shared attach by `id`: remove the utun `/32` alias, drain
2917 /// its per-network L3 isolation membership, and return the IP to the node
2918 /// slice. Idempotent. Mirrors [`Self::detach_container_guest`].
2919 ///
2920 /// # Errors
2921 /// Returns `Ok` even when removing the alias fails (best-effort, logged) —
2922 /// the IP is always returned to the pool so it can never leak.
2923 async fn detach_container_host_shared(&mut self, id: &str) -> Result<(), OverlaydError> {
2924 let Some(info) = self.host_shared_attachments.remove(id) else {
2925 return Ok(());
2926 };
2927 // Drain the per-network membership set and tear down the node-side L3
2928 // isolation rule for this container; drop the network entry once empty.
2929 if let Some(net) = info.isolation_network.as_deref() {
2930 if let Some(set) = self.network_members.get_mut(net) {
2931 set.remove(&info.service_ip);
2932 }
2933 let remaining_peers: Vec<IpAddr> = self
2934 .network_members
2935 .get(net)
2936 .map(|m| m.iter().copied().collect())
2937 .unwrap_or_default();
2938 let node_ip = self
2939 .node_ip
2940 .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::new(10, 200, 0, 1)));
2941 let cidr = self
2942 .cluster_cidr
2943 .map_or_else(|| "10.200.0.0/16".to_string(), |c| c.to_string());
2944 zlayer_overlay::firewall::remove_member_isolation(
2945 net,
2946 info.service_ip,
2947 &remaining_peers,
2948 node_ip,
2949 &cidr,
2950 );
2951 if self
2952 .network_members
2953 .get(net)
2954 .is_some_and(std::collections::HashSet::is_empty)
2955 {
2956 self.network_members.remove(net);
2957 }
2958 }
2959 // Remove the utun /32 alias (best-effort: a failed removal must not
2960 // strand the IP, so we log and still release below).
2961 let prefix_len: u8 = if info.service_ip.is_ipv6() { 128 } else { 32 };
2962 if let Some(transport) = self.global_transport.as_ref() {
2963 if let Err(e) = transport.remove_alias(info.service_ip, prefix_len).await {
2964 tracing::warn!(
2965 container = %id,
2966 ip = %info.service_ip,
2967 error = %e,
2968 "failed to remove host-shared overlay /32 alias from utun (non-fatal)"
2969 );
2970 }
2971 }
2972 // Return the IP to the node slice.
2973 self.ip_allocator.release(info.service_ip);
2974
2975 // Per-job segment lifecycle observability. Unlike the Linux veth path —
2976 // which reaps a per-service BRIDGE on the last ephemeral detach — a
2977 // host-shared container shares the node's single cluster utun and owns
2978 // no per-service bridge or dedicated WG device to tear down (see the
2979 // HONEST CONSTRAINT note on `attach_container_host_shared`). The only
2980 // per-segment state is its overlay `/32` + per-network membership, both
2981 // already reversed above. So `ephemeral` and `service_name` drive the
2982 // last-leaver TRACE here (mirroring the Linux ephemeral-teardown log)
2983 // rather than a bridge teardown: an ephemeral (per-job) segment's IP
2984 // return is logged at info level for reclamation traceability, a
2985 // managed service's at debug.
2986 let service = info.service_name.as_deref().unwrap_or("<none>");
2987 if info.ephemeral {
2988 tracing::info!(
2989 container = %id,
2990 service = %service,
2991 ip = %info.service_ip,
2992 "ephemeral host-shared overlay member detached — per-job segment /32 returned to node slice"
2993 );
2994 } else {
2995 tracing::debug!(
2996 container = %id,
2997 service = %service,
2998 ip = %info.service_ip,
2999 "host-shared overlay member detached — /32 returned to node slice"
3000 );
3001 }
3002 Ok(())
3003 }
3004
3005 /// Release a guest overlay IP back to the pool it was drawn from: the named
3006 /// service bridge's allocator (Linux) when `service` is set and the bridge
3007 /// still exists, otherwise the node slice allocator.
3008 fn release_guest_ip(&mut self, ip: IpAddr, service: Option<&str>) {
3009 #[cfg(target_os = "linux")]
3010 {
3011 // A Shared-mode service drew from the single node-wide shared bridge,
3012 // which is keyed by subnet, not by service name. Try it first.
3013 if let Some(bridge) = self.shared_bridge.as_mut() {
3014 if bridge.subnet.contains(&ip) {
3015 bridge.ip_allocator.release(ip);
3016 return;
3017 }
3018 }
3019 if let Some(svc) = service {
3020 if let Some(bridge) = self.service_bridges.get_mut(svc) {
3021 bridge.ip_allocator.release(ip);
3022 return;
3023 }
3024 }
3025 }
3026 #[cfg(not(target_os = "linux"))]
3027 {
3028 // A Dedicated-mode guest drew its IP from the per-service transport's
3029 // allocator (keyed by service name); return it there so the dedicated
3030 // subnet does not leak addresses across guest churn.
3031 if let Some(svc) = service {
3032 if let Some(st) = self.service_transports.get_mut(svc) {
3033 st.ip_allocator.release(ip);
3034 return;
3035 }
3036 }
3037 }
3038 let _ = service;
3039 self.ip_allocator.release(ip);
3040 }
3041
3042 /// Prefix length of the address pool guest IPs are drawn from when not using
3043 /// a per-service bridge: the node slice if assigned, else the cluster CIDR.
3044 fn slice_prefix_len(&self) -> u8 {
3045 self.slice_cidr.or(self.cluster_cidr).map_or(
3046 if self.node_ip.is_some_and(|ip| ip.is_ipv6()) {
3047 64
3048 } else {
3049 24
3050 },
3051 |c| c.prefix(),
3052 )
3053 }
3054
3055 /// Reachable `WireGuard` endpoint for THIS node, advertised to a guest as a
3056 /// peer on `listen_port` (the node's global overlay port, or a Dedicated
3057 /// service's per-service device port). overlayd has no public reflexive
3058 /// address at this layer, so it uses the node's overlay-listen identity
3059 /// (`node_ip:listen_port`); the caller (the VZ runtime that ships the config
3060 /// into the guest) rewrites it to the concrete VZ-NAT gateway endpoint the
3061 /// guest can dial. Falls back to the unspecified address when no node IP is
3062 /// assigned yet.
3063 fn node_endpoint_for_guest(&self, listen_port: u16) -> String {
3064 let ip = self.node_ip.unwrap_or(IpAddr::V4(Ipv4Addr::UNSPECIFIED));
3065 SocketAddr::new(ip, listen_port).to_string()
3066 }
3067
3068 /// Linux veth/netns attach. On non-Linux this returns the node's overlay IP
3069 /// (host networking) and is never wired for a `LinuxPid` handle in practice.
3070 #[cfg(target_os = "linux")]
3071 #[allow(clippy::too_many_lines)]
3072 async fn attach_container_linux(
3073 &mut self,
3074 container_pid: u32,
3075 service: &str,
3076 join_global: bool,
3077 ephemeral: bool,
3078 isolation_network: Option<String>,
3079 ) -> Result<IpAddr, OverlaydError> {
3080 // Resolve which bridge backs this service. A `Shared`-mode service
3081 // attaches onto the SINGLE node-wide shared bridge; every other mode
3082 // (`Auto`, `Dedicated`) attaches onto its own per-service bridge. The
3083 // mode was recorded at `setup_service_overlay` time.
3084 let use_shared = self
3085 .service_modes
3086 .get(service)
3087 .copied()
3088 .unwrap_or_default()
3089 .uses_shared_bridge();
3090
3091 let (bridge_name, bridge_subnet, bridge_gateway, container_ip) = if use_shared {
3092 let bridge = self.shared_bridge.as_mut().ok_or_else(|| {
3093 OverlaydError::Other(format!(
3094 "no shared bridge for Shared-mode service {service}; call setup_service_overlay() first"
3095 ))
3096 })?;
3097 let ip = bridge.ip_allocator.allocate().ok_or_else(|| {
3098 OverlaydError::Overlay(format!(
3099 "shared bridge {} subnet {} exhausted",
3100 bridge.name, bridge.subnet
3101 ))
3102 })?;
3103 (bridge.name.clone(), bridge.subnet, bridge.gateway, ip)
3104 } else {
3105 let bridge = self.service_bridges.get_mut(service).ok_or_else(|| {
3106 OverlaydError::Other(format!(
3107 "no service bridge for service {service}; call setup_service_overlay() first"
3108 ))
3109 })?;
3110 let ip = bridge.ip_allocator.allocate().ok_or_else(|| {
3111 OverlaydError::Overlay(format!(
3112 "service bridge {} subnet {} exhausted",
3113 bridge.name, bridge.subnet
3114 ))
3115 })?;
3116 (bridge.name.clone(), bridge.subnet, bridge.gateway, ip)
3117 };
3118
3119 let bridge_params = BridgeAttachParams {
3120 bridge_name: &bridge_name,
3121 gateway: bridge_gateway,
3122 subnet_prefix_len: bridge_subnet.prefix_len(),
3123 };
3124 if let Err(e) = self
3125 .attach_to_interface(
3126 container_pid,
3127 container_ip,
3128 "s",
3129 "eth0",
3130 Some(&bridge_params),
3131 )
3132 .await
3133 {
3134 if use_shared {
3135 if let Some(bridge) = self.shared_bridge.as_mut() {
3136 bridge.ip_allocator.release(container_ip);
3137 }
3138 } else if let Some(bridge) = self.service_bridges.get_mut(service) {
3139 bridge.ip_allocator.release(container_ip);
3140 }
3141 return Err(e);
3142 }
3143
3144 let mut global_ip: Option<IpAddr> = None;
3145 if join_global && self.global_interface.is_some() {
3146 let g_ip = self.ip_allocator.allocate()?;
3147 self.attach_to_interface(container_pid, g_ip, "g", "eth1", None)
3148 .await?;
3149 global_ip = Some(g_ip);
3150 }
3151
3152 // Per-network L3 isolation: when this attach joins a named isolated
3153 // network, install the Docker-style iptables rules pinning this member
3154 // to its own network's members + node + egress, then record it in the
3155 // membership map. Non-fatal: a host without iptables logs and proceeds.
3156 if let Some(ref net) = isolation_network {
3157 let node_ip = self
3158 .node_ip
3159 .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::new(10, 200, 0, 1)));
3160 let cidr = self
3161 .cluster_cidr
3162 .map_or_else(|| "10.200.0.0/16".to_string(), |c| c.to_string());
3163 let peers: Vec<IpAddr> = self
3164 .network_members
3165 .get(net)
3166 .map(|m| m.iter().copied().collect())
3167 .unwrap_or_default();
3168 if let Err(e) = zlayer_overlay::firewall::ensure_member_isolation(
3169 net,
3170 container_ip,
3171 &peers,
3172 node_ip,
3173 &cidr,
3174 ) {
3175 tracing::warn!(network = %net, member = %container_ip, error = %e, "failed to install per-network L3 isolation (non-fatal)");
3176 }
3177 self.network_members
3178 .entry(net.clone())
3179 .or_default()
3180 .insert(container_ip);
3181 }
3182
3183 self.attached.insert(
3184 container_pid,
3185 AttachInfo {
3186 service_ip: container_ip,
3187 service_name: Some(service.to_string()),
3188 global_ip,
3189 ephemeral,
3190 isolation_network,
3191 },
3192 );
3193
3194 Ok(container_ip)
3195 }
3196
3197 /// Non-Linux fallback: containers share the host network, so return the
3198 /// node's overlay IP (or loopback).
3199 #[cfg(not(target_os = "linux"))]
3200 #[allow(clippy::unused_async)]
3201 async fn attach_container_linux(
3202 &mut self,
3203 _container_pid: u32,
3204 service: &str,
3205 _join_global: bool,
3206 _ephemeral: bool,
3207 _isolation_network: Option<String>,
3208 ) -> Result<IpAddr, OverlaydError> {
3209 tracing::debug!(service = %service, "LinuxPid attach is a no-op off Linux; using node overlay IP");
3210 Ok(self.node_ip.unwrap_or(IpAddr::V4(Ipv4Addr::LOCALHOST)))
3211 }
3212
3213 /// Release the overlay resources held by a Linux container PID. Idempotent.
3214 #[cfg(target_os = "linux")]
3215 async fn detach_container_linux(&mut self, pid: u32) -> Result<(), OverlaydError> {
3216 // "Process id or not, kill the adapter": the host-side veth name is
3217 // deterministic (`veth-<pid>-{s,g}`), so delete it UNCONDITIONALLY by
3218 // name — even when no attach record survives (a previous daemon crashed
3219 // before recording it, or it was already reaped). Without this, a missing
3220 // record left the host veth orphaned until the PID-keyed periodic sweep
3221 // (which only fires once the PID is dead). The deletes are idempotent
3222 // (ENODEV = success), so the always-on `-g` delete is harmless when the
3223 // container never joined the global overlay.
3224 let info = self.attached.remove(&pid);
3225
3226 let veth_s = format!("veth-{pid}-s");
3227 if let Err(e) = crate::netlink::delete_link_by_name(&veth_s).await {
3228 tracing::warn!(link = %veth_s, pid, error = %e, "Failed to delete service veth");
3229 }
3230 let veth_g = format!("veth-{pid}-g");
3231 if let Err(e) = crate::netlink::delete_link_by_name(&veth_g).await {
3232 tracing::warn!(link = %veth_g, pid, error = %e, "Failed to delete global veth");
3233 }
3234
3235 // No attach record -> nothing more to release (IP/registry bookkeeping
3236 // is keyed off the record). The veths above are already gone.
3237 let Some(info) = info else {
3238 return Ok(());
3239 };
3240
3241 // Return the service IP to whichever pool owns it. A Shared-mode service
3242 // drew its IP from the single node-wide shared bridge (no per-service
3243 // bridge exists for it), so try the shared bridge by subnet containment
3244 // before the named per-service bridge.
3245 if self.shared_bridge.as_mut().is_some_and(|b| {
3246 b.subnet.contains(&info.service_ip) && b.ip_allocator.release(info.service_ip)
3247 }) {
3248 // released into the shared bridge
3249 } else if let Some(svc) = info.service_name.as_deref() {
3250 if let Some(bridge) = self.service_bridges.get_mut(svc) {
3251 bridge.ip_allocator.release(info.service_ip);
3252 } else {
3253 tracing::debug!(service = %svc, ip = %info.service_ip, "detach: service bridge already torn down; dropping service IP release");
3254 }
3255 } else {
3256 self.ip_allocator.release(info.service_ip);
3257 }
3258 if let Some(g) = info.global_ip {
3259 self.ip_allocator.release(g);
3260 }
3261
3262 // Per-network L3 isolation drain: remove this member from its isolated
3263 // network's membership set and tear down its iptables rules against the
3264 // remaining members. Drop the network entry once empty.
3265 if let Some(net) = info.isolation_network.as_deref() {
3266 if let Some(set) = self.network_members.get_mut(net) {
3267 set.remove(&info.service_ip);
3268 }
3269 let still: Vec<IpAddr> = self
3270 .network_members
3271 .get(net)
3272 .map(|m| m.iter().copied().collect())
3273 .unwrap_or_default();
3274 let node_ip = self
3275 .node_ip
3276 .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::new(10, 200, 0, 1)));
3277 let cidr = self
3278 .cluster_cidr
3279 .map_or_else(|| "10.200.0.0/16".to_string(), |c| c.to_string());
3280 zlayer_overlay::firewall::remove_member_isolation(
3281 net,
3282 info.service_ip,
3283 &still,
3284 node_ip,
3285 &cidr,
3286 );
3287 if self
3288 .network_members
3289 .get(net)
3290 .is_some_and(std::collections::HashSet::is_empty)
3291 {
3292 self.network_members.remove(net);
3293 }
3294 }
3295
3296 // Ephemeral last-leaver teardown: a standalone/per-job bridge is reclaimed
3297 // the moment its LAST container leaves (the periodic prune is only the
3298 // ~300s backstop). Managed attaches use ephemeral=false so their bridge
3299 // persists across scale-to-0. Route through teardown_service_overlay so
3300 // overlayd's in-memory state stays synced — never a hand `ip link del`.
3301 // This container's veth is already removed above, so a 0 member count
3302 // means no containers remain on the bridge.
3303 if info.ephemeral {
3304 if let Some(svc) = info.service_name.clone() {
3305 if let Some(bridge_name) = self.service_bridges.get(&svc).map(|b| b.name.clone()) {
3306 if crate::netlink::bridge_member_count(&bridge_name).await == 0 {
3307 tracing::info!(service = %svc, bridge = %bridge_name, "ephemeral overlay bridge idle after last detach — tearing down");
3308 self.teardown_service_overlay(&svc).await;
3309 }
3310 }
3311 }
3312 }
3313 Ok(())
3314 }
3315
3316 /// Non-Linux fallback: nothing to detach (host networking).
3317 #[cfg(not(target_os = "linux"))]
3318 #[allow(clippy::unused_async)]
3319 async fn detach_container_linux(&mut self, _pid: u32) -> Result<(), OverlaydError> {
3320 Ok(())
3321 }
3322
3323 /// Best-effort sweep of orphan veth endpoints whose owning container PID is
3324 /// no longer alive. Names matching `veth-<pid>-*` / `vc-<pid>-*` where
3325 /// `/proc/<pid>` does not exist are deleted.
3326 #[cfg(target_os = "linux")]
3327 async fn sweep_orphan_veths() {
3328 let links = match crate::netlink::list_all_links().await {
3329 Ok(links) => links,
3330 Err(e) => {
3331 tracing::warn!(error = %e, "Failed to list links for orphan sweep");
3332 return;
3333 }
3334 };
3335 for (_index, name) in links {
3336 let remainder = if let Some(r) = name.strip_prefix("veth-") {
3337 r
3338 } else if let Some(r) = name.strip_prefix("vc-") {
3339 r
3340 } else {
3341 continue;
3342 };
3343 let Some(pid_str) = remainder.split('-').next() else {
3344 continue;
3345 };
3346 let pid: u32 = match pid_str.parse() {
3347 Ok(p) => p,
3348 Err(_) => continue,
3349 };
3350 if Path::new(&format!("/proc/{pid}")).exists() {
3351 continue;
3352 }
3353 tracing::info!(link = %name, pid = pid, "Deleting orphan veth");
3354 if let Err(e) = crate::netlink::delete_link_by_name(&name).await {
3355 tracing::warn!(link = %name, error = %e, "Failed to delete orphan veth");
3356 }
3357 }
3358 }
3359
3360 #[cfg(target_os = "linux")]
3361 #[allow(clippy::too_many_lines)]
3362 async fn attach_to_interface(
3363 &mut self,
3364 container_pid: u32,
3365 ip: IpAddr,
3366 tag: &str,
3367 container_iface: &str,
3368 bridge: Option<&BridgeAttachParams<'_>>,
3369 ) -> Result<(), OverlaydError> {
3370 // Best-effort cleanup of orphan veths left by a previous daemon crash.
3371 Self::sweep_orphan_veths().await;
3372
3373 let is_v6 = ip.is_ipv6();
3374 let prefix_len: u8 = if let Some(b) = bridge {
3375 b.subnet_prefix_len
3376 } else if is_v6 {
3377 64
3378 } else {
3379 24
3380 };
3381 let host_prefix: u8 = if is_v6 { 128 } else { 32 };
3382
3383 let veth_host = format!("veth-{container_pid}-{tag}");
3384 let veth_pending = format!("vc-{container_pid}-{tag}");
3385 let veth_container = container_iface.to_string();
3386
3387 let container_ns_fd = std::os::fd::OwnedFd::from(
3388 std::fs::File::open(format!("/proc/{container_pid}/ns/net")).map_err(|e| {
3389 OverlaydError::Overlay(format!("Failed to open /proc/{container_pid}/ns/net: {e}"))
3390 })?,
3391 );
3392
3393 crate::netlink::delete_link_by_name(&veth_host)
3394 .await
3395 .map_err(|e| OverlaydError::Overlay(format!("pre-cleanup delete {veth_host}: {e}")))?;
3396 crate::netlink::delete_link_by_name(&veth_pending)
3397 .await
3398 .map_err(|e| {
3399 OverlaydError::Overlay(format!("pre-cleanup delete {veth_pending}: {e}"))
3400 })?;
3401
3402 let bridge_gateway: Option<IpAddr> = bridge.map(|b| b.gateway);
3403 let bridge_name: Option<String> = bridge.map(|b| b.bridge_name.to_string());
3404 let node_ip = self.node_ip;
3405
3406 let result: Result<(), OverlaydError> = async {
3407 crate::netlink::create_veth_pair(&veth_host, &veth_pending)
3408 .await
3409 .map_err(|e| OverlaydError::Overlay(format!("create veth pair: {e}")))?;
3410
3411 crate::netlink::move_link_into_netns_fd_and_rename(
3412 &veth_pending,
3413 AsFd::as_fd(&container_ns_fd),
3414 &veth_container,
3415 )
3416 .map_err(|e| OverlaydError::Overlay(format!("move veth into netns: {e}")))?;
3417
3418 let vc = veth_container.clone();
3419 let bridge_gateway_for_netns = bridge_gateway;
3420 tokio::task::spawn_blocking(move || {
3421 crate::netlink::with_netns_fd_async(container_ns_fd, move || async move {
3422 crate::netlink::add_address_to_link_by_name(&vc, ip, prefix_len).await?;
3423 crate::netlink::set_link_up_by_name(&vc).await?;
3424 crate::netlink::set_link_up_by_name("lo").await?;
3425 if let Some(gw) = bridge_gateway_for_netns {
3426 crate::netlink::add_default_route_via_gateway(gw).await?;
3427 }
3428 Ok(())
3429 })
3430 })
3431 .await
3432 .map_err(|e| OverlaydError::Overlay(format!("container netns task panicked: {e}")))?
3433 .map_err(|e| OverlaydError::Overlay(format!("container netns ops: {e}")))?;
3434
3435 crate::netlink::set_link_up_by_name(&veth_host)
3436 .await
3437 .map_err(|e| OverlaydError::Overlay(format!("set {veth_host} up: {e}")))?;
3438
3439 if let Some(bname) = bridge_name.as_deref() {
3440 crate::netlink::add_link_to_bridge(&veth_host, bname)
3441 .await
3442 .map_err(|e| {
3443 OverlaydError::Overlay(format!(
3444 "enslave {veth_host} to bridge {bname}: {e}"
3445 ))
3446 })?;
3447 } else {
3448 crate::netlink::replace_route_via_dev(ip, host_prefix, &veth_host, node_ip)
3449 .await
3450 .map_err(|e| {
3451 OverlaydError::Overlay(format!("host route for {ip}/{host_prefix}: {e}"))
3452 })?;
3453 }
3454
3455 Ok(())
3456 }
3457 .await;
3458
3459 // Enable IP forwarding so the host routes between the overlay device(s)
3460 // and the egress NIC. CRITICAL: this is scoped to the address family
3461 // actually in use and (for IPv6) to the specific overlay devices —
3462 // NEVER `net.ipv6.conf.all.forwarding`, whose documented kernel side
3463 // effect is to force `accept_ra=0` + `autoconf=0` on every IPv6
3464 // interface (including the public NIC), dropping the RA-learned default
3465 // route / path-MTU and blackholing the host's own larger reply packets
3466 // (e.g. inbound SSH stalls after key exchange). Done outside the
3467 // attach `result` block so a forwarding-sysctl failure can never roll
3468 // back a successful veth attach. Tracked so teardown reverts it.
3469 if result.is_ok() {
3470 self.enable_forwarding_for_attach(is_v6, &veth_host, bridge_name.as_deref());
3471
3472 // Track the host-side resources this attach created so a clean
3473 // global teardown reverts every host mutation. The host-side veth
3474 // half exists in both the bridged and bridgeless paths; the host
3475 // `/32`(`/128`) route is installed ONLY on the bridgeless path
3476 // (`replace_route_via_dev` above), so record it only when there was
3477 // no bridge to enslave into. All deletions are idempotent, so a
3478 // resource a later per-container detach removes first is harmless.
3479 self.created_veths.insert(veth_host.clone());
3480 if bridge_name.is_none() {
3481 self.created_host_routes
3482 .push((ip, host_prefix, veth_host.clone()));
3483 }
3484 }
3485
3486 if result.is_err() {
3487 let _ = crate::netlink::delete_link_by_name(&veth_host).await;
3488 let _ = crate::netlink::delete_link_by_name(&veth_pending).await;
3489 }
3490 result
3491 }
3492
3493 // -- container attach (Windows HCN) -------------------------------------
3494
3495 /// Windows attach: ensure the overlay HCN Internal network exists, allocate
3496 /// or validate the IP, create the per-container HCN endpoint + namespace,
3497 /// and return the bare-lowercase namespace GUID for the agent to embed in
3498 /// the compute-system document.
3499 ///
3500 /// # Errors
3501 /// Returns an error if the network/endpoint cannot be created or the slice
3502 /// is exhausted.
3503 #[cfg(target_os = "windows")]
3504 #[allow(clippy::too_many_lines)]
3505 async fn attach_container_windows(
3506 &mut self,
3507 container_id: &str,
3508 service: &str,
3509 ip_override: Option<IpAddr>,
3510 dns_server: Option<IpAddr>,
3511 dns_domain: Option<String>,
3512 isolation_network: Option<String>,
3513 ) -> Result<AttachResult, OverlaydError> {
3514 // Resolve whether THIS service has a dedicated per-service overlay. It
3515 // does iff a live dedicated transport exists OR a `hcn-internal` marker
3516 // entry is recorded under `owner_for_service(service)` (the network
3517 // survives daemon restarts even if the transport map is empty mid-init).
3518 // Dedicated services attach onto their OWN per-service Internal network
3519 // and draw IPs from the service subnet; everything else uses the node's
3520 // base/shared overlay network and the node slice.
3521 let dedicated_subnet = self.dedicated_service_subnet(service);
3522 // A `Shared`-mode service attaches onto the SINGLE shared HCN NAT network
3523 // reused across all Shared services (container ports are exposed via the
3524 // userspace free-port L4 proxy). The mode was recorded at setup time.
3525 let use_shared_nat = self
3526 .service_modes
3527 .get(service)
3528 .copied()
3529 .unwrap_or_default()
3530 .uses_shared_bridge();
3531
3532 let (net_id, ip, prefix_length) = if let Some(net) = isolation_network.as_deref() {
3533 // ----- per-isolation-network Internal HCN network path -----
3534 //
3535 // An "isolated" ZLayer network routes its members onto a dedicated
3536 // HCN Internal vSwitch keyed by the isolation-network NAME (not the
3537 // service). HCN Internal vSwitches are mutually isolated by default,
3538 // so same-network members share one vSwitch (reach each other +
3539 // egress via the network gateway + the node), while different
3540 // isolation networks land on separate vSwitches and cannot reach
3541 // each other — L3 isolation with NO ACLs and NO per-member churn.
3542 // This mirrors the Dedicated per-service branch below, but keyed by
3543 // the isolation-network name and drawing IPs from a per-network
3544 // subnet carved deterministically from the node slice.
3545 let iso_subnet = self.isolation_network_subnet(net)?;
3546 let net_id = self.ensure_isolation_network(net, iso_subnet).await?;
3547
3548 // Per-network container IPs come from the isolation network's own
3549 // subnet (never the node slice), via a lazily-created allocator
3550 // bounded to that subnet. The allocator is keyed by the isolation
3551 // network's owner key so it never collides with a same-named
3552 // dedicated service's allocator. An `ip_override` is honored only
3553 // when it falls inside the isolation subnet.
3554 let iso_ipnetwork: IpNetwork = iso_subnet.to_string().parse().map_err(|e| {
3555 OverlaydError::Other(format!(
3556 "failed to parse isolation subnet {iso_subnet}: {e}"
3557 ))
3558 })?;
3559 let alloc_key = crate::network_state::owner_for_isolation_network(net);
3560 let allocator = self
3561 .service_ip_allocators
3562 .entry(alloc_key)
3563 .or_insert_with(|| IpAllocator::new(iso_ipnetwork));
3564 let ip = match ip_override {
3565 Some(ip) if iso_subnet.contains(&ip) => ip,
3566 Some(ip) => {
3567 return Err(OverlaydError::Other(format!(
3568 "overridden IP {ip} is not inside isolation network subnet {iso_subnet} for network {net}"
3569 )));
3570 }
3571 None => allocator.allocate()?,
3572 };
3573 (net_id, ip, iso_subnet.prefix_len())
3574 } else if use_shared_nat {
3575 // ----- shared HCN NAT network path -----
3576 let slice = self.slice_cidr.ok_or_else(|| {
3577 OverlaydError::Other(
3578 "no node slice assigned yet (SetupGlobalOverlay with slice_cidr first)"
3579 .to_string(),
3580 )
3581 })?;
3582 let slice_ipnet: ipnet::IpNet = slice.to_string().parse().map_err(|e| {
3583 OverlaydError::Other(format!("failed to parse slice CIDR {slice}: {e}"))
3584 })?;
3585 let net_id = self.ensure_shared_nat_network(slice_ipnet).await?;
3586 let ip = match ip_override {
3587 Some(ip) => ip,
3588 None => self.ip_allocator.allocate()?,
3589 };
3590 (net_id, ip, slice_ipnet.prefix_len())
3591 } else if let Some(svc_subnet) = dedicated_subnet {
3592 // ----- dedicated per-service network path -----
3593 let net_id = self.ensure_service_network(service, svc_subnet).await?;
3594
3595 // Allocate (or validate) the IP from the SERVICE subnet, not the
3596 // node slice. A per-service allocator is created lazily and bounded
3597 // to the service subnet so addresses stay inside the dedicated
3598 // network. An `ip_override` inside the service subnet is honored;
3599 // one outside it is rejected so a slice-allocated IP can't leak onto
3600 // the dedicated network.
3601 let svc_ipnetwork: IpNetwork = svc_subnet.to_string().parse().map_err(|e| {
3602 OverlaydError::Other(format!("failed to parse service subnet {svc_subnet}: {e}"))
3603 })?;
3604 let allocator = self
3605 .service_ip_allocators
3606 .entry(service.to_string())
3607 .or_insert_with(|| IpAllocator::new(svc_ipnetwork));
3608 let ip = match ip_override {
3609 Some(ip) if svc_subnet.contains(&ip) => ip,
3610 Some(ip) => {
3611 return Err(OverlaydError::Other(format!(
3612 "overridden IP {ip} is not inside dedicated service subnet {svc_subnet} for service {service}"
3613 )));
3614 }
3615 None => allocator.allocate()?,
3616 };
3617 (net_id, ip, svc_subnet.prefix_len())
3618 } else {
3619 // ----- shared base overlay network path (unchanged) -----
3620 let slice = self.slice_cidr.ok_or_else(|| {
3621 OverlaydError::Other(
3622 "no node slice assigned yet (SetupGlobalOverlay with slice_cidr first)"
3623 .to_string(),
3624 )
3625 })?;
3626 let slice_ipnet: ipnet::IpNet = slice.to_string().parse().map_err(|e| {
3627 OverlaydError::Other(format!("failed to parse slice CIDR {slice}: {e}"))
3628 })?;
3629 let net_id = self.ensure_overlay_network(slice_ipnet).await?;
3630 let ip = match ip_override {
3631 Some(ip) => ip,
3632 None => self.ip_allocator.allocate()?,
3633 };
3634 (net_id, ip, slice_ipnet.prefix_len())
3635 };
3636
3637 // 3. Create the endpoint + per-container namespace on the network.
3638 let dns_server_eff = dns_server.or_else(|| self.dns_server_addr.map(|a| a.ip()));
3639 let dns_domain_for_attach = dns_domain.or_else(|| self.dns_domain.clone());
3640 let cluster_cidr = self.cluster_cidr.map(|c| c.to_string()).unwrap_or_default();
3641 let owner_tag = owner_tag(&self.deployment_or_default());
3642 let cid = container_id.to_string();
3643
3644 let attachment = tokio::task::spawn_blocking(move || {
3645 zlayer_hns::attach::EndpointAttachment::create_overlay(
3646 net_id,
3647 &owner_tag,
3648 cid.as_str(),
3649 ip,
3650 prefix_length,
3651 &cluster_cidr,
3652 dns_server_eff,
3653 dns_domain_for_attach.as_deref(),
3654 )
3655 })
3656 .await
3657 .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
3658 .map_err(|e| OverlaydError::Overlay(format!("HCN overlay endpoint attach failed: {e}")))?;
3659
3660 let namespace_id = attachment.namespace_id();
3661 let bare_guid = format_guid_bare(namespace_id);
3662
3663 // Per-network membership: record the container's IP in its isolated
3664 // network's member set. Windows enforcement is an HCN ACL — a
3665 // Linux-incompatible mechanism wired separately; overlayd only maintains
3666 // the membership map here and does NOT call the iptables firewall helper.
3667 if let Some(ref net) = isolation_network {
3668 self.network_members
3669 .entry(net.clone())
3670 .or_default()
3671 .insert(ip);
3672 }
3673
3674 // Record for autoclean keyed by namespace GUID.
3675 self.hcn_cleanup
3676 .insert(namespace_id, (service.to_string(), ip, isolation_network));
3677
3678 tracing::info!(
3679 ns = %bare_guid,
3680 service = %service,
3681 ip = %ip,
3682 "Attached container to HCN overlay"
3683 );
3684
3685 Ok(AttachResult {
3686 ip,
3687 namespace_guid: Some(bare_guid),
3688 })
3689 }
3690
3691 /// Non-Windows path: a `WindowsContainer` handle has no meaning off Windows.
3692 #[cfg(not(target_os = "windows"))]
3693 #[allow(clippy::unused_async)]
3694 async fn attach_container_windows(
3695 &mut self,
3696 _container_id: &str,
3697 _service: &str,
3698 _ip_override: Option<IpAddr>,
3699 _dns_server: Option<IpAddr>,
3700 _dns_domain: Option<String>,
3701 _isolation_network: Option<String>,
3702 ) -> Result<AttachResult, OverlaydError> {
3703 Err(OverlaydError::Other(
3704 "WindowsContainer attach is only supported on Windows".to_string(),
3705 ))
3706 }
3707
3708 /// Detach a Windows container by its bare namespace GUID and release its IP.
3709 /// Idempotent: unknown ids are a no-op.
3710 #[cfg(target_os = "windows")]
3711 async fn detach_container_windows(
3712 &mut self,
3713 namespace_guid: &str,
3714 ) -> Result<(), OverlaydError> {
3715 use windows::core::GUID;
3716
3717 let Ok(guid) = GUID::try_from(namespace_guid) else {
3718 tracing::warn!(ns = %namespace_guid, "detach: unparseable namespace GUID");
3719 return Ok(());
3720 };
3721 if let Some((service, ip, isolation_network)) = self.hcn_cleanup.remove(&guid) {
3722 // Release the IP into the pool it was drawn from. An isolation-network
3723 // member drew from the per-network allocator (keyed by the isolation
3724 // owner key), NOT the node slice; release it there so the isolation
3725 // subnet doesn't leak addresses. Everything else came from the node
3726 // slice.
3727 if let Some(net) = isolation_network.as_deref() {
3728 let alloc_key = crate::network_state::owner_for_isolation_network(net);
3729 if let Some(allocator) = self.service_ip_allocators.get_mut(&alloc_key) {
3730 allocator.release(ip);
3731 } else {
3732 self.ip_allocator.release(ip);
3733 }
3734 } else {
3735 self.ip_allocator.release(ip);
3736 }
3737 // Drain the per-network membership set.
3738 let mut net_now_empty: Option<String> = None;
3739 if let Some(net) = isolation_network.as_deref() {
3740 if let Some(set) = self.network_members.get_mut(net) {
3741 set.remove(&ip);
3742 }
3743 if self
3744 .network_members
3745 .get(net)
3746 .is_some_and(std::collections::HashSet::is_empty)
3747 {
3748 self.network_members.remove(net);
3749 net_now_empty = Some(net.to_string());
3750 }
3751 }
3752 tracing::info!(ns = %namespace_guid, service = %service, ip = %ip, "Released HCN overlay attachment");
3753
3754 // Last-member teardown: when the final member of an isolation network
3755 // leaves, reclaim its per-network HCN Internal network (mirroring the
3756 // per-service network teardown in `teardown_service_overlay`) so we
3757 // don't leak an HCN vSwitch until the next full uninstall. Drop the
3758 // per-network IP allocator and the marker entry too.
3759 if let Some(net) = net_now_empty {
3760 self.teardown_isolation_network(&net).await;
3761 }
3762 }
3763 Ok(())
3764 }
3765
3766 /// Reclaim the per-isolation-network HCN Internal network for `net`: delete
3767 /// the HCN network by the GUID recorded in the marker, drop its marker entry,
3768 /// and discard the per-network IP allocator. Best-effort and idempotent —
3769 /// called once the last member of the isolation network detaches. Mirrors the
3770 /// per-service network teardown in [`Self::teardown_service_overlay`].
3771 #[cfg(target_os = "windows")]
3772 async fn teardown_isolation_network(&mut self, net: &str) {
3773 let owner = crate::network_state::owner_for_isolation_network(net);
3774
3775 // Drop the per-network container-IP allocator.
3776 self.service_ip_allocators.remove(&owner);
3777
3778 let marker_path =
3779 zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
3780 let mut marker = crate::network_state::NetworkState::load(&marker_path);
3781 let removed_entry = marker.remove(&owner);
3782 if removed_entry.is_some() {
3783 if let Err(e) = marker.save(&marker_path) {
3784 tracing::warn!(network = %net, error = %e, path = %marker_path.display(), "failed to persist isolation-network marker removal");
3785 }
3786 }
3787
3788 if let Some(entry) = removed_entry {
3789 if entry.kind == "hcn-internal" {
3790 match windows::core::GUID::try_from(entry.id.as_str()) {
3791 Ok(guid) => {
3792 let id_str = entry.id.clone();
3793 let net_owned = net.to_string();
3794 let delete = tokio::task::spawn_blocking(move || {
3795 zlayer_hns::network::Network::delete(guid)
3796 })
3797 .await;
3798 match delete {
3799 Ok(Ok(())) => {
3800 tracing::info!(network = %net_owned, id = %id_str, "deleted per-isolation-network HCN network on last detach");
3801 }
3802 Ok(Err(e)) => {
3803 tracing::warn!(network = %net_owned, id = %id_str, error = %e, "failed to delete isolation-network HCN network (may leak until uninstall)");
3804 }
3805 Err(e) => {
3806 tracing::warn!(network = %net_owned, id = %id_str, error = %e, "spawn_blocking join failed deleting isolation-network HCN network");
3807 }
3808 }
3809 }
3810 Err(_) => {
3811 tracing::warn!(network = %net, id = %entry.id, "isolation-network marker has unparseable HCN GUID; skipping network delete");
3812 }
3813 }
3814 }
3815 }
3816 }
3817
3818 /// Non-Windows path.
3819 #[cfg(not(target_os = "windows"))]
3820 #[allow(clippy::unused_async)]
3821 async fn detach_container_windows(
3822 &mut self,
3823 _namespace_guid: &str,
3824 ) -> Result<(), OverlaydError> {
3825 Ok(())
3826 }
3827
3828 /// Ensure the per-daemon HCN overlay (Internal vSwitch, no physical-NIC
3829 /// binding) exists on the host, reusing one recorded in the
3830 /// `{data_dir}/agent_network.json` marker or discoverable by name, and
3831 /// recording it in the marker on create.
3832 ///
3833 /// # Errors
3834 /// Propagates the underlying `zlayer_hns` error on create failure.
3835 #[cfg(target_os = "windows")]
3836 #[allow(clippy::too_many_lines)]
3837 async fn ensure_overlay_network(
3838 &mut self,
3839 slice_cidr: ipnet::IpNet,
3840 ) -> Result<windows::core::GUID, OverlaydError> {
3841 use windows::core::GUID;
3842
3843 let daemon_name = self.deployment_or_default();
3844 let net_name = overlay_network_name(&daemon_name);
3845 let marker_path =
3846 zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
3847
3848 // Fast path: marker names a network GUID that still exists; reopen it.
3849 if let Some(recorded_id) = crate::network_state::NetworkState::load(&marker_path)
3850 .get(crate::network_state::OWNER_BASE)
3851 .and_then(|entry| GUID::try_from(entry.id.as_str()).ok())
3852 {
3853 let reopened = tokio::task::spawn_blocking(move || {
3854 zlayer_hns::network::Network::open(recorded_id).ok()
3855 })
3856 .await
3857 .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
3858 if reopened.is_some() {
3859 tracing::info!(name = %net_name, "reusing HCN overlay network from marker");
3860 return Ok(recorded_id);
3861 }
3862 }
3863
3864 // Idempotency: reuse a host network whose queried name matches ours.
3865 let target_name = net_name.clone();
3866 let existing = tokio::task::spawn_blocking(move || -> Option<GUID> {
3867 let guids = zlayer_hns::network::list("{}").ok()?;
3868 for guid in guids {
3869 let Ok(network) = zlayer_hns::network::Network::open(guid) else {
3870 continue;
3871 };
3872 if matches!(network.query("{}"), Ok(props) if props.name == target_name) {
3873 return Some(guid);
3874 }
3875 }
3876 None
3877 })
3878 .await
3879 .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
3880
3881 if let Some(existing_id) = existing {
3882 tracing::info!(name = %net_name, "reusing existing HCN overlay network");
3883 return Ok(existing_id);
3884 }
3885
3886 let net_id = GUID::new()
3887 .map_err(|e| OverlaydError::Other(format!("GUID::new for overlay network: {e}")))?;
3888 let subnet_str = slice_cidr.to_string();
3889
3890 // Default: an HCN Internal network — an internal vSwitch with NO
3891 // physical-NIC binding — so container traffic never touches the
3892 // operator's gateway adapter. Setting ZLAYER_HCN_UPLINK_ADAPTER opts
3893 // into the legacy Transparent model bound to that named uplink.
3894 let use_transparent = std::env::var(zlayer_hns::adapter::ZLAYER_UPLINK_ENV)
3895 .ok()
3896 .is_some_and(|v| !v.trim().is_empty());
3897
3898 let net_name_for_create = net_name.clone();
3899 let subnet_for_create = subnet_str.clone();
3900 if use_transparent {
3901 let uplink = zlayer_hns::adapter::find_primary_adapter()
3902 .map_err(|e| OverlaydError::Other(format!("find_primary_adapter: {e}")))?;
3903 tracing::warn!(uplink = %uplink, "ZLAYER_HCN_UPLINK_ADAPTER set: creating HCN *Transparent* overlay bound to a physical NIC");
3904 tokio::task::spawn_blocking(move || {
3905 zlayer_hns::network::Network::create_transparent(
3906 net_id,
3907 &net_name_for_create,
3908 &subnet_for_create,
3909 &uplink,
3910 )
3911 })
3912 .await
3913 .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
3914 .map_err(|e| {
3915 OverlaydError::Overlay(format!("HcnCreateNetwork transparent ({net_name}): {e}"))
3916 })?;
3917 } else {
3918 tokio::task::spawn_blocking(move || {
3919 zlayer_hns::network::Network::create_internal(
3920 net_id,
3921 &net_name_for_create,
3922 &subnet_for_create,
3923 )
3924 })
3925 .await
3926 .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
3927 .map_err(|e| {
3928 OverlaydError::Overlay(format!("HcnCreateNetwork internal ({net_name}): {e}"))
3929 })?;
3930 }
3931
3932 // HCN's Static IPAM needs ~1-2s after network create to settle its
3933 // address pool; without this the first endpoint frequently fails with
3934 // HCN_E_ADDR_INVALID_OR_RESERVED.
3935 tokio::time::sleep(std::time::Duration::from_secs(2)).await;
3936
3937 tracing::info!(
3938 subnet = %subnet_str,
3939 mode = if use_transparent { "Transparent" } else { "Internal" },
3940 "created HCN overlay network"
3941 );
3942
3943 // Persist the marker so subsequent runs reuse this network by GUID and a
3944 // full uninstall knows to delete it. Best-effort.
3945 let mut marker = crate::network_state::NetworkState::load(&marker_path);
3946 marker.upsert(crate::network_state::ManagedNetwork {
3947 owner: crate::network_state::OWNER_BASE.to_string(),
3948 kind: if use_transparent {
3949 "hcn-transparent"
3950 } else {
3951 "hcn-internal"
3952 }
3953 .to_string(),
3954 name: net_name.clone(),
3955 id: format_guid_bare(net_id),
3956 subnet: subnet_str.clone(),
3957 // Base/Shared HCN network: no dedicated WireGuard identity.
3958 wg_port: None,
3959 wg_private_key: None,
3960 wg_public_key: None,
3961 interface: None,
3962 });
3963 if let Err(e) = marker.save(&marker_path) {
3964 tracing::warn!(error = %e, path = %marker_path.display(), "failed to persist agent network marker (network still reusable by name)");
3965 }
3966
3967 Ok(net_id)
3968 }
3969
3970 /// Ensure the SINGLE shared HCN **NAT** network exists on the host, reusing
3971 /// one recorded under the [`OWNER_SHARED_NAT`] marker owner (or discoverable
3972 /// by its derived name) and recording it on create. Reused across every
3973 /// `OverlayMode::Shared` service on this node.
3974 ///
3975 /// NAT gives Shared containers outbound connectivity and lets the userspace
3976 /// free-port L4 proxy (`proxy_manager.rs`) forward `host:FREEPORT` ->
3977 /// `container_ip:port` without a per-service vSwitch — the Windows analogue
3978 /// of the Linux node-wide shared bridge. Modeled on
3979 /// [`Self::ensure_overlay_network`] but keyed on [`OWNER_SHARED_NAT`] and
3980 /// forced to the NAT network type.
3981 ///
3982 /// Returns the network GUID.
3983 ///
3984 /// # Errors
3985 /// Propagates the underlying `zlayer_hns` error on create failure, or an
3986 /// error if the slice CIDR has no usable gateway host.
3987 #[cfg(target_os = "windows")]
3988 #[allow(clippy::too_many_lines)]
3989 async fn ensure_shared_nat_network(
3990 &mut self,
3991 slice_cidr: ipnet::IpNet,
3992 ) -> Result<windows::core::GUID, OverlaydError> {
3993 use windows::core::GUID;
3994
3995 let daemon_name = self.deployment_or_default();
3996 // Shared NAT network name: `<base overlay name>-shared` so it is
3997 // unambiguously distinct from the base network and per-service networks.
3998 let net_name = format!("{}-shared", overlay_network_name(&daemon_name));
3999 let owner = crate::network_state::OWNER_SHARED_NAT.to_string();
4000 let marker_path =
4001 zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
4002
4003 // Fast path: marker names a network GUID that still exists; reopen it.
4004 let recorded_id = crate::network_state::NetworkState::load(&marker_path)
4005 .get(&owner)
4006 .filter(|entry| entry.kind == "hcn-nat")
4007 .and_then(|entry| GUID::try_from(entry.id.as_str()).ok());
4008 if let Some(recorded_id) = recorded_id {
4009 let reopened = tokio::task::spawn_blocking(move || {
4010 zlayer_hns::network::Network::open(recorded_id).ok()
4011 })
4012 .await
4013 .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
4014 if reopened.is_some() {
4015 tracing::info!(name = %net_name, "reusing shared HCN NAT network from marker");
4016 return Ok(recorded_id);
4017 }
4018 }
4019
4020 // Idempotency: reuse a host network whose queried name matches ours.
4021 let target_name = net_name.clone();
4022 let existing = tokio::task::spawn_blocking(move || -> Option<GUID> {
4023 let guids = zlayer_hns::network::list("{}").ok()?;
4024 for guid in guids {
4025 let Ok(network) = zlayer_hns::network::Network::open(guid) else {
4026 continue;
4027 };
4028 if matches!(network.query("{}"), Ok(props) if props.name == target_name) {
4029 return Some(guid);
4030 }
4031 }
4032 None
4033 })
4034 .await
4035 .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
4036
4037 if let Some(existing_id) = existing {
4038 tracing::info!(name = %net_name, "reusing existing shared HCN NAT network");
4039 return Ok(existing_id);
4040 }
4041
4042 let net_id = GUID::new()
4043 .map_err(|e| OverlaydError::Other(format!("GUID::new for shared NAT network: {e}")))?;
4044 let subnet_str = slice_cidr.to_string();
4045 let settings = shared_nat_settings(&net_name, &subnet_str).ok_or_else(|| {
4046 OverlaydError::Other(format!(
4047 "shared NAT network: slice CIDR '{subnet_str}' has no usable gateway host"
4048 ))
4049 })?;
4050
4051 let net_name_for_create = net_name.clone();
4052 tokio::task::spawn_blocking(move || {
4053 zlayer_hns::network::Network::create(net_id, &settings)
4054 })
4055 .await
4056 .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
4057 .map_err(|e| OverlaydError::Overlay(format!("HcnCreateNetwork NAT ({net_name}): {e}")))?;
4058 let _ = net_name_for_create;
4059
4060 // HCN's IPAM needs ~1-2s after network create to settle its address pool
4061 // (same wait as the base/Internal networks).
4062 tokio::time::sleep(std::time::Duration::from_secs(2)).await;
4063
4064 tracing::info!(subnet = %subnet_str, "created shared HCN NAT network");
4065
4066 let mut marker = crate::network_state::NetworkState::load(&marker_path);
4067 marker.upsert(crate::network_state::ManagedNetwork {
4068 owner,
4069 kind: "hcn-nat".to_string(),
4070 name: net_name.clone(),
4071 id: format_guid_bare(net_id),
4072 subnet: subnet_str.clone(),
4073 wg_port: None,
4074 wg_private_key: None,
4075 wg_public_key: None,
4076 interface: None,
4077 });
4078 if let Err(e) = marker.save(&marker_path) {
4079 tracing::warn!(error = %e, path = %marker_path.display(), "failed to persist shared NAT network marker (network still reusable by name)");
4080 }
4081
4082 Ok(net_id)
4083 }
4084
4085 /// Ensure the per-service HCN **Internal** network for `service` exists on
4086 /// the host, reusing one recorded under the `service:<name>` marker owner
4087 /// (or discoverable by its derived name) and recording it on create.
4088 ///
4089 /// This is the Windows analogue of the Linux per-service bridge: a
4090 /// dedicated (`OverlayMode::Dedicated`) service gets its OWN isolated HCN
4091 /// Internal network — an internal vSwitch with NO physical-NIC binding —
4092 /// distinct from the node's shared base overlay network. Containers attach
4093 /// to it (rather than the base network) so dedicated-service traffic is
4094 /// segregated at the vSwitch layer. Modeled on [`Self::ensure_overlay_network`]
4095 /// but keyed on [`owner_for_service`] and forced to the Internal type (never
4096 /// Transparent — the on-box test asserts zero external vSwitches for
4097 /// dedicated services).
4098 ///
4099 /// Returns the network GUID.
4100 ///
4101 /// # Errors
4102 /// Propagates the underlying `zlayer_hns` error on create failure.
4103 #[cfg(target_os = "windows")]
4104 #[allow(clippy::too_many_lines)]
4105 async fn ensure_service_network(
4106 &mut self,
4107 service: &str,
4108 subnet: ipnet::IpNet,
4109 ) -> Result<windows::core::GUID, OverlaydError> {
4110 use windows::core::GUID;
4111
4112 let daemon_name = self.deployment_or_default();
4113 // Per-service network name: `<base overlay name>-svc-<service>` so it is
4114 // unambiguously distinct from the base network and from other services.
4115 let net_name = format!("{}-svc-{service}", overlay_network_name(&daemon_name));
4116 let owner = owner_for_service(service);
4117 let marker_path =
4118 zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
4119
4120 // Fast path: marker names a network GUID that still exists; reopen it.
4121 // Only honor the recorded id when it belongs to an HCN-internal entry —
4122 // a Dedicated WireGuard marker (`kind == "wg-dedicated"`) stores the
4123 // transport public key in `id`, NOT an HCN GUID, so it must be ignored
4124 // for HCN reuse.
4125 let recorded_hcn_id = crate::network_state::NetworkState::load(&marker_path)
4126 .get(&owner)
4127 .filter(|entry| entry.kind == "hcn-internal")
4128 .and_then(|entry| GUID::try_from(entry.id.as_str()).ok());
4129 if let Some(recorded_id) = recorded_hcn_id {
4130 let reopened = tokio::task::spawn_blocking(move || {
4131 zlayer_hns::network::Network::open(recorded_id).ok()
4132 })
4133 .await
4134 .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
4135 if reopened.is_some() {
4136 tracing::info!(name = %net_name, service = %service, "reusing per-service HCN network from marker");
4137 return Ok(recorded_id);
4138 }
4139 }
4140
4141 // Idempotency: reuse a host network whose queried name matches ours.
4142 let target_name = net_name.clone();
4143 let existing = tokio::task::spawn_blocking(move || -> Option<GUID> {
4144 let guids = zlayer_hns::network::list("{}").ok()?;
4145 for guid in guids {
4146 let Ok(network) = zlayer_hns::network::Network::open(guid) else {
4147 continue;
4148 };
4149 if matches!(network.query("{}"), Ok(props) if props.name == target_name) {
4150 return Some(guid);
4151 }
4152 }
4153 None
4154 })
4155 .await
4156 .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
4157
4158 if let Some(existing_id) = existing {
4159 tracing::info!(name = %net_name, service = %service, "reusing existing per-service HCN network");
4160 return Ok(existing_id);
4161 }
4162
4163 let net_id = GUID::new()
4164 .map_err(|e| OverlaydError::Other(format!("GUID::new for per-service network: {e}")))?;
4165 let subnet_str = subnet.to_string();
4166
4167 // ALWAYS Internal for a dedicated service — never Transparent. The
4168 // dedicated requirement is isolation; an Internal network binds NO
4169 // physical NIC (no external vSwitch), which is what the on-box test
4170 // asserts.
4171 let net_name_for_create = net_name.clone();
4172 let subnet_for_create = subnet_str.clone();
4173 tokio::task::spawn_blocking(move || {
4174 zlayer_hns::network::Network::create_internal(
4175 net_id,
4176 &net_name_for_create,
4177 &subnet_for_create,
4178 )
4179 })
4180 .await
4181 .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
4182 .map_err(|e| {
4183 OverlaydError::Overlay(format!("HcnCreateNetwork internal ({net_name}): {e}"))
4184 })?;
4185
4186 // HCN's Static IPAM needs ~1-2s after network create to settle its
4187 // address pool; without this the first endpoint frequently fails with
4188 // HCN_E_ADDR_INVALID_OR_RESERVED (same wait as the base network).
4189 tokio::time::sleep(std::time::Duration::from_secs(2)).await;
4190
4191 tracing::info!(
4192 service = %service,
4193 subnet = %subnet_str,
4194 "created per-service HCN Internal network"
4195 );
4196
4197 // Persist the marker (owner = `service:<name>`, kind = `hcn-internal`)
4198 // so subsequent runs reuse this network by GUID and a full uninstall
4199 // (`purge_managed_networks`, which sweeps every `kind` starting with
4200 // `hcn`) deletes it. Best-effort.
4201 //
4202 // A dedicated Windows service shares the SAME owner key for two facts:
4203 // the dedicated WireGuard identity (written by the cross-platform core
4204 // in `setup_service_overlay_dedicated`, kind `wg-dedicated`) and this
4205 // HCN network's GUID. The marker is keyed by owner, so carry the WG
4206 // identity fields over when we rewrite the entry to `hcn-internal` — the
4207 // single entry then holds both the HCN GUID (in `id`) and the WG
4208 // identity (in the `wg_*`/`interface` fields), and the WG private key
4209 // survives restarts. (The core re-asserts the `wg-dedicated` shape on
4210 // the next setup; this path re-asserts `hcn-internal` again right after
4211 // — both are self-healing because the network is also reusable by name.)
4212 let mut marker = crate::network_state::NetworkState::load(&marker_path);
4213 let carried = marker.get(&owner).cloned();
4214 marker.upsert(crate::network_state::ManagedNetwork {
4215 owner,
4216 kind: "hcn-internal".to_string(),
4217 name: net_name.clone(),
4218 id: format_guid_bare(net_id),
4219 subnet: subnet_str.clone(),
4220 wg_port: carried.as_ref().and_then(|c| c.wg_port),
4221 wg_private_key: carried.as_ref().and_then(|c| c.wg_private_key.clone()),
4222 wg_public_key: carried.as_ref().and_then(|c| c.wg_public_key.clone()),
4223 interface: carried.as_ref().and_then(|c| c.interface.clone()),
4224 });
4225 if let Err(e) = marker.save(&marker_path) {
4226 tracing::warn!(service = %service, error = %e, path = %marker_path.display(), "failed to persist per-service network marker (network still reusable by name)");
4227 }
4228
4229 Ok(net_id)
4230 }
4231
4232 /// Resolve the per-isolation-network subnet for `net`, carving a fixed-size
4233 /// sub-block out of the node slice deterministically by name hash.
4234 ///
4235 /// Isolation networks attach onto a dedicated HCN Internal vSwitch and need
4236 /// their OWN address pool (never the node slice's shared pool) so a member's
4237 /// IP is on-link with its network's gateway. Unlike dedicated services,
4238 /// isolation networks aren't registered in the cluster's
4239 /// [`ServiceSubnetRegistry`] (a standalone isolated container may use the
4240 /// base overlay, where no `SetupServiceOverlay` ran), so the subnet is
4241 /// derived locally and deterministically: the node slice is split into
4242 /// `/<sub_prefix>` blocks and the network name selects one by hash. The
4243 /// derivation is stable across restarts (same name -> same block) so a
4244 /// reused HCN network keeps the same subnet.
4245 ///
4246 /// # Errors
4247 /// Returns an error if no node slice is assigned yet, the slice CIDR is
4248 /// unparseable, or the slice cannot be subnetted (e.g. already at the host
4249 /// prefix).
4250 #[cfg(target_os = "windows")]
4251 fn isolation_network_subnet(&self, net: &str) -> Result<ipnet::IpNet, OverlaydError> {
4252 use std::hash::{Hash, Hasher};
4253
4254 let slice = self.slice_cidr.ok_or_else(|| {
4255 OverlaydError::Other(
4256 "no node slice assigned yet (SetupGlobalOverlay with slice_cidr first)".to_string(),
4257 )
4258 })?;
4259 let slice_ipnet: ipnet::IpNet = slice.to_string().parse().map_err(|e| {
4260 OverlaydError::Other(format!("failed to parse slice CIDR {slice}: {e}"))
4261 })?;
4262
4263 // Carve the slice into /<sub_prefix> blocks. A `/28` (V4) gives ~13
4264 // usable container IPs per isolation network per node — enough for the
4265 // isolated-container use case — while leaving room for several distinct
4266 // isolation networks inside one node slice. Clamp to the slice prefix so
4267 // a slice already more specific than the target just yields itself.
4268 let sub_prefix: u8 = match slice_ipnet {
4269 ipnet::IpNet::V4(_) => 28u8.max(slice_ipnet.prefix_len()),
4270 ipnet::IpNet::V6(_) => 124u8.max(slice_ipnet.prefix_len()),
4271 };
4272
4273 let blocks: Vec<ipnet::IpNet> = slice_ipnet
4274 .subnets(sub_prefix)
4275 .map_err(|e| {
4276 OverlaydError::Other(format!(
4277 "failed to subnet slice {slice_ipnet} into /{sub_prefix} blocks: {e}"
4278 ))
4279 })?
4280 .collect();
4281 if blocks.is_empty() {
4282 return Err(OverlaydError::Other(format!(
4283 "slice {slice_ipnet} yielded no /{sub_prefix} blocks for isolation network {net}"
4284 )));
4285 }
4286
4287 let mut hasher = std::collections::hash_map::DefaultHasher::new();
4288 net.hash(&mut hasher);
4289 // `% blocks.len()` is always < blocks.len() <= usize::MAX, so this never
4290 // truncates; `try_from` keeps clippy happy without an unchecked cast.
4291 let idx = usize::try_from(hasher.finish() % blocks.len() as u64).unwrap_or(0);
4292 Ok(blocks[idx])
4293 }
4294
4295 /// Ensure the per-isolation-network HCN **Internal** network for `net` exists
4296 /// on the host, reusing one recorded under the
4297 /// [`owner_for_isolation_network`] marker owner (or discoverable by its
4298 /// derived name) and recording it on create.
4299 ///
4300 /// This is the Windows mechanism for per-network L3 isolation: every
4301 /// `ZLayer` "isolated" network gets its OWN HCN Internal vSwitch — an
4302 /// internal vSwitch with NO physical-NIC binding. HCN Internal vSwitches are
4303 /// mutually isolated by default, so same-network members (sharing this
4304 /// vSwitch) reach each other + egress + the node, while members of a
4305 /// different isolation network land on a different vSwitch and cannot reach
4306 /// them. No ACLs, no per-member churn. Modeled on
4307 /// [`Self::ensure_service_network`] but keyed on
4308 /// [`owner_for_isolation_network`] and named `<overlay>-iso-<net>`.
4309 ///
4310 /// Returns the network GUID.
4311 ///
4312 /// # Errors
4313 /// Propagates the underlying `zlayer_hns` error on create failure.
4314 #[cfg(target_os = "windows")]
4315 async fn ensure_isolation_network(
4316 &mut self,
4317 net: &str,
4318 subnet: ipnet::IpNet,
4319 ) -> Result<windows::core::GUID, OverlaydError> {
4320 use windows::core::GUID;
4321
4322 let daemon_name = self.deployment_or_default();
4323 // Per-isolation-network name: `<base overlay name>-iso-<net>` so it is
4324 // unambiguously distinct from the base network and per-service networks.
4325 let net_name = format!("{}-iso-{net}", overlay_network_name(&daemon_name));
4326 let owner = crate::network_state::owner_for_isolation_network(net);
4327 let marker_path =
4328 zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
4329
4330 // Fast path: marker names a network GUID that still exists; reopen it.
4331 let recorded_hcn_id = crate::network_state::NetworkState::load(&marker_path)
4332 .get(&owner)
4333 .filter(|entry| entry.kind == "hcn-internal")
4334 .and_then(|entry| GUID::try_from(entry.id.as_str()).ok());
4335 if let Some(recorded_id) = recorded_hcn_id {
4336 let reopened = tokio::task::spawn_blocking(move || {
4337 zlayer_hns::network::Network::open(recorded_id).ok()
4338 })
4339 .await
4340 .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
4341 if reopened.is_some() {
4342 tracing::info!(name = %net_name, network = %net, "reusing per-isolation-network HCN network from marker");
4343 return Ok(recorded_id);
4344 }
4345 }
4346
4347 // Idempotency: reuse a host network whose queried name matches ours.
4348 let target_name = net_name.clone();
4349 let existing = tokio::task::spawn_blocking(move || -> Option<GUID> {
4350 let guids = zlayer_hns::network::list("{}").ok()?;
4351 for guid in guids {
4352 let Ok(network) = zlayer_hns::network::Network::open(guid) else {
4353 continue;
4354 };
4355 if matches!(network.query("{}"), Ok(props) if props.name == target_name) {
4356 return Some(guid);
4357 }
4358 }
4359 None
4360 })
4361 .await
4362 .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?;
4363
4364 if let Some(existing_id) = existing {
4365 tracing::info!(name = %net_name, network = %net, "reusing existing per-isolation-network HCN network");
4366 return Ok(existing_id);
4367 }
4368
4369 let net_id = GUID::new().map_err(|e| {
4370 OverlaydError::Other(format!("GUID::new for per-isolation-network network: {e}"))
4371 })?;
4372 let subnet_str = subnet.to_string();
4373
4374 // ALWAYS Internal for an isolation network — never Transparent. The
4375 // isolation requirement is exactly the Internal-vSwitch property: no
4376 // physical-NIC binding, mutually isolated from other Internal vSwitches.
4377 let net_name_for_create = net_name.clone();
4378 let subnet_for_create = subnet_str.clone();
4379 tokio::task::spawn_blocking(move || {
4380 zlayer_hns::network::Network::create_internal(
4381 net_id,
4382 &net_name_for_create,
4383 &subnet_for_create,
4384 )
4385 })
4386 .await
4387 .map_err(|e| OverlaydError::Other(format!("spawn_blocking join failed: {e}")))?
4388 .map_err(|e| {
4389 OverlaydError::Overlay(format!("HcnCreateNetwork internal ({net_name}): {e}"))
4390 })?;
4391
4392 // HCN's Static IPAM needs ~1-2s after network create to settle its
4393 // address pool; without this the first endpoint frequently fails with
4394 // HCN_E_ADDR_INVALID_OR_RESERVED (same wait as the per-service network).
4395 tokio::time::sleep(std::time::Duration::from_secs(2)).await;
4396
4397 tracing::info!(
4398 network = %net,
4399 subnet = %subnet_str,
4400 "created per-isolation-network HCN Internal network"
4401 );
4402
4403 // Persist the marker (owner = `iso:<net>`, kind = `hcn-internal`) so
4404 // subsequent runs reuse this network by GUID and a full uninstall
4405 // (`purge_managed_networks`, which sweeps every `kind` starting with
4406 // `hcn`) deletes it. Best-effort.
4407 let mut marker = crate::network_state::NetworkState::load(&marker_path);
4408 marker.upsert(crate::network_state::ManagedNetwork {
4409 owner,
4410 kind: "hcn-internal".to_string(),
4411 name: net_name.clone(),
4412 id: format_guid_bare(net_id),
4413 subnet: subnet_str.clone(),
4414 // Isolation HCN network: no dedicated WireGuard identity.
4415 wg_port: None,
4416 wg_private_key: None,
4417 wg_public_key: None,
4418 interface: None,
4419 });
4420 if let Err(e) = marker.save(&marker_path) {
4421 tracing::warn!(network = %net, error = %e, path = %marker_path.display(), "failed to persist per-isolation-network marker (network still reusable by name)");
4422 }
4423
4424 Ok(net_id)
4425 }
4426
4427 /// Resolve the dedicated per-service subnet for `service`, if the service
4428 /// runs in `OverlayMode::Dedicated` on this node.
4429 ///
4430 /// Source of truth, in order:
4431 /// 1. The live [`ServiceTransport`] in `service_transports` (the normal
4432 /// case once `SetupServiceOverlay` has run this process).
4433 /// 2. A persisted `hcn-internal` marker entry under
4434 /// [`owner_for_service`]`(service)` — covers the window where the HCN
4435 /// network exists from a prior run but the transport map is still empty.
4436 ///
4437 /// Returns `None` for Shared-mode services (attach onto the base network).
4438 #[cfg(target_os = "windows")]
4439 fn dedicated_service_subnet(&self, service: &str) -> Option<ipnet::IpNet> {
4440 if let Some(st) = self.service_transports.get(service) {
4441 return Some(st.subnet);
4442 }
4443 let marker_path =
4444 zlayer_paths::ZLayerDirs::new(self.data_dir.clone()).agent_network_state();
4445 crate::network_state::NetworkState::load(&marker_path)
4446 .get(&owner_for_service(service))
4447 .filter(|entry| entry.kind == "hcn-internal")
4448 .and_then(|entry| entry.subnet.parse::<ipnet::IpNet>().ok())
4449 }
4450
4451 /// The daemon name used for HCN network/owner naming, defaulting to
4452 /// `"zlayer"` when no deployment has been set yet.
4453 #[cfg(target_os = "windows")]
4454 fn deployment_or_default(&self) -> String {
4455 if self.deployment.is_empty() {
4456 "zlayer".to_string()
4457 } else {
4458 self.deployment.clone()
4459 }
4460 }
4461
4462 // -- peers ---------------------------------------------------------------
4463
4464 /// Resolve a [`PeerScope`] to the live [`OverlayTransport`] its ops target.
4465 ///
4466 /// `Global` -> the single cluster transport; `Service { service }` -> that
4467 /// service's dedicated per-service transport (Dedicated mode only).
4468 ///
4469 /// # Errors
4470 /// Returns an error if the global overlay is not up (for `Global`) or no
4471 /// dedicated overlay exists for the named service (for `Service`).
4472 fn transport_for_scope(&self, scope: &PeerScope) -> Result<&OverlayTransport, OverlaydError> {
4473 match scope {
4474 PeerScope::Global => self
4475 .global_transport
4476 .as_ref()
4477 .ok_or_else(|| OverlaydError::Other("global overlay not set up".into())),
4478 PeerScope::Service { service } => self
4479 .service_transports
4480 .get(service)
4481 .map(|s| &s.transport)
4482 .ok_or_else(|| {
4483 OverlaydError::Other(format!("no dedicated overlay for service {service}"))
4484 }),
4485 }
4486 }
4487
4488 /// Add a peer to a resolved transport.
4489 ///
4490 /// # Errors
4491 /// Wraps the underlying transport error.
4492 async fn add_peer_on(
4493 transport: &OverlayTransport,
4494 peer: &PeerInfo,
4495 ) -> Result<(), OverlaydError> {
4496 transport
4497 .add_peer(peer)
4498 .await
4499 .map_err(|e| OverlaydError::Overlay(format!("add_peer failed: {e}")))
4500 }
4501
4502 /// Remove a peer (by base64 public key) from a resolved transport.
4503 ///
4504 /// # Errors
4505 /// Wraps the underlying transport error.
4506 async fn remove_peer_on(
4507 transport: &OverlayTransport,
4508 pubkey: &str,
4509 ) -> Result<(), OverlaydError> {
4510 transport
4511 .remove_peer(pubkey)
4512 .await
4513 .map_err(|e| OverlaydError::Overlay(format!("remove_peer failed: {e}")))
4514 }
4515
4516 /// Plumb a CIDR into a peer's `AllowedIPs` on a resolved transport.
4517 ///
4518 /// # Errors
4519 /// Returns an error when the CIDR is invalid or the UAPI write fails.
4520 async fn add_allowed_ip_on(
4521 transport: &OverlayTransport,
4522 pubkey: &str,
4523 cidr: &str,
4524 ) -> Result<(), OverlaydError> {
4525 let net: ipnet::IpNet = cidr
4526 .parse()
4527 .map_err(|e| OverlaydError::Other(format!("invalid CIDR {cidr}: {e}")))?;
4528 transport
4529 .add_allowed_ip(pubkey, net)
4530 .await
4531 .map_err(|e| OverlaydError::Overlay(format!("add_allowed_ip failed: {e}")))
4532 }
4533
4534 /// Remove a CIDR from a peer's `AllowedIPs` on a resolved transport.
4535 ///
4536 /// # Errors
4537 /// Returns an error when the CIDR is invalid or the UAPI write fails.
4538 async fn remove_allowed_ip_on(
4539 transport: &OverlayTransport,
4540 pubkey: &str,
4541 cidr: &str,
4542 ) -> Result<(), OverlaydError> {
4543 let net: ipnet::IpNet = cidr
4544 .parse()
4545 .map_err(|e| OverlaydError::Other(format!("invalid CIDR {cidr}: {e}")))?;
4546 transport
4547 .remove_allowed_ip(pubkey, net)
4548 .await
4549 .map_err(|e| OverlaydError::Overlay(format!("remove_allowed_ip failed: {e}")))
4550 }
4551
4552 // -- DNS -----------------------------------------------------------------
4553
4554 /// Register an overlay DNS A/AAAA record.
4555 fn register_dns(&mut self, name: String, ip: IpAddr) {
4556 self.dns_records.insert(name, ip);
4557 }
4558
4559 /// Remove an overlay DNS record.
4560 fn unregister_dns(&mut self, name: &str) {
4561 self.dns_records.remove(name);
4562 }
4563
4564 // -- NAT -----------------------------------------------------------------
4565
4566 /// Periodic NAT traversal maintenance: lazily start NAT (and the built-in
4567 /// relay server), re-probe STUN, refresh relays, and run the connect-half —
4568 /// hole-punching / relaying toward every peer whose direct endpoint has not
4569 /// produced a recent `WireGuard` handshake.
4570 ///
4571 /// No-op when NAT traversal is disabled in the resolved [`NatConfig`].
4572 ///
4573 /// # Errors
4574 /// Returns an error when the underlying STUN refresh fails.
4575 async fn nat_maintenance_tick(&mut self) -> Result<(), OverlaydError> {
4576 // Lazily start NAT traversal on the first tick if a config asks for it.
4577 if self.nat_traversal.is_none() {
4578 let config = self.nat_config.clone().unwrap_or_default();
4579 if config.enabled {
4580 // Stand up the built-in relay server here (once) when the
4581 // resolved config carries a `relay_server`. The auth credential
4582 // MUST be cluster-wide-shared (every node's relay *client*
4583 // derives the same BLAKE2b key via `derive_auth_key`), so it
4584 // comes from `cluster_relay_credential` — the cluster HS256
4585 // secret the main daemon stamped into
4586 // `NatConfigSpec.relay_server.auth_credential`, NOT the node's
4587 // per-node WireGuard key. When no credential was supplied the
4588 // relay derives a key from the empty string (only same-config
4589 // nodes can use it).
4590 if let Some(relay_cfg) = config.relay_server.clone() {
4591 if self.relay_server.is_none() {
4592 let credential = self.cluster_relay_credential.clone().unwrap_or_default();
4593 let relay = RelayServer::new(&relay_cfg, &credential);
4594 match relay.start().await {
4595 Ok(bound) => {
4596 tracing::info!(
4597 bound = %bound,
4598 external = %relay_cfg.external_addr,
4599 "Built-in relay server started"
4600 );
4601 self.relay_bound_addr = Some(bound);
4602 self.relay_server = Some(relay);
4603 }
4604 Err(e) => {
4605 tracing::warn!(error = %e, "Built-in relay server failed to start");
4606 }
4607 }
4608 }
4609 }
4610
4611 let mut nat = NatTraversal::new(config, self.overlay_port);
4612 match nat.gather_candidates().await {
4613 Ok(candidates) => {
4614 tracing::info!(count = candidates.len(), "Gathered NAT candidates");
4615 self.nat_last_refresh.store(now_unix(), Ordering::SeqCst);
4616 self.nat_traversal = Some(nat);
4617 }
4618 Err(e) => {
4619 tracing::warn!(error = %e, "NAT candidate gathering failed");
4620 return Ok(());
4621 }
4622 }
4623 // First-tick connect: try to establish toward every already-known
4624 // peer (peers added before NAT came up).
4625 self.nat_connect_known_peers().await;
4626 } else {
4627 return Ok(());
4628 }
4629 }
4630
4631 // Refresh STUN/relay state, then run the connect-half for peers that
4632 // still lack a recent handshake.
4633 if let Some(nat) = self.nat_traversal.as_mut() {
4634 match nat.refresh().await {
4635 Ok(changed) => {
4636 if changed {
4637 tracing::info!("NAT reflexive address changed during refresh");
4638 }
4639 self.nat_last_refresh.store(now_unix(), Ordering::SeqCst);
4640 }
4641 Err(e) => {
4642 return Err(OverlaydError::Overlay(format!(
4643 "NAT maintenance tick failed: {e}"
4644 )));
4645 }
4646 }
4647 }
4648 self.nat_connect_known_peers().await;
4649 Ok(())
4650 }
4651
4652 /// The NAT connect-half: for every peer with advertised candidates that has
4653 /// no recent `WireGuard` handshake, call [`NatTraversal::connect_to_peer`]
4654 /// (which itself updates the live device's peer endpoint) and record the
4655 /// resulting [`ConnectionType`].
4656 ///
4657 /// Best-effort: a peer with no live global transport, no candidates, or a
4658 /// failed traversal is left untouched (its persistent direct endpoint keeps
4659 /// retrying). Candidate sets are collected into a local `Vec` first so the
4660 /// borrow of `self.nat_traversal` / `self.global_transport` does not overlap
4661 /// the mutable borrow of `self.peer_connection_type`.
4662 async fn nat_connect_known_peers(&mut self) {
4663 // No host transport (VM-only overlay) or no NAT orchestrator → nothing
4664 // to connect on this node.
4665 let (Some(_), Some(_)) = (self.global_transport.as_ref(), self.nat_traversal.as_ref())
4666 else {
4667 return;
4668 };
4669 if self.peer_candidates.is_empty() {
4670 return;
4671 }
4672
4673 // Peers whose handshake is older than this cutoff (or never seen) are
4674 // candidates for a (re)connect attempt. WireGuard's default keepalive is
4675 // 25s; 3× that is a generous "the direct endpoint is clearly not
4676 // establishing" threshold that avoids churning healthy peers.
4677 let cutoff = now_unix().saturating_sub(75);
4678
4679 // Snapshot the (pubkey, candidates) work set up front to satisfy the
4680 // borrow checker (we borrow self.transport + self.nat below).
4681 let work: Vec<(String, Vec<Candidate>)> = self
4682 .peer_candidates
4683 .iter()
4684 .map(|(k, v)| (k.clone(), v.clone()))
4685 .collect();
4686
4687 let transport = self.global_transport.as_ref().expect("checked above");
4688 let nat = self.nat_traversal.as_ref().expect("checked above");
4689 let mut results: Vec<(String, ConnectionType)> = Vec::new();
4690
4691 for (pubkey, candidates) in &work {
4692 // Skip peers that already have a fresh handshake on the live device.
4693 match transport.check_peer_handshake(pubkey, cutoff).await {
4694 Ok(true) => continue,
4695 Ok(false) => {}
4696 Err(e) => {
4697 tracing::debug!(peer = %pubkey, error = %e, "handshake check failed; attempting connect anyway");
4698 }
4699 }
4700 match nat.connect_to_peer(transport, pubkey, candidates).await {
4701 Ok(connection_type) => {
4702 tracing::info!(
4703 peer = %pubkey,
4704 connection = %connection_type,
4705 "NAT traversal established connection to peer"
4706 );
4707 results.push((pubkey.clone(), connection_type));
4708 }
4709 Err(e) => {
4710 tracing::debug!(peer = %pubkey, error = %e, "NAT traversal could not connect to peer this tick");
4711 }
4712 }
4713 }
4714
4715 for (pubkey, ct) in results {
4716 self.peer_connection_type.insert(pubkey, ct);
4717 }
4718 }
4719
4720 /// Build a [`NatStatusWire`] from the live NAT orchestrator: this node's
4721 /// local candidates, the per-peer connection types recorded by the connect
4722 /// loop (with each peer's current remote endpoint parsed from the UAPI
4723 /// status dump), and the last STUN-refresh timestamp.
4724 async fn nat_status_snapshot(&self) -> NatStatusWire {
4725 let candidates = self
4726 .nat_traversal
4727 .as_ref()
4728 .map(|n| n.local_candidates().iter().map(candidate_to_wire).collect())
4729 .unwrap_or_default();
4730
4731 // Map hex-pubkey -> current remote endpoint from the live device's UAPI
4732 // dump. The dump keys peers by hex; `peer_connection_type` keys by
4733 // base64, so the join below converts each base64 key to hex.
4734 let mut endpoints: HashMap<String, String> = HashMap::new();
4735 if let Some(transport) = self.global_transport.as_ref() {
4736 if let Ok(dump) = transport.status().await {
4737 for p in parse_peer_status(&dump) {
4738 if !p.endpoint.is_empty() {
4739 endpoints.insert(p.public_key, p.endpoint);
4740 }
4741 }
4742 }
4743 }
4744
4745 let peers = self
4746 .peer_connection_type
4747 .iter()
4748 .map(|(pubkey, ct)| {
4749 let remote_endpoint = zlayer_overlay::nat::pubkey_b64_to_hex(pubkey)
4750 .and_then(|hex| endpoints.get(&hex).cloned());
4751 NatPeerWire {
4752 node_id: pubkey.clone(),
4753 connection_type: ct.to_string(),
4754 remote_endpoint,
4755 }
4756 })
4757 .collect();
4758
4759 NatStatusWire {
4760 candidates,
4761 peers,
4762 last_refresh: self.nat_last_refresh.load(Ordering::SeqCst),
4763 }
4764 }
4765
4766 // -- status --------------------------------------------------------------
4767
4768 /// Build a [`StatusSnapshot`] from current overlay state.
4769 async fn status_snapshot(&self) -> StatusSnapshot {
4770 let mut peers: Vec<PeerStatus> = Vec::new();
4771 let public_key = self.transport_public_key.clone();
4772
4773 if let Some(transport) = self.global_transport.as_ref() {
4774 // Parse the UAPI dump for per-peer state. Best-effort: a parse
4775 // failure leaves the peer list empty rather than failing Status.
4776 if let Ok(dump) = transport.status().await {
4777 peers = parse_peer_status(&dump);
4778 }
4779 }
4780
4781 let service_count = u32::try_from(self.service_count()).unwrap_or(u32::MAX);
4782 let peer_count = u32::try_from(peers.len()).unwrap_or(u32::MAX);
4783
4784 // Per dedicated per-service overlay device: count its peers the same
4785 // way the global status does (parse the UAPI/status dump).
4786 let mut dedicated_services: Vec<DedicatedServiceStatus> = Vec::new();
4787 for (svc, st) in &self.service_transports {
4788 let peer_count = match st.transport.status().await {
4789 Ok(dump) => u32::try_from(parse_peer_status(&dump).len()).unwrap_or(u32::MAX),
4790 Err(_) => 0,
4791 };
4792 dedicated_services.push(DedicatedServiceStatus {
4793 service: svc.clone(),
4794 interface: st.interface.clone(),
4795 public_key: st.public_key.clone(),
4796 listen_port: st.listen_port,
4797 overlay_ip: st.overlay_ip,
4798 subnet: st.subnet.to_string(),
4799 peer_count,
4800 });
4801 }
4802
4803 StatusSnapshot {
4804 interface: self.global_interface.clone(),
4805 node_ip: self.node_ip,
4806 public_key,
4807 overlay_cidr: self.cluster_cidr.map(|c| c.to_string()),
4808 slice_cidr: self.slice_cidr.map(|c| c.to_string()),
4809 peer_count,
4810 service_count,
4811 peers,
4812 dedicated_services,
4813 }
4814 }
4815
4816 /// Number of per-service overlays set up on this node (Shared bridges /
4817 /// placeholders plus any Dedicated transports not already counted there).
4818 fn service_count(&self) -> usize {
4819 let extra_dedicated = self
4820 .service_transports
4821 .keys()
4822 .filter(|svc| !self.service_interfaces.contains_key(*svc))
4823 .count();
4824 self.service_interfaces.len() + extra_dedicated
4825 }
4826
4827 // -- config helper -------------------------------------------------------
4828
4829 fn build_config(
4830 &self,
4831 private_key: String,
4832 public_key: String,
4833 ip: IpAddr,
4834 mask: u8,
4835 listen_port: u16,
4836 physical_egress_ip: Option<IpAddr>,
4837 ) -> OverlayConfig {
4838 // Pick the source/advertised address for the WireGuard endpoint.
4839 //
4840 // Default is the family-matched UNSPECIFIED (`0.0.0.0` / `::`), which lets
4841 // the kernel pick a source per outgoing packet. When the caller resolved a
4842 // physical-egress IP (see `detect_physical_egress`) *and* its family
4843 // matches the overlay IP's family, we pin `local_endpoint` to that IP so
4844 // boringtun's data socket sources from — and advertises — the real NIC
4845 // rather than whatever the default route (possibly a VPN mesh) would pick.
4846 //
4847 // Family mismatch (e.g. physical egress is v4 but this overlay is v6) is
4848 // unusable for source selection, so we warn and fall back to UNSPECIFIED.
4849 //
4850 // boringtun limitation: boringtun 0.7's `DeviceConfig` exposes no way to
4851 // inject or pin the WireGuard DATA socket (its `uapi_fd` is the UAPI
4852 // CONTROL socket only), so `SO_BINDTODEVICE` on the data socket is
4853 // impossible today. Setting `local_endpoint` to the physical IP governs
4854 // source-address selection and the advertised endpoint, which is the
4855 // realistic scope of control we have.
4856 let unspecified = match ip {
4857 IpAddr::V4(_) => IpAddr::V4(Ipv4Addr::UNSPECIFIED),
4858 IpAddr::V6(_) => IpAddr::V6(Ipv6Addr::UNSPECIFIED),
4859 };
4860 let local_addr =
4861 if rootless_forces_unspecified(std::env::var_os("ZLAYER_ROOTLESS").is_some()) {
4862 // Rootless: detect_physical_egress() resolves pasta's in-netns tap IP
4863 // (e.g. 192.168.68.x), which is useless as a WG source/advertised
4864 // endpoint to remote peers. Force UNSPECIFIED; the kernel picks the
4865 // source per packet and the real reachable endpoint comes from the
4866 // advertise_addr path + pasta forwarding.
4867 unspecified
4868 } else {
4869 match physical_egress_ip {
4870 Some(egress) if egress.is_ipv4() == ip.is_ipv4() => egress,
4871 Some(egress) => {
4872 tracing::warn!(
4873 physical_egress_ip = %egress,
4874 overlay_ip = %ip,
4875 "physical egress IP family does not match overlay IP family; \
4876 falling back to UNSPECIFIED for WireGuard local_endpoint"
4877 );
4878 unspecified
4879 }
4880 None => unspecified,
4881 }
4882 };
4883 let mut config = OverlayConfig {
4884 local_endpoint: SocketAddr::new(local_addr, listen_port),
4885 private_key,
4886 public_key,
4887 overlay_cidr: format!("{ip}/{mask}"),
4888 ..OverlayConfig::default()
4889 };
4890 if let Some(nat) = self.nat_config.clone() {
4891 config.nat = nat;
4892 }
4893 if let Some(dir) = self.uapi_sock_dir.clone() {
4894 config.uapi_sock_dir = dir;
4895 }
4896 config
4897 }
4898}
4899
4900/// Build an `Auto`-mode [`ServiceOverlayInfo`]: the per-service bridge/placeholder
4901/// name with every dedicated-device identity field left `None` (`Auto` carries
4902/// the service subnet on the single cluster-wide `WireGuard` device).
4903fn cluster_wg_overlay_info(name: String) -> ServiceOverlayInfo {
4904 ServiceOverlayInfo {
4905 name,
4906 mode: OverlayMode::Auto,
4907 wg_public_key: None,
4908 wg_port: None,
4909 overlay_ip: None,
4910 subnet: None,
4911 }
4912}
4913
4914/// Build a `Shared`-mode [`ServiceOverlayInfo`]: the shared node-wide
4915/// bridge/placeholder name with every dedicated-device identity field left
4916/// `None` (Shared mode shares the single cluster device and the node-wide
4917/// bridge; ports are exposed by the userspace free-port L4 proxy).
4918fn shared_overlay_info(name: String) -> ServiceOverlayInfo {
4919 ServiceOverlayInfo {
4920 name,
4921 mode: OverlayMode::Shared,
4922 wg_public_key: None,
4923 wg_port: None,
4924 overlay_ip: None,
4925 subnet: None,
4926 }
4927}
4928
4929/// Build a Dedicated-mode [`ServiceOverlayInfo`] from a dedicated device's
4930/// identity. `name` is the container-attach handle (bridge name on Linux, the
4931/// dedicated interface elsewhere).
4932fn dedicated_overlay_info(
4933 name: String,
4934 public_key: &str,
4935 listen_port: u16,
4936 overlay_ip: IpAddr,
4937 subnet: ipnet::IpNet,
4938) -> ServiceOverlayInfo {
4939 ServiceOverlayInfo {
4940 name,
4941 mode: OverlayMode::Dedicated,
4942 wg_public_key: Some(public_key.to_string()),
4943 wg_port: Some(listen_port),
4944 overlay_ip: Some(overlay_ip),
4945 subnet: Some(subnet.to_string()),
4946 }
4947}
4948
4949/// Convert a wire [`PeerSpec`] into a `zlayer_overlay::PeerInfo`.
4950///
4951/// # Errors
4952/// Returns an error if `endpoint` cannot be parsed as a `host:port`
4953/// [`SocketAddr`].
4954pub fn peer_spec_to_info(spec: &PeerSpec) -> Result<PeerInfo, OverlaydError> {
4955 let endpoint: SocketAddr = spec.endpoint.parse().map_err(|e| {
4956 OverlaydError::Other(format!("invalid peer endpoint {}: {e}", spec.endpoint))
4957 })?;
4958 Ok(PeerInfo::new(
4959 spec.public_key.clone(),
4960 endpoint,
4961 &spec.allowed_ips,
4962 std::time::Duration::from_secs(spec.persistent_keepalive_secs),
4963 ))
4964}
4965
4966/// Parse a `wg`-style UAPI/`status` dump into [`PeerStatus`] entries.
4967///
4968/// The dump is a series of `key=value` lines; each `public_key=` line starts a
4969/// new peer block, and subsequent `endpoint=` / `allowed_ip=` /
4970/// `latest_handshake=` lines belong to it.
4971fn parse_peer_status(dump: &str) -> Vec<PeerStatus> {
4972 let mut peers: Vec<PeerStatus> = Vec::new();
4973 let mut current: Option<PeerStatus> = None;
4974 let mut allowed: Vec<String> = Vec::new();
4975
4976 let flush = |peers: &mut Vec<PeerStatus>,
4977 current: &mut Option<PeerStatus>,
4978 allowed: &mut Vec<String>| {
4979 if let Some(mut p) = current.take() {
4980 p.allowed_ips = allowed.join(",");
4981 peers.push(p);
4982 }
4983 allowed.clear();
4984 };
4985
4986 for line in dump.lines() {
4987 let line = line.trim();
4988 let Some((key, value)) = line.split_once('=') else {
4989 continue;
4990 };
4991 match key.trim() {
4992 "public_key" | "peer" => {
4993 flush(&mut peers, &mut current, &mut allowed);
4994 current = Some(PeerStatus {
4995 public_key: value.trim().to_string(),
4996 endpoint: String::new(),
4997 allowed_ips: String::new(),
4998 last_handshake_unix_secs: 0,
4999 });
5000 }
5001 "endpoint" => {
5002 if let Some(p) = current.as_mut() {
5003 p.endpoint = value.trim().to_string();
5004 }
5005 }
5006 "allowed_ip" | "allowed_ips" if current.is_some() => {
5007 allowed.push(value.trim().to_string());
5008 }
5009 "latest_handshake" | "last_handshake_time_sec" => {
5010 if let Some(p) = current.as_mut() {
5011 p.last_handshake_unix_secs = value.trim().parse().unwrap_or(0);
5012 }
5013 }
5014 _ => {}
5015 }
5016 }
5017 flush(&mut peers, &mut current, &mut allowed);
5018 peers
5019}
5020
5021/// Convert a wire [`NatConfigSpec`] into the live [`NatConfig`] overlayd drives.
5022///
5023/// Sub-fields left at their zero value in the spec fall back to
5024/// [`NatConfig::default`]'s value (so a sparsely-populated spec still gets sane
5025/// STUN servers / timeouts). The `relay_server`'s `auth_credential` is stripped
5026/// here — it is carried separately on the server (`cluster_relay_credential`)
5027/// because `RelayServerConfig` has no credential field; this conversion only
5028/// produces the bind/external/max-sessions triple it does carry.
5029fn nat_config_spec_to_config(spec: NatConfigSpec) -> NatConfig {
5030 let defaults = NatConfig::default();
5031 NatConfig {
5032 enabled: spec.enabled,
5033 stun_servers: if spec.stun_servers.is_empty() {
5034 defaults.stun_servers
5035 } else {
5036 spec.stun_servers
5037 .into_iter()
5038 .map(|address| StunServerConfig {
5039 address,
5040 label: None,
5041 })
5042 .collect()
5043 },
5044 turn_servers: spec
5045 .turn_servers
5046 .into_iter()
5047 .map(|t| TurnServerConfig {
5048 address: t.addr,
5049 username: t.username,
5050 credential: t.credential,
5051 region: None,
5052 })
5053 .collect(),
5054 hole_punch_timeout_secs: if spec.hole_punch_timeout_secs == 0 {
5055 defaults.hole_punch_timeout_secs
5056 } else {
5057 spec.hole_punch_timeout_secs
5058 },
5059 stun_refresh_interval_secs: if spec.stun_refresh_interval_secs == 0 {
5060 defaults.stun_refresh_interval_secs
5061 } else {
5062 spec.stun_refresh_interval_secs
5063 },
5064 max_candidate_pairs: if spec.max_candidate_pairs == 0 {
5065 defaults.max_candidate_pairs
5066 } else {
5067 spec.max_candidate_pairs
5068 },
5069 relay_server: spec.relay_server.map(|r| RelayServerConfig {
5070 listen_port: r.listen_port,
5071 external_addr: r.external_addr,
5072 max_sessions: if r.max_sessions == 0 {
5073 default_max_relay_sessions()
5074 } else {
5075 r.max_sessions
5076 },
5077 }),
5078 }
5079}
5080
5081/// Default relay `max_sessions` used when a spec leaves it at `0`. Mirrors
5082/// `zlayer_overlay::nat::config`'s private `default_max_relay_sessions` (100).
5083const fn default_max_relay_sessions() -> usize {
5084 100
5085}
5086
5087/// Parse a wire [`NatCandidateWire`] into a live [`Candidate`].
5088///
5089/// Returns `None` when the address does not parse as a `host:port` socket
5090/// address or the type string is unrecognized. Priority is taken verbatim from
5091/// the wire (the advertiser already computed it) so the receiver honors the
5092/// peer's own preference ordering.
5093fn wire_to_candidate(w: &NatCandidateWire) -> Option<Candidate> {
5094 let address: SocketAddr = w.address.parse().ok()?;
5095 let candidate_type = match w.candidate_type.as_str() {
5096 "host" => CandidateType::Host,
5097 "server-reflexive" => CandidateType::ServerReflexive,
5098 "relay" => CandidateType::Relay,
5099 _ => return None,
5100 };
5101 let mut c = Candidate::new(candidate_type, address);
5102 c.priority = w.priority;
5103 Some(c)
5104}
5105
5106/// Convert a live [`Candidate`] into its wire [`NatCandidateWire`] form for a
5107/// `NatStatus` response.
5108fn candidate_to_wire(c: &Candidate) -> NatCandidateWire {
5109 let candidate_type = match c.candidate_type {
5110 CandidateType::Host => "host",
5111 CandidateType::ServerReflexive => "server-reflexive",
5112 CandidateType::Relay => "relay",
5113 };
5114 NatCandidateWire {
5115 candidate_type: candidate_type.to_string(),
5116 address: c.address.to_string(),
5117 priority: c.priority,
5118 }
5119}
5120
5121/// Current Unix time in whole seconds.
5122fn now_unix() -> u64 {
5123 std::time::SystemTime::now()
5124 .duration_since(std::time::UNIX_EPOCH)
5125 .unwrap_or_default()
5126 .as_secs()
5127}
5128
5129/// Offset (relative to the slice's network address) reserved for the node's
5130/// own overlay IP. Offset 1 is always the first usable host of the slice, so
5131/// the node IP is deterministic (`base + 1`) regardless of allocation order.
5132const NODE_RESERVED_OFFSET: u64 = 1;
5133
5134/// Simple IP address allocator supporting both IPv4 and IPv6, bounded to a
5135/// specific CIDR (typically a per-node `/28` slice). Allocations past the last
5136/// usable host return an exhaustion error.
5137///
5138/// Offset [`NODE_RESERVED_OFFSET`] (the first usable host) is reserved for the
5139/// node's own overlay IP and is never handed out by [`IpAllocator::allocate`],
5140/// so the node IP stays deterministic across restarts and immune to container
5141/// allocation order. Use [`IpAllocator::node_ip`] to read it.
5142struct IpAllocator {
5143 /// CIDR the allocator is bounded to.
5144 cidr: IpNetwork,
5145 /// Base (network) address of the CIDR.
5146 base: IpAddr,
5147 /// Monotonic counter for the next allocation offset relative to `base`.
5148 /// Starts at [`NODE_RESERVED_OFFSET`] + 1 so the node's reserved IP is
5149 /// never returned to a container.
5150 next_offset: AtomicU64,
5151 /// IPs returned by `release(...)`. `allocate()` drains this first before
5152 /// incrementing `next_offset`.
5153 released: parking_lot::Mutex<Vec<IpAddr>>,
5154}
5155
5156impl IpAllocator {
5157 fn new(cidr: IpNetwork) -> Self {
5158 Self {
5159 base: cidr.network(),
5160 cidr,
5161 // Reserve offset 1 for the node's own overlay IP; container
5162 // allocation starts at offset 2.
5163 next_offset: AtomicU64::new(NODE_RESERVED_OFFSET + 1),
5164 released: parking_lot::Mutex::new(Vec::new()),
5165 }
5166 }
5167
5168 /// The node's own overlay IP for this slice: the first usable host
5169 /// (`base + 1`), reserved so no container ever receives it. Deterministic
5170 /// for a given slice CIDR, independent of allocation order or restarts.
5171 fn node_ip(&self) -> IpAddr {
5172 self.compute_addr(NODE_RESERVED_OFFSET)
5173 }
5174
5175 #[allow(clippy::cast_possible_truncation)]
5176 fn compute_addr(&self, offset: u64) -> IpAddr {
5177 match self.base {
5178 IpAddr::V4(base_v4) => {
5179 let base_u32 = u32::from_be_bytes(base_v4.octets());
5180 let addr = base_u32.wrapping_add(offset as u32);
5181 IpAddr::V4(Ipv4Addr::from(addr.to_be_bytes()))
5182 }
5183 IpAddr::V6(base_v6) => {
5184 let base_u128 = u128::from(base_v6);
5185 let addr = base_u128.wrapping_add(u128::from(offset));
5186 IpAddr::V6(Ipv6Addr::from(addr))
5187 }
5188 }
5189 }
5190
5191 /// Allocate the next IP in the slice, reusing released IPs first.
5192 ///
5193 /// # Errors
5194 /// Returns [`OverlaydError::Overlay`] when the CIDR is exhausted.
5195 fn allocate(&self) -> Result<IpAddr, OverlaydError> {
5196 if let Some(ip) = self.released.lock().pop() {
5197 return Ok(ip);
5198 }
5199 let offset = self.next_offset.fetch_add(1, Ordering::SeqCst);
5200 let addr = self.compute_addr(offset);
5201
5202 let in_cidr = self.cidr.contains(addr);
5203 let is_v4_broadcast = matches!(
5204 (&self.cidr, &addr),
5205 (IpNetwork::V4(v4), IpAddr::V4(a)) if *a == v4.broadcast()
5206 );
5207 if !in_cidr || is_v4_broadcast {
5208 return Err(OverlaydError::Overlay(format!(
5209 "IP allocator exhausted: next address {addr} is outside slice {}",
5210 self.cidr
5211 )));
5212 }
5213 Ok(addr)
5214 }
5215
5216 /// Return an IP to the free pool. Idempotent. The node's reserved IP is
5217 /// never accepted back into the pool so it can never be handed to a
5218 /// container by a later `allocate()`.
5219 fn release(&self, ip: IpAddr) {
5220 if ip == self.node_ip() {
5221 return;
5222 }
5223 let mut released = self.released.lock();
5224 if !released.contains(&ip) {
5225 released.push(ip);
5226 }
5227 }
5228}
5229
5230// -- Windows HCN helpers (ported from the agent's hcs runtime) --------------
5231
5232/// Owner tag stamped onto every HCN endpoint this server creates. The legacy
5233/// single-instance value is `"zlayer"`; any other name is used verbatim so two
5234/// daemons running side-by-side never sweep each other's endpoints.
5235#[cfg(target_os = "windows")]
5236fn owner_tag(daemon_name: &str) -> String {
5237 if daemon_name == "zlayer" {
5238 "zlayer".to_string()
5239 } else {
5240 daemon_name.to_string()
5241 }
5242}
5243
5244/// Name of the per-daemon HCN overlay network on the host. Legacy
5245/// single-instance value is `"zlayer-overlay"`; any other name becomes
5246/// `"<daemon_name>-overlay"`.
5247#[cfg(target_os = "windows")]
5248fn overlay_network_name(daemon_name: &str) -> String {
5249 if daemon_name == "zlayer" {
5250 "zlayer-overlay".to_string()
5251 } else {
5252 format!("{daemon_name}-overlay")
5253 }
5254}
5255
5256/// Build the [`zlayer_hns::schema::HostComputeNetwork`] document for the single
5257/// shared HCN **NAT** network. A NAT network gives every attached container
5258/// outbound connectivity and host-port forwarding (driven by the userspace
5259/// free-port L4 proxy), without a per-service vSwitch — the Windows analogue of
5260/// the Linux node-wide shared bridge. The Static IPAM declares a default route
5261/// to the subnet gateway so HCN reserves only the gateway (same
5262/// `HCN_E_ADDR_INVALID_OR_RESERVED` avoidance the Internal/Transparent paths
5263/// use). Returns `None` when `subnet` has no usable gateway host.
5264#[cfg(target_os = "windows")]
5265fn shared_nat_settings(name: &str, subnet: &str) -> Option<zlayer_hns::schema::HostComputeNetwork> {
5266 use zlayer_hns::schema::{HostComputeNetwork, Ipam, NetworkType, Route, SchemaVersion, Subnet};
5267
5268 let net: ipnet::IpNet = subnet.parse().ok()?;
5269 let ipnet::IpNet::V4(v4) = net else {
5270 // HCN's NAT IPAM is IPv4 in the current schema.
5271 return None;
5272 };
5273 if v4.prefix_len() >= 31 {
5274 return None;
5275 }
5276 let gateway = std::net::Ipv4Addr::from(u32::from(v4.network()).checked_add(1)?).to_string();
5277
5278 Some(HostComputeNetwork {
5279 id: None,
5280 name: name.to_string(),
5281 ty: NetworkType::Nat,
5282 policies: Vec::new(),
5283 mac_pool: None,
5284 dns: None,
5285 ipams: vec![Ipam {
5286 ty: "Static".to_string(),
5287 subnets: vec![Subnet {
5288 ip_address_prefix: subnet.to_string(),
5289 routes: vec![Route {
5290 next_hop: gateway,
5291 destination_prefix: "0.0.0.0/0".to_string(),
5292 metric: None,
5293 }],
5294 policies: Vec::new(),
5295 }],
5296 }],
5297 flags: 0,
5298 schema_version: SchemaVersion::default(),
5299 })
5300}
5301
5302/// Format a GUID as the bare, lowercase, un-braced string HCN/HCS use to
5303/// identify a namespace inside a compute-system document's
5304/// `Container.Networking.Namespace` field (e.g. `aabbccdd-eeff-...`).
5305#[cfg(target_os = "windows")]
5306fn format_guid_bare(id: windows::core::GUID) -> String {
5307 format!("{id:?}")
5308 .trim_matches(|c: char| c == '{' || c == '}')
5309 .to_ascii_lowercase()
5310}
5311
5312/// Delete every host-level HCN network this server created for `daemon_name` and
5313/// clear the persistent marker. Called on a full uninstall — never on a routine
5314/// stop/restart. Best-effort throughout. Synchronous (HCN calls are blocking).
5315#[cfg(target_os = "windows")]
5316pub fn purge_managed_networks(data_dir: &Path, daemon_name: &str) {
5317 use windows::core::GUID;
5318
5319 let marker_path = zlayer_paths::ZLayerDirs::new(data_dir.to_path_buf()).agent_network_state();
5320 let state = crate::network_state::NetworkState::load(&marker_path);
5321
5322 // Pass 1: delete recorded HCN networks by GUID.
5323 for entry in &state.networks {
5324 if !entry.kind.starts_with("hcn") {
5325 continue;
5326 }
5327 match GUID::try_from(entry.id.as_str()) {
5328 Ok(guid) => match zlayer_hns::network::Network::delete(guid) {
5329 Ok(()) => {
5330 tracing::info!(name = %entry.name, id = %entry.id, "deleted managed HCN network");
5331 }
5332 Err(e) => {
5333 tracing::warn!(name = %entry.name, id = %entry.id, error = %e, "failed to delete managed HCN network");
5334 }
5335 },
5336 Err(e) => {
5337 tracing::warn!(id = %entry.id, error = %e, "managed network marker has unparseable GUID");
5338 }
5339 }
5340 }
5341
5342 // Pass 2: name-sweep fallback for an overlay network whose marker entry was
5343 // lost (crash between create and marker write).
5344 let overlay_name = overlay_network_name(daemon_name);
5345 if let Ok(guids) = zlayer_hns::network::list("{}") {
5346 for guid in guids {
5347 let Ok(network) = zlayer_hns::network::Network::open(guid) else {
5348 continue;
5349 };
5350 let is_ours = matches!(network.query("{}"), Ok(props) if props.name == overlay_name);
5351 drop(network);
5352 if is_ours {
5353 match zlayer_hns::network::Network::delete(guid) {
5354 Ok(()) => {
5355 tracing::info!(name = %overlay_name, "deleted overlay HCN network (name sweep)");
5356 }
5357 Err(e) => {
5358 tracing::warn!(name = %overlay_name, error = %e, "failed to delete overlay network (name sweep)");
5359 }
5360 }
5361 }
5362 }
5363 }
5364
5365 if marker_path.exists() {
5366 if let Err(e) = std::fs::remove_file(&marker_path) {
5367 tracing::warn!(error = %e, path = %marker_path.display(), "failed to remove agent network marker");
5368 }
5369 }
5370}
5371
5372#[cfg(test)]
5373mod tests {
5374 use super::*;
5375
5376 #[cfg(target_os = "linux")]
5377 #[test]
5378 fn orphan_bridge_selection() {
5379 use std::collections::HashSet;
5380
5381 // Two live per-service bridges the daemon says SHOULD exist.
5382 let live: HashSet<&str> = ["zl-prod-0-web-b", "zl-prod-0-api-b"].into_iter().collect();
5383 // The active global device and node-wide shared bridge are protected,
5384 // plus a live in-memory dedicated device.
5385 let protected: HashSet<String> = ["zl-prod-0-g", "zl-prod-0-shared-sh", "zl-prod-0-db-d"]
5386 .into_iter()
5387 .map(String::from)
5388 .collect();
5389
5390 // The full set of host links the kernel would report.
5391 let host_links = [
5392 // Live -> keep.
5393 "zl-prod-0-web-b",
5394 "zl-prod-0-api-b",
5395 // Protected global / shared / live dedicated device -> keep.
5396 "zl-prod-0-g",
5397 "zl-prod-0-shared-sh",
5398 "zl-prod-0-db-d",
5399 // Orphan bridges (the user's observed leaks) -> reclaim.
5400 "zl-1ca4568944-b",
5401 "zl-81c6bc17c7-b",
5402 // Orphan dedicated device -> reclaim.
5403 "zl-prod-0-gone-d",
5404 // Container veths owned by the PID-keyed sweep, never here -> skip.
5405 "veth-4242-s",
5406 "vc-4242-g",
5407 // Unrelated host links -> skip.
5408 "eth0",
5409 "lo",
5410 "docker0",
5411 "zl-not-a-bridge",
5412 ];
5413
5414 let orphans: Vec<&str> = host_links
5415 .into_iter()
5416 .filter(|n| is_orphan_service_bridge(n, &live, &protected))
5417 .collect();
5418
5419 assert_eq!(
5420 orphans,
5421 vec!["zl-1ca4568944-b", "zl-81c6bc17c7-b", "zl-prod-0-gone-d"],
5422 "only orphaned -b/-d service bridges/devices are selected; \
5423 live, protected (-g/-sh/live -d), veth, and unrelated links are excluded"
5424 );
5425 }
5426
5427 #[test]
5428 fn peer_spec_to_info_parses_endpoint_and_keepalive() {
5429 let spec = PeerSpec {
5430 public_key: "base64key".to_string(),
5431 endpoint: "1.2.3.4:51820".to_string(),
5432 allowed_ips: "10.200.0.5/32,10.200.1.0/24".to_string(),
5433 persistent_keepalive_secs: 25,
5434 candidates: Vec::new(),
5435 };
5436 let info = peer_spec_to_info(&spec).expect("valid spec");
5437 assert_eq!(info.public_key, "base64key");
5438 assert_eq!(info.endpoint, "1.2.3.4:51820".parse().unwrap());
5439 assert_eq!(info.allowed_ips, "10.200.0.5/32,10.200.1.0/24");
5440 assert_eq!(
5441 info.persistent_keepalive_interval,
5442 std::time::Duration::from_secs(25)
5443 );
5444 }
5445
5446 #[test]
5447 fn peer_spec_to_info_rejects_bad_endpoint() {
5448 let spec = PeerSpec {
5449 public_key: "k".to_string(),
5450 endpoint: "not-a-socket-addr".to_string(),
5451 allowed_ips: String::new(),
5452 persistent_keepalive_secs: 0,
5453 candidates: Vec::new(),
5454 };
5455 assert!(peer_spec_to_info(&spec).is_err());
5456 }
5457
5458 #[test]
5459 fn interface_name_never_exceeds_limit() {
5460 let cases: Vec<(&[&str], &str)> = vec![
5461 (&["a"], "g"),
5462 (&["zlayer-manager"], "g"),
5463 (&["my-very-long-deployment-name-that-goes-on-and-on"], "g"),
5464 (&["zlayer", "manager"], "s"),
5465 (
5466 &["abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz"],
5467 "s",
5468 ),
5469 (&["x"], ""),
5470 ];
5471 for (parts, suffix) in &cases {
5472 let name = make_interface_name(parts, suffix);
5473 assert!(name.len() <= MAX_IFNAME_LEN, "Name '{name}' too long");
5474 assert!(name.starts_with("zl-"));
5475 }
5476 }
5477
5478 #[test]
5479 fn node_ip_is_first_usable_and_reserved() {
5480 let cidr: IpNetwork = "10.200.0.0/26".parse().unwrap();
5481 let alloc = IpAllocator::new(cidr);
5482
5483 // The node IP is the deterministic first-usable host of the slice.
5484 let expected_node_ip: IpAddr = "10.200.0.1".parse().unwrap();
5485 assert_eq!(alloc.node_ip(), expected_node_ip);
5486
5487 // Several container allocations must NEVER hand out the node IP, and
5488 // the node IP stays put regardless of allocation order.
5489 let mut handed_out = Vec::new();
5490 for _ in 0..10 {
5491 let ip = alloc.allocate().expect("slice not exhausted");
5492 assert_ne!(
5493 ip, expected_node_ip,
5494 "allocate() returned the reserved node IP"
5495 );
5496 handed_out.push(ip);
5497 }
5498 // Reservation holds after the allocations.
5499 assert_eq!(alloc.node_ip(), expected_node_ip);
5500
5501 // First container allocation is offset 2 (base + 2), proving offset 1
5502 // (the node) was reserved and skipped.
5503 assert_eq!(handed_out[0], "10.200.0.2".parse::<IpAddr>().unwrap());
5504
5505 // Releasing the node IP must not pollute the free pool with it.
5506 alloc.release(expected_node_ip);
5507 let next = alloc.allocate().expect("slice not exhausted");
5508 assert_ne!(
5509 next, expected_node_ip,
5510 "node IP leaked back into the pool via release()"
5511 );
5512 }
5513
5514 #[test]
5515 fn node_ip_ipv6_is_first_usable() {
5516 let cidr: IpNetwork = "fd00:200::/64".parse().unwrap();
5517 let alloc = IpAllocator::new(cidr);
5518 let expected: IpAddr = "fd00:200::1".parse().unwrap();
5519 assert_eq!(alloc.node_ip(), expected);
5520 for _ in 0..5 {
5521 assert_ne!(alloc.allocate().unwrap(), expected);
5522 }
5523 assert_eq!(alloc.node_ip(), expected);
5524 }
5525
5526 #[test]
5527 fn interface_name_is_deterministic() {
5528 assert_eq!(
5529 make_interface_name(&["zlayer-manager"], "g"),
5530 make_interface_name(&["zlayer-manager"], "g")
5531 );
5532 }
5533
5534 #[test]
5535 fn parse_peer_status_splits_blocks() {
5536 let dump = "\
5537public_key=AAA
5538endpoint=1.2.3.4:51820
5539allowed_ip=10.200.0.2/32
5540allowed_ip=10.200.1.0/24
5541latest_handshake=1700000000
5542public_key=BBB
5543endpoint=5.6.7.8:51820
5544allowed_ip=10.200.0.3/32
5545latest_handshake=0
5546";
5547 let peers = parse_peer_status(dump);
5548 assert_eq!(peers.len(), 2);
5549 assert_eq!(peers[0].public_key, "AAA");
5550 assert_eq!(peers[0].endpoint, "1.2.3.4:51820");
5551 assert_eq!(peers[0].allowed_ips, "10.200.0.2/32,10.200.1.0/24");
5552 assert_eq!(peers[0].last_handshake_unix_secs, 1_700_000_000);
5553 assert_eq!(peers[1].public_key, "BBB");
5554 assert_eq!(peers[1].last_handshake_unix_secs, 0);
5555 }
5556
5557 #[tokio::test]
5558 async fn status_snapshot_before_setup_is_empty() {
5559 let server = OverlaydServer::new(std::path::PathBuf::from("/tmp/zlayer-overlayd-test"));
5560 let snap = server.status_snapshot().await;
5561 assert!(snap.interface.is_none());
5562 assert!(snap.node_ip.is_none());
5563 assert!(snap.public_key.is_none());
5564 assert_eq!(snap.peer_count, 0);
5565 assert_eq!(snap.service_count, 0);
5566 assert!(snap.peers.is_empty());
5567 }
5568
5569 #[tokio::test]
5570 async fn allocate_and_release_ip_round_trip() {
5571 let mut server = OverlaydServer::new(std::path::PathBuf::from("/tmp/zlayer-overlayd-test"));
5572 let a = server.allocate_ip("svc", false).expect("alloc a");
5573 let b = server.allocate_ip("svc", false).expect("alloc b");
5574 assert_ne!(a, b);
5575 server.release_ip(a);
5576 // Released IP is handed back before the monotonic counter advances.
5577 let c = server.allocate_ip("svc", false).expect("alloc c");
5578 assert_eq!(c, a);
5579 }
5580
5581 /// Build a throwaway server bound to a unique temp data dir so the marker
5582 /// file (rehydrated in `new`) never collides between tests.
5583 fn test_server() -> OverlaydServer {
5584 let dir = std::env::temp_dir().join(format!(
5585 "zlayer-overlayd-scope-{}-{}",
5586 std::process::id(),
5587 now_unix()
5588 ));
5589 OverlaydServer::new(dir)
5590 }
5591
5592 /// `nat_config_spec_to_config` fills sparse fields from `NatConfig::default`
5593 /// and copies populated ones verbatim (the Step-0 wire-config threading).
5594 #[test]
5595 fn nat_config_spec_to_config_fills_defaults_and_copies() {
5596 // Empty spec → defaults (default STUN servers, default timeouts).
5597 let cfg = nat_config_spec_to_config(NatConfigSpec::default());
5598 let d = NatConfig::default();
5599 assert_eq!(cfg.stun_servers.len(), d.stun_servers.len());
5600 assert_eq!(cfg.hole_punch_timeout_secs, d.hole_punch_timeout_secs);
5601 assert_eq!(cfg.max_candidate_pairs, d.max_candidate_pairs);
5602 assert!(cfg.relay_server.is_none());
5603
5604 // Populated spec → copied verbatim; relay credential is NOT on the
5605 // produced RelayServerConfig (it is carried separately on the server).
5606 let spec = NatConfigSpec {
5607 enabled: true,
5608 stun_servers: vec!["stun.example:3478".to_string()],
5609 turn_servers: vec![zlayer_types::nat_wire::TurnServerSpec {
5610 addr: "turn.example:3478".to_string(),
5611 username: "u".to_string(),
5612 credential: "p".to_string(),
5613 }],
5614 hole_punch_timeout_secs: 9,
5615 stun_refresh_interval_secs: 40,
5616 max_candidate_pairs: 3,
5617 relay_server: Some(zlayer_types::nat_wire::RelayServerSpec {
5618 listen_port: 3478,
5619 external_addr: "1.2.3.4:3478".to_string(),
5620 max_sessions: 7,
5621 auth_credential: Some("cluster-secret".to_string()),
5622 }),
5623 };
5624 let cfg = nat_config_spec_to_config(spec);
5625 assert_eq!(cfg.stun_servers.len(), 1);
5626 assert_eq!(cfg.stun_servers[0].address, "stun.example:3478");
5627 assert_eq!(cfg.turn_servers.len(), 1);
5628 assert_eq!(cfg.hole_punch_timeout_secs, 9);
5629 assert_eq!(cfg.max_candidate_pairs, 3);
5630 let relay = cfg.relay_server.expect("relay present");
5631 assert_eq!(relay.listen_port, 3478);
5632 assert_eq!(relay.max_sessions, 7);
5633 }
5634
5635 /// `wire_to_candidate` parses valid candidates and rejects bad ones;
5636 /// `candidate_to_wire` is its inverse for the type/address/priority triple.
5637 #[test]
5638 fn candidate_wire_conversions_round_trip() {
5639 let w = NatCandidateWire {
5640 candidate_type: "server-reflexive".to_string(),
5641 address: "203.0.113.5:51820".to_string(),
5642 priority: 50,
5643 };
5644 let c = wire_to_candidate(&w).expect("valid candidate");
5645 assert_eq!(c.candidate_type, CandidateType::ServerReflexive);
5646 assert_eq!(c.priority, 50);
5647 let back = candidate_to_wire(&c);
5648 assert_eq!(back, w);
5649
5650 // Bad address / type → None.
5651 assert!(wire_to_candidate(&NatCandidateWire {
5652 candidate_type: "host".to_string(),
5653 address: "not-an-addr".to_string(),
5654 priority: 1,
5655 })
5656 .is_none());
5657 assert!(wire_to_candidate(&NatCandidateWire {
5658 candidate_type: "bogus".to_string(),
5659 address: "1.2.3.4:5".to_string(),
5660 priority: 1,
5661 })
5662 .is_none());
5663 }
5664
5665 /// `AddPeer` carrying candidates records them in `peer_candidates`; a
5666 /// candidate-free add (or one with only-invalid candidates) leaves no entry,
5667 /// and `RemovePeer` clears them.
5668 #[tokio::test]
5669 async fn add_peer_records_candidates_and_remove_clears_them() {
5670 let mut server = test_server();
5671 let pubkey = "base64key".to_string();
5672 let resp = server
5673 .handle(OverlaydRequest::AddPeer {
5674 peer: PeerSpec {
5675 public_key: pubkey.clone(),
5676 endpoint: "1.2.3.4:51820".to_string(),
5677 allowed_ips: "10.200.0.2/32".to_string(),
5678 persistent_keepalive_secs: 25,
5679 candidates: vec![NatCandidateWire {
5680 candidate_type: "host".to_string(),
5681 address: "192.168.1.5:51820".to_string(),
5682 priority: 100,
5683 }],
5684 },
5685 scope: PeerScope::Global,
5686 })
5687 .await;
5688 assert!(matches!(resp, OverlaydResponse::Ok));
5689 assert_eq!(
5690 server.peer_candidates.get(&pubkey).map(Vec::len),
5691 Some(1),
5692 "candidates must be recorded"
5693 );
5694
5695 // Remove clears the candidate + connection-type bookkeeping.
5696 let resp = server
5697 .handle(OverlaydRequest::RemovePeer {
5698 pubkey: pubkey.clone(),
5699 scope: PeerScope::Global,
5700 })
5701 .await;
5702 assert!(matches!(resp, OverlaydResponse::Ok));
5703 assert!(!server.peer_candidates.contains_key(&pubkey));
5704 }
5705
5706 /// `NatStatus` returns a `NatStatusWire` (empty before any tick) — proving
5707 /// the new IPC pair is wired through `dispatch`.
5708 #[tokio::test]
5709 async fn nat_status_request_returns_wire_snapshot() {
5710 let mut server = test_server();
5711 let resp = server.handle(OverlaydRequest::NatStatus).await;
5712 match resp {
5713 OverlaydResponse::NatStatus(wire) => {
5714 assert!(wire.candidates.is_empty());
5715 assert!(wire.peers.is_empty());
5716 }
5717 other => panic!("expected NatStatus response, got {other:?}"),
5718 }
5719 }
5720
5721 /// True when the process can mutate netlink + `/proc/sys` (root). The
5722 /// teardown-completeness test below is `#[ignore]`d and additionally skips
5723 /// (not fails) when run via `--ignored` without privileges, matching the
5724 /// crate's "skip gracefully when not root" convention.
5725 #[cfg(target_os = "linux")]
5726 fn is_root() -> bool {
5727 // SAFETY: `geteuid` is a pure read of the caller's effective uid.
5728 #[allow(unsafe_code)]
5729 let euid = unsafe { libc::geteuid() };
5730 euid == 0
5731 }
5732
5733 /// End-to-end teardown completeness: populate the server's
5734 /// `created_veths` / `created_bridges` / `created_host_routes` tracking sets
5735 /// with REAL host resources created via netlink, snapshot
5736 /// `net.ipv4.ip_forward`, force it to `1` (recording the prior value in
5737 /// `prev_ipv4_forward` exactly as `enable_forwarding_for_attach` does), then
5738 /// drive the same teardown the `Shutdown` request triggers
5739 /// (`handle(OverlaydRequest::Shutdown)`), and assert: every tracked veth /
5740 /// bridge / route is gone at the kernel level AND `ip_forward` is restored to
5741 /// the snapshot.
5742 ///
5743 /// This is the regression for the full teardown fix (revert routes + veths +
5744 /// bridges + forwarding sysctl on shutdown). Names are unique and <=15 chars;
5745 /// a belt-and-braces cleanup runs before the asserts so a failed assertion
5746 /// still leaves the host clean. Skips (returns) when not root.
5747 #[cfg(target_os = "linux")]
5748 #[tokio::test(flavor = "multi_thread")]
5749 #[ignore = "needs CAP_NET_ADMIN + /proc/sys write; run on a privileged Linux host"]
5750 async fn shutdown_teardown_reverts_resources_and_ip_forward() {
5751 if !is_root() {
5752 eprintln!("skipping shutdown_teardown_reverts_resources_and_ip_forward: requires root");
5753 return;
5754 }
5755
5756 let suffix = format!("{:x}", now_unix() & 0xff_ffff);
5757 let veth_host = format!("vh-{suffix}");
5758 let veth_peer = format!("vp-{suffix}");
5759 let bridge = format!("zlb-{suffix}");
5760 assert!(veth_host.len() <= 15, "veth host name exceeds IFNAMSIZ");
5761 assert!(veth_peer.len() <= 15, "veth peer name exceeds IFNAMSIZ");
5762 assert!(bridge.len() <= 15, "bridge name exceeds IFNAMSIZ");
5763
5764 let dest = IpAddr::V4(Ipv4Addr::new(10, 233, 0, 9));
5765 let prefix: u8 = 32;
5766
5767 // --- create real host resources and register them with the server's
5768 // teardown-tracking sets, exactly as the attach paths do. ---
5769 crate::netlink::create_veth_pair(&veth_host, &veth_peer)
5770 .await
5771 .expect("create_veth_pair");
5772 crate::netlink::create_bridge(&bridge)
5773 .await
5774 .expect("create_bridge");
5775 crate::netlink::replace_route_via_dev(dest, prefix, &veth_host, None)
5776 .await
5777 .expect("replace_route_via_dev");
5778
5779 let mut server = test_server();
5780 server.created_veths.insert(veth_host.clone());
5781 server.created_bridges.insert(bridge.clone());
5782 server
5783 .created_host_routes
5784 .push((dest, prefix, veth_host.clone()));
5785
5786 // Snapshot ip_forward, then flip it to 1 and record the prior value the
5787 // way enable_forwarding_for_attach does so revert_forwarding restores it.
5788 let snapshot =
5789 crate::netlink::read_sysctl("net.ipv4.ip_forward").unwrap_or_else(|_| "0".to_string());
5790 server.prev_ipv4_forward = Some(snapshot.clone());
5791 crate::netlink::set_sysctl("net.ipv4.ip_forward", "1").expect("set ip_forward=1");
5792
5793 // --- drive teardown via the real Shutdown dispatch path ---
5794 let resp = server.handle(OverlaydRequest::Shutdown).await;
5795 assert!(
5796 matches!(resp, OverlaydResponse::Ok),
5797 "Shutdown should return Ok, got {resp:?}"
5798 );
5799
5800 // Snapshot kernel state AFTER teardown.
5801 let veth_gone = !std::path::Path::new(&format!("/sys/class/net/{veth_host}")).exists();
5802 let bridge_gone = !std::path::Path::new(&format!("/sys/class/net/{bridge}")).exists();
5803 let route_gone = {
5804 let target = format!("10.233.0.9/{prefix}");
5805 std::process::Command::new("ip")
5806 .args(["route", "show", &target, "dev", &veth_host])
5807 .output()
5808 .map_or(true, |o| !o.status.success() || o.stdout.is_empty())
5809 };
5810 let ip_forward_after = crate::netlink::read_sysctl("net.ipv4.ip_forward")
5811 .unwrap_or_else(|_| "unknown".to_string());
5812
5813 // Belt-and-braces cleanup before asserting so the host stays clean even
5814 // if an assertion fails (teardown should have done all of this already).
5815 let _ = crate::netlink::delete_route_via_dev(dest, prefix, &veth_host).await;
5816 let _ = crate::netlink::delete_link_by_name(&veth_host).await;
5817 let _ = crate::netlink::delete_link_by_name(&veth_peer).await;
5818 let _ = crate::netlink::delete_link_by_name(&bridge).await;
5819 // Restore ip_forward to the snapshot regardless of teardown outcome.
5820 let _ = crate::netlink::set_sysctl("net.ipv4.ip_forward", &snapshot);
5821
5822 // --- assertions ---
5823 assert!(veth_gone, "teardown should delete the tracked host veth");
5824 assert!(bridge_gone, "teardown should delete the tracked bridge");
5825 assert!(
5826 route_gone,
5827 "teardown should delete the tracked /32 host route"
5828 );
5829 assert_eq!(
5830 ip_forward_after.trim(),
5831 snapshot.trim(),
5832 "teardown should restore net.ipv4.ip_forward to its pre-overlay value"
5833 );
5834
5835 // Tracking sets must be drained by teardown so a re-run starts clean.
5836 assert!(
5837 server.created_veths.is_empty(),
5838 "created_veths should be drained by teardown"
5839 );
5840 assert!(
5841 server.created_bridges.is_empty(),
5842 "created_bridges should be drained by teardown"
5843 );
5844 assert!(
5845 server.created_host_routes.is_empty(),
5846 "created_host_routes should be drained by teardown"
5847 );
5848 }
5849
5850 #[test]
5851 fn build_config_uses_matching_physical_egress_ipv4() {
5852 let server = test_server();
5853 let overlay_ip: IpAddr = "10.200.0.1".parse().unwrap();
5854 let egress: IpAddr = "192.0.2.10".parse().unwrap();
5855 let config = server.build_config(
5856 "priv".to_string(),
5857 "pub".to_string(),
5858 overlay_ip,
5859 16,
5860 51820,
5861 Some(egress),
5862 );
5863 assert_eq!(config.local_endpoint, SocketAddr::new(egress, 51820));
5864 }
5865
5866 #[test]
5867 fn build_config_falls_back_to_unspecified_when_none() {
5868 let server = test_server();
5869 let overlay_ip: IpAddr = "10.200.0.1".parse().unwrap();
5870 let config = server.build_config(
5871 "priv".to_string(),
5872 "pub".to_string(),
5873 overlay_ip,
5874 16,
5875 51820,
5876 None,
5877 );
5878 assert_eq!(
5879 config.local_endpoint,
5880 SocketAddr::new(IpAddr::V4(Ipv4Addr::UNSPECIFIED), 51820)
5881 );
5882 }
5883
5884 #[test]
5885 fn build_config_falls_back_to_unspecified_on_family_mismatch() {
5886 let server = test_server();
5887 // Overlay is v6 but the resolved physical egress is v4: unusable for
5888 // source selection, so we must fall back to the v6 UNSPECIFIED address.
5889 let overlay_ip: IpAddr = "fd00::1".parse().unwrap();
5890 let egress: IpAddr = "192.0.2.10".parse().unwrap();
5891 let config = server.build_config(
5892 "priv".to_string(),
5893 "pub".to_string(),
5894 overlay_ip,
5895 64,
5896 51820,
5897 Some(egress),
5898 );
5899 assert_eq!(
5900 config.local_endpoint,
5901 SocketAddr::new(IpAddr::V6(Ipv6Addr::UNSPECIFIED), 51820)
5902 );
5903 }
5904
5905 #[test]
5906 fn rootless_forces_unspecified_decision() {
5907 // Rootless mode must force the WG local_endpoint to UNSPECIFIED because
5908 // detect_physical_egress() resolves pasta's in-netns tap IP there.
5909 assert!(rootless_forces_unspecified(true));
5910 // Non-rootless preserves the existing physical-egress selection path.
5911 assert!(!rootless_forces_unspecified(false));
5912 }
5913
5914 #[tokio::test]
5915 async fn transport_for_scope_global_requires_setup() {
5916 let server = test_server();
5917 // No global overlay set up yet -> Global scope errors. (Can't use
5918 // `expect_err` because `&OverlayTransport` is not `Debug`.)
5919 match server.transport_for_scope(&PeerScope::Global) {
5920 Ok(_) => panic!("global overlay should not be set up"),
5921 Err(OverlaydError::Other(m)) => {
5922 assert!(m.contains("global overlay not set up"), "got: {m}");
5923 }
5924 Err(other) => panic!("unexpected error: {other:?}"),
5925 }
5926 }
5927
5928 #[tokio::test]
5929 async fn transport_for_scope_unset_service_errors() {
5930 let server = test_server();
5931 match server.transport_for_scope(&PeerScope::Service {
5932 service: "x".to_string(),
5933 }) {
5934 Ok(_) => panic!("no dedicated overlay should exist for x"),
5935 Err(OverlaydError::Other(m)) => {
5936 assert_eq!(m, "no dedicated overlay for service x");
5937 }
5938 Err(other) => panic!("unexpected error: {other:?}"),
5939 }
5940 }
5941
5942 #[tokio::test]
5943 async fn add_peer_service_scope_before_setup_errors_via_dispatch() {
5944 let mut server = test_server();
5945 let resp = server
5946 .handle(OverlaydRequest::AddPeer {
5947 peer: PeerSpec {
5948 public_key: "k".to_string(),
5949 endpoint: "1.2.3.4:51820".to_string(),
5950 allowed_ips: "10.200.0.2/32".to_string(),
5951 persistent_keepalive_secs: 0,
5952 candidates: Vec::new(),
5953 },
5954 scope: PeerScope::Service {
5955 service: "x".to_string(),
5956 },
5957 })
5958 .await;
5959 match resp {
5960 OverlaydResponse::Err { message } => {
5961 assert_eq!(message, "no dedicated overlay for service x");
5962 }
5963 other => panic!("expected Err response, got {other:?}"),
5964 }
5965 }
5966
5967 /// The host-adapter degrade decision. A `create_interface()` failure is fatal
5968 /// on Linux (the kernel TUN IS the container data path) and degrades to a
5969 /// VM-only overlay on macOS/Windows (containers mesh VM-to-VM, the host
5970 /// utun/Wintun is off the data path). We can't provoke a real utun/Wintun
5971 /// syscall failure from a Linux test box, so we assert the pure `cfg!`-driven
5972 /// classifier instead: on this Linux test runner it must report fatal.
5973 /// (On macOS/Windows the same fn returns `false` — that arm is covered by the
5974 /// cfg, exercised natively, and cannot be asserted here.)
5975 #[test]
5976 fn host_adapter_failure_fatal_decision() {
5977 // Non-mandatory: platform-driven — fatal on Linux, degrade on macOS/Windows.
5978 assert_eq!(
5979 host_adapter_failure_is_fatal(false),
5980 cfg!(target_os = "linux"),
5981 "non-mandatory host-adapter failure is fatal only on Linux (kernel TUN is the data path)"
5982 );
5983 // Mandatory (host-shared macOS nodes where the utun IS the container data
5984 // path): fatal on every platform.
5985 assert!(
5986 host_adapter_failure_is_fatal(true),
5987 "a mandatory host adapter must make failure fatal on every platform"
5988 );
5989 }
5990
5991 /// A VM-only overlay leaves `global_transport == None`. The Global-scope peer
5992 /// dispatch must then WARN-AND-SKIP the on-device install (guests get the
5993 /// peer via guest-config push) rather than erroring — assert the dispatch
5994 /// returns `Ok` and still mirrors the peer into `global_peers`. This is the
5995 /// Linux-runnable proxy for the degraded host-adapter path: it exercises the
5996 /// exact `None`-tolerant branch without needing a real utun/Wintun failure.
5997 #[tokio::test]
5998 async fn add_global_peer_with_no_host_adapter_skips_and_records() {
5999 let mut server = test_server();
6000 assert!(
6001 server.global_transport.is_none(),
6002 "fresh server has no host adapter (VM-only precondition)"
6003 );
6004 let pubkey = "k".to_string();
6005 let resp = server
6006 .handle(OverlaydRequest::AddPeer {
6007 peer: PeerSpec {
6008 public_key: pubkey.clone(),
6009 endpoint: "1.2.3.4:51820".to_string(),
6010 allowed_ips: "10.200.0.2/32".to_string(),
6011 persistent_keepalive_secs: 0,
6012 candidates: Vec::new(),
6013 },
6014 scope: PeerScope::Global,
6015 })
6016 .await;
6017 match resp {
6018 OverlaydResponse::Ok => {}
6019 other => panic!("expected Ok (warn-and-skip), got {other:?}"),
6020 }
6021 assert!(
6022 server.global_peers.contains_key(&pubkey),
6023 "Global peer must still be mirrored for guest-config push"
6024 );
6025 }
6026
6027 /// End-to-end Dedicated setup. Needs a real TUN device, so it is ignored by
6028 /// default and only runs on a privileged Linux host (mirrors the crate's
6029 /// other privileged overlay e2e tests).
6030 #[cfg(target_os = "linux")]
6031 #[tokio::test]
6032 #[ignore = "needs CAP_NET_ADMIN; run on a privileged Linux host"]
6033 async fn dedicated_setup_creates_distinct_device_and_routes_service_peer() {
6034 let mut server = test_server();
6035 // Bring up the global overlay first so the cluster CIDR + global device
6036 // exist (the dedicated device must get a distinct port and key).
6037 let global_name = server
6038 .setup_global_overlay(
6039 "dep".to_string(),
6040 "i0".to_string(),
6041 "10.200.0.0/16",
6042 Some("10.200.0.0/28"),
6043 zlayer_core::DEFAULT_WG_PORT,
6044 None,
6045 false,
6046 )
6047 .await
6048 .expect("global overlay up");
6049 assert!(!global_name.is_empty());
6050
6051 // Dedicated service setup.
6052 let info = server
6053 .setup_service_overlay("web", OverlayMode::Dedicated)
6054 .await
6055 .expect("dedicated service overlay up");
6056 assert_eq!(info.mode, OverlayMode::Dedicated);
6057 let port = info.wg_port.expect("dedicated port");
6058 assert_ne!(
6059 port, server.overlay_port,
6060 "dedicated device must not share the global port"
6061 );
6062
6063 let st = server
6064 .service_transports
6065 .get("web")
6066 .expect("service transport recorded");
6067 assert_eq!(st.listen_port, port);
6068 assert_ne!(
6069 st.interface, global_name,
6070 "dedicated interface must differ from global"
6071 );
6072 assert_eq!(
6073 Some(st.public_key.clone()),
6074 info.wg_public_key,
6075 "info pubkey matches recorded transport"
6076 );
6077 assert_ne!(
6078 Some(st.public_key.clone()),
6079 server.transport_public_key,
6080 "dedicated key must differ from global key"
6081 );
6082
6083 // A Service-scoped AddPeer must land on the dedicated device (succeeds),
6084 // proving scope routing targets the per-service transport.
6085 let resp = server
6086 .handle(OverlaydRequest::AddPeer {
6087 peer: PeerSpec {
6088 public_key: {
6089 let (_priv, pubk) = OverlayTransport::generate_keys().await.unwrap();
6090 pubk
6091 },
6092 endpoint: "5.6.7.8:51999".to_string(),
6093 allowed_ips: "10.201.0.2/32".to_string(),
6094 persistent_keepalive_secs: 25,
6095 candidates: Vec::new(),
6096 },
6097 scope: PeerScope::Service {
6098 service: "web".to_string(),
6099 },
6100 })
6101 .await;
6102 assert!(
6103 matches!(resp, OverlaydResponse::Ok),
6104 "service-scoped add_peer should land on the dedicated device, got {resp:?}"
6105 );
6106 }
6107
6108 #[tokio::test]
6109 async fn guest_attach_requires_global_overlay() {
6110 // Without a global overlay (no node public key / transport) a
6111 // guest-managed attach must error rather than allocate anything.
6112 let mut server = test_server();
6113 let resp = server
6114 .handle(OverlaydRequest::AttachContainer {
6115 handle: AttachHandle::GuestManaged {
6116 id: "vm-1".to_string(),
6117 },
6118 service: "web".to_string(),
6119 join_global: true,
6120 dns_server: None,
6121 dns_domain: None,
6122 ephemeral: false,
6123 isolation_network: None,
6124 })
6125 .await;
6126 match resp {
6127 OverlaydResponse::Err { message } => {
6128 assert!(
6129 message.contains("global overlay to be set up"),
6130 "got: {message}"
6131 );
6132 }
6133 other => panic!("expected Err response, got {other:?}"),
6134 }
6135 // Nothing was recorded.
6136 assert!(server.guest_attachments.is_empty());
6137 }
6138
6139 #[tokio::test]
6140 async fn detach_unknown_guest_is_idempotent() {
6141 let mut server = test_server();
6142 // No such guest -> Ok (idempotent), no panic.
6143 server
6144 .detach_container_guest("never-attached")
6145 .await
6146 .expect("detach of unknown guest is a no-op");
6147 }
6148
6149 /// Full guest-managed attach/detach round-trip. Needs a real TUN device (the
6150 /// global overlay must be live so the guest peer can be installed), so it is
6151 /// ignored by default and only runs on a privileged Linux host — mirrors the
6152 /// crate's other privileged overlay e2e tests.
6153 #[cfg(target_os = "linux")]
6154 #[tokio::test]
6155 #[ignore = "needs CAP_NET_ADMIN; run on a privileged Linux host"]
6156 async fn guest_attach_allocates_config_and_detach_releases() {
6157 let mut server = test_server();
6158 server
6159 .setup_global_overlay(
6160 "dep".to_string(),
6161 "i0".to_string(),
6162 "10.200.0.0/16",
6163 Some("10.200.0.0/28"),
6164 zlayer_core::DEFAULT_WG_PORT,
6165 None,
6166 false,
6167 )
6168 .await
6169 .expect("global overlay up");
6170
6171 // Seed a global peer so the guest config carries it through.
6172 let (_p, other_pub) = OverlayTransport::generate_keys().await.unwrap();
6173 let add = server
6174 .handle(OverlaydRequest::AddPeer {
6175 peer: PeerSpec {
6176 public_key: other_pub.clone(),
6177 endpoint: "9.9.9.9:51820".to_string(),
6178 allowed_ips: "10.200.1.0/28".to_string(),
6179 persistent_keepalive_secs: 25,
6180 candidates: Vec::new(),
6181 },
6182 scope: PeerScope::Global,
6183 })
6184 .await;
6185 assert!(
6186 matches!(add, OverlaydResponse::Ok),
6187 "seed peer add: {add:?}"
6188 );
6189
6190 let resp = server
6191 .handle(OverlaydRequest::AttachContainer {
6192 handle: AttachHandle::GuestManaged {
6193 id: "vm-1".to_string(),
6194 },
6195 service: "web".to_string(),
6196 join_global: true,
6197 dns_server: Some("10.200.0.1".parse().unwrap()),
6198 dns_domain: Some("overlay".to_string()),
6199 ephemeral: false,
6200 isolation_network: None,
6201 })
6202 .await;
6203 let config = match resp {
6204 OverlaydResponse::GuestConfig(c) => c,
6205 other => panic!("expected GuestConfig, got {other:?}"),
6206 };
6207 assert!(!config.private_key.is_empty());
6208 assert!(!config.public_key.is_empty());
6209 assert_ne!(config.private_key, config.public_key);
6210 assert_eq!(config.listen_port, server.overlay_port);
6211 assert_eq!(config.dns_server, Some("10.200.0.1".parse().unwrap()));
6212 // Peers = the seeded global peer + this node (self) + nothing else.
6213 assert!(
6214 config.peers.iter().any(|p| p.public_key == other_pub),
6215 "guest must learn the seeded global peer"
6216 );
6217 assert!(
6218 config
6219 .peers
6220 .iter()
6221 .any(|p| Some(&p.public_key) == server.transport_public_key.as_ref()),
6222 "guest must learn THIS node as a peer"
6223 );
6224 // The guest's own key is registered as a global peer (host route).
6225 assert!(server.global_peers.contains_key(&config.public_key));
6226 let info = server
6227 .guest_attachments
6228 .get("vm-1")
6229 .expect("attachment recorded");
6230 assert_eq!(info.overlay_ip, config.overlay_ip);
6231
6232 // Detach releases the peer + IP.
6233 let det = server
6234 .handle(OverlaydRequest::DetachContainer {
6235 handle: AttachHandle::GuestManaged {
6236 id: "vm-1".to_string(),
6237 },
6238 })
6239 .await;
6240 assert!(matches!(det, OverlaydResponse::Ok), "detach: {det:?}");
6241 assert!(!server.guest_attachments.contains_key("vm-1"));
6242 assert!(!server.global_peers.contains_key(&config.public_key));
6243 }
6244
6245 /// The `setup_service_overlay` dispatch must handle ALL THREE modes —
6246 /// including the default `Auto` — without panicking. `resolve()` is now the
6247 /// identity, so the old `unreachable!("resolve never returns Auto")` arm
6248 /// would panic on the default mode; this proves the arm is gone. Each mode
6249 /// is recorded in `service_modes` BEFORE any netlink/transport work, so we
6250 /// assert on that deterministically regardless of host privilege (the
6251 /// downstream bridge/transport bring-up may succeed or fail depending on
6252 /// `CAP_NET_ADMIN`, but it must never panic).
6253 #[cfg(target_os = "linux")]
6254 #[tokio::test]
6255 async fn dispatch_handles_all_three_modes_without_panic() {
6256 for mode in [
6257 OverlayMode::Auto,
6258 OverlayMode::Shared,
6259 OverlayMode::Dedicated,
6260 ] {
6261 let mut server = test_server();
6262 let service = format!("svc-{mode:?}");
6263 // Must return a Result (Ok or Err) — never panic via `unreachable!`.
6264 let _ = server.setup_service_overlay(&service, mode).await;
6265 // The resolved mode is recorded up front for the attach path.
6266 assert_eq!(
6267 server.service_modes.get(&service).copied(),
6268 Some(mode.resolve()),
6269 "mode {mode:?} must be recorded for the attach path"
6270 );
6271 }
6272 }
6273
6274 /// Two distinct `Shared` services must reuse the SAME node-wide shared
6275 /// bridge (one bridge, not two), while an `Auto` service gets its OWN
6276 /// per-service bridge. Needs `CAP_NET_ADMIN` to create the bridges, so it is
6277 /// ignored by default like the crate's other privileged overlay e2e tests.
6278 #[cfg(target_os = "linux")]
6279 #[tokio::test]
6280 #[ignore = "needs CAP_NET_ADMIN; run on a privileged Linux host"]
6281 async fn shared_services_reuse_one_bridge_auto_gets_its_own() {
6282 let mut server = test_server();
6283 server
6284 .setup_global_overlay(
6285 "dep".to_string(),
6286 "i0".to_string(),
6287 "10.200.0.0/16",
6288 Some("10.200.0.0/26"),
6289 zlayer_core::DEFAULT_WG_PORT,
6290 None,
6291 false,
6292 )
6293 .await
6294 .expect("global overlay up");
6295
6296 // First Shared service creates the shared bridge.
6297 let info_a = server
6298 .setup_service_overlay("web", OverlayMode::Shared)
6299 .await
6300 .expect("shared service web up");
6301 assert_eq!(info_a.mode, OverlayMode::Shared);
6302 let shared_name = server
6303 .shared_bridge
6304 .as_ref()
6305 .expect("shared bridge created")
6306 .name
6307 .clone();
6308 assert_eq!(info_a.name, shared_name);
6309 // Shared services are NOT per-service bridges.
6310 assert!(
6311 !server.service_bridges.contains_key("web"),
6312 "Shared service must not create a per-service bridge"
6313 );
6314
6315 // Second Shared service REUSES the same shared bridge — no new bridge.
6316 let info_b = server
6317 .setup_service_overlay("api", OverlayMode::Shared)
6318 .await
6319 .expect("shared service api up");
6320 assert_eq!(
6321 info_b.name, shared_name,
6322 "a second Shared service must reuse the SAME node-wide bridge"
6323 );
6324 assert!(!server.service_bridges.contains_key("api"));
6325 // Still exactly one shared bridge object.
6326 assert_eq!(
6327 server.shared_bridge.as_ref().map(|b| b.name.clone()),
6328 Some(shared_name.clone())
6329 );
6330
6331 // An Auto service gets its OWN per-service bridge, distinct from the
6332 // shared bridge.
6333 let info_c = server
6334 .setup_service_overlay("batch", OverlayMode::Auto)
6335 .await
6336 .expect("auto service batch up");
6337 assert_eq!(info_c.mode, OverlayMode::Auto);
6338 assert!(
6339 server.service_bridges.contains_key("batch"),
6340 "Auto service must get its own per-service bridge"
6341 );
6342 assert_ne!(
6343 info_c.name, shared_name,
6344 "Auto per-service bridge must differ from the shared bridge"
6345 );
6346
6347 // Both Shared services point their service_interfaces entry at the one
6348 // shared bridge; the Auto service points at its own.
6349 assert_eq!(server.service_interfaces.get("web"), Some(&shared_name));
6350 assert_eq!(server.service_interfaces.get("api"), Some(&shared_name));
6351 assert_ne!(server.service_interfaces.get("batch"), Some(&shared_name));
6352 }
6353
6354 /// A `Shared` service's container attach must draw its IP from the shared
6355 /// bridge pool and must fail cleanly (no panic, clear error) when the shared
6356 /// bridge has not been set up yet. Unprivileged: exercises only the
6357 /// pre-netlink resolution branch.
6358 #[cfg(target_os = "linux")]
6359 #[tokio::test]
6360 async fn attach_shared_without_setup_errors_cleanly() {
6361 let mut server = test_server();
6362 // Mark the service Shared but never set up the shared bridge.
6363 server
6364 .service_modes
6365 .insert("web".to_string(), OverlayMode::Shared);
6366 let err = server
6367 .attach_container_linux(424_242, "web", false, false, None)
6368 .await
6369 .expect_err("attach must fail without a shared bridge");
6370 match err {
6371 OverlaydError::Other(m) => {
6372 assert!(
6373 m.contains("no shared bridge"),
6374 "expected shared-bridge error, got: {m}"
6375 );
6376 }
6377 other => panic!("unexpected error variant: {other:?}"),
6378 }
6379 }
6380
6381 /// A container attached on a NAMED isolated network must be recorded in the
6382 /// per-network membership map (`network_members["net-a"]` gains the member's
6383 /// service IP). Needs `CAP_NET_ADMIN` to bring up the bridge + veth, so it is
6384 /// ignored by default like the crate's other privileged overlay e2e tests.
6385 #[cfg(target_os = "linux")]
6386 #[tokio::test]
6387 #[ignore = "needs CAP_NET_ADMIN; run on a privileged Linux host"]
6388 async fn attach_linux_isolated_network_records_membership() {
6389 let mut server = test_server();
6390 server
6391 .setup_global_overlay(
6392 "dep".to_string(),
6393 "i0".to_string(),
6394 "10.200.0.0/16",
6395 Some("10.200.0.0/26"),
6396 zlayer_core::DEFAULT_WG_PORT,
6397 None,
6398 false,
6399 )
6400 .await
6401 .expect("global overlay up");
6402
6403 // An Auto service gives us a real per-service bridge to attach onto.
6404 server
6405 .setup_service_overlay("web", OverlayMode::Auto)
6406 .await
6407 .expect("auto service web up");
6408
6409 // Attach this very process (a live PID with a real netns) onto the named
6410 // isolated network "net-a".
6411 let pid = std::process::id();
6412 let ip = server
6413 .attach_container_linux(pid, "web", false, true, Some("net-a".to_string()))
6414 .await
6415 .expect("attach onto isolated network");
6416
6417 // Membership map gained exactly this member under "net-a".
6418 let members = server
6419 .network_members
6420 .get("net-a")
6421 .expect("net-a membership recorded");
6422 assert!(
6423 members.contains(&ip),
6424 "network_members[net-a] must contain the attached member IP {ip}"
6425 );
6426
6427 // Detach drains the membership and drops the now-empty network entry.
6428 server
6429 .detach_container_linux(pid)
6430 .await
6431 .expect("detach succeeds");
6432 assert!(
6433 !server.network_members.contains_key("net-a"),
6434 "empty isolated network must be dropped from network_members on last detach"
6435 );
6436 }
6437
6438 /// The isolation-network owner key namespace is distinct from the dedicated
6439 /// per-service namespace, so an isolation network and a service of the same
6440 /// name never collide on the same marker/allocator key. Platform-agnostic.
6441 #[test]
6442 fn isolation_owner_key_distinct_from_service_owner_key() {
6443 let iso = crate::network_state::owner_for_isolation_network("alpha");
6444 let svc = crate::network_state::owner_for_service("alpha");
6445 assert_ne!(
6446 iso, svc,
6447 "isolation and service owner keys must not collide for the same name"
6448 );
6449 assert_eq!(iso, "iso:alpha");
6450 assert_eq!(svc, "service:alpha");
6451 }
6452
6453 /// `isolation_network_subnet` is deterministic (same name -> same block so a
6454 /// reused HCN network keeps its subnet across restarts), stays INSIDE the
6455 /// node slice, and lands DIFFERENT isolation networks on DISJOINT sub-blocks
6456 /// (the whole point of L3 isolation — distinct networks must not share an
6457 /// address range). Windows-only (the method is `cfg(windows)`); exercised by
6458 /// `cargo xwin test`.
6459 #[cfg(target_os = "windows")]
6460 #[test]
6461 fn isolation_network_subnet_is_deterministic_disjoint_and_inside_slice() {
6462 let mut server = test_server();
6463 let slice: IpNetwork = "10.200.5.0/26".parse().unwrap();
6464 server.slice_cidr = Some(slice);
6465 let slice_net: ipnet::IpNet = "10.200.5.0/26".parse().unwrap();
6466
6467 // Deterministic: same name -> same block on repeated calls.
6468 let a1 = server.isolation_network_subnet("alpha").unwrap();
6469 let a2 = server.isolation_network_subnet("alpha").unwrap();
6470 assert_eq!(a1, a2, "same isolation network must map to the same subnet");
6471
6472 // Inside the node slice and at the /28 sub-prefix.
6473 assert!(
6474 slice_net.contains(&a1.network()) && slice_net.contains(&a1.broadcast()),
6475 "isolation subnet {a1} must be wholly inside the node slice {slice_net}"
6476 );
6477 assert_eq!(a1.prefix_len(), 28, "expected a /28 isolation sub-block");
6478
6479 // A different network name carving a different /28 block must be disjoint.
6480 // (`beta` and `gamma` hash to different indices than `alpha`; pick whichever
6481 // of several names lands on a distinct block to assert disjointness.)
6482 let other = ["beta", "gamma", "delta", "omega", "zeta"]
6483 .iter()
6484 .map(|n| server.isolation_network_subnet(n).unwrap())
6485 .find(|s| *s != a1)
6486 .expect("at least one other name must land on a different /28 block");
6487 let overlaps = a1.contains(&other.network()) || other.contains(&a1.network());
6488 assert!(
6489 !overlaps,
6490 "distinct isolation networks must occupy disjoint subnets ({a1} vs {other})"
6491 );
6492 }
6493}