Skip to main content

zlayer_overlayd/
network_state.rs

1//! Persistent marker for host-level networks `ZLayer` creates.
2//!
3//! Some network objects `ZLayer` provisions live at the **host** level, not the
4//! daemon-process level — most notably the Windows HCN overlay network, which
5//! the HCS runtime creates and HCN keeps alive until an explicit
6//! `HcnDeleteNetwork`. Such objects must:
7//!
8//!   * be **reused** across daemon restarts / binary updates / reinstalls
9//!     (look them up by their recorded id instead of blindly recreating), and
10//!   * be torn down **only** on a full uninstall (`daemon uninstall --purge`),
11//!     never on a routine stop/restart.
12//!
13//! This module is the on-disk record that makes that lifecycle possible. It is
14//! intentionally backend-agnostic (pure `serde` + `std::fs`) so the same marker
15//! file can track HCN networks on Windows, bridges on Linux, etc. The file
16//! lives at [`zlayer_paths::ZLayerDirs::agent_network_state`]
17//! (`{data_dir}/agent_network.json`).
18
19use std::path::Path;
20
21use serde::{Deserialize, Serialize};
22
23/// Schema version for [`NetworkState`]. Bump on a breaking layout change.
24const CURRENT_VERSION: u32 = 1;
25
26/// `owner` value for the node's single shared base overlay network.
27pub const OWNER_BASE: &str = "base";
28
29/// Build the `owner` value for a dedicated per-service network.
30#[must_use]
31pub fn owner_for_service(service: &str) -> String {
32    format!("service:{service}")
33}
34
35/// One host-level network `ZLayer` is responsible for.
36#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
37pub struct ManagedNetwork {
38    /// Logical owner: [`OWNER_BASE`] for the node's shared overlay, or
39    /// `service:<name>` (see [`owner_for_service`]) for a dedicated per-service
40    /// network. Used as the upsert key.
41    pub owner: String,
42    /// Backend-specific kind, e.g. `"hcn-internal"`. Lets a reader (and the
43    /// uninstall path) know which API to use to delete the object.
44    pub kind: String,
45    /// Human-readable network name (e.g. `"zlayer-overlay"`).
46    pub name: String,
47    /// Host-addressable id — for HCN this is the network GUID string.
48    pub id: String,
49    /// CIDR the network was created with (informational / diagnostics).
50    pub subnet: String,
51    /// Dedicated-overlay `WireGuard` listen port (per-service transports only).
52    #[serde(default, skip_serializing_if = "Option::is_none")]
53    pub wg_port: Option<u16>,
54    /// Dedicated-overlay `WireGuard` private key, base64 (per-service only).
55    /// Persisted so the device identity survives overlayd restarts (no
56    /// per-service republish loop exists, so a stable key avoids a re-peer storm).
57    #[serde(default, skip_serializing_if = "Option::is_none")]
58    pub wg_private_key: Option<String>,
59    /// Dedicated-overlay public key, base64.
60    #[serde(default, skip_serializing_if = "Option::is_none")]
61    pub wg_public_key: Option<String>,
62    /// Dedicated-overlay interface name.
63    #[serde(default, skip_serializing_if = "Option::is_none")]
64    pub interface: Option<String>,
65}
66
67/// The full marker file: every host-level network this node manages.
68#[derive(Debug, Clone, Serialize, Deserialize)]
69pub struct NetworkState {
70    /// On-disk schema version.
71    #[serde(default = "default_version")]
72    pub version: u32,
73    /// Managed networks, keyed in practice by [`ManagedNetwork::owner`].
74    #[serde(default)]
75    pub networks: Vec<ManagedNetwork>,
76}
77
78fn default_version() -> u32 {
79    CURRENT_VERSION
80}
81
82impl Default for NetworkState {
83    fn default() -> Self {
84        Self {
85            version: CURRENT_VERSION,
86            networks: Vec::new(),
87        }
88    }
89}
90
91impl NetworkState {
92    /// Load the marker file. A missing or unparseable file yields an empty
93    /// state rather than an error — the marker is a best-effort cache that the
94    /// live-host enumeration paths can always rebuild.
95    #[must_use]
96    pub fn load(path: &Path) -> Self {
97        match std::fs::read(path) {
98            Ok(bytes) => serde_json::from_slice(&bytes).unwrap_or_default(),
99            Err(_) => Self::default(),
100        }
101    }
102
103    /// Persist the marker file, creating the parent directory if needed. Writes
104    /// to a sibling temp file and renames so a crash mid-write can't leave a
105    /// truncated marker.
106    ///
107    /// # Errors
108    ///
109    /// Returns any I/O error from creating the directory, writing, or renaming.
110    pub fn save(&self, path: &Path) -> std::io::Result<()> {
111        if let Some(parent) = path.parent() {
112            std::fs::create_dir_all(parent)?;
113        }
114        let json = serde_json::to_vec_pretty(self).map_err(std::io::Error::other)?;
115        let tmp = path.with_extension("json.tmp");
116        std::fs::write(&tmp, &json)?;
117        std::fs::rename(&tmp, path)?;
118        Ok(())
119    }
120
121    /// Look up a managed network by owner.
122    #[must_use]
123    pub fn get(&self, owner: &str) -> Option<&ManagedNetwork> {
124        self.networks.iter().find(|n| n.owner == owner)
125    }
126
127    /// Insert or replace the entry for `net.owner`.
128    pub fn upsert(&mut self, net: ManagedNetwork) {
129        if let Some(existing) = self.networks.iter_mut().find(|n| n.owner == net.owner) {
130            *existing = net;
131        } else {
132            self.networks.push(net);
133        }
134    }
135
136    /// Remove and return the entry for `owner`, if present.
137    pub fn remove(&mut self, owner: &str) -> Option<ManagedNetwork> {
138        self.networks
139            .iter()
140            .position(|n| n.owner == owner)
141            .map(|pos| self.networks.remove(pos))
142    }
143}
144
145/// Width of the dedicated-overlay listen-port band scanned by
146/// [`DedicatedPortAllocator`]. Ports are handed out from `base+1 ..= base+MAX`,
147/// so a default base of `51820` yields the range `51821..=52076` — 256 distinct
148/// per-service `WireGuard` transports, comfortably more than any single node is
149/// expected to host while staying well clear of the ephemeral range.
150pub const DEDICATED_PORT_BAND: u16 = 256;
151
152/// Deterministic allocator for dedicated-overlay `WireGuard` listen ports.
153///
154/// Each per-service [`OverlayMode::Dedicated`] overlay needs its own UDP listen
155/// port distinct from the node's shared base-overlay port. This allocator hands
156/// out the lowest free port in the band `base+1 ..= base+`[`DEDICATED_PORT_BAND`]
157/// by scanning ascending — no RNG, fully reproducible across restarts.
158///
159/// On startup, callers rehydrate the in-use set from the marker (the persisted
160/// [`ManagedNetwork::wg_port`] of each dedicated service) via [`Self::reserve`]
161/// so a service re-binds the exact port it had before.
162///
163/// [`OverlayMode::Dedicated`]: # "consumed by a later task"
164#[derive(Debug, Clone)]
165pub struct DedicatedPortAllocator {
166    base: u16,
167    used: std::collections::BTreeSet<u16>,
168}
169
170impl DedicatedPortAllocator {
171    /// Build an allocator over `base+1 ..= base+`[`DEDICATED_PORT_BAND`], seeding
172    /// the in-use set from `in_use` (e.g. ports already recorded in the marker).
173    ///
174    /// Ports in `in_use` that fall outside the band are kept in the used set —
175    /// they never collide with [`allocate`](Self::allocate) results and reserving
176    /// an out-of-band port is harmless — but they can never be re-allocated.
177    pub fn new(base: u16, in_use: impl IntoIterator<Item = u16>) -> Self {
178        Self {
179            base,
180            used: in_use.into_iter().collect(),
181        }
182    }
183
184    /// Lowest port in the band, i.e. `base + 1` (saturating).
185    fn band_start(&self) -> u16 {
186        self.base.saturating_add(1)
187    }
188
189    /// Highest port in the band, i.e. `base + `[`DEDICATED_PORT_BAND`] (saturating).
190    fn band_end(&self) -> u16 {
191        self.base.saturating_add(DEDICATED_PORT_BAND)
192    }
193
194    /// Allocate the lowest free port in the band, recording it as used.
195    ///
196    /// # Errors
197    ///
198    /// Returns [`OverlaydError::Other`] if every port in the band is taken.
199    pub fn allocate(&mut self) -> crate::error::Result<u16> {
200        for port in self.band_start()..=self.band_end() {
201            if !self.used.contains(&port) {
202                self.used.insert(port);
203                return Ok(port);
204            }
205        }
206        Err(crate::error::OverlaydError::Other(format!(
207            "dedicated-overlay port band exhausted ({}..={}, {} ports)",
208            self.band_start(),
209            self.band_end(),
210            DEDICATED_PORT_BAND
211        )))
212    }
213
214    /// Free a previously allocated port so it can be handed out again.
215    pub fn release(&mut self, port: u16) {
216        self.used.remove(&port);
217    }
218
219    /// Mark a specific port used without scanning — used to rehydrate the
220    /// allocator from persisted marker state so a service re-binds its port.
221    pub fn reserve(&mut self, port: u16) {
222        self.used.insert(port);
223    }
224
225    /// Whether `port` is currently recorded as in use.
226    #[must_use]
227    pub fn is_used(&self, port: u16) -> bool {
228        self.used.contains(&port)
229    }
230}
231
232#[cfg(test)]
233mod tests {
234    use super::*;
235
236    fn sample(owner: &str, id: &str) -> ManagedNetwork {
237        ManagedNetwork {
238            owner: owner.to_string(),
239            kind: "hcn-internal".to_string(),
240            name: "zlayer-overlay".to_string(),
241            id: id.to_string(),
242            subnet: "10.200.0.0/28".to_string(),
243            wg_port: None,
244            wg_private_key: None,
245            wg_public_key: None,
246            interface: None,
247        }
248    }
249
250    #[test]
251    fn upsert_replaces_same_owner_and_get_finds_it() {
252        let mut st = NetworkState::default();
253        st.upsert(sample(OWNER_BASE, "guid-1"));
254        st.upsert(sample(OWNER_BASE, "guid-2")); // same owner -> replace
255        assert_eq!(st.networks.len(), 1);
256        assert_eq!(st.get(OWNER_BASE).unwrap().id, "guid-2");
257    }
258
259    #[test]
260    fn distinct_owners_coexist_and_remove_targets_one() {
261        let mut st = NetworkState::default();
262        st.upsert(sample(OWNER_BASE, "base-guid"));
263        st.upsert(sample(&owner_for_service("web"), "web-guid"));
264        assert_eq!(st.networks.len(), 2);
265
266        let removed = st.remove(OWNER_BASE).expect("base entry present");
267        assert_eq!(removed.id, "base-guid");
268        assert_eq!(st.networks.len(), 1);
269        assert!(st.get(OWNER_BASE).is_none());
270        assert_eq!(st.get(&owner_for_service("web")).unwrap().id, "web-guid");
271        assert!(st.remove("service:nope").is_none());
272    }
273
274    #[test]
275    fn save_then_load_roundtrips() {
276        let dir = std::env::temp_dir().join(format!("zlayer-netstate-test-{}", std::process::id()));
277        let path = dir.join("agent_network.json");
278        let _ = std::fs::remove_dir_all(&dir);
279
280        let mut st = NetworkState::default();
281        st.upsert(sample(OWNER_BASE, "guid-rt"));
282        st.save(&path).expect("save must succeed");
283
284        let loaded = NetworkState::load(&path);
285        assert_eq!(loaded.version, CURRENT_VERSION);
286        assert_eq!(loaded.networks, st.networks);
287
288        let _ = std::fs::remove_dir_all(&dir);
289    }
290
291    #[test]
292    fn load_missing_file_is_empty_default() {
293        let path = std::env::temp_dir().join("zlayer-netstate-does-not-exist-xyz.json");
294        let _ = std::fs::remove_file(&path);
295        let st = NetworkState::load(&path);
296        assert_eq!(st.version, CURRENT_VERSION);
297        assert!(st.networks.is_empty());
298    }
299
300    #[test]
301    fn dedicated_fields_survive_save_load_roundtrip() {
302        let dir = std::env::temp_dir().join(format!("zlayer-netstate-ded-{}", std::process::id()));
303        let path = dir.join("agent_network.json");
304        let _ = std::fs::remove_dir_all(&dir);
305
306        let mut net = sample(&owner_for_service("web"), "ded-guid");
307        net.wg_port = Some(51823);
308        net.wg_private_key = Some("cHJpdmF0ZS1rZXktYjY0".to_string());
309        net.wg_public_key = Some("cHVibGljLWtleS1iNjQ=".to_string());
310        net.interface = Some("zl-web0".to_string());
311
312        let mut st = NetworkState::default();
313        st.upsert(net.clone());
314        st.save(&path).expect("save must succeed");
315
316        let loaded = NetworkState::load(&path);
317        let got = loaded
318            .get(&owner_for_service("web"))
319            .expect("service entry present");
320        assert_eq!(got.wg_port, Some(51823));
321        assert_eq!(got.wg_private_key.as_deref(), Some("cHJpdmF0ZS1rZXktYjY0"));
322        assert_eq!(got.wg_public_key.as_deref(), Some("cHVibGljLWtleS1iNjQ="));
323        assert_eq!(got.interface.as_deref(), Some("zl-web0"));
324        assert_eq!(got, &net);
325
326        let _ = std::fs::remove_dir_all(&dir);
327    }
328
329    #[test]
330    fn older_marker_without_dedicated_fields_still_loads() {
331        // Hand-written marker JSON from before the dedicated-overlay fields
332        // existed: it must deserialize with the new fields defaulting to None.
333        let dir = std::env::temp_dir().join(format!("zlayer-netstate-bc-{}", std::process::id()));
334        let path = dir.join("agent_network.json");
335        let _ = std::fs::remove_dir_all(&dir);
336        std::fs::create_dir_all(&dir).expect("mkdir");
337
338        let legacy = r#"{
339            "version": 1,
340            "networks": [
341                {
342                    "owner": "base",
343                    "kind": "hcn-internal",
344                    "name": "zlayer-overlay",
345                    "id": "legacy-guid",
346                    "subnet": "10.200.0.0/28"
347                }
348            ]
349        }"#;
350        std::fs::write(&path, legacy).expect("write legacy marker");
351
352        let loaded = NetworkState::load(&path);
353        let got = loaded.get(OWNER_BASE).expect("base entry present");
354        assert_eq!(got.id, "legacy-guid");
355        assert_eq!(got.wg_port, None);
356        assert_eq!(got.wg_private_key, None);
357        assert_eq!(got.wg_public_key, None);
358        assert_eq!(got.interface, None);
359
360        let _ = std::fs::remove_dir_all(&dir);
361    }
362
363    #[test]
364    fn allocate_returns_distinct_ascending_ports() {
365        let mut alloc = DedicatedPortAllocator::new(51820, std::iter::empty());
366        let a = alloc.allocate().expect("port a");
367        let b = alloc.allocate().expect("port b");
368        let c = alloc.allocate().expect("port c");
369        assert_eq!(a, 51821);
370        assert_eq!(b, 51822);
371        assert_eq!(c, 51823);
372    }
373
374    #[test]
375    fn release_then_allocate_reuses_freed_port() {
376        let mut alloc = DedicatedPortAllocator::new(51820, std::iter::empty());
377        let a = alloc.allocate().expect("port a");
378        let b = alloc.allocate().expect("port b");
379        assert_eq!(a, 51821);
380        assert_eq!(b, 51822);
381
382        alloc.release(a);
383        // Lowest free is now the released port again.
384        let reused = alloc.allocate().expect("reused port");
385        assert_eq!(reused, 51821);
386    }
387
388    #[test]
389    fn reserved_port_is_skipped_by_allocate() {
390        // Rehydrate as if 51821 was persisted in the marker for another service.
391        let mut alloc = DedicatedPortAllocator::new(51820, [51821]);
392        assert!(alloc.is_used(51821));
393        let first = alloc.allocate().expect("first allocation");
394        assert_eq!(first, 51822);
395
396        // Explicit reserve mid-flight is also honored.
397        alloc.reserve(51823);
398        let next = alloc.allocate().expect("next allocation");
399        assert_eq!(next, 51824);
400    }
401
402    #[test]
403    fn band_exhaustion_errors() {
404        // Pre-reserve every port in the band so allocate has nothing left.
405        let base = 51820u16;
406        let full: Vec<u16> = (base + 1..=base + DEDICATED_PORT_BAND).collect();
407        let mut alloc = DedicatedPortAllocator::new(base, full);
408        let err = alloc.allocate().expect_err("band must be exhausted");
409        assert!(
410            matches!(err, crate::error::OverlaydError::Other(ref m) if m.contains("exhausted")),
411            "unexpected error: {err:?}"
412        );
413    }
414}