Skip to main content

zlayer_overlay/
bootstrap.rs

1//! Overlay network bootstrap functionality
2//!
3//! Provides initialization and joining capabilities for overlay networks,
4//! including keypair generation, interface creation, and peer management.
5
6use crate::allocator::IpAllocator;
7use crate::config::PeerInfo;
8use crate::dns::{peer_hostname, DnsConfig, DnsHandle, DnsServer, DEFAULT_DNS_PORT};
9use crate::error::{OverlayError, Result};
10#[cfg(feature = "nat")]
11use crate::nat::{Candidate, ConnectionType, NatTraversal, RelayServer};
12use crate::transport::OverlayTransport;
13use serde::{Deserialize, Serialize};
14use std::net::{IpAddr, SocketAddr};
15use std::path::{Path, PathBuf};
16use std::time::Duration;
17use tracing::{debug, info, warn};
18
19/// Default overlay interface name for `ZLayer`
20///
21/// On macOS, this is `"utun"` which tells boringtun to let the kernel
22/// auto-assign a `utunN` device. On Linux, a custom name is used.
23#[cfg(target_os = "macos")]
24pub const DEFAULT_INTERFACE_NAME: &str = "utun";
25#[cfg(not(target_os = "macos"))]
26pub const DEFAULT_INTERFACE_NAME: &str = "zl-overlay0";
27
28/// Default overlay listen port (re-exported from `zlayer-core`).
29pub use zlayer_core::DEFAULT_WG_PORT;
30
31/// Default overlay network CIDR (IPv4)
32pub const DEFAULT_OVERLAY_CIDR: &str = "10.200.0.0/16";
33
34/// Default overlay network CIDR (IPv6)
35///
36/// Uses a ULA (Unique Local Address) prefix in the `fd00::/8` range.
37/// The `fd00:200::/48` prefix mirrors the IPv4 `10.200.0.0/16` convention.
38pub const DEFAULT_OVERLAY_CIDR_V6: &str = "fd00:200::/48";
39
40/// Default persistent keepalive interval (seconds)
41pub const DEFAULT_KEEPALIVE_SECS: u16 = 25;
42
43/// Overlay network bootstrap configuration
44///
45/// Contains all configuration needed to initialize and manage
46/// an overlay network on a node.
47#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct BootstrapConfig {
49    /// Network CIDR (e.g., "10.200.0.0/16")
50    pub cidr: String,
51
52    /// This node's overlay IP address (IPv4 or IPv6)
53    pub node_ip: IpAddr,
54
55    /// Overlay interface name
56    pub interface: String,
57
58    /// Overlay listen port
59    pub port: u16,
60
61    /// This node's overlay private key
62    pub private_key: String,
63
64    /// This node's overlay public key
65    pub public_key: String,
66
67    /// Whether this node is the cluster leader
68    pub is_leader: bool,
69
70    /// Creation timestamp (Unix epoch seconds)
71    pub created_at: u64,
72}
73
74impl BootstrapConfig {
75    /// Get the overlay IP with host prefix for allowed IPs
76    ///
77    /// Returns `/32` for IPv4 addresses and `/128` for IPv6 addresses.
78    #[must_use]
79    pub fn allowed_ip(&self) -> String {
80        let prefix = match self.node_ip {
81            IpAddr::V4(_) => 32,
82            IpAddr::V6(_) => 128,
83        };
84        format!("{}/{}", self.node_ip, prefix)
85    }
86}
87
88/// Peer configuration for overlay network
89#[derive(Debug, Clone, Serialize, Deserialize)]
90pub struct PeerConfig {
91    /// Peer's node ID (for identification)
92    pub node_id: String,
93
94    /// Peer's overlay public key
95    pub public_key: String,
96
97    /// Peer's public endpoint (host:port)
98    pub endpoint: String,
99
100    /// Peer's overlay IP address (IPv4 or IPv6)
101    pub overlay_ip: IpAddr,
102
103    /// Optional persistent keepalive interval in seconds
104    #[serde(default)]
105    pub keepalive: Option<u16>,
106
107    /// Optional custom DNS hostname for this peer (without zone suffix)
108    /// If provided, the peer will be registered with this name in addition
109    /// to the auto-generated IP-based hostname.
110    #[serde(default)]
111    pub hostname: Option<String>,
112
113    /// NAT traversal candidates for this peer
114    #[serde(default)]
115    #[cfg(feature = "nat")]
116    pub candidates: Vec<Candidate>,
117
118    /// How this peer is currently connected
119    #[serde(default)]
120    #[cfg(feature = "nat")]
121    pub connection_type: ConnectionType,
122}
123
124impl PeerConfig {
125    /// Create a new peer configuration
126    #[must_use]
127    pub fn new(node_id: String, public_key: String, endpoint: String, overlay_ip: IpAddr) -> Self {
128        Self {
129            node_id,
130            public_key,
131            endpoint,
132            overlay_ip,
133            keepalive: Some(DEFAULT_KEEPALIVE_SECS),
134            hostname: None,
135            #[cfg(feature = "nat")]
136            candidates: Vec::new(),
137            #[cfg(feature = "nat")]
138            connection_type: ConnectionType::default(),
139        }
140    }
141
142    /// Set a custom DNS hostname for this peer
143    #[must_use]
144    pub fn with_hostname(mut self, hostname: impl Into<String>) -> Self {
145        self.hostname = Some(hostname.into());
146        self
147    }
148
149    /// Convert to `PeerInfo` for overlay transport configuration
150    ///
151    /// # Errors
152    ///
153    /// Returns an error if the endpoint address cannot be parsed.
154    pub fn to_peer_info(&self) -> std::result::Result<PeerInfo, Box<dyn std::error::Error>> {
155        let endpoint: SocketAddr = self.endpoint.parse()?;
156        let keepalive =
157            Duration::from_secs(u64::from(self.keepalive.unwrap_or(DEFAULT_KEEPALIVE_SECS)));
158        let prefix = match self.overlay_ip {
159            IpAddr::V4(_) => 32,
160            IpAddr::V6(_) => 128,
161        };
162
163        Ok(PeerInfo::new(
164            self.public_key.clone(),
165            endpoint,
166            &format!("{}/{}", self.overlay_ip, prefix),
167            keepalive,
168        ))
169    }
170}
171
172/// Persistent state for the overlay bootstrap
173#[derive(Debug, Clone, Serialize, Deserialize)]
174pub struct BootstrapState {
175    /// Bootstrap configuration
176    pub config: BootstrapConfig,
177
178    /// List of configured peers
179    pub peers: Vec<PeerConfig>,
180
181    /// IP allocator state (only for leader)
182    #[serde(skip_serializing_if = "Option::is_none")]
183    pub allocator_state: Option<crate::allocator::IpAllocatorState>,
184}
185
186/// Bootstrap manager for overlay network
187///
188/// Handles overlay network initialization, peer management,
189/// and overlay transport interface configuration.
190pub struct OverlayBootstrap {
191    /// Bootstrap configuration
192    config: BootstrapConfig,
193
194    /// Configured peers
195    peers: Vec<PeerConfig>,
196
197    /// Data directory for persistent state
198    data_dir: PathBuf,
199
200    /// IP allocator (only for leader nodes)
201    allocator: Option<IpAllocator>,
202
203    /// DNS configuration (opt-in)
204    dns_config: Option<DnsConfig>,
205
206    /// DNS handle for managing records (available after `start()` if DNS enabled)
207    dns_handle: Option<DnsHandle>,
208
209    /// Overlay transport (boringtun device handle).
210    ///
211    /// Must be kept alive for the overlay network lifetime; dropping the
212    /// transport destroys the TUN device.
213    transport: Option<OverlayTransport>,
214
215    /// NAT traversal orchestrator (available after `start()` if NAT is enabled)
216    #[cfg(feature = "nat")]
217    nat_traversal: Option<NatTraversal>,
218
219    /// Built-in relay server (available after `start()` if relay is configured)
220    #[cfg(feature = "nat")]
221    relay_server: Option<RelayServer>,
222}
223
224impl OverlayBootstrap {
225    /// Initialize as cluster leader (first node in the overlay)
226    ///
227    /// This generates a new overlay keypair, allocates the first IP
228    /// in the CIDR range, and prepares the node as the overlay leader.
229    ///
230    /// # Arguments
231    /// * `cidr` - Overlay network CIDR (e.g., "10.200.0.0/16")
232    /// * `port` - Overlay listen port
233    /// * `data_dir` - Directory for persistent state
234    ///
235    /// # Example
236    /// ```ignore
237    /// let bootstrap = OverlayBootstrap::init_leader(
238    ///     "10.200.0.0/16",
239    ///     51820,
240    ///     Path::new("/var/lib/zlayer"),
241    /// ).await?;
242    /// ```
243    ///
244    /// # Errors
245    ///
246    /// Returns an error if already initialized, key generation fails, or state cannot be saved.
247    pub async fn init_leader(cidr: &str, port: u16, data_dir: &Path) -> Result<Self> {
248        // Check if already initialized
249        let config_path = data_dir.join("overlay_bootstrap.json");
250        if config_path.exists() {
251            return Err(OverlayError::AlreadyInitialized(
252                config_path.display().to_string(),
253            ));
254        }
255
256        // Ensure data directory exists
257        tokio::fs::create_dir_all(data_dir).await?;
258
259        // Generate overlay keypair
260        info!("Generating overlay keypair for leader");
261        let (private_key, public_key) = OverlayTransport::generate_keys()
262            .await
263            .map_err(|e| OverlayError::TransportCommand(e.to_string()))?;
264
265        // Initialize IP allocator and allocate first IP for leader
266        let mut allocator = IpAllocator::new(cidr)?;
267        let node_ip = allocator.allocate_first()?;
268
269        info!(node_ip = %node_ip, cidr = cidr, "Allocated leader IP");
270
271        // Create config
272        let config = BootstrapConfig {
273            cidr: cidr.to_string(),
274            node_ip,
275            interface: DEFAULT_INTERFACE_NAME.to_string(),
276            port,
277            private_key,
278            public_key,
279            is_leader: true,
280            created_at: current_timestamp(),
281        };
282
283        let bootstrap = Self {
284            config,
285            peers: Vec::new(),
286            data_dir: data_dir.to_path_buf(),
287            allocator: Some(allocator),
288            dns_config: None,
289            dns_handle: None,
290            transport: None,
291            #[cfg(feature = "nat")]
292            nat_traversal: None,
293            #[cfg(feature = "nat")]
294            relay_server: None,
295        };
296
297        // Persist state
298        bootstrap.save().await?;
299
300        Ok(bootstrap)
301    }
302
303    /// Join an existing overlay network
304    ///
305    /// Generates a new overlay keypair and configures this node
306    /// to connect to an existing overlay network.
307    ///
308    /// # Arguments
309    /// * `leader_cidr` - Leader's overlay network CIDR
310    /// * `leader_endpoint` - Leader's public endpoint (host:port)
311    /// * `leader_public_key` - Leader's overlay public key
312    /// * `leader_overlay_ip` - Leader's overlay IP address
313    /// * `allocated_ip` - IP address allocated for this node by the leader
314    /// * `port` - Overlay listen port for this node
315    /// * `data_dir` - Directory for persistent state
316    ///
317    /// # Errors
318    ///
319    /// Returns an error if already initialized, key generation fails, or state cannot be saved.
320    pub async fn join(
321        leader_cidr: &str,
322        leader_endpoint: &str,
323        leader_public_key: &str,
324        leader_overlay_ip: IpAddr,
325        allocated_ip: IpAddr,
326        port: u16,
327        data_dir: &Path,
328    ) -> Result<Self> {
329        // Check if already initialized
330        let config_path = data_dir.join("overlay_bootstrap.json");
331        if config_path.exists() {
332            return Err(OverlayError::AlreadyInitialized(
333                config_path.display().to_string(),
334            ));
335        }
336
337        // Ensure data directory exists
338        tokio::fs::create_dir_all(data_dir).await?;
339
340        // Generate overlay keypair for this node
341        info!("Generating overlay keypair for joining node");
342        let (private_key, public_key) = OverlayTransport::generate_keys()
343            .await
344            .map_err(|e| OverlayError::TransportCommand(e.to_string()))?;
345
346        // Create config
347        let config = BootstrapConfig {
348            cidr: leader_cidr.to_string(),
349            node_ip: allocated_ip,
350            interface: DEFAULT_INTERFACE_NAME.to_string(),
351            port,
352            private_key,
353            public_key,
354            is_leader: false,
355            created_at: current_timestamp(),
356        };
357
358        // Add leader as the first peer
359        let leader_peer = PeerConfig {
360            node_id: "leader".to_string(),
361            public_key: leader_public_key.to_string(),
362            endpoint: leader_endpoint.to_string(),
363            overlay_ip: leader_overlay_ip,
364            keepalive: Some(DEFAULT_KEEPALIVE_SECS),
365            hostname: None, // Leader gets its own DNS alias "leader.zone"
366            #[cfg(feature = "nat")]
367            candidates: Vec::new(),
368            #[cfg(feature = "nat")]
369            connection_type: ConnectionType::default(),
370        };
371
372        info!(
373            leader_endpoint = leader_endpoint,
374            overlay_ip = %allocated_ip,
375            "Configured leader as peer"
376        );
377
378        let bootstrap = Self {
379            config,
380            peers: vec![leader_peer],
381            data_dir: data_dir.to_path_buf(),
382            allocator: None, // Workers don't manage IP allocation
383            dns_config: None,
384            dns_handle: None,
385            transport: None,
386            #[cfg(feature = "nat")]
387            nat_traversal: None,
388            #[cfg(feature = "nat")]
389            relay_server: None,
390        };
391
392        // Persist state
393        bootstrap.save().await?;
394
395        Ok(bootstrap)
396    }
397
398    /// Load existing bootstrap state from disk
399    ///
400    /// # Errors
401    ///
402    /// Returns an error if the state file is missing, unreadable, or invalid.
403    pub async fn load(data_dir: &Path) -> Result<Self> {
404        let config_path = data_dir.join("overlay_bootstrap.json");
405
406        if !config_path.exists() {
407            return Err(OverlayError::NotInitialized);
408        }
409
410        let contents = tokio::fs::read_to_string(&config_path).await?;
411        let state: BootstrapState = serde_json::from_str(&contents)?;
412
413        let allocator = if let Some(alloc_state) = state.allocator_state {
414            Some(IpAllocator::from_state(alloc_state)?)
415        } else {
416            None
417        };
418
419        Ok(Self {
420            config: state.config,
421            peers: state.peers,
422            data_dir: data_dir.to_path_buf(),
423            allocator,
424            dns_config: None, // DNS config must be re-enabled after load
425            dns_handle: None,
426            transport: None,
427            #[cfg(feature = "nat")]
428            nat_traversal: None,
429            #[cfg(feature = "nat")]
430            relay_server: None,
431        })
432    }
433
434    /// Save bootstrap state to disk
435    ///
436    /// # Errors
437    ///
438    /// Returns an error if serialization or file writing fails.
439    pub async fn save(&self) -> Result<()> {
440        let config_path = self.data_dir.join("overlay_bootstrap.json");
441
442        let state = BootstrapState {
443            config: self.config.clone(),
444            peers: self.peers.clone(),
445            allocator_state: self
446                .allocator
447                .as_ref()
448                .map(super::allocator::IpAllocator::to_state),
449        };
450
451        let contents = serde_json::to_string_pretty(&state)?;
452        tokio::fs::write(&config_path, contents).await?;
453
454        debug!(path = %config_path.display(), "Saved bootstrap state");
455        Ok(())
456    }
457
458    /// Enable DNS service discovery for the overlay network
459    ///
460    /// When DNS is enabled, peers are automatically registered with both:
461    /// - An IP-based hostname: `node-X-Y.zone` (e.g., `node-0-5.overlay.local`)
462    /// - A custom hostname if provided in `PeerConfig`
463    ///
464    /// The leader node additionally gets a `leader.zone` alias.
465    ///
466    /// # Arguments
467    /// * `zone` - DNS zone (e.g., "overlay.local.")
468    /// * `port` - DNS server port (default: 15353 to avoid conflicts)
469    ///
470    /// # Example
471    /// ```ignore
472    /// let bootstrap = OverlayBootstrap::init_leader(cidr, port, data_dir)
473    ///     .await?
474    ///     .with_dns("overlay.local.", 15353)?;
475    /// bootstrap.start().await?;
476    /// ```
477    ///
478    /// # Errors
479    ///
480    /// This method currently always succeeds but returns `Result` for API consistency.
481    pub fn with_dns(mut self, zone: &str, port: u16) -> Result<Self> {
482        self.dns_config = Some(DnsConfig {
483            zone: zone.to_string(),
484            port,
485            bind_addr: self.config.node_ip,
486        });
487        Ok(self)
488    }
489
490    /// Enable DNS with default port (15353)
491    ///
492    /// # Errors
493    ///
494    /// This method currently always succeeds but returns `Result` for API consistency.
495    pub fn with_dns_default(self, zone: &str) -> Result<Self> {
496        self.with_dns(zone, DEFAULT_DNS_PORT)
497    }
498
499    /// Get the DNS handle for managing records
500    ///
501    /// Returns None if DNS is not enabled or `start()` hasn't been called yet.
502    #[must_use]
503    pub fn dns_handle(&self) -> Option<&DnsHandle> {
504        self.dns_handle.as_ref()
505    }
506
507    /// Check if DNS is enabled
508    #[must_use]
509    pub fn dns_enabled(&self) -> bool {
510        self.dns_config.is_some()
511    }
512
513    /// Start the overlay network (create and configure overlay transport)
514    ///
515    /// This creates the boringtun TUN interface, assigns the overlay IP,
516    /// configures all known peers, and starts the DNS server if enabled.
517    ///
518    /// # Errors
519    ///
520    /// Returns an error if interface creation, peer configuration, or DNS startup fails.
521    pub async fn start(&mut self) -> Result<()> {
522        info!(
523            interface = %self.config.interface,
524            overlay_ip = %self.config.node_ip,
525            port = self.config.port,
526            dns_enabled = self.dns_config.is_some(),
527            "Starting overlay network"
528        );
529
530        // Convert our config to OverlayConfig
531        let overlay_config = crate::config::OverlayConfig {
532            local_endpoint: SocketAddr::new(
533                std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED),
534                self.config.port,
535            ),
536            private_key: self.config.private_key.clone(),
537            public_key: self.config.public_key.clone(),
538            overlay_cidr: self.config.allowed_ip(),
539            peer_discovery_interval: Duration::from_secs(30),
540            #[cfg(feature = "nat")]
541            nat: crate::nat::NatConfig::default(),
542        };
543
544        #[cfg(feature = "nat")]
545        let nat_config = overlay_config.nat.clone();
546
547        // Create overlay transport
548        let mut transport = OverlayTransport::new(overlay_config, self.config.interface.clone());
549
550        // Create the interface
551        transport
552            .create_interface()
553            .await
554            .map_err(|e| OverlayError::TransportCommand(e.to_string()))?;
555
556        // On macOS, the kernel assigns a utunN name that may differ from
557        // the requested name. Update our config to reflect the actual name.
558        let actual_name = transport.interface_name().to_string();
559        if actual_name != self.config.interface {
560            info!(
561                requested = %self.config.interface,
562                actual = %actual_name,
563                "Interface name resolved by kernel"
564            );
565            self.config.interface = actual_name;
566        }
567
568        // Convert peers to PeerInfo
569        let peer_infos: Vec<PeerInfo> = self
570            .peers
571            .iter()
572            .filter_map(|p| match p.to_peer_info() {
573                Ok(info) => Some(info),
574                Err(e) => {
575                    warn!(peer = %p.node_id, error = %e, "Failed to parse peer info");
576                    None
577                }
578            })
579            .collect();
580
581        // Configure transport with peers
582        transport
583            .configure(&peer_infos)
584            .await
585            .map_err(|e| OverlayError::TransportCommand(e.to_string()))?;
586
587        // Store the transport so the TUN device stays alive for the overlay
588        // lifetime. Dropping the OverlayTransport destroys the boringtun device.
589        self.transport = Some(transport);
590
591        // NAT traversal: gather candidates and connect to peers
592        #[cfg(feature = "nat")]
593        self.start_nat_traversal(nat_config).await;
594
595        // Start DNS server if configured
596        self.start_dns().await?;
597
598        info!("Overlay network started successfully");
599        Ok(())
600    }
601
602    /// Start the DNS server and register all known peers.
603    async fn start_dns(&mut self) -> Result<()> {
604        let Some(dns_config) = &self.dns_config else {
605            return Ok(());
606        };
607
608        info!(
609            zone = %dns_config.zone,
610            port = dns_config.port,
611            "Starting DNS server for overlay"
612        );
613
614        let dns_server =
615            DnsServer::from_config(dns_config).map_err(|e| OverlayError::Dns(e.to_string()))?;
616
617        // Register self with IP-based hostname
618        let self_hostname = peer_hostname(self.config.node_ip);
619        dns_server
620            .add_record(&self_hostname, self.config.node_ip)
621            .await
622            .map_err(|e| OverlayError::Dns(e.to_string()))?;
623
624        // If leader, also register "leader" alias
625        if self.config.is_leader {
626            dns_server
627                .add_record("leader", self.config.node_ip)
628                .await
629                .map_err(|e| OverlayError::Dns(e.to_string()))?;
630            debug!(ip = %self.config.node_ip, "Registered leader.{}", dns_config.zone);
631        }
632
633        // Register existing peers
634        for peer in &self.peers {
635            // Always register IP-based hostname
636            let hostname = peer_hostname(peer.overlay_ip);
637            dns_server
638                .add_record(&hostname, peer.overlay_ip)
639                .await
640                .map_err(|e| OverlayError::Dns(e.to_string()))?;
641
642            // Also register custom hostname if provided
643            if let Some(custom) = &peer.hostname {
644                dns_server
645                    .add_record(custom, peer.overlay_ip)
646                    .await
647                    .map_err(|e| OverlayError::Dns(e.to_string()))?;
648                debug!(
649                    hostname = custom,
650                    ip = %peer.overlay_ip,
651                    "Registered custom hostname"
652                );
653            }
654        }
655
656        // Start the DNS server and store the handle
657        let handle = dns_server
658            .start()
659            .await
660            .map_err(|e| OverlayError::Dns(e.to_string()))?;
661        self.dns_handle = Some(handle);
662
663        info!("DNS server started successfully");
664        Ok(())
665    }
666
667    /// Initialize NAT traversal, gather candidates, and connect to known peers.
668    #[cfg(feature = "nat")]
669    async fn start_nat_traversal(&mut self, nat_config: crate::nat::NatConfig) {
670        if !nat_config.enabled {
671            return;
672        }
673
674        // Optionally start built-in relay server
675        if let Some(ref relay_config) = nat_config.relay_server {
676            let relay = RelayServer::new(relay_config, &self.config.private_key);
677            match relay.start().await {
678                Ok(()) => {
679                    info!("Built-in relay server started");
680                    self.relay_server = Some(relay);
681                }
682                Err(e) => {
683                    warn!(error = %e, "Failed to start relay server");
684                }
685            }
686        }
687
688        let mut nat = NatTraversal::new(nat_config, self.config.port);
689        match nat.gather_candidates().await {
690            Ok(candidates) => {
691                info!(count = candidates.len(), "Gathered NAT candidates");
692                if let Some(ref transport) = self.transport {
693                    for peer in &mut self.peers {
694                        if !peer.candidates.is_empty() {
695                            match nat
696                                .connect_to_peer(transport, &peer.public_key, &peer.candidates)
697                                .await
698                            {
699                                Ok(ct) => {
700                                    peer.connection_type = ct;
701                                    info!(
702                                        peer = %peer.node_id,
703                                        connection = %ct,
704                                        "NAT traversal succeeded"
705                                    );
706                                }
707                                Err(e) => warn!(
708                                    peer = %peer.node_id,
709                                    error = %e,
710                                    "NAT traversal failed"
711                                ),
712                            }
713                        }
714                    }
715                }
716                self.nat_traversal = Some(nat);
717            }
718            Err(e) => warn!(error = %e, "NAT candidate gathering failed"),
719        }
720    }
721
722    /// Stop the overlay network (shut down the boringtun transport)
723    ///
724    /// # Errors
725    ///
726    /// This method currently always succeeds but returns `Result` for API consistency.
727    #[allow(clippy::unused_async)]
728    pub async fn stop(&mut self) -> Result<()> {
729        info!(interface = %self.config.interface, "Stopping overlay network");
730
731        if let Some(mut transport) = self.transport.take() {
732            transport.shutdown();
733        }
734
735        Ok(())
736    }
737
738    /// Add a new peer to the overlay network
739    ///
740    /// For leader nodes, this also allocates an IP address for the peer.
741    ///
742    /// # Errors
743    ///
744    /// Returns an error if no IPs are available, DNS registration fails, or state cannot be saved.
745    pub async fn add_peer(&mut self, mut peer: PeerConfig) -> Result<IpAddr> {
746        // If we're the leader, allocate an IP for this peer
747        let overlay_ip = if let Some(ref mut allocator) = self.allocator {
748            let ip = allocator.allocate().ok_or(OverlayError::NoAvailableIps)?;
749            peer.overlay_ip = ip;
750            ip
751        } else {
752            peer.overlay_ip
753        };
754
755        // Add peer to overlay transport via UAPI
756        if let Ok(peer_info) = peer.to_peer_info() {
757            // Prefer the stored transport; fall back to a temporary instance
758            // (UAPI calls work via the Unix socket regardless of DeviceHandle)
759            let transport_ref: Option<&OverlayTransport> = self.transport.as_ref();
760
761            let result = if let Some(t) = transport_ref {
762                t.add_peer(&peer_info).await
763            } else {
764                let overlay_config = crate::config::OverlayConfig {
765                    local_endpoint: SocketAddr::new(
766                        std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED),
767                        self.config.port,
768                    ),
769                    private_key: self.config.private_key.clone(),
770                    public_key: self.config.public_key.clone(),
771                    overlay_cidr: self.config.allowed_ip(),
772                    peer_discovery_interval: Duration::from_secs(30),
773                    #[cfg(feature = "nat")]
774                    nat: crate::nat::NatConfig::default(),
775                };
776                let tmp = OverlayTransport::new(overlay_config, self.config.interface.clone());
777                tmp.add_peer(&peer_info).await
778            };
779
780            match result {
781                Ok(()) => debug!(peer = %peer.node_id, "Added peer to overlay"),
782                Err(e) => {
783                    warn!(peer = %peer.node_id, error = %e, "Failed to add peer to overlay (interface may not be up)");
784                }
785            }
786        }
787
788        // Register peer in DNS if enabled
789        if let Some(ref dns_handle) = self.dns_handle {
790            // IP-based hostname
791            let hostname = peer_hostname(overlay_ip);
792            dns_handle
793                .add_record(&hostname, overlay_ip)
794                .await
795                .map_err(|e| OverlayError::Dns(e.to_string()))?;
796            debug!(hostname = %hostname, ip = %overlay_ip, "Registered peer in DNS");
797
798            // Custom hostname alias if provided
799            if let Some(ref custom) = peer.hostname {
800                dns_handle
801                    .add_record(custom, overlay_ip)
802                    .await
803                    .map_err(|e| OverlayError::Dns(e.to_string()))?;
804                debug!(hostname = %custom, ip = %overlay_ip, "Registered custom hostname in DNS");
805            }
806        }
807
808        // NAT traversal for new peer
809        #[cfg(feature = "nat")]
810        {
811            if let (Some(ref nat), Some(ref transport)) = (&self.nat_traversal, &self.transport) {
812                if !peer.candidates.is_empty() {
813                    match nat
814                        .connect_to_peer(transport, &peer.public_key, &peer.candidates)
815                        .await
816                    {
817                        Ok(ct) => {
818                            peer.connection_type = ct;
819                            info!(
820                                peer = %peer.node_id,
821                                connection = %ct,
822                                "NAT traversal for new peer"
823                            );
824                        }
825                        Err(e) => warn!(
826                            peer = %peer.node_id,
827                            error = %e,
828                            "NAT failed for new peer"
829                        ),
830                    }
831                }
832            }
833        }
834
835        // Add to peer list
836        self.peers.push(peer);
837
838        // Persist state
839        self.save().await?;
840
841        info!(peer_ip = %overlay_ip, "Added peer to overlay");
842        Ok(overlay_ip)
843    }
844
845    /// Remove a peer from the overlay network
846    ///
847    /// # Errors
848    ///
849    /// Returns an error if the peer is not found, DNS removal fails, or state cannot be saved.
850    pub async fn remove_peer(&mut self, public_key: &str) -> Result<()> {
851        // Find the peer
852        let peer_idx = self
853            .peers
854            .iter()
855            .position(|p| p.public_key == public_key)
856            .ok_or_else(|| OverlayError::PeerNotFound(public_key.to_string()))?;
857
858        let peer = &self.peers[peer_idx];
859
860        // Capture peer info for DNS removal before we lose the reference
861        let peer_overlay_ip = peer.overlay_ip;
862        let peer_custom_hostname = peer.hostname.clone();
863
864        // Release IP if we're managing allocation
865        if let Some(ref mut allocator) = self.allocator {
866            allocator.release(peer_overlay_ip);
867        }
868
869        // Remove from DNS if enabled
870        if let Some(ref dns_handle) = self.dns_handle {
871            // Remove IP-based hostname
872            let hostname = peer_hostname(peer_overlay_ip);
873            dns_handle
874                .remove_record(&hostname)
875                .await
876                .map_err(|e| OverlayError::Dns(e.to_string()))?;
877            debug!(hostname = %hostname, "Removed peer from DNS");
878
879            // Remove custom hostname if it was set
880            if let Some(ref custom) = peer_custom_hostname {
881                dns_handle
882                    .remove_record(custom)
883                    .await
884                    .map_err(|e| OverlayError::Dns(e.to_string()))?;
885                debug!(hostname = %custom, "Removed custom hostname from DNS");
886            }
887        }
888
889        // Remove peer from overlay transport via UAPI
890        let transport_ref: Option<&OverlayTransport> = self.transport.as_ref();
891
892        let result = if let Some(t) = transport_ref {
893            t.remove_peer(public_key).await
894        } else {
895            let overlay_config = crate::config::OverlayConfig {
896                local_endpoint: SocketAddr::new(
897                    std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED),
898                    self.config.port,
899                ),
900                private_key: self.config.private_key.clone(),
901                public_key: self.config.public_key.clone(),
902                overlay_cidr: self.config.allowed_ip(),
903                peer_discovery_interval: Duration::from_secs(30),
904                #[cfg(feature = "nat")]
905                nat: crate::nat::NatConfig::default(),
906            };
907            let tmp = OverlayTransport::new(overlay_config, self.config.interface.clone());
908            tmp.remove_peer(public_key).await
909        };
910
911        match result {
912            Ok(()) => debug!(public_key = public_key, "Removed peer from overlay"),
913            Err(e) => {
914                warn!(public_key = public_key, error = %e, "Failed to remove peer from overlay");
915            }
916        }
917
918        // Remove from peer list
919        self.peers.remove(peer_idx);
920
921        // Persist state
922        self.save().await?;
923
924        info!(public_key = public_key, "Removed peer from overlay");
925        Ok(())
926    }
927
928    /// Get this node's public key
929    #[must_use]
930    pub fn public_key(&self) -> &str {
931        &self.config.public_key
932    }
933
934    /// Get this node's overlay IP (IPv4 or IPv6)
935    #[must_use]
936    pub fn node_ip(&self) -> IpAddr {
937        self.config.node_ip
938    }
939
940    /// Get the overlay CIDR
941    #[must_use]
942    pub fn cidr(&self) -> &str {
943        &self.config.cidr
944    }
945
946    /// Get the overlay interface name
947    #[must_use]
948    pub fn interface(&self) -> &str {
949        &self.config.interface
950    }
951
952    /// Get the overlay listen port
953    #[must_use]
954    pub fn port(&self) -> u16 {
955        self.config.port
956    }
957
958    /// Check if this node is the leader
959    #[must_use]
960    pub fn is_leader(&self) -> bool {
961        self.config.is_leader
962    }
963
964    /// Get configured peers
965    #[must_use]
966    pub fn peers(&self) -> &[PeerConfig] {
967        &self.peers
968    }
969
970    /// Get the bootstrap config
971    #[must_use]
972    pub fn config(&self) -> &BootstrapConfig {
973        &self.config
974    }
975
976    /// Allocate an IP for a new peer (leader only)
977    ///
978    /// This is used by the control plane when processing join requests.
979    ///
980    /// # Errors
981    ///
982    /// Returns an error if this node is not a leader or no IPs are available.
983    pub fn allocate_peer_ip(&mut self) -> Result<IpAddr> {
984        let allocator = self
985            .allocator
986            .as_mut()
987            .ok_or(OverlayError::Config("Not a leader node".to_string()))?;
988
989        allocator.allocate().ok_or(OverlayError::NoAvailableIps)
990    }
991
992    /// Get IP allocation statistics (leader only)
993    #[must_use]
994    #[allow(clippy::cast_possible_truncation)]
995    pub fn allocation_stats(&self) -> Option<(u32, u32)> {
996        self.allocator
997            .as_ref()
998            .map(|a| (a.allocated_count() as u32, a.total_hosts()))
999    }
1000
1001    /// Perform NAT maintenance: refresh STUN, attempt relay upgrades.
1002    ///
1003    /// Call this periodically from the runtime's main loop. Re-probes
1004    /// STUN servers to detect reflexive address changes and attempts
1005    /// to upgrade relayed connections to direct or hole-punched.
1006    ///
1007    /// # Errors
1008    ///
1009    /// Returns an error if STUN refresh fails.
1010    #[cfg(feature = "nat")]
1011    pub async fn nat_maintenance_tick(&mut self) -> Result<()> {
1012        let (Some(nat), Some(transport)) = (&mut self.nat_traversal, &self.transport) else {
1013            return Ok(());
1014        };
1015
1016        if nat.refresh().await? {
1017            info!("Reflexive address changed");
1018        }
1019
1020        for peer in &mut self.peers {
1021            if peer.connection_type == ConnectionType::Relayed && !peer.candidates.is_empty() {
1022                if let Ok(Some(upgraded)) = nat
1023                    .attempt_upgrade(transport, &peer.public_key, &peer.candidates)
1024                    .await
1025                {
1026                    peer.connection_type = upgraded;
1027                    info!(
1028                        peer = %peer.node_id,
1029                        connection = %upgraded,
1030                        "Upgraded relayed connection"
1031                    );
1032                }
1033            }
1034        }
1035
1036        Ok(())
1037    }
1038
1039    /// Get this node's NAT candidates for sharing with peers.
1040    ///
1041    /// Returns an empty vec if NAT traversal has not been initialized
1042    /// or no candidates were gathered.
1043    #[cfg(feature = "nat")]
1044    #[must_use]
1045    pub fn nat_candidates(&self) -> Vec<Candidate> {
1046        self.nat_traversal
1047            .as_ref()
1048            .map(|n| n.local_candidates().to_vec())
1049            .unwrap_or_default()
1050    }
1051}
1052
1053/// Get current Unix timestamp
1054fn current_timestamp() -> u64 {
1055    std::time::SystemTime::now()
1056        .duration_since(std::time::UNIX_EPOCH)
1057        .unwrap_or_default()
1058        .as_secs()
1059}
1060
1061#[cfg(test)]
1062mod tests {
1063    use super::*;
1064    use std::net::Ipv4Addr;
1065
1066    #[test]
1067    fn test_bootstrap_config_allowed_ip_v4() {
1068        let config = BootstrapConfig {
1069            cidr: "10.200.0.0/16".to_string(),
1070            node_ip: IpAddr::V4(Ipv4Addr::new(10, 200, 0, 1)),
1071            interface: DEFAULT_INTERFACE_NAME.to_string(),
1072            port: DEFAULT_WG_PORT,
1073            private_key: "test_private".to_string(),
1074            public_key: "test_public".to_string(),
1075            is_leader: true,
1076            created_at: 0,
1077        };
1078
1079        assert_eq!(config.allowed_ip(), "10.200.0.1/32");
1080    }
1081
1082    #[test]
1083    fn test_bootstrap_config_allowed_ip_v6() {
1084        let config = BootstrapConfig {
1085            cidr: "fd00:200::/48".to_string(),
1086            node_ip: "fd00:200::1".parse::<IpAddr>().unwrap(),
1087            interface: DEFAULT_INTERFACE_NAME.to_string(),
1088            port: DEFAULT_WG_PORT,
1089            private_key: "test_private".to_string(),
1090            public_key: "test_public".to_string(),
1091            is_leader: true,
1092            created_at: 0,
1093        };
1094
1095        assert_eq!(config.allowed_ip(), "fd00:200::1/128");
1096    }
1097
1098    #[test]
1099    fn test_peer_config_new_v4() {
1100        let peer = PeerConfig::new(
1101            "node-1".to_string(),
1102            "pubkey123".to_string(),
1103            "192.168.1.100:51820".to_string(),
1104            IpAddr::V4(Ipv4Addr::new(10, 200, 0, 5)),
1105        );
1106
1107        assert_eq!(peer.node_id, "node-1");
1108        assert_eq!(peer.keepalive, Some(DEFAULT_KEEPALIVE_SECS));
1109        assert_eq!(peer.hostname, None);
1110    }
1111
1112    #[test]
1113    fn test_peer_config_new_v6() {
1114        let peer = PeerConfig::new(
1115            "node-1".to_string(),
1116            "pubkey123".to_string(),
1117            "[::1]:51820".to_string(),
1118            "fd00:200::5".parse::<IpAddr>().unwrap(),
1119        );
1120
1121        assert_eq!(peer.node_id, "node-1");
1122        assert_eq!(peer.keepalive, Some(DEFAULT_KEEPALIVE_SECS));
1123        assert_eq!(peer.hostname, None);
1124    }
1125
1126    #[test]
1127    fn test_peer_config_with_hostname() {
1128        let peer = PeerConfig::new(
1129            "node-1".to_string(),
1130            "pubkey123".to_string(),
1131            "192.168.1.100:51820".to_string(),
1132            IpAddr::V4(Ipv4Addr::new(10, 200, 0, 5)),
1133        )
1134        .with_hostname("web-server");
1135
1136        assert_eq!(peer.hostname, Some("web-server".to_string()));
1137    }
1138
1139    #[test]
1140    fn test_peer_config_to_peer_info_v4() {
1141        let peer = PeerConfig::new(
1142            "node-1".to_string(),
1143            "pubkey123".to_string(),
1144            "192.168.1.100:51820".to_string(),
1145            IpAddr::V4(Ipv4Addr::new(10, 200, 0, 5)),
1146        );
1147
1148        let peer_info = peer.to_peer_info().unwrap();
1149        assert_eq!(peer_info.public_key, "pubkey123");
1150        assert_eq!(peer_info.allowed_ips, "10.200.0.5/32");
1151    }
1152
1153    #[test]
1154    fn test_peer_config_to_peer_info_v6() {
1155        let peer = PeerConfig::new(
1156            "node-1".to_string(),
1157            "pubkey123".to_string(),
1158            "[::1]:51820".to_string(),
1159            "fd00:200::5".parse::<IpAddr>().unwrap(),
1160        );
1161
1162        let peer_info = peer.to_peer_info().unwrap();
1163        assert_eq!(peer_info.public_key, "pubkey123");
1164        assert_eq!(peer_info.allowed_ips, "fd00:200::5/128");
1165    }
1166
1167    #[test]
1168    fn test_bootstrap_state_serialization_v4() {
1169        let config = BootstrapConfig {
1170            cidr: "10.200.0.0/16".to_string(),
1171            node_ip: IpAddr::V4(Ipv4Addr::new(10, 200, 0, 1)),
1172            interface: DEFAULT_INTERFACE_NAME.to_string(),
1173            port: DEFAULT_WG_PORT,
1174            private_key: "private".to_string(),
1175            public_key: "public".to_string(),
1176            is_leader: true,
1177            created_at: 1_234_567_890,
1178        };
1179
1180        let state = BootstrapState {
1181            config,
1182            peers: vec![],
1183            allocator_state: None,
1184        };
1185
1186        let json = serde_json::to_string_pretty(&state).unwrap();
1187        let deserialized: BootstrapState = serde_json::from_str(&json).unwrap();
1188
1189        assert_eq!(deserialized.config.cidr, "10.200.0.0/16");
1190        assert_eq!(deserialized.config.node_ip.to_string(), "10.200.0.1");
1191    }
1192
1193    #[test]
1194    fn test_bootstrap_state_serialization_v6() {
1195        let config = BootstrapConfig {
1196            cidr: "fd00:200::/48".to_string(),
1197            node_ip: "fd00:200::1".parse::<IpAddr>().unwrap(),
1198            interface: DEFAULT_INTERFACE_NAME.to_string(),
1199            port: DEFAULT_WG_PORT,
1200            private_key: "private".to_string(),
1201            public_key: "public".to_string(),
1202            is_leader: true,
1203            created_at: 1_234_567_890,
1204        };
1205
1206        let state = BootstrapState {
1207            config,
1208            peers: vec![],
1209            allocator_state: None,
1210        };
1211
1212        let json = serde_json::to_string_pretty(&state).unwrap();
1213        let deserialized: BootstrapState = serde_json::from_str(&json).unwrap();
1214
1215        assert_eq!(deserialized.config.cidr, "fd00:200::/48");
1216        assert_eq!(deserialized.config.node_ip.to_string(), "fd00:200::1");
1217    }
1218
1219    #[test]
1220    fn test_default_overlay_cidr_v6_constant() {
1221        // Verify the IPv6 CIDR constant is valid
1222        let net: ipnet::IpNet = DEFAULT_OVERLAY_CIDR_V6.parse().unwrap();
1223        assert!(matches!(net, ipnet::IpNet::V6(_)));
1224        assert_eq!(net.prefix_len(), 48);
1225    }
1226}