Skip to main content

nucleus/network/
bridge.rs

1use super::{egress, netlink, netns};
2use crate::error::{NucleusError, Result, StateTransition};
3use crate::network::config::{BridgeConfig, EgressPolicy, PortForward};
4use crate::network::NetworkState;
5use std::fs::OpenOptions;
6use std::net::Ipv4Addr;
7use std::os::fd::FromRawFd;
8use std::os::unix::fs::FileTypeExt;
9use std::os::unix::fs::OpenOptionsExt;
10use std::os::unix::io::AsRawFd;
11use std::process::Command;
12use tracing::{debug, info, warn};
13
14/// Bridge network manager
15pub struct BridgeNetwork {
16    config: BridgeConfig,
17    container_ip: String,
18    veth_host: String,
19    container_id: String,
20    prev_ip_forward: Option<String>,
21    state: NetworkState,
22}
23
24impl BridgeNetwork {
25    fn open_dev_urandom() -> Result<std::fs::File> {
26        let file = OpenOptions::new()
27            .read(true)
28            .custom_flags(libc::O_NOFOLLOW | libc::O_CLOEXEC)
29            .open("/dev/urandom")
30            .map_err(|e| {
31                NucleusError::NetworkError(format!("Failed to open /dev/urandom: {}", e))
32            })?;
33
34        let metadata = file.metadata().map_err(|e| {
35            NucleusError::NetworkError(format!("Failed to stat /dev/urandom: {}", e))
36        })?;
37        if !metadata.file_type().is_char_device() {
38            return Err(NucleusError::NetworkError(
39                "/dev/urandom is not a character device".to_string(),
40            ));
41        }
42
43        Ok(file)
44    }
45
46    /// Set up bridge networking for a container
47    ///
48    /// Creates bridge, veth pair, assigns IPs, enables NAT.
49    /// Must be called from the parent process after fork (needs host netns).
50    ///
51    /// State transitions: Unconfigured -> Configuring -> Active
52    pub fn setup(pid: u32, config: &BridgeConfig) -> Result<Self> {
53        Self::setup_for(pid, config, &format!("{:x}", pid))
54    }
55
56    /// Set up bridge networking with an explicit container ID for IP tracking.
57    pub fn setup_with_id(pid: u32, config: &BridgeConfig, container_id: &str) -> Result<Self> {
58        Self::setup_for(pid, config, container_id)
59    }
60
61    fn setup_for(pid: u32, config: &BridgeConfig, container_id: &str) -> Result<Self> {
62        // Validate all network parameters before using them in shell commands
63        config.validate()?;
64
65        let mut net_state = NetworkState::Unconfigured;
66        net_state = net_state.transition(NetworkState::Configuring)?;
67
68        let alloc_dir = Self::ip_alloc_dir();
69        let container_ip = Self::reserve_ip_in_dir(
70            &alloc_dir,
71            container_id,
72            &config.subnet,
73            config.container_ip.as_deref(),
74        )?;
75        let prefix = Self::subnet_prefix(&config.subnet);
76
77        // Linux interface names max 15 chars; truncate if needed
78        let veth_host_full = format!("veth-{:x}", pid);
79        let veth_cont_full = format!("vethc-{:x}", pid);
80        let veth_host = veth_host_full[..veth_host_full.len().min(15)].to_string();
81        let veth_container = veth_cont_full[..veth_cont_full.len().min(15)].to_string();
82        let mut rollback = SetupRollback::new(
83            veth_host.clone(),
84            config.subnet.clone(),
85            Some((alloc_dir.clone(), container_id.to_string())),
86        );
87
88        // 1. Create bridge if it doesn't exist
89        Self::ensure_bridge_for(&config.bridge_name, &config.subnet)?;
90
91        // 2. Create veth pair
92        netlink::create_veth(&veth_host, &veth_container)?;
93        rollback.veth_created = true;
94
95        // 3. Attach host end to bridge
96        netlink::set_link_master(&veth_host, &config.bridge_name)?;
97        netlink::set_link_up(&veth_host)?;
98
99        // 4. Move container end to container's network namespace
100        netlink::set_link_netns(&veth_container, pid)?;
101
102        // 5. Configure container interface (inside container netns via setns).
103        // Capture the process start time from /proc to detect PID recycling
104        // between the caller passing the PID and our netns operations.
105        let start_ticks = Self::read_pid_start_ticks(pid);
106        if start_ticks == 0 {
107            drop(rollback);
108            return Err(NucleusError::NetworkError(format!(
109                "Cannot read start_ticks for PID {} – process may have exited",
110                pid
111            )));
112        }
113
114        let container_addr: Ipv4Addr = container_ip.parse().map_err(|e| {
115            NucleusError::NetworkError(format!("invalid container IP '{}': {}", container_ip, e))
116        })?;
117        {
118            let vc = veth_container.clone();
119            netns::in_netns(pid, move || {
120                netlink::add_addr(&vc, container_addr, prefix)?;
121                netlink::set_link_up(&vc)?;
122                netlink::set_link_up("lo")?;
123                Ok(())
124            })?;
125        }
126
127        // Verify PID was not recycled during netns operations
128        let current_ticks = Self::read_pid_start_ticks(pid);
129        if current_ticks != start_ticks {
130            drop(rollback);
131            return Err(NucleusError::NetworkError(format!(
132                "PID {} was recycled during network setup (start_ticks changed: {} -> {})",
133                pid, start_ticks, current_ticks
134            )));
135        }
136
137        // 6. Set default route in container
138        let gateway = Self::gateway_from_subnet(&config.subnet);
139        let gateway_addr: Ipv4Addr = gateway.parse().map_err(|e| {
140            NucleusError::NetworkError(format!("invalid gateway IP '{}': {}", gateway, e))
141        })?;
142        netns::in_netns(pid, move || netlink::add_default_route(gateway_addr))?;
143
144        // 7. Enable NAT (masquerade) on the host
145        Self::run_cmd(
146            "iptables",
147            &[
148                "-t",
149                "nat",
150                "-A",
151                "POSTROUTING",
152                "-s",
153                &config.subnet,
154                "-j",
155                "MASQUERADE",
156            ],
157        )?;
158        rollback.nat_added = true;
159
160        // 8. Enable IP forwarding (save previous value for restore on cleanup)
161        let prev_ip_forward = match std::fs::read_to_string("/proc/sys/net/ipv4/ip_forward") {
162            Ok(v) => Some(v.trim().to_string()),
163            Err(e) => {
164                warn!(
165                    "Could not read ip_forward state (will not restore on cleanup): {}",
166                    e
167                );
168                None
169            }
170        };
171        rollback.prev_ip_forward = prev_ip_forward;
172        std::fs::write("/proc/sys/net/ipv4/ip_forward", "1").map_err(|e| {
173            NucleusError::NetworkError(format!("Failed to enable IP forwarding: {}", e))
174        })?;
175
176        // 9. Set up port forwarding rules
177        for pf in &config.port_forwards {
178            Self::setup_port_forward_for(&container_ip, pf)?;
179            rollback
180                .port_forwards
181                .push((container_ip.clone(), pf.clone()));
182        }
183
184        net_state = net_state.transition(NetworkState::Active)?;
185
186        info!(
187            "Bridge network configured: {} -> {} (IP: {})",
188            veth_host, veth_container, container_ip
189        );
190        let prev_ip_forward = rollback.prev_ip_forward.clone();
191        rollback.disarm();
192
193        Ok(Self {
194            config: config.clone(),
195            container_ip,
196            veth_host,
197            container_id: container_id.to_string(),
198            prev_ip_forward,
199            state: net_state,
200        })
201    }
202
203    /// Apply egress policy rules inside the container's network namespace.
204    ///
205    /// Uses iptables OUTPUT chain to restrict outbound connections.
206    /// Must be called after bridge setup while the container netns is reachable.
207    pub fn apply_egress_policy(&self, pid: u32, policy: &EgressPolicy) -> Result<()> {
208        egress::apply_egress_policy(pid, &self.config.dns, policy, false)
209    }
210
211    /// Clean up bridge networking
212    ///
213    /// State transition: Active -> Cleaned
214    pub fn cleanup(mut self) -> Result<()> {
215        self.state = self.state.transition(NetworkState::Cleaned)?;
216
217        // Release the IP allocation
218        Self::release_allocated_ip(&self.container_id);
219
220        // Remove port forwarding rules
221        for pf in &self.config.port_forwards {
222            if let Err(e) = self.cleanup_port_forward(pf) {
223                warn!("Failed to cleanup port forward: {}", e);
224            }
225        }
226
227        // Remove NAT rule
228        let _ = Self::run_cmd(
229            "iptables",
230            &[
231                "-t",
232                "nat",
233                "-D",
234                "POSTROUTING",
235                "-s",
236                &self.config.subnet,
237                "-j",
238                "MASQUERADE",
239            ],
240        );
241
242        // Delete veth pair (deleting one end removes both)
243        let _ = netlink::del_link(&self.veth_host);
244
245        // Restore previous ip_forward state if we changed it
246        if let Some(ref prev) = self.prev_ip_forward {
247            if prev == "0" {
248                if let Err(e) = std::fs::write("/proc/sys/net/ipv4/ip_forward", "0") {
249                    warn!("Failed to restore ip_forward to 0: {}", e);
250                } else {
251                    info!("Restored net.ipv4.ip_forward to 0");
252                }
253            }
254        }
255
256        info!("Bridge network cleaned up");
257        Ok(())
258    }
259
260    /// Best-effort cleanup for use in Drop. Performs the same teardown as
261    /// `cleanup()` but ignores all errors and skips the state transition
262    /// (which requires ownership).
263    fn cleanup_best_effort(&mut self) {
264        if self.state == NetworkState::Cleaned {
265            return;
266        }
267
268        Self::release_allocated_ip(&self.container_id);
269
270        for pf in &self.config.port_forwards {
271            let _ = self.cleanup_port_forward(pf);
272        }
273
274        let _ = Self::run_cmd(
275            "iptables",
276            &[
277                "-t",
278                "nat",
279                "-D",
280                "POSTROUTING",
281                "-s",
282                &self.config.subnet,
283                "-j",
284                "MASQUERADE",
285            ],
286        );
287
288        let _ = netlink::del_link(&self.veth_host);
289
290        if let Some(ref prev) = self.prev_ip_forward {
291            if prev == "0" {
292                let _ = std::fs::write("/proc/sys/net/ipv4/ip_forward", "0");
293            }
294        }
295
296        self.state = NetworkState::Cleaned;
297        debug!("Bridge network cleaned up (best-effort via drop)");
298    }
299
300    /// Detect and remove orphaned iptables rules from previous Nucleus runs.
301    ///
302    /// Checks for stale MASQUERADE rules referencing the nucleus subnet that
303    /// have no corresponding running container. Prevents gradual degradation
304    /// of network isolation from accumulated orphaned rules.
305    pub fn cleanup_orphaned_rules(subnet: &str) {
306        // List NAT rules and look for nucleus-related MASQUERADE entries
307        let output = match Command::new("iptables")
308            .args(["-t", "nat", "-L", "POSTROUTING", "-n"])
309            .output()
310        {
311            Ok(o) => o,
312            Err(e) => {
313                debug!("Cannot check iptables for orphaned rules: {}", e);
314                return;
315            }
316        };
317
318        let stdout = String::from_utf8_lossy(&output.stdout);
319        let mut orphaned_count = 0u32;
320        for line in stdout.lines() {
321            if line.contains("MASQUERADE") && line.contains(subnet) {
322                // Try to remove it; if it fails, it may be actively used
323                let _ = Self::run_cmd(
324                    "iptables",
325                    &[
326                        "-t",
327                        "nat",
328                        "-D",
329                        "POSTROUTING",
330                        "-s",
331                        subnet,
332                        "-j",
333                        "MASQUERADE",
334                    ],
335                );
336                orphaned_count += 1;
337            }
338        }
339
340        if orphaned_count > 0 {
341            info!(
342                "Cleaned up {} orphaned iptables MASQUERADE rule(s) for subnet {}",
343                orphaned_count, subnet
344            );
345        }
346    }
347
348    fn ensure_bridge_for(bridge_name: &str, subnet: &str) -> Result<()> {
349        if netlink::link_exists(bridge_name) {
350            return Ok(());
351        }
352
353        netlink::create_bridge(bridge_name)?;
354
355        let gateway = Self::gateway_from_subnet(subnet);
356        let gateway_addr: Ipv4Addr = gateway.parse().map_err(|e| {
357            NucleusError::NetworkError(format!("invalid bridge gateway '{}': {}", gateway, e))
358        })?;
359        netlink::add_addr(bridge_name, gateway_addr, Self::subnet_prefix(subnet))?;
360        netlink::set_link_up(bridge_name)?;
361
362        info!("Created bridge {}", bridge_name);
363        Ok(())
364    }
365
366    fn setup_port_forward_for(container_ip: &str, pf: &PortForward) -> Result<()> {
367        for chain in ["PREROUTING", "OUTPUT"] {
368            let args = Self::port_forward_rule_args("-A", chain, container_ip, pf);
369            Self::run_cmd_owned("iptables", &args)?;
370        }
371
372        let host_ip = pf
373            .host_ip
374            .map(|ip| ip.to_string())
375            .unwrap_or_else(|| "0.0.0.0".to_string());
376        info!(
377            "Port forward: {}:{} -> {}:{}/{}",
378            host_ip, pf.host_port, container_ip, pf.container_port, pf.protocol
379        );
380        Ok(())
381    }
382
383    fn cleanup_port_forward(&self, pf: &PortForward) -> Result<()> {
384        for chain in ["OUTPUT", "PREROUTING"] {
385            let args = Self::port_forward_rule_args("-D", chain, &self.container_ip, pf);
386            Self::run_cmd_owned("iptables", &args)?;
387        }
388        Ok(())
389    }
390
391    /// Allocate a container IP from the subnet using /dev/urandom.
392    ///
393    /// Checks both host-visible interfaces (via `ip addr`) and IPs assigned to
394    /// other Nucleus containers (via state files) to avoid duplicates. Container
395    /// IPs inside network namespaces are invisible to `ip addr show` on the host.
396    fn allocate_ip_with_reserved(
397        subnet: &str,
398        reserved: &std::collections::HashSet<String>,
399    ) -> Result<String> {
400        let base = subnet.split('/').next().unwrap_or("10.0.42.0");
401        let parts: Vec<&str> = base.split('.').collect();
402        if parts.len() != 4 {
403            return Ok("10.0.42.2".to_string());
404        }
405
406        // Use rejection sampling to avoid modulo bias.
407        // Range is 2..=254 (253 values). We reject random bytes >= 253 to
408        // ensure uniform distribution, then add 2 to shift into the valid range.
409        // Open /dev/urandom once and read all randomness in a single batch.
410        // 128 bytes gives ~125 valid candidates (byte < 253), making exhaustion
411        // in a populated subnet far less likely than the previous 32-byte buffer.
412        let mut rand_buf = [0u8; 128];
413        let mut urandom = Self::open_dev_urandom()?;
414        std::io::Read::read_exact(&mut urandom, &mut rand_buf).map_err(|e| {
415            NucleusError::NetworkError(format!("Failed to read /dev/urandom: {}", e))
416        })?;
417        for &byte in &rand_buf {
418            // Rejection sampling: discard values that would cause modulo bias
419            if byte >= 253 {
420                continue;
421            }
422            let offset = byte as u32 + 2;
423            let candidate = format!("{}.{}.{}.{}", parts[0], parts[1], parts[2], offset);
424            if reserved.contains(&candidate) {
425                continue;
426            }
427            if !Self::is_ip_in_use(&candidate)? {
428                // Lock is released when lock_file is dropped
429                return Ok(candidate);
430            }
431        }
432
433        Err(NucleusError::NetworkError(format!(
434            "Failed to allocate free IP in subnet {}",
435            subnet
436        )))
437    }
438
439    fn reserve_ip_in_dir(
440        alloc_dir: &std::path::Path,
441        container_id: &str,
442        subnet: &str,
443        requested_ip: Option<&str>,
444    ) -> Result<String> {
445        Self::ensure_alloc_dir(alloc_dir)?;
446        let lock_path = alloc_dir.join(".lock");
447        let lock_file = std::fs::OpenOptions::new()
448            .create(true)
449            .write(true)
450            .truncate(false)
451            .open(&lock_path)
452            .map_err(|e| {
453                NucleusError::NetworkError(format!("Failed to open IP alloc lock: {}", e))
454            })?;
455        // SAFETY: lock_file is a valid open fd. LOCK_EX is a blocking exclusive
456        // lock that is released when the fd is closed (end of scope).
457        let lock_ret = unsafe { libc::flock(lock_file.as_raw_fd(), libc::LOCK_EX) };
458        if lock_ret != 0 {
459            return Err(NucleusError::NetworkError(format!(
460                "Failed to acquire IP alloc lock: {}",
461                std::io::Error::last_os_error()
462            )));
463        }
464
465        let reserved = Self::collect_reserved_ips_in_dir(alloc_dir);
466        let ip = match requested_ip {
467            Some(ip) => {
468                if reserved.contains(ip) || Self::is_ip_in_use(ip)? {
469                    return Err(NucleusError::NetworkError(format!(
470                        "Requested container IP {} is already in use",
471                        ip
472                    )));
473                }
474                ip.to_string()
475            }
476            None => Self::allocate_ip_with_reserved(subnet, &reserved)?,
477        };
478
479        Self::record_allocated_ip_in_dir(alloc_dir, container_id, &ip)?;
480        Ok(ip)
481    }
482
483    /// Scan the Nucleus IP allocation directory for IPs already assigned.
484    fn collect_reserved_ips_in_dir(
485        alloc_dir: &std::path::Path,
486    ) -> std::collections::HashSet<String> {
487        let mut ips = std::collections::HashSet::new();
488        if let Ok(entries) = std::fs::read_dir(alloc_dir) {
489            for entry in entries.flatten() {
490                if let Some(name) = entry.file_name().to_str() {
491                    if name.ends_with(".ip") {
492                        if let Ok(ip) = std::fs::read_to_string(entry.path()) {
493                            let ip = ip.trim().to_string();
494                            if !ip.is_empty() {
495                                ips.insert(ip);
496                            }
497                        }
498                    }
499                }
500            }
501        }
502        ips
503    }
504
505    /// Persist the allocated IP for this container so other containers can see it.
506    fn record_allocated_ip_in_dir(
507        alloc_dir: &std::path::Path,
508        container_id: &str,
509        ip: &str,
510    ) -> Result<()> {
511        Self::ensure_alloc_dir(alloc_dir)?;
512        let path = alloc_dir.join(format!("{}.ip", container_id));
513        std::fs::write(&path, ip).map_err(|e| {
514            NucleusError::NetworkError(format!("Failed to record IP allocation: {}", e))
515        })?;
516        Ok(())
517    }
518
519    /// Remove the persisted IP allocation for a container.
520    fn release_allocated_ip(container_id: &str) {
521        let alloc_dir = Self::ip_alloc_dir();
522        Self::release_allocated_ip_in_dir(&alloc_dir, container_id);
523    }
524
525    fn release_allocated_ip_in_dir(alloc_dir: &std::path::Path, container_id: &str) {
526        let path = alloc_dir.join(format!("{}.ip", container_id));
527        let _ = std::fs::remove_file(path);
528    }
529
530    /// Create the IP allocation directory with restrictive permissions (0700)
531    /// and reject symlinked paths to prevent symlink attacks.
532    fn ensure_alloc_dir(alloc_dir: &std::path::Path) -> Result<()> {
533        // L11: Check for symlinks BEFORE creating directories to avoid TOCTOU.
534        // If the path already exists, verify it's not a symlink.
535        if alloc_dir.exists() {
536            if let Ok(meta) = std::fs::symlink_metadata(alloc_dir) {
537                if meta.file_type().is_symlink() {
538                    return Err(NucleusError::NetworkError(format!(
539                        "IP alloc dir {:?} is a symlink, refusing to use",
540                        alloc_dir
541                    )));
542                }
543            }
544        }
545        // Also check parent directory for symlinks
546        if let Some(parent) = alloc_dir.parent() {
547            if let Ok(meta) = std::fs::symlink_metadata(parent) {
548                if meta.file_type().is_symlink() {
549                    return Err(NucleusError::NetworkError(format!(
550                        "IP alloc dir parent {:?} is a symlink, refusing to use",
551                        parent
552                    )));
553                }
554            }
555        }
556
557        std::fs::create_dir_all(alloc_dir).map_err(|e| {
558            NucleusError::NetworkError(format!("Failed to create IP alloc dir: {}", e))
559        })?;
560
561        // Restrict permissions to owner-only atomically after creation
562        use std::os::unix::fs::PermissionsExt;
563        let perms = std::fs::Permissions::from_mode(0o700);
564        std::fs::set_permissions(alloc_dir, perms).map_err(|e| {
565            NucleusError::NetworkError(format!(
566                "Failed to set permissions on IP alloc dir {:?}: {}",
567                alloc_dir, e
568            ))
569        })?;
570
571        // Re-verify no symlink replacement after permissions were set
572        if let Ok(meta) = std::fs::symlink_metadata(alloc_dir) {
573            if meta.file_type().is_symlink() {
574                return Err(NucleusError::NetworkError(format!(
575                    "IP alloc dir {:?} was replaced with a symlink during setup",
576                    alloc_dir
577                )));
578            }
579        }
580        Ok(())
581    }
582
583    fn ip_alloc_dir() -> std::path::PathBuf {
584        if nix::unistd::Uid::effective().is_root() {
585            std::path::PathBuf::from("/var/run/nucleus/ip-alloc")
586        } else {
587            dirs::runtime_dir()
588                .map(|d| d.join("nucleus/ip-alloc"))
589                .or_else(|| dirs::data_local_dir().map(|d| d.join("nucleus/ip-alloc")))
590                .unwrap_or_else(|| {
591                    dirs::home_dir()
592                        .map(|h| h.join(".nucleus/ip-alloc"))
593                        .unwrap_or_else(|| std::path::PathBuf::from("/var/run/nucleus/ip-alloc"))
594                })
595        }
596    }
597
598    /// Read the start time (field 22) from /proc/<pid>/stat to detect PID recycling.
599    /// Returns 0 if the process does not exist or the field cannot be parsed.
600    fn read_pid_start_ticks(pid: u32) -> u64 {
601        let stat_path = format!("/proc/{}/stat", pid);
602        if let Ok(content) = std::fs::read_to_string(&stat_path) {
603            // Field 22 is starttime. The comm field (2) may contain spaces/parens,
604            // so find the last ')' and count fields from there.
605            if let Some(after_comm) = content.rfind(')') {
606                return content[after_comm + 2..]
607                    .split_whitespace()
608                    .nth(19) // field 22 is 20th after the ')' + state field
609                    .and_then(|s| s.parse().ok())
610                    .unwrap_or(0);
611            }
612        }
613        0
614    }
615
616    /// Get gateway IP from subnet (first usable address)
617    fn gateway_from_subnet(subnet: &str) -> String {
618        let base = subnet.split('/').next().unwrap_or("10.0.42.0");
619        let parts: Vec<&str> = base.split('.').collect();
620        if parts.len() == 4 {
621            format!("{}.{}.{}.1", parts[0], parts[1], parts[2])
622        } else {
623            "10.0.42.1".to_string()
624        }
625    }
626
627    fn subnet_prefix(subnet: &str) -> u8 {
628        subnet
629            .split_once('/')
630            .and_then(|(_, p)| p.parse::<u8>().ok())
631            .filter(|p| *p <= 32)
632            .unwrap_or(24)
633    }
634
635    /// Resolve a system binary to a validated absolute path.
636    ///
637    /// When running as root, searches known sysadmin paths and validates
638    /// ownership and permissions before use. When unprivileged, uses
639    /// `which`-style PATH resolution but still validates the result.
640    /// Returns an error if no valid binary is found.
641    pub(crate) fn resolve_bin(name: &str) -> Result<String> {
642        let search_dirs: &[&str] = match name {
643            "iptables" => &["/usr/sbin/iptables", "/sbin/iptables", "/usr/bin/iptables"],
644            "slirp4netns" => &[
645                "/usr/bin/slirp4netns",
646                "/bin/slirp4netns",
647                "/run/current-system/sw/bin/slirp4netns",
648            ],
649            _ => &[],
650        };
651
652        for path in search_dirs {
653            let p = std::path::Path::new(path);
654            if p.exists() {
655                Self::validate_network_binary(p, name)?;
656                return Ok(path.to_string());
657            }
658        }
659
660        // Fallback: resolve via PATH, but validate the result
661        if let Ok(output) = Command::new("which").arg(name).output() {
662            if output.status.success() {
663                let resolved = String::from_utf8_lossy(&output.stdout).trim().to_string();
664                if !resolved.is_empty() {
665                    let p = std::path::Path::new(&resolved);
666                    Self::validate_network_binary(p, name)?;
667                    return Ok(resolved);
668                }
669            }
670        }
671
672        Err(NucleusError::NetworkError(format!(
673            "Required binary '{}' not found or failed validation",
674            name
675        )))
676    }
677
678    /// Validate a network binary's ownership and permissions.
679    /// Rejects binaries that are group/world-writable or not owned by root/euid.
680    fn validate_network_binary(path: &std::path::Path, name: &str) -> Result<()> {
681        use std::os::unix::fs::MetadataExt;
682
683        let meta = std::fs::metadata(path)
684            .map_err(|e| NucleusError::NetworkError(format!("Cannot stat {}: {}", name, e)))?;
685        let mode = meta.mode();
686        if mode & 0o022 != 0 {
687            return Err(NucleusError::NetworkError(format!(
688                "Binary '{}' at {:?} is writable by group/others (mode {:o}), refusing to execute",
689                name, path, mode
690            )));
691        }
692        let owner = meta.uid();
693        let euid = nix::unistd::Uid::effective().as_raw();
694        if owner != 0 && owner != euid {
695            return Err(NucleusError::NetworkError(format!(
696                "Binary '{}' at {:?} owned by UID {} (expected root or euid {}), refusing to execute",
697                name, path, owner, euid
698            )));
699        }
700        Ok(())
701    }
702
703    fn run_cmd(program: &str, args: &[&str]) -> Result<()> {
704        let resolved = Self::resolve_bin(program)?;
705        let output = Command::new(&resolved).args(args).output().map_err(|e| {
706            NucleusError::NetworkError(format!("Failed to run {} {:?}: {}", resolved, args, e))
707        })?;
708
709        if !output.status.success() {
710            let stderr = String::from_utf8_lossy(&output.stderr);
711            return Err(NucleusError::NetworkError(format!(
712                "{} {:?} failed: {}",
713                program, args, stderr
714            )));
715        }
716
717        Ok(())
718    }
719
720    fn run_cmd_owned(program: &str, args: &[String]) -> Result<()> {
721        let refs: Vec<&str> = args.iter().map(String::as_str).collect();
722        Self::run_cmd(program, &refs)
723    }
724
725    fn port_forward_rule_args(
726        operation: &str,
727        chain: &str,
728        container_ip: &str,
729        pf: &PortForward,
730    ) -> Vec<String> {
731        let mut args = vec![
732            "-t".to_string(),
733            "nat".to_string(),
734            operation.to_string(),
735            chain.to_string(),
736            "-p".to_string(),
737            pf.protocol.as_str().to_string(),
738        ];
739
740        if chain == "OUTPUT" {
741            args.extend([
742                "-m".to_string(),
743                "addrtype".to_string(),
744                "--dst-type".to_string(),
745                "LOCAL".to_string(),
746            ]);
747        }
748
749        if let Some(host_ip) = pf.host_ip {
750            args.extend(["-d".to_string(), host_ip.to_string()]);
751        }
752
753        args.extend([
754            "--dport".to_string(),
755            pf.host_port.to_string(),
756            "-j".to_string(),
757            "DNAT".to_string(),
758            "--to-destination".to_string(),
759            format!("{}:{}", container_ip, pf.container_port),
760        ]);
761
762        args
763    }
764
765    fn is_ip_in_use(ip: &str) -> Result<bool> {
766        let addr: Ipv4Addr = ip
767            .parse()
768            .map_err(|e| NucleusError::NetworkError(format!("invalid IP '{}': {}", ip, e)))?;
769        netlink::is_addr_in_use(&addr)
770    }
771
772    /// Write resolv.conf inside container (for writable /etc, e.g. agent mode)
773    pub fn write_resolv_conf(root: &std::path::Path, dns: &[String]) -> Result<()> {
774        let resolv_path = root.join("etc/resolv.conf");
775        let content: String = dns
776            .iter()
777            .map(|server| format!("nameserver {}\n", server))
778            .collect();
779        std::fs::write(&resolv_path, content).map_err(|e| {
780            NucleusError::NetworkError(format!("Failed to write resolv.conf: {}", e))
781        })?;
782        Ok(())
783    }
784
785    /// Bind-mount a resolv.conf over a read-only /etc (for production rootfs mode).
786    ///
787    /// Creates a memfd-backed resolv.conf and bind-mounts it over
788    /// /etc/resolv.conf so it works even when the rootfs /etc is read-only.
789    /// The memfd is cleaned up when the container exits.
790    pub fn bind_mount_resolv_conf(root: &std::path::Path, dns: &[String]) -> Result<()> {
791        use nix::mount::{mount, MsFlags};
792
793        let content: String = dns
794            .iter()
795            .map(|server| format!("nameserver {}\n", server))
796            .collect();
797
798        // Create a memfd-backed file to avoid leaving staging files on disk
799        let memfd_name = std::ffi::CString::new("nucleus-resolv").map_err(|e| {
800            NucleusError::NetworkError(format!("Failed to create memfd name: {}", e))
801        })?;
802        // SAFETY: memfd_name is a valid NUL-terminated CString. memfd_create
803        // returns a new fd or -1 on error; we check for error below.
804        let raw_fd = unsafe { libc::memfd_create(memfd_name.as_ptr(), 0) };
805        if raw_fd < 0 {
806            // Fallback to staging file if memfd_create is unavailable
807            return Self::bind_mount_resolv_conf_staging(root, dns);
808        }
809        // SAFETY: raw_fd is a valid, newly-created fd from memfd_create.
810        // OwnedFd takes ownership and will close it exactly once on drop,
811        // preventing double-close on any error path.
812        let memfd = unsafe { std::os::fd::OwnedFd::from_raw_fd(raw_fd) };
813
814        // Write content to memfd using File I/O to handle partial writes correctly.
815        use std::io::Write as _;
816        let mut memfd_file = std::fs::File::from(memfd);
817        if memfd_file.write_all(content.as_bytes()).is_err() {
818            // memfd_file dropped here, closing the fd automatically
819            return Self::bind_mount_resolv_conf_staging(root, dns);
820        }
821        // Re-extract the OwnedFd for the proc path below
822        use std::os::fd::IntoRawFd;
823        let memfd = {
824            let raw = memfd_file.into_raw_fd();
825            // SAFETY: raw is the valid fd we just extracted from the File.
826            unsafe { std::os::fd::OwnedFd::from_raw_fd(raw) }
827        };
828
829        // Ensure the mount target exists
830        let target = root.join("etc/resolv.conf");
831        if !target.exists() {
832            let _ = std::fs::write(&target, "");
833        }
834
835        // Bind mount the memfd over the read-only resolv.conf
836        let memfd_path = format!("/proc/self/fd/{}", memfd.as_raw_fd());
837        mount(
838            Some(memfd_path.as_str()),
839            &target,
840            None::<&str>,
841            MsFlags::MS_BIND,
842            None::<&str>,
843        )
844        .map_err(|e| {
845            // memfd dropped here via the returned Err, closing the fd automatically
846            NucleusError::NetworkError(format!("Failed to bind mount resolv.conf: {}", e))
847        })?;
848
849        // memfd dropped here – the mount holds a kernel reference to the file,
850        // so it survives the fd close.
851
852        info!("Bind-mounted resolv.conf for bridge networking (rootfs mode, memfd)");
853        Ok(())
854    }
855
856    /// Fallback: bind-mount a staging resolv.conf file.
857    fn bind_mount_resolv_conf_staging(root: &std::path::Path, dns: &[String]) -> Result<()> {
858        use nix::mount::{mount, MsFlags};
859
860        let content: String = dns
861            .iter()
862            .map(|server| format!("nameserver {}\n", server))
863            .collect();
864
865        // Write to a staging file outside /etc
866        let staging = root.join("tmp/.resolv.conf.nucleus");
867        if let Some(parent) = staging.parent() {
868            std::fs::create_dir_all(parent).map_err(|e| {
869                NucleusError::NetworkError(format!(
870                    "Failed to create resolv.conf staging parent: {}",
871                    e
872                ))
873            })?;
874        }
875        std::fs::write(&staging, content).map_err(|e| {
876            NucleusError::NetworkError(format!("Failed to write staging resolv.conf: {}", e))
877        })?;
878
879        // Ensure the mount target exists
880        let target = root.join("etc/resolv.conf");
881        if !target.exists() {
882            let _ = std::fs::write(&target, "");
883        }
884
885        // Bind mount the staging file over the read-only resolv.conf
886        mount(
887            Some(staging.as_path()),
888            &target,
889            None::<&str>,
890            MsFlags::MS_BIND,
891            None::<&str>,
892        )
893        .map_err(|e| {
894            NucleusError::NetworkError(format!("Failed to bind mount resolv.conf: {}", e))
895        })?;
896
897        // The bind mount holds a reference to the inode, so we can safely
898        // unlink the staging path to avoid leaking DNS server info on disk.
899        if let Err(e) = std::fs::remove_file(&staging) {
900            warn!("Failed to remove staging resolv.conf {:?}: {}", staging, e);
901        }
902
903        info!("Bind-mounted resolv.conf for bridge networking (rootfs mode, staging)");
904        Ok(())
905    }
906}
907
908impl Drop for BridgeNetwork {
909    fn drop(&mut self) {
910        self.cleanup_best_effort();
911    }
912}
913
914struct SetupRollback {
915    veth_host: String,
916    subnet: String,
917    veth_created: bool,
918    nat_added: bool,
919    port_forwards: Vec<(String, PortForward)>,
920    prev_ip_forward: Option<String>,
921    reserved_ip: Option<(std::path::PathBuf, String)>,
922    armed: bool,
923}
924
925impl SetupRollback {
926    fn new(
927        veth_host: String,
928        subnet: String,
929        reserved_ip: Option<(std::path::PathBuf, String)>,
930    ) -> Self {
931        Self {
932            veth_host,
933            subnet,
934            veth_created: false,
935            nat_added: false,
936            port_forwards: Vec::new(),
937            prev_ip_forward: None,
938            reserved_ip,
939            armed: true,
940        }
941    }
942
943    fn disarm(&mut self) {
944        self.armed = false;
945    }
946}
947
948impl Drop for SetupRollback {
949    fn drop(&mut self) {
950        if !self.armed {
951            return;
952        }
953
954        for (container_ip, pf) in self.port_forwards.iter().rev() {
955            for chain in ["OUTPUT", "PREROUTING"] {
956                let args = BridgeNetwork::port_forward_rule_args("-D", chain, container_ip, pf);
957                if let Err(e) = BridgeNetwork::run_cmd_owned("iptables", &args) {
958                    warn!(
959                        "Rollback: failed to remove iptables {} rule for {}: {}",
960                        chain, container_ip, e
961                    );
962                }
963            }
964        }
965
966        if self.nat_added {
967            if let Err(e) = BridgeNetwork::run_cmd(
968                "iptables",
969                &[
970                    "-t",
971                    "nat",
972                    "-D",
973                    "POSTROUTING",
974                    "-s",
975                    &self.subnet,
976                    "-j",
977                    "MASQUERADE",
978                ],
979            ) {
980                warn!("Rollback: failed to remove NAT rule: {}", e);
981            }
982        }
983
984        if self.veth_created {
985            if let Err(e) = netlink::del_link(&self.veth_host) {
986                warn!("Rollback: failed to delete veth {}: {}", self.veth_host, e);
987            }
988        }
989
990        if let Some((alloc_dir, container_id)) = &self.reserved_ip {
991            BridgeNetwork::release_allocated_ip_in_dir(alloc_dir, container_id);
992        }
993    }
994}
995
996#[cfg(test)]
997mod tests {
998    use super::*;
999
1000    #[test]
1001    fn test_ip_allocation_rejection_sampling_range() {
1002        // H-5: Verify that rejection sampling produces values in 2..=254
1003        // and that values >= 253 are rejected (no modulo bias).
1004        for byte in 0u8..253 {
1005            let offset = byte as u32 + 2;
1006            assert!(
1007                (2..=254).contains(&offset),
1008                "offset {} out of range",
1009                offset
1010            );
1011        }
1012        // Values 253, 254, 255 must be rejected
1013        for byte in [253u8, 254, 255] {
1014            assert!(byte >= 253);
1015        }
1016    }
1017
1018    #[test]
1019    fn test_reserve_ip_blocks_duplicate_requested_address() {
1020        let temp = tempfile::tempdir().unwrap();
1021        BridgeNetwork::record_allocated_ip_in_dir(temp.path(), "one", "10.0.42.2").unwrap();
1022
1023        let err =
1024            BridgeNetwork::reserve_ip_in_dir(temp.path(), "two", "10.0.42.0/24", Some("10.0.42.2"))
1025                .unwrap_err();
1026        assert!(
1027            err.to_string().contains("already in use"),
1028            "second reservation of the same IP must fail"
1029        );
1030    }
1031
1032    #[test]
1033    fn test_setup_rollback_releases_reserved_ip() {
1034        let temp = tempfile::tempdir().unwrap();
1035        BridgeNetwork::record_allocated_ip_in_dir(temp.path(), "rollback", "10.0.42.3").unwrap();
1036
1037        let rollback = SetupRollback {
1038            veth_host: "veth-test".to_string(),
1039            subnet: "10.0.42.0/24".to_string(),
1040            veth_created: false,
1041            nat_added: false,
1042            port_forwards: Vec::new(),
1043            prev_ip_forward: None,
1044            reserved_ip: Some((temp.path().to_path_buf(), "rollback".to_string())),
1045            armed: true,
1046        };
1047
1048        drop(rollback);
1049
1050        assert!(
1051            !temp.path().join("rollback.ip").exists(),
1052            "rollback must release reserved IP files on setup failure"
1053        );
1054    }
1055
1056    #[test]
1057    fn test_port_forward_rules_include_output_chain_for_local_host_clients() {
1058        let pf = PortForward {
1059            host_ip: None,
1060            host_port: 8080,
1061            container_port: 80,
1062            protocol: crate::network::config::Protocol::Tcp,
1063        };
1064
1065        let prerouting =
1066            BridgeNetwork::port_forward_rule_args("-A", "PREROUTING", "10.0.42.2", &pf);
1067        let output = BridgeNetwork::port_forward_rule_args("-A", "OUTPUT", "10.0.42.2", &pf);
1068
1069        assert!(prerouting.iter().any(|arg| arg == "PREROUTING"));
1070        assert!(output.iter().any(|arg| arg == "OUTPUT"));
1071        assert!(
1072            output
1073                .windows(2)
1074                .any(|pair| pair[0] == "--dst-type" && pair[1] == "LOCAL"),
1075            "OUTPUT rule must target local-destination traffic"
1076        );
1077    }
1078
1079    #[test]
1080    fn test_port_forward_rules_include_host_ip_when_configured() {
1081        let pf = PortForward {
1082            host_ip: Some(std::net::Ipv4Addr::new(127, 0, 0, 1)),
1083            host_port: 4173,
1084            container_port: 4173,
1085            protocol: crate::network::config::Protocol::Tcp,
1086        };
1087
1088        let prerouting =
1089            BridgeNetwork::port_forward_rule_args("-A", "PREROUTING", "10.0.42.2", &pf);
1090        let output = BridgeNetwork::port_forward_rule_args("-A", "OUTPUT", "10.0.42.2", &pf);
1091
1092        for args in [&prerouting, &output] {
1093            assert!(
1094                args.windows(2)
1095                    .any(|pair| pair[0] == "-d" && pair[1] == "127.0.0.1"),
1096                "port forward must restrict DNAT rules to the configured host IP"
1097            );
1098        }
1099    }
1100}