Skip to main content

nucleus/network/
bridge.rs

1use super::{egress, netlink, netns};
2use crate::error::{NucleusError, Result, StateTransition};
3use crate::network::config::{BridgeConfig, EgressPolicy, PortForward};
4use crate::network::NetworkState;
5use serde::{Deserialize, Serialize};
6use std::fs::OpenOptions;
7use std::net::Ipv4Addr;
8use std::os::fd::FromRawFd;
9use std::os::unix::fs::FileTypeExt;
10use std::os::unix::fs::OpenOptionsExt;
11use std::os::unix::io::AsRawFd;
12use std::process::Command;
13use tracing::{debug, info, warn};
14
15/// Bridge network manager
16pub struct BridgeNetwork {
17    config: BridgeConfig,
18    container_ip: String,
19    veth_host: String,
20    container_id: String,
21    ip_forward_ref_acquired: bool,
22    state: NetworkState,
23}
24
25const IP_FORWARD_SYSCTL_PATH: &str = "/proc/sys/net/ipv4/ip_forward";
26const IP_FORWARD_LOCK_FILE: &str = ".ip_forward.lock";
27const IP_FORWARD_STATE_FILE: &str = ".ip_forward.state";
28
29#[derive(Debug, Clone, Serialize, Deserialize)]
30struct IpForwardRefState {
31    refcount: u64,
32    original_value: String,
33}
34
35impl BridgeNetwork {
36    fn open_dev_urandom() -> Result<std::fs::File> {
37        let file = OpenOptions::new()
38            .read(true)
39            .custom_flags(libc::O_NOFOLLOW | libc::O_CLOEXEC)
40            .open("/dev/urandom")
41            .map_err(|e| {
42                NucleusError::NetworkError(format!("Failed to open /dev/urandom: {}", e))
43            })?;
44
45        let metadata = file.metadata().map_err(|e| {
46            NucleusError::NetworkError(format!("Failed to stat /dev/urandom: {}", e))
47        })?;
48        if !metadata.file_type().is_char_device() {
49            return Err(NucleusError::NetworkError(
50                "/dev/urandom is not a character device".to_string(),
51            ));
52        }
53
54        Ok(file)
55    }
56
57    /// Set up bridge networking for a container
58    ///
59    /// Creates bridge, veth pair, assigns IPs, enables NAT.
60    /// Must be called from the parent process after fork (needs host netns).
61    ///
62    /// State transitions: Unconfigured -> Configuring -> Active
63    pub fn setup(pid: u32, config: &BridgeConfig) -> Result<Self> {
64        Self::setup_for(pid, config, &format!("{:x}", pid))
65    }
66
67    /// Set up bridge networking with an explicit container ID for IP tracking.
68    pub fn setup_with_id(pid: u32, config: &BridgeConfig, container_id: &str) -> Result<Self> {
69        Self::setup_for(pid, config, container_id)
70    }
71
72    fn setup_for(pid: u32, config: &BridgeConfig, container_id: &str) -> Result<Self> {
73        // Validate all network parameters before using them in shell commands
74        config.validate()?;
75
76        let mut net_state = NetworkState::Unconfigured;
77        net_state = net_state.transition(NetworkState::Configuring)?;
78
79        let alloc_dir = Self::ip_alloc_dir();
80        let container_ip = Self::reserve_ip_in_dir(
81            &alloc_dir,
82            container_id,
83            &config.subnet,
84            config.container_ip.as_deref(),
85        )?;
86        let prefix = Self::subnet_prefix(&config.subnet);
87
88        // Linux interface names max 15 chars; truncate if needed
89        let veth_host_full = format!("veth-{:x}", pid);
90        let veth_cont_full = format!("vethc-{:x}", pid);
91        let veth_host = veth_host_full[..veth_host_full.len().min(15)].to_string();
92        let veth_container = veth_cont_full[..veth_cont_full.len().min(15)].to_string();
93        let mut rollback = SetupRollback::new(
94            veth_host.clone(),
95            config.subnet.clone(),
96            Some((alloc_dir.clone(), container_id.to_string())),
97        );
98
99        // 1. Create bridge if it doesn't exist
100        Self::ensure_bridge_for(&config.bridge_name, &config.subnet)?;
101
102        // 2. Create veth pair
103        netlink::create_veth(&veth_host, &veth_container)?;
104        rollback.veth_created = true;
105
106        // 3. Attach host end to bridge
107        netlink::set_link_master(&veth_host, &config.bridge_name)?;
108        netlink::set_link_up(&veth_host)?;
109
110        // 4. Move container end to container's network namespace
111        netlink::set_link_netns(&veth_container, pid)?;
112
113        // 5. Configure container interface (inside container netns via setns).
114        // Capture the process start time from /proc to detect PID recycling
115        // between the caller passing the PID and our netns operations.
116        let start_ticks = Self::read_pid_start_ticks(pid);
117        if start_ticks == 0 {
118            drop(rollback);
119            return Err(NucleusError::NetworkError(format!(
120                "Cannot read start_ticks for PID {} – process may have exited",
121                pid
122            )));
123        }
124
125        let container_addr: Ipv4Addr = container_ip.parse().map_err(|e| {
126            NucleusError::NetworkError(format!("invalid container IP '{}': {}", container_ip, e))
127        })?;
128        {
129            let vc = veth_container.clone();
130            netns::in_netns(pid, move || {
131                netlink::add_addr(&vc, container_addr, prefix)?;
132                netlink::set_link_up(&vc)?;
133                netlink::set_link_up("lo")?;
134                Ok(())
135            })?;
136        }
137
138        // Verify PID was not recycled during netns operations
139        let current_ticks = Self::read_pid_start_ticks(pid);
140        if current_ticks != start_ticks {
141            drop(rollback);
142            return Err(NucleusError::NetworkError(format!(
143                "PID {} was recycled during network setup (start_ticks changed: {} -> {})",
144                pid, start_ticks, current_ticks
145            )));
146        }
147
148        // 6. Set default route in container
149        let gateway = Self::gateway_from_subnet(&config.subnet);
150        let gateway_addr: Ipv4Addr = gateway.parse().map_err(|e| {
151            NucleusError::NetworkError(format!("invalid gateway IP '{}': {}", gateway, e))
152        })?;
153        netns::in_netns(pid, move || netlink::add_default_route(gateway_addr))?;
154
155        // 7. Enable NAT (masquerade) on the host
156        Self::run_cmd(
157            "iptables",
158            &[
159                "-t",
160                "nat",
161                "-A",
162                "POSTROUTING",
163                "-s",
164                &config.subnet,
165                "-j",
166                "MASQUERADE",
167            ],
168        )?;
169        rollback.nat_added = true;
170
171        // 8. Enable IP forwarding using a cross-container refcount so one
172        // container cannot disable forwarding while another bridge is still active.
173        Self::acquire_ip_forward_ref()?;
174        rollback.ip_forward_ref_acquired = true;
175
176        // 9. Set up port forwarding rules
177        for pf in &config.port_forwards {
178            Self::setup_port_forward_for(&container_ip, pf)?;
179            rollback
180                .port_forwards
181                .push((container_ip.clone(), pf.clone()));
182        }
183
184        net_state = net_state.transition(NetworkState::Active)?;
185
186        info!(
187            "Bridge network configured: {} -> {} (IP: {})",
188            veth_host, veth_container, container_ip
189        );
190        let ip_forward_ref_acquired = rollback.ip_forward_ref_acquired;
191        rollback.disarm();
192
193        Ok(Self {
194            config: config.clone(),
195            container_ip,
196            veth_host,
197            container_id: container_id.to_string(),
198            ip_forward_ref_acquired,
199            state: net_state,
200        })
201    }
202
203    /// Apply egress policy rules inside the container's network namespace.
204    ///
205    /// Uses iptables OUTPUT chain to restrict outbound connections.
206    /// Must be called after bridge setup while the container netns is reachable.
207    pub fn apply_egress_policy(&self, pid: u32, policy: &EgressPolicy) -> Result<()> {
208        egress::apply_egress_policy(pid, &self.config.dns, policy, false)
209    }
210
211    /// Clean up bridge networking
212    ///
213    /// State transition: Active -> Cleaned
214    pub fn cleanup(mut self) -> Result<()> {
215        self.state = self.state.transition(NetworkState::Cleaned)?;
216
217        // Release the IP allocation
218        Self::release_allocated_ip(&self.container_id);
219
220        // Remove port forwarding rules
221        for pf in &self.config.port_forwards {
222            if let Err(e) = self.cleanup_port_forward(pf) {
223                warn!("Failed to cleanup port forward: {}", e);
224            }
225        }
226
227        // Remove NAT rule
228        let _ = Self::run_cmd(
229            "iptables",
230            &[
231                "-t",
232                "nat",
233                "-D",
234                "POSTROUTING",
235                "-s",
236                &self.config.subnet,
237                "-j",
238                "MASQUERADE",
239            ],
240        );
241
242        // Delete veth pair (deleting one end removes both)
243        let _ = netlink::del_link(&self.veth_host);
244
245        if self.ip_forward_ref_acquired {
246            if let Err(e) = Self::release_ip_forward_ref() {
247                warn!("Failed to release ip_forward refcount: {}", e);
248            } else {
249                self.ip_forward_ref_acquired = false;
250            }
251        }
252
253        info!("Bridge network cleaned up");
254        Ok(())
255    }
256
257    /// Best-effort cleanup for use in Drop. Performs the same teardown as
258    /// `cleanup()` but ignores all errors and skips the state transition
259    /// (which requires ownership).
260    fn cleanup_best_effort(&mut self) {
261        if self.state == NetworkState::Cleaned {
262            return;
263        }
264
265        Self::release_allocated_ip(&self.container_id);
266
267        for pf in &self.config.port_forwards {
268            let _ = self.cleanup_port_forward(pf);
269        }
270
271        let _ = Self::run_cmd(
272            "iptables",
273            &[
274                "-t",
275                "nat",
276                "-D",
277                "POSTROUTING",
278                "-s",
279                &self.config.subnet,
280                "-j",
281                "MASQUERADE",
282            ],
283        );
284
285        let _ = netlink::del_link(&self.veth_host);
286
287        if self.ip_forward_ref_acquired {
288            let _ = Self::release_ip_forward_ref();
289            self.ip_forward_ref_acquired = false;
290        }
291
292        self.state = NetworkState::Cleaned;
293        debug!("Bridge network cleaned up (best-effort via drop)");
294    }
295
296    /// Detect and remove orphaned iptables rules from previous Nucleus runs.
297    ///
298    /// Checks for stale MASQUERADE rules referencing the nucleus subnet that
299    /// have no corresponding running container. Prevents gradual degradation
300    /// of network isolation from accumulated orphaned rules.
301    pub fn cleanup_orphaned_rules(subnet: &str) {
302        // List NAT rules and look for nucleus-related MASQUERADE entries
303        let iptables = match Self::resolve_bin("iptables") {
304            Ok(path) => path,
305            Err(e) => {
306                debug!("Cannot resolve iptables for orphaned rule cleanup: {}", e);
307                return;
308            }
309        };
310        let output = match Command::new(&iptables)
311            .args(["-t", "nat", "-L", "POSTROUTING", "-n"])
312            .output()
313        {
314            Ok(o) => o,
315            Err(e) => {
316                debug!("Cannot check iptables for orphaned rules: {}", e);
317                return;
318            }
319        };
320
321        let stdout = String::from_utf8_lossy(&output.stdout);
322        let mut orphaned_count = 0u32;
323        for line in stdout.lines() {
324            if line.contains("MASQUERADE") && line.contains(subnet) {
325                // Try to remove it; if it fails, it may be actively used
326                let _ = Self::run_cmd(
327                    "iptables",
328                    &[
329                        "-t",
330                        "nat",
331                        "-D",
332                        "POSTROUTING",
333                        "-s",
334                        subnet,
335                        "-j",
336                        "MASQUERADE",
337                    ],
338                );
339                orphaned_count += 1;
340            }
341        }
342
343        if orphaned_count > 0 {
344            info!(
345                "Cleaned up {} orphaned iptables MASQUERADE rule(s) for subnet {}",
346                orphaned_count, subnet
347            );
348        }
349    }
350
351    fn ensure_bridge_for(bridge_name: &str, subnet: &str) -> Result<()> {
352        if netlink::link_exists(bridge_name) {
353            return Ok(());
354        }
355
356        netlink::create_bridge(bridge_name)?;
357
358        let gateway = Self::gateway_from_subnet(subnet);
359        let gateway_addr: Ipv4Addr = gateway.parse().map_err(|e| {
360            NucleusError::NetworkError(format!("invalid bridge gateway '{}': {}", gateway, e))
361        })?;
362        netlink::add_addr(bridge_name, gateway_addr, Self::subnet_prefix(subnet))?;
363        netlink::set_link_up(bridge_name)?;
364
365        info!("Created bridge {}", bridge_name);
366        Ok(())
367    }
368
369    fn setup_port_forward_for(container_ip: &str, pf: &PortForward) -> Result<()> {
370        for chain in ["PREROUTING", "OUTPUT"] {
371            let args = Self::port_forward_rule_args("-A", chain, container_ip, pf);
372            Self::run_cmd_owned("iptables", &args)?;
373        }
374
375        let host_ip = pf
376            .host_ip
377            .map(|ip| ip.to_string())
378            .unwrap_or_else(|| "0.0.0.0".to_string());
379        info!(
380            "Port forward: {}:{} -> {}:{}/{}",
381            host_ip, pf.host_port, container_ip, pf.container_port, pf.protocol
382        );
383        Ok(())
384    }
385
386    fn cleanup_port_forward(&self, pf: &PortForward) -> Result<()> {
387        for chain in ["OUTPUT", "PREROUTING"] {
388            let args = Self::port_forward_rule_args("-D", chain, &self.container_ip, pf);
389            Self::run_cmd_owned("iptables", &args)?;
390        }
391        Ok(())
392    }
393
394    /// Allocate a container IP from the subnet using /dev/urandom.
395    ///
396    /// Checks both host-visible interfaces (via `ip addr`) and IPs assigned to
397    /// other Nucleus containers (via state files) to avoid duplicates. Container
398    /// IPs inside network namespaces are invisible to `ip addr show` on the host.
399    fn allocate_ip_with_reserved(
400        subnet: &str,
401        reserved: &std::collections::HashSet<String>,
402    ) -> Result<String> {
403        let base = subnet.split('/').next().unwrap_or("10.0.42.0");
404        let parts: Vec<&str> = base.split('.').collect();
405        if parts.len() != 4 {
406            return Ok("10.0.42.2".to_string());
407        }
408
409        // Use rejection sampling to avoid modulo bias.
410        // Range is 2..=254 (253 values). We reject random bytes >= 253 to
411        // ensure uniform distribution, then add 2 to shift into the valid range.
412        // Open /dev/urandom once and read all randomness in a single batch.
413        // 128 bytes gives ~125 valid candidates (byte < 253), making exhaustion
414        // in a populated subnet far less likely than the previous 32-byte buffer.
415        let mut rand_buf = [0u8; 128];
416        let mut urandom = Self::open_dev_urandom()?;
417        std::io::Read::read_exact(&mut urandom, &mut rand_buf).map_err(|e| {
418            NucleusError::NetworkError(format!("Failed to read /dev/urandom: {}", e))
419        })?;
420        for &byte in &rand_buf {
421            // Rejection sampling: discard values that would cause modulo bias
422            if byte >= 253 {
423                continue;
424            }
425            let offset = byte as u32 + 2;
426            let candidate = format!("{}.{}.{}.{}", parts[0], parts[1], parts[2], offset);
427            if reserved.contains(&candidate) {
428                continue;
429            }
430            if !Self::is_ip_in_use(&candidate)? {
431                // Lock is released when lock_file is dropped
432                return Ok(candidate);
433            }
434        }
435
436        Err(NucleusError::NetworkError(format!(
437            "Failed to allocate free IP in subnet {}",
438            subnet
439        )))
440    }
441
442    fn reserve_ip_in_dir(
443        alloc_dir: &std::path::Path,
444        container_id: &str,
445        subnet: &str,
446        requested_ip: Option<&str>,
447    ) -> Result<String> {
448        Self::ensure_alloc_dir(alloc_dir)?;
449        let lock_path = alloc_dir.join(".lock");
450        let lock_file = std::fs::OpenOptions::new()
451            .create(true)
452            .write(true)
453            .truncate(false)
454            .open(&lock_path)
455            .map_err(|e| {
456                NucleusError::NetworkError(format!("Failed to open IP alloc lock: {}", e))
457            })?;
458        // SAFETY: lock_file is a valid open fd. LOCK_EX is a blocking exclusive
459        // lock that is released when the fd is closed (end of scope).
460        let lock_ret = unsafe { libc::flock(lock_file.as_raw_fd(), libc::LOCK_EX) };
461        if lock_ret != 0 {
462            return Err(NucleusError::NetworkError(format!(
463                "Failed to acquire IP alloc lock: {}",
464                std::io::Error::last_os_error()
465            )));
466        }
467
468        let reserved = Self::collect_reserved_ips_in_dir(alloc_dir);
469        let ip = match requested_ip {
470            Some(ip) => {
471                if reserved.contains(ip) || Self::is_ip_in_use(ip)? {
472                    return Err(NucleusError::NetworkError(format!(
473                        "Requested container IP {} is already in use",
474                        ip
475                    )));
476                }
477                ip.to_string()
478            }
479            None => Self::allocate_ip_with_reserved(subnet, &reserved)?,
480        };
481
482        Self::record_allocated_ip_in_dir(alloc_dir, container_id, &ip)?;
483        Ok(ip)
484    }
485
486    /// Scan the Nucleus IP allocation directory for IPs already assigned.
487    fn collect_reserved_ips_in_dir(
488        alloc_dir: &std::path::Path,
489    ) -> std::collections::HashSet<String> {
490        let mut ips = std::collections::HashSet::new();
491        if let Ok(entries) = std::fs::read_dir(alloc_dir) {
492            for entry in entries.flatten() {
493                if let Some(name) = entry.file_name().to_str() {
494                    if name.ends_with(".ip") {
495                        if let Ok(ip) = std::fs::read_to_string(entry.path()) {
496                            let ip = ip.trim().to_string();
497                            if !ip.is_empty() {
498                                ips.insert(ip);
499                            }
500                        }
501                    }
502                }
503            }
504        }
505        ips
506    }
507
508    /// Persist the allocated IP for this container so other containers can see it.
509    fn record_allocated_ip_in_dir(
510        alloc_dir: &std::path::Path,
511        container_id: &str,
512        ip: &str,
513    ) -> Result<()> {
514        Self::ensure_alloc_dir(alloc_dir)?;
515        let path = alloc_dir.join(format!("{}.ip", container_id));
516        std::fs::write(&path, ip).map_err(|e| {
517            NucleusError::NetworkError(format!("Failed to record IP allocation: {}", e))
518        })?;
519        Ok(())
520    }
521
522    /// Remove the persisted IP allocation for a container.
523    fn release_allocated_ip(container_id: &str) {
524        let alloc_dir = Self::ip_alloc_dir();
525        Self::release_allocated_ip_in_dir(&alloc_dir, container_id);
526    }
527
528    fn release_allocated_ip_in_dir(alloc_dir: &std::path::Path, container_id: &str) {
529        let path = alloc_dir.join(format!("{}.ip", container_id));
530        let _ = std::fs::remove_file(path);
531    }
532
533    /// Create the IP allocation directory with restrictive permissions (0700)
534    /// and reject symlinked paths to prevent symlink attacks.
535    fn ensure_alloc_dir(alloc_dir: &std::path::Path) -> Result<()> {
536        // L11: Check for symlinks BEFORE creating directories to avoid TOCTOU.
537        // If the path already exists, verify it's not a symlink.
538        if alloc_dir.exists() {
539            if let Ok(meta) = std::fs::symlink_metadata(alloc_dir) {
540                if meta.file_type().is_symlink() {
541                    return Err(NucleusError::NetworkError(format!(
542                        "IP alloc dir {:?} is a symlink, refusing to use",
543                        alloc_dir
544                    )));
545                }
546            }
547        }
548        // Also check parent directory for symlinks
549        if let Some(parent) = alloc_dir.parent() {
550            if let Ok(meta) = std::fs::symlink_metadata(parent) {
551                if meta.file_type().is_symlink() {
552                    return Err(NucleusError::NetworkError(format!(
553                        "IP alloc dir parent {:?} is a symlink, refusing to use",
554                        parent
555                    )));
556                }
557            }
558        }
559
560        std::fs::create_dir_all(alloc_dir).map_err(|e| {
561            NucleusError::NetworkError(format!("Failed to create IP alloc dir: {}", e))
562        })?;
563
564        // Restrict permissions to owner-only atomically after creation
565        use std::os::unix::fs::PermissionsExt;
566        let perms = std::fs::Permissions::from_mode(0o700);
567        std::fs::set_permissions(alloc_dir, perms).map_err(|e| {
568            NucleusError::NetworkError(format!(
569                "Failed to set permissions on IP alloc dir {:?}: {}",
570                alloc_dir, e
571            ))
572        })?;
573
574        // Re-verify no symlink replacement after permissions were set
575        if let Ok(meta) = std::fs::symlink_metadata(alloc_dir) {
576            if meta.file_type().is_symlink() {
577                return Err(NucleusError::NetworkError(format!(
578                    "IP alloc dir {:?} was replaced with a symlink during setup",
579                    alloc_dir
580                )));
581            }
582        }
583        Ok(())
584    }
585
586    fn ip_alloc_dir() -> std::path::PathBuf {
587        if nix::unistd::Uid::effective().is_root() {
588            std::path::PathBuf::from("/var/run/nucleus/ip-alloc")
589        } else {
590            dirs::runtime_dir()
591                .map(|d| d.join("nucleus/ip-alloc"))
592                .or_else(|| dirs::data_local_dir().map(|d| d.join("nucleus/ip-alloc")))
593                .unwrap_or_else(|| {
594                    dirs::home_dir()
595                        .map(|h| h.join(".nucleus/ip-alloc"))
596                        .unwrap_or_else(|| std::path::PathBuf::from("/var/run/nucleus/ip-alloc"))
597                })
598        }
599    }
600
601    fn ip_forward_lock_path(alloc_dir: &std::path::Path) -> std::path::PathBuf {
602        alloc_dir.join(IP_FORWARD_LOCK_FILE)
603    }
604
605    fn ip_forward_state_path(alloc_dir: &std::path::Path) -> std::path::PathBuf {
606        alloc_dir.join(IP_FORWARD_STATE_FILE)
607    }
608
609    fn read_ip_forward_value(sysctl_path: &std::path::Path) -> Result<String> {
610        std::fs::read_to_string(sysctl_path)
611            .map(|value| value.trim().to_string())
612            .map_err(|e| {
613                NucleusError::NetworkError(format!(
614                    "Failed to read {}: {}",
615                    sysctl_path.display(),
616                    e
617                ))
618            })
619    }
620
621    fn write_ip_forward_value(sysctl_path: &std::path::Path, value: &str) -> Result<()> {
622        std::fs::write(sysctl_path, value).map_err(|e| {
623            NucleusError::NetworkError(format!(
624                "Failed to write {} to {}: {}",
625                value,
626                sysctl_path.display(),
627                e
628            ))
629        })
630    }
631
632    fn load_ip_forward_state(alloc_dir: &std::path::Path) -> Result<Option<IpForwardRefState>> {
633        let state_path = Self::ip_forward_state_path(alloc_dir);
634        match std::fs::read_to_string(&state_path) {
635            Ok(content) => serde_json::from_str(&content).map(Some).map_err(|e| {
636                NucleusError::NetworkError(format!(
637                    "Failed to parse ip_forward refcount state {:?}: {}",
638                    state_path, e
639                ))
640            }),
641            Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(None),
642            Err(e) => Err(NucleusError::NetworkError(format!(
643                "Failed to read ip_forward refcount state {:?}: {}",
644                state_path, e
645            ))),
646        }
647    }
648
649    fn store_ip_forward_state(
650        alloc_dir: &std::path::Path,
651        state: &IpForwardRefState,
652    ) -> Result<()> {
653        let state_path = Self::ip_forward_state_path(alloc_dir);
654        let encoded = serde_json::to_vec(state).map_err(|e| {
655            NucleusError::NetworkError(format!(
656                "Failed to serialize ip_forward refcount state {:?}: {}",
657                state_path, e
658            ))
659        })?;
660        std::fs::write(&state_path, encoded).map_err(|e| {
661            NucleusError::NetworkError(format!(
662                "Failed to persist ip_forward refcount state {:?}: {}",
663                state_path, e
664            ))
665        })
666    }
667
668    fn remove_ip_forward_state(alloc_dir: &std::path::Path) -> Result<()> {
669        let state_path = Self::ip_forward_state_path(alloc_dir);
670        match std::fs::remove_file(&state_path) {
671            Ok(()) => Ok(()),
672            Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(()),
673            Err(e) => Err(NucleusError::NetworkError(format!(
674                "Failed to remove ip_forward refcount state {:?}: {}",
675                state_path, e
676            ))),
677        }
678    }
679
680    fn acquire_ip_forward_ref() -> Result<()> {
681        let alloc_dir = Self::ip_alloc_dir();
682        Self::acquire_ip_forward_ref_in_dir(
683            &alloc_dir,
684            std::path::Path::new(IP_FORWARD_SYSCTL_PATH),
685        )
686    }
687
688    fn acquire_ip_forward_ref_in_dir(
689        alloc_dir: &std::path::Path,
690        sysctl_path: &std::path::Path,
691    ) -> Result<()> {
692        Self::ensure_alloc_dir(alloc_dir)?;
693        let lock_path = Self::ip_forward_lock_path(alloc_dir);
694        let lock_file = std::fs::OpenOptions::new()
695            .create(true)
696            .write(true)
697            .truncate(false)
698            .open(&lock_path)
699            .map_err(|e| {
700                NucleusError::NetworkError(format!(
701                    "Failed to open ip_forward lock {:?}: {}",
702                    lock_path, e
703                ))
704            })?;
705        let lock_ret = unsafe { libc::flock(lock_file.as_raw_fd(), libc::LOCK_EX) };
706        if lock_ret != 0 {
707            return Err(NucleusError::NetworkError(format!(
708                "Failed to acquire ip_forward lock: {}",
709                std::io::Error::last_os_error()
710            )));
711        }
712
713        let mut state = match Self::load_ip_forward_state(alloc_dir)? {
714            Some(state) => state,
715            None => {
716                let original_value = Self::read_ip_forward_value(sysctl_path)?;
717                let state = IpForwardRefState {
718                    refcount: 0,
719                    original_value,
720                };
721                Self::store_ip_forward_state(alloc_dir, &state)?;
722                state
723            }
724        };
725
726        if state.refcount == 0 {
727            Self::write_ip_forward_value(sysctl_path, "1")?;
728        }
729        state.refcount = state.refcount.checked_add(1).ok_or_else(|| {
730            NucleusError::NetworkError("ip_forward refcount overflow".to_string())
731        })?;
732        Self::store_ip_forward_state(alloc_dir, &state)
733    }
734
735    fn release_ip_forward_ref() -> Result<()> {
736        let alloc_dir = Self::ip_alloc_dir();
737        Self::release_ip_forward_ref_in_dir(
738            &alloc_dir,
739            std::path::Path::new(IP_FORWARD_SYSCTL_PATH),
740        )
741    }
742
743    fn release_ip_forward_ref_in_dir(
744        alloc_dir: &std::path::Path,
745        sysctl_path: &std::path::Path,
746    ) -> Result<()> {
747        if !alloc_dir.exists() {
748            return Ok(());
749        }
750        let lock_path = Self::ip_forward_lock_path(alloc_dir);
751        let lock_file = std::fs::OpenOptions::new()
752            .create(true)
753            .write(true)
754            .truncate(false)
755            .open(&lock_path)
756            .map_err(|e| {
757                NucleusError::NetworkError(format!(
758                    "Failed to open ip_forward lock {:?}: {}",
759                    lock_path, e
760                ))
761            })?;
762        let lock_ret = unsafe { libc::flock(lock_file.as_raw_fd(), libc::LOCK_EX) };
763        if lock_ret != 0 {
764            return Err(NucleusError::NetworkError(format!(
765                "Failed to acquire ip_forward lock: {}",
766                std::io::Error::last_os_error()
767            )));
768        }
769
770        let Some(mut state) = Self::load_ip_forward_state(alloc_dir)? else {
771            return Ok(());
772        };
773
774        if state.refcount == 0 {
775            return Self::remove_ip_forward_state(alloc_dir);
776        }
777
778        state.refcount -= 1;
779        if state.refcount == 0 {
780            Self::write_ip_forward_value(sysctl_path, &state.original_value)?;
781            Self::remove_ip_forward_state(alloc_dir)?;
782            info!("Restored net.ipv4.ip_forward to {}", state.original_value);
783        } else {
784            Self::store_ip_forward_state(alloc_dir, &state)?;
785        }
786
787        Ok(())
788    }
789
790    /// Read the start time (field 22) from /proc/<pid>/stat to detect PID recycling.
791    /// Returns 0 if the process does not exist or the field cannot be parsed.
792    fn read_pid_start_ticks(pid: u32) -> u64 {
793        let stat_path = format!("/proc/{}/stat", pid);
794        if let Ok(content) = std::fs::read_to_string(&stat_path) {
795            // Field 22 is starttime. The comm field (2) may contain spaces/parens,
796            // so find the last ')' and count fields from there.
797            if let Some(after_comm) = content.rfind(')') {
798                return content[after_comm + 2..]
799                    .split_whitespace()
800                    .nth(19) // field 22 is 20th after the ')' + state field
801                    .and_then(|s| s.parse().ok())
802                    .unwrap_or(0);
803            }
804        }
805        0
806    }
807
808    /// Get gateway IP from subnet (first usable address)
809    fn gateway_from_subnet(subnet: &str) -> String {
810        let base = subnet.split('/').next().unwrap_or("10.0.42.0");
811        let parts: Vec<&str> = base.split('.').collect();
812        if parts.len() == 4 {
813            format!("{}.{}.{}.1", parts[0], parts[1], parts[2])
814        } else {
815            "10.0.42.1".to_string()
816        }
817    }
818
819    fn subnet_prefix(subnet: &str) -> u8 {
820        subnet
821            .split_once('/')
822            .and_then(|(_, p)| p.parse::<u8>().ok())
823            .filter(|p| *p <= 32)
824            .unwrap_or(24)
825    }
826
827    /// Resolve a system binary to a validated absolute path.
828    ///
829    /// Searches known sysadmin paths first while validating ownership and
830    /// permissions before use. When running as root, the process PATH is never
831    /// consulted because bridge networking invokes privileged helpers and must
832    /// not trust an attacker-influenced environment. This also avoids depending
833    /// on a separate `which` binary in service managers that set a narrow PATH.
834    /// Returns an error if no valid binary is found.
835    pub(crate) fn resolve_bin(name: &str) -> Result<String> {
836        let search_dirs: &[&str] = match name {
837            "iptables" => &[
838                "/usr/sbin/iptables",
839                "/sbin/iptables",
840                "/usr/bin/iptables",
841                "/run/current-system/sw/bin/iptables",
842            ],
843            "slirp4netns" => &[
844                "/usr/bin/slirp4netns",
845                "/bin/slirp4netns",
846                "/run/current-system/sw/bin/slirp4netns",
847            ],
848            _ => &[],
849        };
850
851        for path in search_dirs {
852            let p = std::path::Path::new(path);
853            if p.exists() {
854                Self::validate_network_binary(p, name)?;
855                let resolved = std::fs::canonicalize(p).map_err(|e| {
856                    NucleusError::NetworkError(format!(
857                        "Cannot canonicalize {} at {:?}: {}",
858                        name, p, e
859                    ))
860                })?;
861                return Ok(resolved.to_string_lossy().into_owned());
862            }
863        }
864
865        if nix::unistd::Uid::effective().is_root() {
866            return Err(NucleusError::NetworkError(format!(
867                "Required binary '{}' not found in trusted system paths",
868                name
869            )));
870        }
871
872        if let Some(path_var) = std::env::var_os("PATH") {
873            for dir in std::env::split_paths(&path_var) {
874                let candidate = dir.join(name);
875                if candidate.exists() {
876                    Self::validate_network_binary(&candidate, name)?;
877                    let resolved = std::fs::canonicalize(&candidate).map_err(|e| {
878                        NucleusError::NetworkError(format!(
879                            "Cannot canonicalize {} at {:?}: {}",
880                            name, candidate, e
881                        ))
882                    })?;
883                    return Ok(resolved.to_string_lossy().into_owned());
884                }
885            }
886        }
887
888        Err(NucleusError::NetworkError(format!(
889            "Required binary '{}' not found or failed validation",
890            name
891        )))
892    }
893
894    /// Validate a network binary's ownership and permissions.
895    /// Rejects binaries that are group/world-writable or not owned by root/euid,
896    /// except for immutable Nix store artifacts.
897    fn validate_network_binary(path: &std::path::Path, name: &str) -> Result<()> {
898        use std::os::unix::fs::MetadataExt;
899
900        let resolved = std::fs::canonicalize(path).unwrap_or_else(|_| path.to_path_buf());
901        let meta = std::fs::metadata(&resolved)
902            .map_err(|e| NucleusError::NetworkError(format!("Cannot stat {}: {}", name, e)))?;
903        let mode = meta.mode();
904        if mode & 0o111 == 0 {
905            return Err(NucleusError::NetworkError(format!(
906                "Binary '{}' at {:?} is not executable, refusing to execute",
907                name, resolved
908            )));
909        }
910        if mode & 0o022 != 0 {
911            return Err(NucleusError::NetworkError(format!(
912                "Binary '{}' at {:?} is writable by group/others (mode {:o}), refusing to execute",
913                name, resolved, mode
914            )));
915        }
916        let owner = meta.uid();
917        let euid = nix::unistd::Uid::effective().as_raw();
918        if owner != 0 && owner != euid && !Self::is_trusted_store_network_binary(&resolved, mode) {
919            return Err(NucleusError::NetworkError(format!(
920                "Binary '{}' at {:?} owned by UID {} (expected root or euid {}), refusing to execute",
921                name, resolved, owner, euid
922            )));
923        }
924        Ok(())
925    }
926
927    fn is_trusted_store_network_binary(path: &std::path::Path, mode: u32) -> bool {
928        use std::os::unix::fs::MetadataExt;
929        if !path.starts_with("/nix/store") {
930            return false;
931        }
932        if mode & 0o200 != 0 {
933            return false;
934        }
935        if let Some(parent) = path.parent() {
936            if let Ok(parent_meta) = std::fs::metadata(parent) {
937                return parent_meta.mode() & 0o222 == 0;
938            }
939        }
940        false
941    }
942
943    fn run_cmd(program: &str, args: &[&str]) -> Result<()> {
944        let resolved = Self::resolve_bin(program)?;
945        let output = Command::new(&resolved).args(args).output().map_err(|e| {
946            NucleusError::NetworkError(format!("Failed to run {} {:?}: {}", resolved, args, e))
947        })?;
948
949        if !output.status.success() {
950            let stderr = String::from_utf8_lossy(&output.stderr);
951            return Err(NucleusError::NetworkError(format!(
952                "{} {:?} failed: {}",
953                program, args, stderr
954            )));
955        }
956
957        Ok(())
958    }
959
960    fn run_cmd_owned(program: &str, args: &[String]) -> Result<()> {
961        let refs: Vec<&str> = args.iter().map(String::as_str).collect();
962        Self::run_cmd(program, &refs)
963    }
964
965    fn port_forward_rule_args(
966        operation: &str,
967        chain: &str,
968        container_ip: &str,
969        pf: &PortForward,
970    ) -> Vec<String> {
971        let mut args = vec![
972            "-t".to_string(),
973            "nat".to_string(),
974            operation.to_string(),
975            chain.to_string(),
976            "-p".to_string(),
977            pf.protocol.as_str().to_string(),
978        ];
979
980        if chain == "OUTPUT" {
981            args.extend([
982                "-m".to_string(),
983                "addrtype".to_string(),
984                "--dst-type".to_string(),
985                "LOCAL".to_string(),
986            ]);
987        }
988
989        if let Some(host_ip) = pf.host_ip {
990            args.extend(["-d".to_string(), host_ip.to_string()]);
991        }
992
993        args.extend([
994            "--dport".to_string(),
995            pf.host_port.to_string(),
996            "-j".to_string(),
997            "DNAT".to_string(),
998            "--to-destination".to_string(),
999            format!("{}:{}", container_ip, pf.container_port),
1000        ]);
1001
1002        args
1003    }
1004
1005    fn is_ip_in_use(ip: &str) -> Result<bool> {
1006        let addr: Ipv4Addr = ip
1007            .parse()
1008            .map_err(|e| NucleusError::NetworkError(format!("invalid IP '{}': {}", ip, e)))?;
1009        netlink::is_addr_in_use(&addr)
1010    }
1011
1012    /// Write resolv.conf inside container (for writable /etc, e.g. agent mode)
1013    pub fn write_resolv_conf(root: &std::path::Path, dns: &[String]) -> Result<()> {
1014        let resolv_path = root.join("etc/resolv.conf");
1015        let content: String = dns
1016            .iter()
1017            .map(|server| format!("nameserver {}\n", server))
1018            .collect();
1019        std::fs::write(&resolv_path, content).map_err(|e| {
1020            NucleusError::NetworkError(format!("Failed to write resolv.conf: {}", e))
1021        })?;
1022        Ok(())
1023    }
1024
1025    /// Bind-mount a resolv.conf over a read-only /etc (for production rootfs mode).
1026    ///
1027    /// Creates a memfd-backed resolv.conf and bind-mounts it over
1028    /// /etc/resolv.conf so it works even when the rootfs /etc is read-only.
1029    /// The memfd is cleaned up when the container exits.
1030    pub fn bind_mount_resolv_conf(root: &std::path::Path, dns: &[String]) -> Result<()> {
1031        use nix::mount::{mount, MsFlags};
1032
1033        let content: String = dns
1034            .iter()
1035            .map(|server| format!("nameserver {}\n", server))
1036            .collect();
1037
1038        // Create a memfd-backed file to avoid leaving staging files on disk
1039        let memfd_name = std::ffi::CString::new("nucleus-resolv").map_err(|e| {
1040            NucleusError::NetworkError(format!("Failed to create memfd name: {}", e))
1041        })?;
1042        // SAFETY: memfd_name is a valid NUL-terminated CString. memfd_create
1043        // returns a new fd or -1 on error; we check for error below.
1044        let raw_fd = unsafe { libc::memfd_create(memfd_name.as_ptr(), 0) };
1045        if raw_fd < 0 {
1046            // Fallback to staging file if memfd_create is unavailable
1047            return Self::bind_mount_resolv_conf_staging(root, dns);
1048        }
1049        // SAFETY: raw_fd is a valid, newly-created fd from memfd_create.
1050        // OwnedFd takes ownership and will close it exactly once on drop,
1051        // preventing double-close on any error path.
1052        let memfd = unsafe { std::os::fd::OwnedFd::from_raw_fd(raw_fd) };
1053
1054        // Write content to memfd using File I/O to handle partial writes correctly.
1055        use std::io::Write as _;
1056        let mut memfd_file = std::fs::File::from(memfd);
1057        if memfd_file.write_all(content.as_bytes()).is_err() {
1058            // memfd_file dropped here, closing the fd automatically
1059            return Self::bind_mount_resolv_conf_staging(root, dns);
1060        }
1061        // Re-extract the OwnedFd for the proc path below
1062        use std::os::fd::IntoRawFd;
1063        let memfd = {
1064            let raw = memfd_file.into_raw_fd();
1065            // SAFETY: raw is the valid fd we just extracted from the File.
1066            unsafe { std::os::fd::OwnedFd::from_raw_fd(raw) }
1067        };
1068
1069        // Ensure the mount target exists
1070        let target = root.join("etc/resolv.conf");
1071        if !target.exists() {
1072            let _ = std::fs::write(&target, "");
1073        }
1074
1075        // Bind mount the memfd over the read-only resolv.conf
1076        let memfd_path = format!("/proc/self/fd/{}", memfd.as_raw_fd());
1077        if let Err(e) = mount(
1078            Some(memfd_path.as_str()),
1079            &target,
1080            None::<&str>,
1081            MsFlags::MS_BIND,
1082            None::<&str>,
1083        ) {
1084            return Err(NucleusError::NetworkError(format!(
1085                "Failed to bind mount memfd-backed resolv.conf: {}",
1086                e
1087            )));
1088        }
1089        Self::harden_resolv_conf_bind(&target)?;
1090
1091        // memfd dropped here – the mount holds a kernel reference to the file,
1092        // so it survives the fd close.
1093
1094        info!("Bind-mounted resolv.conf for bridge networking (rootfs mode, memfd)");
1095        Ok(())
1096    }
1097
1098    /// Fallback: bind-mount a staging resolv.conf file.
1099    fn bind_mount_resolv_conf_staging(root: &std::path::Path, dns: &[String]) -> Result<()> {
1100        use nix::mount::{mount, MsFlags};
1101
1102        let content: String = dns
1103            .iter()
1104            .map(|server| format!("nameserver {}\n", server))
1105            .collect();
1106
1107        let staging = Self::create_resolv_conf_staging_file(root, content.as_bytes())?;
1108
1109        // Ensure the mount target exists
1110        let target = root.join("etc/resolv.conf");
1111        if !target.exists() {
1112            let _ = std::fs::write(&target, "");
1113        }
1114
1115        // Bind mount the staging file over the read-only resolv.conf
1116        mount(
1117            Some(staging.path()),
1118            &target,
1119            None::<&str>,
1120            MsFlags::MS_BIND,
1121            None::<&str>,
1122        )
1123        .map_err(|e| {
1124            NucleusError::NetworkError(format!("Failed to bind mount resolv.conf: {}", e))
1125        })?;
1126        Self::harden_resolv_conf_bind(&target)?;
1127
1128        // The bind mount holds a reference to the inode. Dropping the temporary
1129        // file unlinks the staging path so DNS server info is not left on disk.
1130
1131        info!("Bind-mounted resolv.conf for bridge networking (rootfs mode, staging)");
1132        Ok(())
1133    }
1134
1135    fn create_resolv_conf_staging_file(
1136        root: &std::path::Path,
1137        content: &[u8],
1138    ) -> Result<tempfile::NamedTempFile> {
1139        use std::io::Write as _;
1140
1141        let staging_dir = root.parent().ok_or_else(|| {
1142            NucleusError::NetworkError(format!(
1143                "Container root {:?} has no parent for resolv.conf staging",
1144                root
1145            ))
1146        })?;
1147
1148        let mut staging = tempfile::Builder::new()
1149            .prefix(".resolv.conf.nucleus.")
1150            .tempfile_in(staging_dir)
1151            .map_err(|e| {
1152                NucleusError::NetworkError(format!(
1153                    "Failed to create temporary resolv.conf staging file under {:?}: {}",
1154                    staging_dir, e
1155                ))
1156            })?;
1157
1158        staging.as_file_mut().write_all(content).map_err(|e| {
1159            NucleusError::NetworkError(format!(
1160                "Failed to write temporary resolv.conf staging file {:?}: {}",
1161                staging.path(),
1162                e
1163            ))
1164        })?;
1165
1166        Ok(staging)
1167    }
1168
1169    fn harden_resolv_conf_bind(target: &std::path::Path) -> Result<()> {
1170        use nix::mount::{mount, MsFlags};
1171
1172        mount(
1173            None::<&str>,
1174            target,
1175            None::<&str>,
1176            MsFlags::MS_REMOUNT
1177                | MsFlags::MS_BIND
1178                | MsFlags::MS_RDONLY
1179                | MsFlags::MS_NOSUID
1180                | MsFlags::MS_NODEV
1181                | MsFlags::MS_NOEXEC,
1182            None::<&str>,
1183        )
1184        .map_err(|e| {
1185            NucleusError::NetworkError(format!(
1186                "Failed to remount resolv.conf with hardened flags at {:?}: {}",
1187                target, e
1188            ))
1189        })
1190    }
1191}
1192
1193impl Drop for BridgeNetwork {
1194    fn drop(&mut self) {
1195        self.cleanup_best_effort();
1196    }
1197}
1198
1199struct SetupRollback {
1200    veth_host: String,
1201    subnet: String,
1202    veth_created: bool,
1203    nat_added: bool,
1204    port_forwards: Vec<(String, PortForward)>,
1205    ip_forward_ref_acquired: bool,
1206    reserved_ip: Option<(std::path::PathBuf, String)>,
1207    armed: bool,
1208}
1209
1210impl SetupRollback {
1211    fn new(
1212        veth_host: String,
1213        subnet: String,
1214        reserved_ip: Option<(std::path::PathBuf, String)>,
1215    ) -> Self {
1216        Self {
1217            veth_host,
1218            subnet,
1219            veth_created: false,
1220            nat_added: false,
1221            port_forwards: Vec::new(),
1222            ip_forward_ref_acquired: false,
1223            reserved_ip,
1224            armed: true,
1225        }
1226    }
1227
1228    fn disarm(&mut self) {
1229        self.armed = false;
1230    }
1231}
1232
1233impl Drop for SetupRollback {
1234    fn drop(&mut self) {
1235        if !self.armed {
1236            return;
1237        }
1238
1239        for (container_ip, pf) in self.port_forwards.iter().rev() {
1240            for chain in ["OUTPUT", "PREROUTING"] {
1241                let args = BridgeNetwork::port_forward_rule_args("-D", chain, container_ip, pf);
1242                if let Err(e) = BridgeNetwork::run_cmd_owned("iptables", &args) {
1243                    warn!(
1244                        "Rollback: failed to remove iptables {} rule for {}: {}",
1245                        chain, container_ip, e
1246                    );
1247                }
1248            }
1249        }
1250
1251        if self.nat_added {
1252            if let Err(e) = BridgeNetwork::run_cmd(
1253                "iptables",
1254                &[
1255                    "-t",
1256                    "nat",
1257                    "-D",
1258                    "POSTROUTING",
1259                    "-s",
1260                    &self.subnet,
1261                    "-j",
1262                    "MASQUERADE",
1263                ],
1264            ) {
1265                warn!("Rollback: failed to remove NAT rule: {}", e);
1266            }
1267        }
1268
1269        if self.veth_created {
1270            if let Err(e) = netlink::del_link(&self.veth_host) {
1271                warn!("Rollback: failed to delete veth {}: {}", self.veth_host, e);
1272            }
1273        }
1274
1275        if self.ip_forward_ref_acquired {
1276            if let Err(e) = BridgeNetwork::release_ip_forward_ref() {
1277                warn!("Rollback: failed to release ip_forward refcount: {}", e);
1278            }
1279        }
1280
1281        if let Some((alloc_dir, container_id)) = &self.reserved_ip {
1282            BridgeNetwork::release_allocated_ip_in_dir(alloc_dir, container_id);
1283        }
1284    }
1285}
1286
1287#[cfg(test)]
1288mod tests {
1289    use super::*;
1290
1291    #[test]
1292    fn test_ip_allocation_rejection_sampling_range() {
1293        // H-5: Verify that rejection sampling produces values in 2..=254
1294        // and that values >= 253 are rejected (no modulo bias).
1295        for byte in 0u8..253 {
1296            let offset = byte as u32 + 2;
1297            assert!(
1298                (2..=254).contains(&offset),
1299                "offset {} out of range",
1300                offset
1301            );
1302        }
1303        // Values 253, 254, 255 must be rejected
1304        for byte in [253u8, 254, 255] {
1305            assert!(byte >= 253);
1306        }
1307    }
1308
1309    #[test]
1310    fn test_reserve_ip_blocks_duplicate_requested_address() {
1311        let temp = tempfile::tempdir().unwrap();
1312        BridgeNetwork::record_allocated_ip_in_dir(temp.path(), "one", "10.0.42.2").unwrap();
1313
1314        let err =
1315            BridgeNetwork::reserve_ip_in_dir(temp.path(), "two", "10.0.42.0/24", Some("10.0.42.2"))
1316                .unwrap_err();
1317        assert!(
1318            err.to_string().contains("already in use"),
1319            "second reservation of the same IP must fail"
1320        );
1321    }
1322
1323    #[test]
1324    fn test_setup_rollback_releases_reserved_ip() {
1325        let temp = tempfile::tempdir().unwrap();
1326        BridgeNetwork::record_allocated_ip_in_dir(temp.path(), "rollback", "10.0.42.3").unwrap();
1327
1328        let rollback = SetupRollback {
1329            veth_host: "veth-test".to_string(),
1330            subnet: "10.0.42.0/24".to_string(),
1331            veth_created: false,
1332            nat_added: false,
1333            port_forwards: Vec::new(),
1334            ip_forward_ref_acquired: false,
1335            reserved_ip: Some((temp.path().to_path_buf(), "rollback".to_string())),
1336            armed: true,
1337        };
1338
1339        drop(rollback);
1340
1341        assert!(
1342            !temp.path().join("rollback.ip").exists(),
1343            "rollback must release reserved IP files on setup failure"
1344        );
1345    }
1346
1347    #[test]
1348    fn test_resolv_conf_staging_file_is_outside_container_root() {
1349        let temp = tempfile::tempdir().unwrap();
1350        let root = temp.path().join("root");
1351        std::fs::create_dir_all(root.join("tmp")).unwrap();
1352
1353        let staging =
1354            BridgeNetwork::create_resolv_conf_staging_file(&root, b"nameserver 203.0.113.53\n")
1355                .unwrap();
1356
1357        assert_eq!(
1358            std::fs::read_to_string(staging.path()).unwrap(),
1359            "nameserver 203.0.113.53\n"
1360        );
1361        assert!(
1362            !staging.path().starts_with(&root),
1363            "staging file must not be created under the container root"
1364        );
1365    }
1366
1367    #[test]
1368    fn test_bind_mount_resolv_conf_does_not_overwrite_root_tmp_symlink_on_failure() {
1369        use std::os::unix::fs::symlink;
1370
1371        let temp = tempfile::tempdir().unwrap();
1372        let root = temp.path().join("root");
1373        std::fs::create_dir_all(root.join("tmp")).unwrap();
1374
1375        let victim = temp.path().join("host_victim_file");
1376        std::fs::write(&victim, "ORIGINAL_HOST_CONTENT\n").unwrap();
1377        symlink(&victim, root.join("tmp/.resolv.conf.nucleus")).unwrap();
1378
1379        let dns = vec!["203.0.113.53".to_string()];
1380        let result = BridgeNetwork::bind_mount_resolv_conf(&root, &dns);
1381
1382        assert!(
1383            result.is_err(),
1384            "test root intentionally lacks /etc so bind mount setup must fail"
1385        );
1386        assert_eq!(
1387            std::fs::read_to_string(&victim).unwrap(),
1388            "ORIGINAL_HOST_CONTENT\n",
1389            "resolv.conf setup must not write through attacker-controlled /tmp symlinks"
1390        );
1391    }
1392
1393    #[test]
1394    fn test_ip_forward_refcount_restores_original_only_after_last_release() {
1395        let temp = tempfile::tempdir().unwrap();
1396        let sysctl = temp.path().join("ip_forward");
1397        std::fs::write(&sysctl, "0").unwrap();
1398
1399        BridgeNetwork::acquire_ip_forward_ref_in_dir(temp.path(), &sysctl).unwrap();
1400        BridgeNetwork::acquire_ip_forward_ref_in_dir(temp.path(), &sysctl).unwrap();
1401        assert_eq!(std::fs::read_to_string(&sysctl).unwrap(), "1");
1402
1403        BridgeNetwork::release_ip_forward_ref_in_dir(temp.path(), &sysctl).unwrap();
1404        assert_eq!(std::fs::read_to_string(&sysctl).unwrap(), "1");
1405
1406        BridgeNetwork::release_ip_forward_ref_in_dir(temp.path(), &sysctl).unwrap();
1407        assert_eq!(std::fs::read_to_string(&sysctl).unwrap(), "0");
1408        assert!(
1409            !temp.path().join(IP_FORWARD_STATE_FILE).exists(),
1410            "state file must be removed when the last bridge releases ip_forward"
1411        );
1412    }
1413
1414    #[test]
1415    fn test_port_forward_rules_include_output_chain_for_local_host_clients() {
1416        let pf = PortForward {
1417            host_ip: None,
1418            host_port: 8080,
1419            container_port: 80,
1420            protocol: crate::network::config::Protocol::Tcp,
1421        };
1422
1423        let prerouting =
1424            BridgeNetwork::port_forward_rule_args("-A", "PREROUTING", "10.0.42.2", &pf);
1425        let output = BridgeNetwork::port_forward_rule_args("-A", "OUTPUT", "10.0.42.2", &pf);
1426
1427        assert!(prerouting.iter().any(|arg| arg == "PREROUTING"));
1428        assert!(output.iter().any(|arg| arg == "OUTPUT"));
1429        assert!(
1430            output
1431                .windows(2)
1432                .any(|pair| pair[0] == "--dst-type" && pair[1] == "LOCAL"),
1433            "OUTPUT rule must target local-destination traffic"
1434        );
1435    }
1436
1437    #[test]
1438    fn test_port_forward_rules_include_host_ip_when_configured() {
1439        let pf = PortForward {
1440            host_ip: Some(std::net::Ipv4Addr::new(127, 0, 0, 1)),
1441            host_port: 4173,
1442            container_port: 4173,
1443            protocol: crate::network::config::Protocol::Tcp,
1444        };
1445
1446        let prerouting =
1447            BridgeNetwork::port_forward_rule_args("-A", "PREROUTING", "10.0.42.2", &pf);
1448        let output = BridgeNetwork::port_forward_rule_args("-A", "OUTPUT", "10.0.42.2", &pf);
1449
1450        for args in [&prerouting, &output] {
1451            assert!(
1452                args.windows(2)
1453                    .any(|pair| pair[0] == "-d" && pair[1] == "127.0.0.1"),
1454                "port forward must restrict DNAT rules to the configured host IP"
1455            );
1456        }
1457    }
1458}