Skip to main content

nucleus/network/
bridge.rs

1use super::{egress, netlink, netns};
2use crate::error::{NucleusError, Result, StateTransition};
3use crate::network::config::{BridgeConfig, EgressPolicy, PortForward};
4use crate::network::NetworkState;
5use serde::{Deserialize, Serialize};
6use std::fs::OpenOptions;
7use std::net::Ipv4Addr;
8use std::os::fd::FromRawFd;
9use std::os::unix::fs::FileTypeExt;
10use std::os::unix::fs::OpenOptionsExt;
11use std::os::unix::io::AsRawFd;
12use std::process::Command;
13use tracing::{debug, info, warn};
14
15/// Bridge network manager
16pub struct BridgeNetwork {
17    config: BridgeConfig,
18    container_ip: String,
19    veth_host: String,
20    container_id: String,
21    ip_forward_ref_acquired: bool,
22    state: NetworkState,
23}
24
25const IP_FORWARD_SYSCTL_PATH: &str = "/proc/sys/net/ipv4/ip_forward";
26const IP_FORWARD_LOCK_FILE: &str = ".ip_forward.lock";
27const IP_FORWARD_STATE_FILE: &str = ".ip_forward.state";
28
29#[derive(Debug, Clone, Serialize, Deserialize)]
30struct IpForwardRefState {
31    refcount: u64,
32    original_value: String,
33}
34
35impl BridgeNetwork {
36    fn open_dev_urandom() -> Result<std::fs::File> {
37        let file = OpenOptions::new()
38            .read(true)
39            .custom_flags(libc::O_NOFOLLOW | libc::O_CLOEXEC)
40            .open("/dev/urandom")
41            .map_err(|e| {
42                NucleusError::NetworkError(format!("Failed to open /dev/urandom: {}", e))
43            })?;
44
45        let metadata = file.metadata().map_err(|e| {
46            NucleusError::NetworkError(format!("Failed to stat /dev/urandom: {}", e))
47        })?;
48        if !metadata.file_type().is_char_device() {
49            return Err(NucleusError::NetworkError(
50                "/dev/urandom is not a character device".to_string(),
51            ));
52        }
53
54        Ok(file)
55    }
56
57    /// Set up bridge networking for a container
58    ///
59    /// Creates bridge, veth pair, assigns IPs, enables NAT.
60    /// Must be called from the parent process after fork (needs host netns).
61    ///
62    /// State transitions: Unconfigured -> Configuring -> Active
63    pub fn setup(pid: u32, config: &BridgeConfig) -> Result<Self> {
64        Self::setup_for(pid, config, &format!("{:x}", pid))
65    }
66
67    /// Set up bridge networking with an explicit container ID for IP tracking.
68    pub fn setup_with_id(pid: u32, config: &BridgeConfig, container_id: &str) -> Result<Self> {
69        Self::setup_for(pid, config, container_id)
70    }
71
72    fn setup_for(pid: u32, config: &BridgeConfig, container_id: &str) -> Result<Self> {
73        // Validate all network parameters before using them in shell commands
74        config.validate()?;
75
76        let mut net_state = NetworkState::Unconfigured;
77        net_state = net_state.transition(NetworkState::Configuring)?;
78
79        let alloc_dir = Self::ip_alloc_dir();
80        let container_ip = Self::reserve_ip_in_dir(
81            &alloc_dir,
82            container_id,
83            &config.subnet,
84            config.container_ip.as_deref(),
85        )?;
86        let prefix = Self::subnet_prefix(&config.subnet);
87
88        // Linux interface names max 15 chars; truncate if needed
89        let veth_host_full = format!("veth-{:x}", pid);
90        let veth_cont_full = format!("vethc-{:x}", pid);
91        let veth_host = veth_host_full[..veth_host_full.len().min(15)].to_string();
92        let veth_container = veth_cont_full[..veth_cont_full.len().min(15)].to_string();
93        let mut rollback = SetupRollback::new(
94            veth_host.clone(),
95            config.subnet.clone(),
96            Some((alloc_dir.clone(), container_id.to_string())),
97        );
98
99        // 1. Create bridge if it doesn't exist
100        Self::ensure_bridge_for(&config.bridge_name, &config.subnet)?;
101
102        // 2. Create veth pair
103        netlink::create_veth(&veth_host, &veth_container)?;
104        rollback.veth_created = true;
105
106        // 3. Attach host end to bridge
107        netlink::set_link_master(&veth_host, &config.bridge_name)?;
108        netlink::set_link_up(&veth_host)?;
109
110        // 4. Move container end to container's network namespace
111        netlink::set_link_netns(&veth_container, pid)?;
112
113        // 5. Configure container interface (inside container netns via setns).
114        // Capture the process start time from /proc to detect PID recycling
115        // between the caller passing the PID and our netns operations.
116        let start_ticks = Self::read_pid_start_ticks(pid);
117        if start_ticks == 0 {
118            drop(rollback);
119            return Err(NucleusError::NetworkError(format!(
120                "Cannot read start_ticks for PID {} – process may have exited",
121                pid
122            )));
123        }
124
125        let container_addr: Ipv4Addr = container_ip.parse().map_err(|e| {
126            NucleusError::NetworkError(format!("invalid container IP '{}': {}", container_ip, e))
127        })?;
128        {
129            let vc = veth_container.clone();
130            netns::in_netns(pid, move || {
131                netlink::add_addr(&vc, container_addr, prefix)?;
132                netlink::set_link_up(&vc)?;
133                netlink::set_link_up("lo")?;
134                Ok(())
135            })?;
136        }
137
138        // Verify PID was not recycled during netns operations
139        let current_ticks = Self::read_pid_start_ticks(pid);
140        if current_ticks != start_ticks {
141            drop(rollback);
142            return Err(NucleusError::NetworkError(format!(
143                "PID {} was recycled during network setup (start_ticks changed: {} -> {})",
144                pid, start_ticks, current_ticks
145            )));
146        }
147
148        // 6. Set default route in container
149        let gateway = Self::gateway_from_subnet(&config.subnet);
150        let gateway_addr: Ipv4Addr = gateway.parse().map_err(|e| {
151            NucleusError::NetworkError(format!("invalid gateway IP '{}': {}", gateway, e))
152        })?;
153        netns::in_netns(pid, move || netlink::add_default_route(gateway_addr))?;
154
155        // 7. Enable NAT (masquerade) on the host
156        Self::run_cmd(
157            "iptables",
158            &[
159                "-t",
160                "nat",
161                "-A",
162                "POSTROUTING",
163                "-s",
164                &config.subnet,
165                "-j",
166                "MASQUERADE",
167            ],
168        )?;
169        rollback.nat_added = true;
170
171        // 8. Enable IP forwarding using a cross-container refcount so one
172        // container cannot disable forwarding while another bridge is still active.
173        Self::acquire_ip_forward_ref()?;
174        rollback.ip_forward_ref_acquired = true;
175
176        // 9. Set up port forwarding rules
177        for pf in &config.port_forwards {
178            Self::setup_port_forward_for(&container_ip, pf)?;
179            rollback
180                .port_forwards
181                .push((container_ip.clone(), pf.clone()));
182        }
183
184        net_state = net_state.transition(NetworkState::Active)?;
185
186        info!(
187            "Bridge network configured: {} -> {} (IP: {})",
188            veth_host, veth_container, container_ip
189        );
190        let ip_forward_ref_acquired = rollback.ip_forward_ref_acquired;
191        rollback.disarm();
192
193        Ok(Self {
194            config: config.clone(),
195            container_ip,
196            veth_host,
197            container_id: container_id.to_string(),
198            ip_forward_ref_acquired,
199            state: net_state,
200        })
201    }
202
203    /// Apply egress policy rules inside the container's network namespace.
204    ///
205    /// Uses iptables OUTPUT chain to restrict outbound connections.
206    /// Must be called after bridge setup while the container netns is reachable.
207    pub fn apply_egress_policy(&self, pid: u32, policy: &EgressPolicy) -> Result<()> {
208        egress::apply_egress_policy(pid, &self.config.dns, policy, false)
209    }
210
211    /// Clean up bridge networking
212    ///
213    /// State transition: Active -> Cleaned
214    pub fn cleanup(mut self) -> Result<()> {
215        self.state = self.state.transition(NetworkState::Cleaned)?;
216
217        // Release the IP allocation
218        Self::release_allocated_ip(&self.container_id);
219
220        // Remove port forwarding rules
221        for pf in &self.config.port_forwards {
222            if let Err(e) = self.cleanup_port_forward(pf) {
223                warn!("Failed to cleanup port forward: {}", e);
224            }
225        }
226
227        // Remove NAT rule
228        let _ = Self::run_cmd(
229            "iptables",
230            &[
231                "-t",
232                "nat",
233                "-D",
234                "POSTROUTING",
235                "-s",
236                &self.config.subnet,
237                "-j",
238                "MASQUERADE",
239            ],
240        );
241
242        // Delete veth pair (deleting one end removes both)
243        let _ = netlink::del_link(&self.veth_host);
244
245        if self.ip_forward_ref_acquired {
246            if let Err(e) = Self::release_ip_forward_ref() {
247                warn!("Failed to release ip_forward refcount: {}", e);
248            } else {
249                self.ip_forward_ref_acquired = false;
250            }
251        }
252
253        info!("Bridge network cleaned up");
254        Ok(())
255    }
256
257    /// Best-effort cleanup for use in Drop. Performs the same teardown as
258    /// `cleanup()` but ignores all errors and skips the state transition
259    /// (which requires ownership).
260    fn cleanup_best_effort(&mut self) {
261        if self.state == NetworkState::Cleaned {
262            return;
263        }
264
265        Self::release_allocated_ip(&self.container_id);
266
267        for pf in &self.config.port_forwards {
268            let _ = self.cleanup_port_forward(pf);
269        }
270
271        let _ = Self::run_cmd(
272            "iptables",
273            &[
274                "-t",
275                "nat",
276                "-D",
277                "POSTROUTING",
278                "-s",
279                &self.config.subnet,
280                "-j",
281                "MASQUERADE",
282            ],
283        );
284
285        let _ = netlink::del_link(&self.veth_host);
286
287        if self.ip_forward_ref_acquired {
288            let _ = Self::release_ip_forward_ref();
289            self.ip_forward_ref_acquired = false;
290        }
291
292        self.state = NetworkState::Cleaned;
293        debug!("Bridge network cleaned up (best-effort via drop)");
294    }
295
296    /// Detect and remove orphaned iptables rules from previous Nucleus runs.
297    ///
298    /// Checks for stale MASQUERADE rules referencing the nucleus subnet that
299    /// have no corresponding running container. Prevents gradual degradation
300    /// of network isolation from accumulated orphaned rules.
301    pub fn cleanup_orphaned_rules(subnet: &str) {
302        // List NAT rules and look for nucleus-related MASQUERADE entries
303        let output = match Command::new("iptables")
304            .args(["-t", "nat", "-L", "POSTROUTING", "-n"])
305            .output()
306        {
307            Ok(o) => o,
308            Err(e) => {
309                debug!("Cannot check iptables for orphaned rules: {}", e);
310                return;
311            }
312        };
313
314        let stdout = String::from_utf8_lossy(&output.stdout);
315        let mut orphaned_count = 0u32;
316        for line in stdout.lines() {
317            if line.contains("MASQUERADE") && line.contains(subnet) {
318                // Try to remove it; if it fails, it may be actively used
319                let _ = Self::run_cmd(
320                    "iptables",
321                    &[
322                        "-t",
323                        "nat",
324                        "-D",
325                        "POSTROUTING",
326                        "-s",
327                        subnet,
328                        "-j",
329                        "MASQUERADE",
330                    ],
331                );
332                orphaned_count += 1;
333            }
334        }
335
336        if orphaned_count > 0 {
337            info!(
338                "Cleaned up {} orphaned iptables MASQUERADE rule(s) for subnet {}",
339                orphaned_count, subnet
340            );
341        }
342    }
343
344    fn ensure_bridge_for(bridge_name: &str, subnet: &str) -> Result<()> {
345        if netlink::link_exists(bridge_name) {
346            return Ok(());
347        }
348
349        netlink::create_bridge(bridge_name)?;
350
351        let gateway = Self::gateway_from_subnet(subnet);
352        let gateway_addr: Ipv4Addr = gateway.parse().map_err(|e| {
353            NucleusError::NetworkError(format!("invalid bridge gateway '{}': {}", gateway, e))
354        })?;
355        netlink::add_addr(bridge_name, gateway_addr, Self::subnet_prefix(subnet))?;
356        netlink::set_link_up(bridge_name)?;
357
358        info!("Created bridge {}", bridge_name);
359        Ok(())
360    }
361
362    fn setup_port_forward_for(container_ip: &str, pf: &PortForward) -> Result<()> {
363        for chain in ["PREROUTING", "OUTPUT"] {
364            let args = Self::port_forward_rule_args("-A", chain, container_ip, pf);
365            Self::run_cmd_owned("iptables", &args)?;
366        }
367
368        let host_ip = pf
369            .host_ip
370            .map(|ip| ip.to_string())
371            .unwrap_or_else(|| "0.0.0.0".to_string());
372        info!(
373            "Port forward: {}:{} -> {}:{}/{}",
374            host_ip, pf.host_port, container_ip, pf.container_port, pf.protocol
375        );
376        Ok(())
377    }
378
379    fn cleanup_port_forward(&self, pf: &PortForward) -> Result<()> {
380        for chain in ["OUTPUT", "PREROUTING"] {
381            let args = Self::port_forward_rule_args("-D", chain, &self.container_ip, pf);
382            Self::run_cmd_owned("iptables", &args)?;
383        }
384        Ok(())
385    }
386
387    /// Allocate a container IP from the subnet using /dev/urandom.
388    ///
389    /// Checks both host-visible interfaces (via `ip addr`) and IPs assigned to
390    /// other Nucleus containers (via state files) to avoid duplicates. Container
391    /// IPs inside network namespaces are invisible to `ip addr show` on the host.
392    fn allocate_ip_with_reserved(
393        subnet: &str,
394        reserved: &std::collections::HashSet<String>,
395    ) -> Result<String> {
396        let base = subnet.split('/').next().unwrap_or("10.0.42.0");
397        let parts: Vec<&str> = base.split('.').collect();
398        if parts.len() != 4 {
399            return Ok("10.0.42.2".to_string());
400        }
401
402        // Use rejection sampling to avoid modulo bias.
403        // Range is 2..=254 (253 values). We reject random bytes >= 253 to
404        // ensure uniform distribution, then add 2 to shift into the valid range.
405        // Open /dev/urandom once and read all randomness in a single batch.
406        // 128 bytes gives ~125 valid candidates (byte < 253), making exhaustion
407        // in a populated subnet far less likely than the previous 32-byte buffer.
408        let mut rand_buf = [0u8; 128];
409        let mut urandom = Self::open_dev_urandom()?;
410        std::io::Read::read_exact(&mut urandom, &mut rand_buf).map_err(|e| {
411            NucleusError::NetworkError(format!("Failed to read /dev/urandom: {}", e))
412        })?;
413        for &byte in &rand_buf {
414            // Rejection sampling: discard values that would cause modulo bias
415            if byte >= 253 {
416                continue;
417            }
418            let offset = byte as u32 + 2;
419            let candidate = format!("{}.{}.{}.{}", parts[0], parts[1], parts[2], offset);
420            if reserved.contains(&candidate) {
421                continue;
422            }
423            if !Self::is_ip_in_use(&candidate)? {
424                // Lock is released when lock_file is dropped
425                return Ok(candidate);
426            }
427        }
428
429        Err(NucleusError::NetworkError(format!(
430            "Failed to allocate free IP in subnet {}",
431            subnet
432        )))
433    }
434
435    fn reserve_ip_in_dir(
436        alloc_dir: &std::path::Path,
437        container_id: &str,
438        subnet: &str,
439        requested_ip: Option<&str>,
440    ) -> Result<String> {
441        Self::ensure_alloc_dir(alloc_dir)?;
442        let lock_path = alloc_dir.join(".lock");
443        let lock_file = std::fs::OpenOptions::new()
444            .create(true)
445            .write(true)
446            .truncate(false)
447            .open(&lock_path)
448            .map_err(|e| {
449                NucleusError::NetworkError(format!("Failed to open IP alloc lock: {}", e))
450            })?;
451        // SAFETY: lock_file is a valid open fd. LOCK_EX is a blocking exclusive
452        // lock that is released when the fd is closed (end of scope).
453        let lock_ret = unsafe { libc::flock(lock_file.as_raw_fd(), libc::LOCK_EX) };
454        if lock_ret != 0 {
455            return Err(NucleusError::NetworkError(format!(
456                "Failed to acquire IP alloc lock: {}",
457                std::io::Error::last_os_error()
458            )));
459        }
460
461        let reserved = Self::collect_reserved_ips_in_dir(alloc_dir);
462        let ip = match requested_ip {
463            Some(ip) => {
464                if reserved.contains(ip) || Self::is_ip_in_use(ip)? {
465                    return Err(NucleusError::NetworkError(format!(
466                        "Requested container IP {} is already in use",
467                        ip
468                    )));
469                }
470                ip.to_string()
471            }
472            None => Self::allocate_ip_with_reserved(subnet, &reserved)?,
473        };
474
475        Self::record_allocated_ip_in_dir(alloc_dir, container_id, &ip)?;
476        Ok(ip)
477    }
478
479    /// Scan the Nucleus IP allocation directory for IPs already assigned.
480    fn collect_reserved_ips_in_dir(
481        alloc_dir: &std::path::Path,
482    ) -> std::collections::HashSet<String> {
483        let mut ips = std::collections::HashSet::new();
484        if let Ok(entries) = std::fs::read_dir(alloc_dir) {
485            for entry in entries.flatten() {
486                if let Some(name) = entry.file_name().to_str() {
487                    if name.ends_with(".ip") {
488                        if let Ok(ip) = std::fs::read_to_string(entry.path()) {
489                            let ip = ip.trim().to_string();
490                            if !ip.is_empty() {
491                                ips.insert(ip);
492                            }
493                        }
494                    }
495                }
496            }
497        }
498        ips
499    }
500
501    /// Persist the allocated IP for this container so other containers can see it.
502    fn record_allocated_ip_in_dir(
503        alloc_dir: &std::path::Path,
504        container_id: &str,
505        ip: &str,
506    ) -> Result<()> {
507        Self::ensure_alloc_dir(alloc_dir)?;
508        let path = alloc_dir.join(format!("{}.ip", container_id));
509        std::fs::write(&path, ip).map_err(|e| {
510            NucleusError::NetworkError(format!("Failed to record IP allocation: {}", e))
511        })?;
512        Ok(())
513    }
514
515    /// Remove the persisted IP allocation for a container.
516    fn release_allocated_ip(container_id: &str) {
517        let alloc_dir = Self::ip_alloc_dir();
518        Self::release_allocated_ip_in_dir(&alloc_dir, container_id);
519    }
520
521    fn release_allocated_ip_in_dir(alloc_dir: &std::path::Path, container_id: &str) {
522        let path = alloc_dir.join(format!("{}.ip", container_id));
523        let _ = std::fs::remove_file(path);
524    }
525
526    /// Create the IP allocation directory with restrictive permissions (0700)
527    /// and reject symlinked paths to prevent symlink attacks.
528    fn ensure_alloc_dir(alloc_dir: &std::path::Path) -> Result<()> {
529        // L11: Check for symlinks BEFORE creating directories to avoid TOCTOU.
530        // If the path already exists, verify it's not a symlink.
531        if alloc_dir.exists() {
532            if let Ok(meta) = std::fs::symlink_metadata(alloc_dir) {
533                if meta.file_type().is_symlink() {
534                    return Err(NucleusError::NetworkError(format!(
535                        "IP alloc dir {:?} is a symlink, refusing to use",
536                        alloc_dir
537                    )));
538                }
539            }
540        }
541        // Also check parent directory for symlinks
542        if let Some(parent) = alloc_dir.parent() {
543            if let Ok(meta) = std::fs::symlink_metadata(parent) {
544                if meta.file_type().is_symlink() {
545                    return Err(NucleusError::NetworkError(format!(
546                        "IP alloc dir parent {:?} is a symlink, refusing to use",
547                        parent
548                    )));
549                }
550            }
551        }
552
553        std::fs::create_dir_all(alloc_dir).map_err(|e| {
554            NucleusError::NetworkError(format!("Failed to create IP alloc dir: {}", e))
555        })?;
556
557        // Restrict permissions to owner-only atomically after creation
558        use std::os::unix::fs::PermissionsExt;
559        let perms = std::fs::Permissions::from_mode(0o700);
560        std::fs::set_permissions(alloc_dir, perms).map_err(|e| {
561            NucleusError::NetworkError(format!(
562                "Failed to set permissions on IP alloc dir {:?}: {}",
563                alloc_dir, e
564            ))
565        })?;
566
567        // Re-verify no symlink replacement after permissions were set
568        if let Ok(meta) = std::fs::symlink_metadata(alloc_dir) {
569            if meta.file_type().is_symlink() {
570                return Err(NucleusError::NetworkError(format!(
571                    "IP alloc dir {:?} was replaced with a symlink during setup",
572                    alloc_dir
573                )));
574            }
575        }
576        Ok(())
577    }
578
579    fn ip_alloc_dir() -> std::path::PathBuf {
580        if nix::unistd::Uid::effective().is_root() {
581            std::path::PathBuf::from("/var/run/nucleus/ip-alloc")
582        } else {
583            dirs::runtime_dir()
584                .map(|d| d.join("nucleus/ip-alloc"))
585                .or_else(|| dirs::data_local_dir().map(|d| d.join("nucleus/ip-alloc")))
586                .unwrap_or_else(|| {
587                    dirs::home_dir()
588                        .map(|h| h.join(".nucleus/ip-alloc"))
589                        .unwrap_or_else(|| std::path::PathBuf::from("/var/run/nucleus/ip-alloc"))
590                })
591        }
592    }
593
594    fn ip_forward_lock_path(alloc_dir: &std::path::Path) -> std::path::PathBuf {
595        alloc_dir.join(IP_FORWARD_LOCK_FILE)
596    }
597
598    fn ip_forward_state_path(alloc_dir: &std::path::Path) -> std::path::PathBuf {
599        alloc_dir.join(IP_FORWARD_STATE_FILE)
600    }
601
602    fn read_ip_forward_value(sysctl_path: &std::path::Path) -> Result<String> {
603        std::fs::read_to_string(sysctl_path)
604            .map(|value| value.trim().to_string())
605            .map_err(|e| {
606                NucleusError::NetworkError(format!(
607                    "Failed to read {}: {}",
608                    sysctl_path.display(),
609                    e
610                ))
611            })
612    }
613
614    fn write_ip_forward_value(sysctl_path: &std::path::Path, value: &str) -> Result<()> {
615        std::fs::write(sysctl_path, value).map_err(|e| {
616            NucleusError::NetworkError(format!(
617                "Failed to write {} to {}: {}",
618                value,
619                sysctl_path.display(),
620                e
621            ))
622        })
623    }
624
625    fn load_ip_forward_state(alloc_dir: &std::path::Path) -> Result<Option<IpForwardRefState>> {
626        let state_path = Self::ip_forward_state_path(alloc_dir);
627        match std::fs::read_to_string(&state_path) {
628            Ok(content) => serde_json::from_str(&content).map(Some).map_err(|e| {
629                NucleusError::NetworkError(format!(
630                    "Failed to parse ip_forward refcount state {:?}: {}",
631                    state_path, e
632                ))
633            }),
634            Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(None),
635            Err(e) => Err(NucleusError::NetworkError(format!(
636                "Failed to read ip_forward refcount state {:?}: {}",
637                state_path, e
638            ))),
639        }
640    }
641
642    fn store_ip_forward_state(
643        alloc_dir: &std::path::Path,
644        state: &IpForwardRefState,
645    ) -> Result<()> {
646        let state_path = Self::ip_forward_state_path(alloc_dir);
647        let encoded = serde_json::to_vec(state).map_err(|e| {
648            NucleusError::NetworkError(format!(
649                "Failed to serialize ip_forward refcount state {:?}: {}",
650                state_path, e
651            ))
652        })?;
653        std::fs::write(&state_path, encoded).map_err(|e| {
654            NucleusError::NetworkError(format!(
655                "Failed to persist ip_forward refcount state {:?}: {}",
656                state_path, e
657            ))
658        })
659    }
660
661    fn remove_ip_forward_state(alloc_dir: &std::path::Path) -> Result<()> {
662        let state_path = Self::ip_forward_state_path(alloc_dir);
663        match std::fs::remove_file(&state_path) {
664            Ok(()) => Ok(()),
665            Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(()),
666            Err(e) => Err(NucleusError::NetworkError(format!(
667                "Failed to remove ip_forward refcount state {:?}: {}",
668                state_path, e
669            ))),
670        }
671    }
672
673    fn acquire_ip_forward_ref() -> Result<()> {
674        let alloc_dir = Self::ip_alloc_dir();
675        Self::acquire_ip_forward_ref_in_dir(
676            &alloc_dir,
677            std::path::Path::new(IP_FORWARD_SYSCTL_PATH),
678        )
679    }
680
681    fn acquire_ip_forward_ref_in_dir(
682        alloc_dir: &std::path::Path,
683        sysctl_path: &std::path::Path,
684    ) -> Result<()> {
685        Self::ensure_alloc_dir(alloc_dir)?;
686        let lock_path = Self::ip_forward_lock_path(alloc_dir);
687        let lock_file = std::fs::OpenOptions::new()
688            .create(true)
689            .write(true)
690            .truncate(false)
691            .open(&lock_path)
692            .map_err(|e| {
693                NucleusError::NetworkError(format!(
694                    "Failed to open ip_forward lock {:?}: {}",
695                    lock_path, e
696                ))
697            })?;
698        let lock_ret = unsafe { libc::flock(lock_file.as_raw_fd(), libc::LOCK_EX) };
699        if lock_ret != 0 {
700            return Err(NucleusError::NetworkError(format!(
701                "Failed to acquire ip_forward lock: {}",
702                std::io::Error::last_os_error()
703            )));
704        }
705
706        let mut state = match Self::load_ip_forward_state(alloc_dir)? {
707            Some(state) => state,
708            None => {
709                let original_value = Self::read_ip_forward_value(sysctl_path)?;
710                let state = IpForwardRefState {
711                    refcount: 0,
712                    original_value,
713                };
714                Self::store_ip_forward_state(alloc_dir, &state)?;
715                state
716            }
717        };
718
719        if state.refcount == 0 {
720            Self::write_ip_forward_value(sysctl_path, "1")?;
721        }
722        state.refcount = state.refcount.checked_add(1).ok_or_else(|| {
723            NucleusError::NetworkError("ip_forward refcount overflow".to_string())
724        })?;
725        Self::store_ip_forward_state(alloc_dir, &state)
726    }
727
728    fn release_ip_forward_ref() -> Result<()> {
729        let alloc_dir = Self::ip_alloc_dir();
730        Self::release_ip_forward_ref_in_dir(
731            &alloc_dir,
732            std::path::Path::new(IP_FORWARD_SYSCTL_PATH),
733        )
734    }
735
736    fn release_ip_forward_ref_in_dir(
737        alloc_dir: &std::path::Path,
738        sysctl_path: &std::path::Path,
739    ) -> Result<()> {
740        if !alloc_dir.exists() {
741            return Ok(());
742        }
743        let lock_path = Self::ip_forward_lock_path(alloc_dir);
744        let lock_file = std::fs::OpenOptions::new()
745            .create(true)
746            .write(true)
747            .truncate(false)
748            .open(&lock_path)
749            .map_err(|e| {
750                NucleusError::NetworkError(format!(
751                    "Failed to open ip_forward lock {:?}: {}",
752                    lock_path, e
753                ))
754            })?;
755        let lock_ret = unsafe { libc::flock(lock_file.as_raw_fd(), libc::LOCK_EX) };
756        if lock_ret != 0 {
757            return Err(NucleusError::NetworkError(format!(
758                "Failed to acquire ip_forward lock: {}",
759                std::io::Error::last_os_error()
760            )));
761        }
762
763        let Some(mut state) = Self::load_ip_forward_state(alloc_dir)? else {
764            return Ok(());
765        };
766
767        if state.refcount == 0 {
768            return Self::remove_ip_forward_state(alloc_dir);
769        }
770
771        state.refcount -= 1;
772        if state.refcount == 0 {
773            Self::write_ip_forward_value(sysctl_path, &state.original_value)?;
774            Self::remove_ip_forward_state(alloc_dir)?;
775            info!("Restored net.ipv4.ip_forward to {}", state.original_value);
776        } else {
777            Self::store_ip_forward_state(alloc_dir, &state)?;
778        }
779
780        Ok(())
781    }
782
783    /// Read the start time (field 22) from /proc/<pid>/stat to detect PID recycling.
784    /// Returns 0 if the process does not exist or the field cannot be parsed.
785    fn read_pid_start_ticks(pid: u32) -> u64 {
786        let stat_path = format!("/proc/{}/stat", pid);
787        if let Ok(content) = std::fs::read_to_string(&stat_path) {
788            // Field 22 is starttime. The comm field (2) may contain spaces/parens,
789            // so find the last ')' and count fields from there.
790            if let Some(after_comm) = content.rfind(')') {
791                return content[after_comm + 2..]
792                    .split_whitespace()
793                    .nth(19) // field 22 is 20th after the ')' + state field
794                    .and_then(|s| s.parse().ok())
795                    .unwrap_or(0);
796            }
797        }
798        0
799    }
800
801    /// Get gateway IP from subnet (first usable address)
802    fn gateway_from_subnet(subnet: &str) -> String {
803        let base = subnet.split('/').next().unwrap_or("10.0.42.0");
804        let parts: Vec<&str> = base.split('.').collect();
805        if parts.len() == 4 {
806            format!("{}.{}.{}.1", parts[0], parts[1], parts[2])
807        } else {
808            "10.0.42.1".to_string()
809        }
810    }
811
812    fn subnet_prefix(subnet: &str) -> u8 {
813        subnet
814            .split_once('/')
815            .and_then(|(_, p)| p.parse::<u8>().ok())
816            .filter(|p| *p <= 32)
817            .unwrap_or(24)
818    }
819
820    /// Resolve a system binary to a validated absolute path.
821    ///
822    /// Searches known sysadmin paths first, then the process PATH, while
823    /// validating ownership and permissions before use. This avoids depending
824    /// on a separate `which` binary in service managers that set a narrow PATH.
825    /// Returns an error if no valid binary is found.
826    pub(crate) fn resolve_bin(name: &str) -> Result<String> {
827        let search_dirs: &[&str] = match name {
828            "iptables" => &["/usr/sbin/iptables", "/sbin/iptables", "/usr/bin/iptables"],
829            "slirp4netns" => &[
830                "/usr/bin/slirp4netns",
831                "/bin/slirp4netns",
832                "/run/current-system/sw/bin/slirp4netns",
833            ],
834            _ => &[],
835        };
836
837        for path in search_dirs {
838            let p = std::path::Path::new(path);
839            if p.exists() {
840                Self::validate_network_binary(p, name)?;
841                return Ok(path.to_string());
842            }
843        }
844
845        if let Some(path_var) = std::env::var_os("PATH") {
846            for dir in std::env::split_paths(&path_var) {
847                let candidate = dir.join(name);
848                if candidate.exists() {
849                    Self::validate_network_binary(&candidate, name)?;
850                    return Ok(candidate.to_string_lossy().into_owned());
851                }
852            }
853        }
854
855        Err(NucleusError::NetworkError(format!(
856            "Required binary '{}' not found or failed validation",
857            name
858        )))
859    }
860
861    /// Validate a network binary's ownership and permissions.
862    /// Rejects binaries that are group/world-writable or not owned by root/euid,
863    /// except for immutable Nix store artifacts.
864    fn validate_network_binary(path: &std::path::Path, name: &str) -> Result<()> {
865        use std::os::unix::fs::MetadataExt;
866
867        let resolved = std::fs::canonicalize(path).unwrap_or_else(|_| path.to_path_buf());
868        let meta = std::fs::metadata(&resolved)
869            .map_err(|e| NucleusError::NetworkError(format!("Cannot stat {}: {}", name, e)))?;
870        let mode = meta.mode();
871        if mode & 0o022 != 0 {
872            return Err(NucleusError::NetworkError(format!(
873                "Binary '{}' at {:?} is writable by group/others (mode {:o}), refusing to execute",
874                name, resolved, mode
875            )));
876        }
877        let owner = meta.uid();
878        let euid = nix::unistd::Uid::effective().as_raw();
879        if owner != 0 && owner != euid && !Self::is_trusted_store_network_binary(&resolved, mode) {
880            return Err(NucleusError::NetworkError(format!(
881                "Binary '{}' at {:?} owned by UID {} (expected root or euid {}), refusing to execute",
882                name, resolved, owner, euid
883            )));
884        }
885        Ok(())
886    }
887
888    fn is_trusted_store_network_binary(path: &std::path::Path, mode: u32) -> bool {
889        use std::os::unix::fs::MetadataExt;
890        if !path.starts_with("/nix/store") {
891            return false;
892        }
893        if mode & 0o200 != 0 {
894            return false;
895        }
896        if let Some(parent) = path.parent() {
897            if let Ok(parent_meta) = std::fs::metadata(parent) {
898                return parent_meta.mode() & 0o222 == 0;
899            }
900        }
901        false
902    }
903
904    fn run_cmd(program: &str, args: &[&str]) -> Result<()> {
905        let resolved = Self::resolve_bin(program)?;
906        let output = Command::new(&resolved).args(args).output().map_err(|e| {
907            NucleusError::NetworkError(format!("Failed to run {} {:?}: {}", resolved, args, e))
908        })?;
909
910        if !output.status.success() {
911            let stderr = String::from_utf8_lossy(&output.stderr);
912            return Err(NucleusError::NetworkError(format!(
913                "{} {:?} failed: {}",
914                program, args, stderr
915            )));
916        }
917
918        Ok(())
919    }
920
921    fn run_cmd_owned(program: &str, args: &[String]) -> Result<()> {
922        let refs: Vec<&str> = args.iter().map(String::as_str).collect();
923        Self::run_cmd(program, &refs)
924    }
925
926    fn port_forward_rule_args(
927        operation: &str,
928        chain: &str,
929        container_ip: &str,
930        pf: &PortForward,
931    ) -> Vec<String> {
932        let mut args = vec![
933            "-t".to_string(),
934            "nat".to_string(),
935            operation.to_string(),
936            chain.to_string(),
937            "-p".to_string(),
938            pf.protocol.as_str().to_string(),
939        ];
940
941        if chain == "OUTPUT" {
942            args.extend([
943                "-m".to_string(),
944                "addrtype".to_string(),
945                "--dst-type".to_string(),
946                "LOCAL".to_string(),
947            ]);
948        }
949
950        if let Some(host_ip) = pf.host_ip {
951            args.extend(["-d".to_string(), host_ip.to_string()]);
952        }
953
954        args.extend([
955            "--dport".to_string(),
956            pf.host_port.to_string(),
957            "-j".to_string(),
958            "DNAT".to_string(),
959            "--to-destination".to_string(),
960            format!("{}:{}", container_ip, pf.container_port),
961        ]);
962
963        args
964    }
965
966    fn is_ip_in_use(ip: &str) -> Result<bool> {
967        let addr: Ipv4Addr = ip
968            .parse()
969            .map_err(|e| NucleusError::NetworkError(format!("invalid IP '{}': {}", ip, e)))?;
970        netlink::is_addr_in_use(&addr)
971    }
972
973    /// Write resolv.conf inside container (for writable /etc, e.g. agent mode)
974    pub fn write_resolv_conf(root: &std::path::Path, dns: &[String]) -> Result<()> {
975        let resolv_path = root.join("etc/resolv.conf");
976        let content: String = dns
977            .iter()
978            .map(|server| format!("nameserver {}\n", server))
979            .collect();
980        std::fs::write(&resolv_path, content).map_err(|e| {
981            NucleusError::NetworkError(format!("Failed to write resolv.conf: {}", e))
982        })?;
983        Ok(())
984    }
985
986    /// Bind-mount a resolv.conf over a read-only /etc (for production rootfs mode).
987    ///
988    /// Creates a memfd-backed resolv.conf and bind-mounts it over
989    /// /etc/resolv.conf so it works even when the rootfs /etc is read-only.
990    /// The memfd is cleaned up when the container exits.
991    pub fn bind_mount_resolv_conf(root: &std::path::Path, dns: &[String]) -> Result<()> {
992        use nix::mount::{mount, MsFlags};
993
994        let content: String = dns
995            .iter()
996            .map(|server| format!("nameserver {}\n", server))
997            .collect();
998
999        // Create a memfd-backed file to avoid leaving staging files on disk
1000        let memfd_name = std::ffi::CString::new("nucleus-resolv").map_err(|e| {
1001            NucleusError::NetworkError(format!("Failed to create memfd name: {}", e))
1002        })?;
1003        // SAFETY: memfd_name is a valid NUL-terminated CString. memfd_create
1004        // returns a new fd or -1 on error; we check for error below.
1005        let raw_fd = unsafe { libc::memfd_create(memfd_name.as_ptr(), 0) };
1006        if raw_fd < 0 {
1007            // Fallback to staging file if memfd_create is unavailable
1008            return Self::bind_mount_resolv_conf_staging(root, dns);
1009        }
1010        // SAFETY: raw_fd is a valid, newly-created fd from memfd_create.
1011        // OwnedFd takes ownership and will close it exactly once on drop,
1012        // preventing double-close on any error path.
1013        let memfd = unsafe { std::os::fd::OwnedFd::from_raw_fd(raw_fd) };
1014
1015        // Write content to memfd using File I/O to handle partial writes correctly.
1016        use std::io::Write as _;
1017        let mut memfd_file = std::fs::File::from(memfd);
1018        if memfd_file.write_all(content.as_bytes()).is_err() {
1019            // memfd_file dropped here, closing the fd automatically
1020            return Self::bind_mount_resolv_conf_staging(root, dns);
1021        }
1022        // Re-extract the OwnedFd for the proc path below
1023        use std::os::fd::IntoRawFd;
1024        let memfd = {
1025            let raw = memfd_file.into_raw_fd();
1026            // SAFETY: raw is the valid fd we just extracted from the File.
1027            unsafe { std::os::fd::OwnedFd::from_raw_fd(raw) }
1028        };
1029
1030        // Ensure the mount target exists
1031        let target = root.join("etc/resolv.conf");
1032        if !target.exists() {
1033            let _ = std::fs::write(&target, "");
1034        }
1035
1036        // Bind mount the memfd over the read-only resolv.conf
1037        let memfd_path = format!("/proc/self/fd/{}", memfd.as_raw_fd());
1038        if let Err(e) = mount(
1039            Some(memfd_path.as_str()),
1040            &target,
1041            None::<&str>,
1042            MsFlags::MS_BIND,
1043            None::<&str>,
1044        ) {
1045            return Err(NucleusError::NetworkError(format!(
1046                "Failed to bind mount memfd-backed resolv.conf: {}",
1047                e
1048            )));
1049        }
1050        Self::harden_resolv_conf_bind(&target)?;
1051
1052        // memfd dropped here – the mount holds a kernel reference to the file,
1053        // so it survives the fd close.
1054
1055        info!("Bind-mounted resolv.conf for bridge networking (rootfs mode, memfd)");
1056        Ok(())
1057    }
1058
1059    /// Fallback: bind-mount a staging resolv.conf file.
1060    fn bind_mount_resolv_conf_staging(root: &std::path::Path, dns: &[String]) -> Result<()> {
1061        use nix::mount::{mount, MsFlags};
1062
1063        let content: String = dns
1064            .iter()
1065            .map(|server| format!("nameserver {}\n", server))
1066            .collect();
1067
1068        let staging = Self::create_resolv_conf_staging_file(root, content.as_bytes())?;
1069
1070        // Ensure the mount target exists
1071        let target = root.join("etc/resolv.conf");
1072        if !target.exists() {
1073            let _ = std::fs::write(&target, "");
1074        }
1075
1076        // Bind mount the staging file over the read-only resolv.conf
1077        mount(
1078            Some(staging.path()),
1079            &target,
1080            None::<&str>,
1081            MsFlags::MS_BIND,
1082            None::<&str>,
1083        )
1084        .map_err(|e| {
1085            NucleusError::NetworkError(format!("Failed to bind mount resolv.conf: {}", e))
1086        })?;
1087        Self::harden_resolv_conf_bind(&target)?;
1088
1089        // The bind mount holds a reference to the inode. Dropping the temporary
1090        // file unlinks the staging path so DNS server info is not left on disk.
1091
1092        info!("Bind-mounted resolv.conf for bridge networking (rootfs mode, staging)");
1093        Ok(())
1094    }
1095
1096    fn create_resolv_conf_staging_file(
1097        root: &std::path::Path,
1098        content: &[u8],
1099    ) -> Result<tempfile::NamedTempFile> {
1100        use std::io::Write as _;
1101
1102        let staging_dir = root.parent().ok_or_else(|| {
1103            NucleusError::NetworkError(format!(
1104                "Container root {:?} has no parent for resolv.conf staging",
1105                root
1106            ))
1107        })?;
1108
1109        let mut staging = tempfile::Builder::new()
1110            .prefix(".resolv.conf.nucleus.")
1111            .tempfile_in(staging_dir)
1112            .map_err(|e| {
1113                NucleusError::NetworkError(format!(
1114                    "Failed to create temporary resolv.conf staging file under {:?}: {}",
1115                    staging_dir, e
1116                ))
1117            })?;
1118
1119        staging.as_file_mut().write_all(content).map_err(|e| {
1120            NucleusError::NetworkError(format!(
1121                "Failed to write temporary resolv.conf staging file {:?}: {}",
1122                staging.path(),
1123                e
1124            ))
1125        })?;
1126
1127        Ok(staging)
1128    }
1129
1130    fn harden_resolv_conf_bind(target: &std::path::Path) -> Result<()> {
1131        use nix::mount::{mount, MsFlags};
1132
1133        mount(
1134            None::<&str>,
1135            target,
1136            None::<&str>,
1137            MsFlags::MS_REMOUNT
1138                | MsFlags::MS_BIND
1139                | MsFlags::MS_RDONLY
1140                | MsFlags::MS_NOSUID
1141                | MsFlags::MS_NODEV
1142                | MsFlags::MS_NOEXEC,
1143            None::<&str>,
1144        )
1145        .map_err(|e| {
1146            NucleusError::NetworkError(format!(
1147                "Failed to remount resolv.conf with hardened flags at {:?}: {}",
1148                target, e
1149            ))
1150        })
1151    }
1152}
1153
1154impl Drop for BridgeNetwork {
1155    fn drop(&mut self) {
1156        self.cleanup_best_effort();
1157    }
1158}
1159
1160struct SetupRollback {
1161    veth_host: String,
1162    subnet: String,
1163    veth_created: bool,
1164    nat_added: bool,
1165    port_forwards: Vec<(String, PortForward)>,
1166    ip_forward_ref_acquired: bool,
1167    reserved_ip: Option<(std::path::PathBuf, String)>,
1168    armed: bool,
1169}
1170
1171impl SetupRollback {
1172    fn new(
1173        veth_host: String,
1174        subnet: String,
1175        reserved_ip: Option<(std::path::PathBuf, String)>,
1176    ) -> Self {
1177        Self {
1178            veth_host,
1179            subnet,
1180            veth_created: false,
1181            nat_added: false,
1182            port_forwards: Vec::new(),
1183            ip_forward_ref_acquired: false,
1184            reserved_ip,
1185            armed: true,
1186        }
1187    }
1188
1189    fn disarm(&mut self) {
1190        self.armed = false;
1191    }
1192}
1193
1194impl Drop for SetupRollback {
1195    fn drop(&mut self) {
1196        if !self.armed {
1197            return;
1198        }
1199
1200        for (container_ip, pf) in self.port_forwards.iter().rev() {
1201            for chain in ["OUTPUT", "PREROUTING"] {
1202                let args = BridgeNetwork::port_forward_rule_args("-D", chain, container_ip, pf);
1203                if let Err(e) = BridgeNetwork::run_cmd_owned("iptables", &args) {
1204                    warn!(
1205                        "Rollback: failed to remove iptables {} rule for {}: {}",
1206                        chain, container_ip, e
1207                    );
1208                }
1209            }
1210        }
1211
1212        if self.nat_added {
1213            if let Err(e) = BridgeNetwork::run_cmd(
1214                "iptables",
1215                &[
1216                    "-t",
1217                    "nat",
1218                    "-D",
1219                    "POSTROUTING",
1220                    "-s",
1221                    &self.subnet,
1222                    "-j",
1223                    "MASQUERADE",
1224                ],
1225            ) {
1226                warn!("Rollback: failed to remove NAT rule: {}", e);
1227            }
1228        }
1229
1230        if self.veth_created {
1231            if let Err(e) = netlink::del_link(&self.veth_host) {
1232                warn!("Rollback: failed to delete veth {}: {}", self.veth_host, e);
1233            }
1234        }
1235
1236        if self.ip_forward_ref_acquired {
1237            if let Err(e) = BridgeNetwork::release_ip_forward_ref() {
1238                warn!("Rollback: failed to release ip_forward refcount: {}", e);
1239            }
1240        }
1241
1242        if let Some((alloc_dir, container_id)) = &self.reserved_ip {
1243            BridgeNetwork::release_allocated_ip_in_dir(alloc_dir, container_id);
1244        }
1245    }
1246}
1247
1248#[cfg(test)]
1249mod tests {
1250    use super::*;
1251
1252    #[test]
1253    fn test_ip_allocation_rejection_sampling_range() {
1254        // H-5: Verify that rejection sampling produces values in 2..=254
1255        // and that values >= 253 are rejected (no modulo bias).
1256        for byte in 0u8..253 {
1257            let offset = byte as u32 + 2;
1258            assert!(
1259                (2..=254).contains(&offset),
1260                "offset {} out of range",
1261                offset
1262            );
1263        }
1264        // Values 253, 254, 255 must be rejected
1265        for byte in [253u8, 254, 255] {
1266            assert!(byte >= 253);
1267        }
1268    }
1269
1270    #[test]
1271    fn test_reserve_ip_blocks_duplicate_requested_address() {
1272        let temp = tempfile::tempdir().unwrap();
1273        BridgeNetwork::record_allocated_ip_in_dir(temp.path(), "one", "10.0.42.2").unwrap();
1274
1275        let err =
1276            BridgeNetwork::reserve_ip_in_dir(temp.path(), "two", "10.0.42.0/24", Some("10.0.42.2"))
1277                .unwrap_err();
1278        assert!(
1279            err.to_string().contains("already in use"),
1280            "second reservation of the same IP must fail"
1281        );
1282    }
1283
1284    #[test]
1285    fn test_setup_rollback_releases_reserved_ip() {
1286        let temp = tempfile::tempdir().unwrap();
1287        BridgeNetwork::record_allocated_ip_in_dir(temp.path(), "rollback", "10.0.42.3").unwrap();
1288
1289        let rollback = SetupRollback {
1290            veth_host: "veth-test".to_string(),
1291            subnet: "10.0.42.0/24".to_string(),
1292            veth_created: false,
1293            nat_added: false,
1294            port_forwards: Vec::new(),
1295            ip_forward_ref_acquired: false,
1296            reserved_ip: Some((temp.path().to_path_buf(), "rollback".to_string())),
1297            armed: true,
1298        };
1299
1300        drop(rollback);
1301
1302        assert!(
1303            !temp.path().join("rollback.ip").exists(),
1304            "rollback must release reserved IP files on setup failure"
1305        );
1306    }
1307
1308    #[test]
1309    fn test_resolv_conf_staging_file_is_outside_container_root() {
1310        let temp = tempfile::tempdir().unwrap();
1311        let root = temp.path().join("root");
1312        std::fs::create_dir_all(root.join("tmp")).unwrap();
1313
1314        let staging =
1315            BridgeNetwork::create_resolv_conf_staging_file(&root, b"nameserver 203.0.113.53\n")
1316                .unwrap();
1317
1318        assert_eq!(
1319            std::fs::read_to_string(staging.path()).unwrap(),
1320            "nameserver 203.0.113.53\n"
1321        );
1322        assert!(
1323            !staging.path().starts_with(&root),
1324            "staging file must not be created under the container root"
1325        );
1326    }
1327
1328    #[test]
1329    fn test_bind_mount_resolv_conf_does_not_overwrite_root_tmp_symlink_on_failure() {
1330        use std::os::unix::fs::symlink;
1331
1332        let temp = tempfile::tempdir().unwrap();
1333        let root = temp.path().join("root");
1334        std::fs::create_dir_all(root.join("tmp")).unwrap();
1335
1336        let victim = temp.path().join("host_victim_file");
1337        std::fs::write(&victim, "ORIGINAL_HOST_CONTENT\n").unwrap();
1338        symlink(&victim, root.join("tmp/.resolv.conf.nucleus")).unwrap();
1339
1340        let dns = vec!["203.0.113.53".to_string()];
1341        let result = BridgeNetwork::bind_mount_resolv_conf(&root, &dns);
1342
1343        assert!(
1344            result.is_err(),
1345            "test root intentionally lacks /etc so bind mount setup must fail"
1346        );
1347        assert_eq!(
1348            std::fs::read_to_string(&victim).unwrap(),
1349            "ORIGINAL_HOST_CONTENT\n",
1350            "resolv.conf setup must not write through attacker-controlled /tmp symlinks"
1351        );
1352    }
1353
1354    #[test]
1355    fn test_ip_forward_refcount_restores_original_only_after_last_release() {
1356        let temp = tempfile::tempdir().unwrap();
1357        let sysctl = temp.path().join("ip_forward");
1358        std::fs::write(&sysctl, "0").unwrap();
1359
1360        BridgeNetwork::acquire_ip_forward_ref_in_dir(temp.path(), &sysctl).unwrap();
1361        BridgeNetwork::acquire_ip_forward_ref_in_dir(temp.path(), &sysctl).unwrap();
1362        assert_eq!(std::fs::read_to_string(&sysctl).unwrap(), "1");
1363
1364        BridgeNetwork::release_ip_forward_ref_in_dir(temp.path(), &sysctl).unwrap();
1365        assert_eq!(std::fs::read_to_string(&sysctl).unwrap(), "1");
1366
1367        BridgeNetwork::release_ip_forward_ref_in_dir(temp.path(), &sysctl).unwrap();
1368        assert_eq!(std::fs::read_to_string(&sysctl).unwrap(), "0");
1369        assert!(
1370            !temp.path().join(IP_FORWARD_STATE_FILE).exists(),
1371            "state file must be removed when the last bridge releases ip_forward"
1372        );
1373    }
1374
1375    #[test]
1376    fn test_port_forward_rules_include_output_chain_for_local_host_clients() {
1377        let pf = PortForward {
1378            host_ip: None,
1379            host_port: 8080,
1380            container_port: 80,
1381            protocol: crate::network::config::Protocol::Tcp,
1382        };
1383
1384        let prerouting =
1385            BridgeNetwork::port_forward_rule_args("-A", "PREROUTING", "10.0.42.2", &pf);
1386        let output = BridgeNetwork::port_forward_rule_args("-A", "OUTPUT", "10.0.42.2", &pf);
1387
1388        assert!(prerouting.iter().any(|arg| arg == "PREROUTING"));
1389        assert!(output.iter().any(|arg| arg == "OUTPUT"));
1390        assert!(
1391            output
1392                .windows(2)
1393                .any(|pair| pair[0] == "--dst-type" && pair[1] == "LOCAL"),
1394            "OUTPUT rule must target local-destination traffic"
1395        );
1396    }
1397
1398    #[test]
1399    fn test_port_forward_rules_include_host_ip_when_configured() {
1400        let pf = PortForward {
1401            host_ip: Some(std::net::Ipv4Addr::new(127, 0, 0, 1)),
1402            host_port: 4173,
1403            container_port: 4173,
1404            protocol: crate::network::config::Protocol::Tcp,
1405        };
1406
1407        let prerouting =
1408            BridgeNetwork::port_forward_rule_args("-A", "PREROUTING", "10.0.42.2", &pf);
1409        let output = BridgeNetwork::port_forward_rule_args("-A", "OUTPUT", "10.0.42.2", &pf);
1410
1411        for args in [&prerouting, &output] {
1412            assert!(
1413                args.windows(2)
1414                    .any(|pair| pair[0] == "-d" && pair[1] == "127.0.0.1"),
1415                "port forward must restrict DNAT rules to the configured host IP"
1416            );
1417        }
1418    }
1419}