Skip to main content

nucleus/network/
bridge.rs

1use super::{netlink, netns};
2use crate::error::{NucleusError, Result, StateTransition};
3use crate::network::config::{BridgeConfig, EgressPolicy, PortForward};
4use crate::network::NetworkState;
5use std::fs::OpenOptions;
6use std::net::Ipv4Addr;
7use std::os::fd::FromRawFd;
8use std::os::unix::fs::FileTypeExt;
9use std::os::unix::fs::OpenOptionsExt;
10use std::os::unix::io::AsRawFd;
11use std::process::Command;
12use tracing::{debug, info, warn};
13
14/// Bridge network manager
15pub struct BridgeNetwork {
16    config: BridgeConfig,
17    container_ip: String,
18    veth_host: String,
19    container_id: String,
20    prev_ip_forward: Option<String>,
21    state: NetworkState,
22}
23
24impl BridgeNetwork {
25    fn open_dev_urandom() -> Result<std::fs::File> {
26        let file = OpenOptions::new()
27            .read(true)
28            .custom_flags(libc::O_NOFOLLOW | libc::O_CLOEXEC)
29            .open("/dev/urandom")
30            .map_err(|e| {
31                NucleusError::NetworkError(format!("Failed to open /dev/urandom: {}", e))
32            })?;
33
34        let metadata = file.metadata().map_err(|e| {
35            NucleusError::NetworkError(format!("Failed to stat /dev/urandom: {}", e))
36        })?;
37        if !metadata.file_type().is_char_device() {
38            return Err(NucleusError::NetworkError(
39                "/dev/urandom is not a character device".to_string(),
40            ));
41        }
42
43        Ok(file)
44    }
45
46    /// Set up bridge networking for a container
47    ///
48    /// Creates bridge, veth pair, assigns IPs, enables NAT.
49    /// Must be called from the parent process after fork (needs host netns).
50    ///
51    /// State transitions: Unconfigured -> Configuring -> Active
52    pub fn setup(pid: u32, config: &BridgeConfig) -> Result<Self> {
53        Self::setup_for(pid, config, &format!("{:x}", pid))
54    }
55
56    /// Set up bridge networking with an explicit container ID for IP tracking.
57    pub fn setup_with_id(pid: u32, config: &BridgeConfig, container_id: &str) -> Result<Self> {
58        Self::setup_for(pid, config, container_id)
59    }
60
61    fn setup_for(pid: u32, config: &BridgeConfig, container_id: &str) -> Result<Self> {
62        // Validate all network parameters before using them in shell commands
63        config.validate()?;
64
65        let mut net_state = NetworkState::Unconfigured;
66        net_state = net_state.transition(NetworkState::Configuring)?;
67
68        let alloc_dir = Self::ip_alloc_dir();
69        let container_ip = Self::reserve_ip_in_dir(
70            &alloc_dir,
71            container_id,
72            &config.subnet,
73            config.container_ip.as_deref(),
74        )?;
75        let prefix = Self::subnet_prefix(&config.subnet);
76
77        // Linux interface names max 15 chars; truncate if needed
78        let veth_host_full = format!("veth-{:x}", pid);
79        let veth_cont_full = format!("vethc-{:x}", pid);
80        let veth_host = veth_host_full[..veth_host_full.len().min(15)].to_string();
81        let veth_container = veth_cont_full[..veth_cont_full.len().min(15)].to_string();
82        let mut rollback = SetupRollback::new(
83            veth_host.clone(),
84            config.subnet.clone(),
85            Some((alloc_dir.clone(), container_id.to_string())),
86        );
87
88        // 1. Create bridge if it doesn't exist
89        Self::ensure_bridge_for(&config.bridge_name, &config.subnet)?;
90
91        // 2. Create veth pair
92        netlink::create_veth(&veth_host, &veth_container)?;
93        rollback.veth_created = true;
94
95        // 3. Attach host end to bridge
96        netlink::set_link_master(&veth_host, &config.bridge_name)?;
97        netlink::set_link_up(&veth_host)?;
98
99        // 4. Move container end to container's network namespace
100        netlink::set_link_netns(&veth_container, pid)?;
101
102        // 5. Configure container interface (inside container netns via setns).
103        // Capture the process start time from /proc to detect PID recycling
104        // between the caller passing the PID and our netns operations.
105        let start_ticks = Self::read_pid_start_ticks(pid);
106        if start_ticks == 0 {
107            drop(rollback);
108            return Err(NucleusError::NetworkError(format!(
109                "Cannot read start_ticks for PID {} — process may have exited",
110                pid
111            )));
112        }
113
114        let container_addr: Ipv4Addr = container_ip.parse().map_err(|e| {
115            NucleusError::NetworkError(format!("invalid container IP '{}': {}", container_ip, e))
116        })?;
117        {
118            let vc = veth_container.clone();
119            netns::in_netns(pid, move || {
120                netlink::add_addr(&vc, container_addr, prefix)?;
121                netlink::set_link_up(&vc)?;
122                netlink::set_link_up("lo")?;
123                Ok(())
124            })?;
125        }
126
127        // Verify PID was not recycled during netns operations
128        let current_ticks = Self::read_pid_start_ticks(pid);
129        if current_ticks != start_ticks {
130            drop(rollback);
131            return Err(NucleusError::NetworkError(format!(
132                "PID {} was recycled during network setup (start_ticks changed: {} -> {})",
133                pid, start_ticks, current_ticks
134            )));
135        }
136
137        // 6. Set default route in container
138        let gateway = Self::gateway_from_subnet(&config.subnet);
139        let gateway_addr: Ipv4Addr = gateway.parse().map_err(|e| {
140            NucleusError::NetworkError(format!("invalid gateway IP '{}': {}", gateway, e))
141        })?;
142        netns::in_netns(pid, move || netlink::add_default_route(gateway_addr))?;
143
144        // 7. Enable NAT (masquerade) on the host
145        Self::run_cmd(
146            "iptables",
147            &[
148                "-t",
149                "nat",
150                "-A",
151                "POSTROUTING",
152                "-s",
153                &config.subnet,
154                "-j",
155                "MASQUERADE",
156            ],
157        )?;
158        rollback.nat_added = true;
159
160        // 8. Enable IP forwarding (save previous value for restore on cleanup)
161        let prev_ip_forward = match std::fs::read_to_string("/proc/sys/net/ipv4/ip_forward") {
162            Ok(v) => Some(v.trim().to_string()),
163            Err(e) => {
164                warn!(
165                    "Could not read ip_forward state (will not restore on cleanup): {}",
166                    e
167                );
168                None
169            }
170        };
171        rollback.prev_ip_forward = prev_ip_forward;
172        std::fs::write("/proc/sys/net/ipv4/ip_forward", "1").map_err(|e| {
173            NucleusError::NetworkError(format!("Failed to enable IP forwarding: {}", e))
174        })?;
175
176        // 9. Set up port forwarding rules
177        for pf in &config.port_forwards {
178            Self::setup_port_forward_for(&container_ip, pf)?;
179            rollback
180                .port_forwards
181                .push((container_ip.clone(), pf.clone()));
182        }
183
184        net_state = net_state.transition(NetworkState::Active)?;
185
186        info!(
187            "Bridge network configured: {} -> {} (IP: {})",
188            veth_host, veth_container, container_ip
189        );
190        let prev_ip_forward = rollback.prev_ip_forward.clone();
191        rollback.disarm();
192
193        Ok(Self {
194            config: config.clone(),
195            container_ip,
196            veth_host,
197            container_id: container_id.to_string(),
198            prev_ip_forward,
199            state: net_state,
200        })
201    }
202
203    /// Apply egress policy rules inside the container's network namespace.
204    ///
205    /// Uses iptables OUTPUT chain to restrict outbound connections.
206    /// Must be called after bridge setup while the container netns is reachable.
207    pub fn apply_egress_policy(&self, pid: u32, policy: &EgressPolicy) -> Result<()> {
208        // Validate egress CIDRs before passing to iptables
209        for cidr in &policy.allowed_cidrs {
210            crate::network::config::validate_egress_cidr(cidr)
211                .map_err(|e| NucleusError::NetworkError(format!("Invalid egress CIDR: {}", e)))?;
212        }
213
214        let ipt = Self::resolve_bin("iptables")?;
215
216        // M15: Set DROP policy BEFORE flushing rules to avoid a window where
217        // all egress is unrestricted. The order is: DROP -> flush -> add rules.
218        netns::exec_in_netns(pid, &ipt, &["-P", "OUTPUT", "DROP"])?;
219        // Flush any existing OUTPUT rules to prevent duplication on repeated calls
220        netns::exec_in_netns(pid, &ipt, &["-F", "OUTPUT"])?;
221
222        // Default policy: drop all OUTPUT (except established/related and loopback)
223        netns::exec_in_netns(pid, &ipt, &["-A", "OUTPUT", "-o", "lo", "-j", "ACCEPT"])?;
224
225        netns::exec_in_netns(
226            pid,
227            &ipt,
228            &[
229                "-A",
230                "OUTPUT",
231                "-m",
232                "conntrack",
233                "--ctstate",
234                "ESTABLISHED,RELATED",
235                "-j",
236                "ACCEPT",
237            ],
238        )?;
239
240        // Allow DNS to configured resolvers (only when policy permits it)
241        if policy.allow_dns {
242            for dns in &self.config.dns {
243                netns::exec_in_netns(
244                    pid,
245                    &ipt,
246                    &[
247                        "-A", "OUTPUT", "-p", "udp", "-d", dns, "--dport", "53", "-j", "ACCEPT",
248                    ],
249                )?;
250                netns::exec_in_netns(
251                    pid,
252                    &ipt,
253                    &[
254                        "-A", "OUTPUT", "-p", "tcp", "-d", dns, "--dport", "53", "-j", "ACCEPT",
255                    ],
256                )?;
257            }
258        }
259
260        // Allow traffic to each permitted CIDR
261        for cidr in &policy.allowed_cidrs {
262            if policy.allowed_tcp_ports.is_empty() && policy.allowed_udp_ports.is_empty() {
263                netns::exec_in_netns(pid, &ipt, &["-A", "OUTPUT", "-d", cidr, "-j", "ACCEPT"])?;
264            } else {
265                for port in &policy.allowed_tcp_ports {
266                    let port_s = port.to_string();
267                    netns::exec_in_netns(
268                        pid,
269                        &ipt,
270                        &[
271                            "-A", "OUTPUT", "-p", "tcp", "-d", cidr, "--dport", &port_s, "-j",
272                            "ACCEPT",
273                        ],
274                    )?;
275                }
276                for port in &policy.allowed_udp_ports {
277                    let port_s = port.to_string();
278                    netns::exec_in_netns(
279                        pid,
280                        &ipt,
281                        &[
282                            "-A", "OUTPUT", "-p", "udp", "-d", cidr, "--dport", &port_s, "-j",
283                            "ACCEPT",
284                        ],
285                    )?;
286                }
287            }
288        }
289
290        // Log denied packets (rate-limited)
291        if policy.log_denied {
292            netns::exec_in_netns(
293                pid,
294                &ipt,
295                &[
296                    "-A",
297                    "OUTPUT",
298                    "-m",
299                    "limit",
300                    "--limit",
301                    "5/min",
302                    "-j",
303                    "LOG",
304                    "--log-prefix",
305                    "nucleus-egress-denied: ",
306                ],
307            )?;
308        }
309
310        // Drop everything else
311        netns::exec_in_netns(pid, &ipt, &["-P", "OUTPUT", "DROP"])?;
312
313        info!(
314            "Egress policy applied: {} allowed CIDRs",
315            policy.allowed_cidrs.len()
316        );
317        debug!("Egress policy details: {:?}", policy);
318
319        Ok(())
320    }
321
322    /// Clean up bridge networking
323    ///
324    /// State transition: Active -> Cleaned
325    pub fn cleanup(mut self) -> Result<()> {
326        self.state = self.state.transition(NetworkState::Cleaned)?;
327
328        // Release the IP allocation
329        Self::release_allocated_ip(&self.container_id);
330
331        // Remove port forwarding rules
332        for pf in &self.config.port_forwards {
333            if let Err(e) = self.cleanup_port_forward(pf) {
334                warn!("Failed to cleanup port forward: {}", e);
335            }
336        }
337
338        // Remove NAT rule
339        let _ = Self::run_cmd(
340            "iptables",
341            &[
342                "-t",
343                "nat",
344                "-D",
345                "POSTROUTING",
346                "-s",
347                &self.config.subnet,
348                "-j",
349                "MASQUERADE",
350            ],
351        );
352
353        // Delete veth pair (deleting one end removes both)
354        let _ = netlink::del_link(&self.veth_host);
355
356        // Restore previous ip_forward state if we changed it
357        if let Some(ref prev) = self.prev_ip_forward {
358            if prev == "0" {
359                if let Err(e) = std::fs::write("/proc/sys/net/ipv4/ip_forward", "0") {
360                    warn!("Failed to restore ip_forward to 0: {}", e);
361                } else {
362                    info!("Restored net.ipv4.ip_forward to 0");
363                }
364            }
365        }
366
367        info!("Bridge network cleaned up");
368        Ok(())
369    }
370
371    /// Best-effort cleanup for use in Drop. Performs the same teardown as
372    /// `cleanup()` but ignores all errors and skips the state transition
373    /// (which requires ownership).
374    fn cleanup_best_effort(&mut self) {
375        if self.state == NetworkState::Cleaned {
376            return;
377        }
378
379        Self::release_allocated_ip(&self.container_id);
380
381        for pf in &self.config.port_forwards {
382            let _ = self.cleanup_port_forward(pf);
383        }
384
385        let _ = Self::run_cmd(
386            "iptables",
387            &[
388                "-t",
389                "nat",
390                "-D",
391                "POSTROUTING",
392                "-s",
393                &self.config.subnet,
394                "-j",
395                "MASQUERADE",
396            ],
397        );
398
399        let _ = netlink::del_link(&self.veth_host);
400
401        if let Some(ref prev) = self.prev_ip_forward {
402            if prev == "0" {
403                let _ = std::fs::write("/proc/sys/net/ipv4/ip_forward", "0");
404            }
405        }
406
407        self.state = NetworkState::Cleaned;
408        debug!("Bridge network cleaned up (best-effort via drop)");
409    }
410
411    /// Detect and remove orphaned iptables rules from previous Nucleus runs.
412    ///
413    /// Checks for stale MASQUERADE rules referencing the nucleus subnet that
414    /// have no corresponding running container. Prevents gradual degradation
415    /// of network isolation from accumulated orphaned rules.
416    pub fn cleanup_orphaned_rules(subnet: &str) {
417        // List NAT rules and look for nucleus-related MASQUERADE entries
418        let output = match Command::new("iptables")
419            .args(["-t", "nat", "-L", "POSTROUTING", "-n"])
420            .output()
421        {
422            Ok(o) => o,
423            Err(e) => {
424                debug!("Cannot check iptables for orphaned rules: {}", e);
425                return;
426            }
427        };
428
429        let stdout = String::from_utf8_lossy(&output.stdout);
430        let mut orphaned_count = 0u32;
431        for line in stdout.lines() {
432            if line.contains("MASQUERADE") && line.contains(subnet) {
433                // Try to remove it; if it fails, it may be actively used
434                let _ = Self::run_cmd(
435                    "iptables",
436                    &[
437                        "-t",
438                        "nat",
439                        "-D",
440                        "POSTROUTING",
441                        "-s",
442                        subnet,
443                        "-j",
444                        "MASQUERADE",
445                    ],
446                );
447                orphaned_count += 1;
448            }
449        }
450
451        if orphaned_count > 0 {
452            info!(
453                "Cleaned up {} orphaned iptables MASQUERADE rule(s) for subnet {}",
454                orphaned_count, subnet
455            );
456        }
457    }
458
459    fn ensure_bridge_for(bridge_name: &str, subnet: &str) -> Result<()> {
460        if netlink::link_exists(bridge_name) {
461            return Ok(());
462        }
463
464        netlink::create_bridge(bridge_name)?;
465
466        let gateway = Self::gateway_from_subnet(subnet);
467        let gateway_addr: Ipv4Addr = gateway.parse().map_err(|e| {
468            NucleusError::NetworkError(format!("invalid bridge gateway '{}': {}", gateway, e))
469        })?;
470        netlink::add_addr(bridge_name, gateway_addr, Self::subnet_prefix(subnet))?;
471        netlink::set_link_up(bridge_name)?;
472
473        info!("Created bridge {}", bridge_name);
474        Ok(())
475    }
476
477    fn setup_port_forward_for(container_ip: &str, pf: &PortForward) -> Result<()> {
478        for chain in ["PREROUTING", "OUTPUT"] {
479            let args = Self::port_forward_rule_args("-A", chain, container_ip, pf);
480            Self::run_cmd_owned("iptables", &args)?;
481        }
482
483        let host_ip = pf
484            .host_ip
485            .map(|ip| ip.to_string())
486            .unwrap_or_else(|| "0.0.0.0".to_string());
487        info!(
488            "Port forward: {}:{} -> {}:{}/{}",
489            host_ip, pf.host_port, container_ip, pf.container_port, pf.protocol
490        );
491        Ok(())
492    }
493
494    fn cleanup_port_forward(&self, pf: &PortForward) -> Result<()> {
495        for chain in ["OUTPUT", "PREROUTING"] {
496            let args = Self::port_forward_rule_args("-D", chain, &self.container_ip, pf);
497            Self::run_cmd_owned("iptables", &args)?;
498        }
499        Ok(())
500    }
501
502    /// Allocate a container IP from the subnet using /dev/urandom.
503    ///
504    /// Checks both host-visible interfaces (via `ip addr`) and IPs assigned to
505    /// other Nucleus containers (via state files) to avoid duplicates. Container
506    /// IPs inside network namespaces are invisible to `ip addr show` on the host.
507    fn allocate_ip_with_reserved(
508        subnet: &str,
509        reserved: &std::collections::HashSet<String>,
510    ) -> Result<String> {
511        let base = subnet.split('/').next().unwrap_or("10.0.42.0");
512        let parts: Vec<&str> = base.split('.').collect();
513        if parts.len() != 4 {
514            return Ok("10.0.42.2".to_string());
515        }
516
517        // Use rejection sampling to avoid modulo bias.
518        // Range is 2..=254 (253 values). We reject random bytes >= 253 to
519        // ensure uniform distribution, then add 2 to shift into the valid range.
520        // Open /dev/urandom once and read all randomness in a single batch.
521        // 128 bytes gives ~125 valid candidates (byte < 253), making exhaustion
522        // in a populated subnet far less likely than the previous 32-byte buffer.
523        let mut rand_buf = [0u8; 128];
524        let mut urandom = Self::open_dev_urandom()?;
525        std::io::Read::read_exact(&mut urandom, &mut rand_buf).map_err(|e| {
526            NucleusError::NetworkError(format!("Failed to read /dev/urandom: {}", e))
527        })?;
528        for &byte in &rand_buf {
529            // Rejection sampling: discard values that would cause modulo bias
530            if byte >= 253 {
531                continue;
532            }
533            let offset = byte as u32 + 2;
534            let candidate = format!("{}.{}.{}.{}", parts[0], parts[1], parts[2], offset);
535            if reserved.contains(&candidate) {
536                continue;
537            }
538            if !Self::is_ip_in_use(&candidate)? {
539                // Lock is released when lock_file is dropped
540                return Ok(candidate);
541            }
542        }
543
544        Err(NucleusError::NetworkError(format!(
545            "Failed to allocate free IP in subnet {}",
546            subnet
547        )))
548    }
549
550    fn reserve_ip_in_dir(
551        alloc_dir: &std::path::Path,
552        container_id: &str,
553        subnet: &str,
554        requested_ip: Option<&str>,
555    ) -> Result<String> {
556        Self::ensure_alloc_dir(alloc_dir)?;
557        let lock_path = alloc_dir.join(".lock");
558        let lock_file = std::fs::OpenOptions::new()
559            .create(true)
560            .write(true)
561            .truncate(false)
562            .open(&lock_path)
563            .map_err(|e| {
564                NucleusError::NetworkError(format!("Failed to open IP alloc lock: {}", e))
565            })?;
566        // SAFETY: lock_file is a valid open fd. LOCK_EX is a blocking exclusive
567        // lock that is released when the fd is closed (end of scope).
568        let lock_ret = unsafe { libc::flock(lock_file.as_raw_fd(), libc::LOCK_EX) };
569        if lock_ret != 0 {
570            return Err(NucleusError::NetworkError(format!(
571                "Failed to acquire IP alloc lock: {}",
572                std::io::Error::last_os_error()
573            )));
574        }
575
576        let reserved = Self::collect_reserved_ips_in_dir(alloc_dir);
577        let ip = match requested_ip {
578            Some(ip) => {
579                if reserved.contains(ip) || Self::is_ip_in_use(ip)? {
580                    return Err(NucleusError::NetworkError(format!(
581                        "Requested container IP {} is already in use",
582                        ip
583                    )));
584                }
585                ip.to_string()
586            }
587            None => Self::allocate_ip_with_reserved(subnet, &reserved)?,
588        };
589
590        Self::record_allocated_ip_in_dir(alloc_dir, container_id, &ip)?;
591        Ok(ip)
592    }
593
594    /// Scan the Nucleus IP allocation directory for IPs already assigned.
595    fn collect_reserved_ips_in_dir(
596        alloc_dir: &std::path::Path,
597    ) -> std::collections::HashSet<String> {
598        let mut ips = std::collections::HashSet::new();
599        if let Ok(entries) = std::fs::read_dir(alloc_dir) {
600            for entry in entries.flatten() {
601                if let Some(name) = entry.file_name().to_str() {
602                    if name.ends_with(".ip") {
603                        if let Ok(ip) = std::fs::read_to_string(entry.path()) {
604                            let ip = ip.trim().to_string();
605                            if !ip.is_empty() {
606                                ips.insert(ip);
607                            }
608                        }
609                    }
610                }
611            }
612        }
613        ips
614    }
615
616    /// Persist the allocated IP for this container so other containers can see it.
617    fn record_allocated_ip_in_dir(
618        alloc_dir: &std::path::Path,
619        container_id: &str,
620        ip: &str,
621    ) -> Result<()> {
622        Self::ensure_alloc_dir(alloc_dir)?;
623        let path = alloc_dir.join(format!("{}.ip", container_id));
624        std::fs::write(&path, ip).map_err(|e| {
625            NucleusError::NetworkError(format!("Failed to record IP allocation: {}", e))
626        })?;
627        Ok(())
628    }
629
630    /// Remove the persisted IP allocation for a container.
631    fn release_allocated_ip(container_id: &str) {
632        let alloc_dir = Self::ip_alloc_dir();
633        Self::release_allocated_ip_in_dir(&alloc_dir, container_id);
634    }
635
636    fn release_allocated_ip_in_dir(alloc_dir: &std::path::Path, container_id: &str) {
637        let path = alloc_dir.join(format!("{}.ip", container_id));
638        let _ = std::fs::remove_file(path);
639    }
640
641    /// Create the IP allocation directory with restrictive permissions (0700)
642    /// and reject symlinked paths to prevent symlink attacks.
643    fn ensure_alloc_dir(alloc_dir: &std::path::Path) -> Result<()> {
644        // L11: Check for symlinks BEFORE creating directories to avoid TOCTOU.
645        // If the path already exists, verify it's not a symlink.
646        if alloc_dir.exists() {
647            if let Ok(meta) = std::fs::symlink_metadata(alloc_dir) {
648                if meta.file_type().is_symlink() {
649                    return Err(NucleusError::NetworkError(format!(
650                        "IP alloc dir {:?} is a symlink, refusing to use",
651                        alloc_dir
652                    )));
653                }
654            }
655        }
656        // Also check parent directory for symlinks
657        if let Some(parent) = alloc_dir.parent() {
658            if let Ok(meta) = std::fs::symlink_metadata(parent) {
659                if meta.file_type().is_symlink() {
660                    return Err(NucleusError::NetworkError(format!(
661                        "IP alloc dir parent {:?} is a symlink, refusing to use",
662                        parent
663                    )));
664                }
665            }
666        }
667
668        std::fs::create_dir_all(alloc_dir).map_err(|e| {
669            NucleusError::NetworkError(format!("Failed to create IP alloc dir: {}", e))
670        })?;
671
672        // Restrict permissions to owner-only atomically after creation
673        use std::os::unix::fs::PermissionsExt;
674        let perms = std::fs::Permissions::from_mode(0o700);
675        std::fs::set_permissions(alloc_dir, perms).map_err(|e| {
676            NucleusError::NetworkError(format!(
677                "Failed to set permissions on IP alloc dir {:?}: {}",
678                alloc_dir, e
679            ))
680        })?;
681
682        // Re-verify no symlink replacement after permissions were set
683        if let Ok(meta) = std::fs::symlink_metadata(alloc_dir) {
684            if meta.file_type().is_symlink() {
685                return Err(NucleusError::NetworkError(format!(
686                    "IP alloc dir {:?} was replaced with a symlink during setup",
687                    alloc_dir
688                )));
689            }
690        }
691        Ok(())
692    }
693
694    fn ip_alloc_dir() -> std::path::PathBuf {
695        if nix::unistd::Uid::effective().is_root() {
696            std::path::PathBuf::from("/var/run/nucleus/ip-alloc")
697        } else {
698            dirs::runtime_dir()
699                .map(|d| d.join("nucleus/ip-alloc"))
700                .or_else(|| dirs::data_local_dir().map(|d| d.join("nucleus/ip-alloc")))
701                .unwrap_or_else(|| {
702                    dirs::home_dir()
703                        .map(|h| h.join(".nucleus/ip-alloc"))
704                        .unwrap_or_else(|| std::path::PathBuf::from("/var/run/nucleus/ip-alloc"))
705                })
706        }
707    }
708
709    /// Read the start time (field 22) from /proc/<pid>/stat to detect PID recycling.
710    /// Returns 0 if the process does not exist or the field cannot be parsed.
711    fn read_pid_start_ticks(pid: u32) -> u64 {
712        let stat_path = format!("/proc/{}/stat", pid);
713        if let Ok(content) = std::fs::read_to_string(&stat_path) {
714            // Field 22 is starttime. The comm field (2) may contain spaces/parens,
715            // so find the last ')' and count fields from there.
716            if let Some(after_comm) = content.rfind(')') {
717                return content[after_comm + 2..]
718                    .split_whitespace()
719                    .nth(19) // field 22 is 20th after the ')' + state field
720                    .and_then(|s| s.parse().ok())
721                    .unwrap_or(0);
722            }
723        }
724        0
725    }
726
727    /// Get gateway IP from subnet (first usable address)
728    fn gateway_from_subnet(subnet: &str) -> String {
729        let base = subnet.split('/').next().unwrap_or("10.0.42.0");
730        let parts: Vec<&str> = base.split('.').collect();
731        if parts.len() == 4 {
732            format!("{}.{}.{}.1", parts[0], parts[1], parts[2])
733        } else {
734            "10.0.42.1".to_string()
735        }
736    }
737
738    fn subnet_prefix(subnet: &str) -> u8 {
739        subnet
740            .split_once('/')
741            .and_then(|(_, p)| p.parse::<u8>().ok())
742            .filter(|p| *p <= 32)
743            .unwrap_or(24)
744    }
745
746    /// Resolve a system binary to a validated absolute path.
747    ///
748    /// When running as root, searches known sysadmin paths and validates
749    /// ownership and permissions before use. When unprivileged, uses
750    /// `which`-style PATH resolution but still validates the result.
751    /// Returns an error if no valid binary is found.
752    fn resolve_bin(name: &str) -> Result<String> {
753        let search_dirs: &[&str] = match name {
754            "iptables" => &["/usr/sbin/iptables", "/sbin/iptables", "/usr/bin/iptables"],
755            _ => &[],
756        };
757
758        for path in search_dirs {
759            let p = std::path::Path::new(path);
760            if p.exists() {
761                Self::validate_network_binary(p, name)?;
762                return Ok(path.to_string());
763            }
764        }
765
766        // Fallback: resolve via PATH, but validate the result
767        if let Ok(output) = Command::new("which").arg(name).output() {
768            if output.status.success() {
769                let resolved = String::from_utf8_lossy(&output.stdout).trim().to_string();
770                if !resolved.is_empty() {
771                    let p = std::path::Path::new(&resolved);
772                    Self::validate_network_binary(p, name)?;
773                    return Ok(resolved);
774                }
775            }
776        }
777
778        Err(NucleusError::NetworkError(format!(
779            "Required binary '{}' not found or failed validation",
780            name
781        )))
782    }
783
784    /// Validate a network binary's ownership and permissions.
785    /// Rejects binaries that are group/world-writable or not owned by root/euid.
786    fn validate_network_binary(path: &std::path::Path, name: &str) -> Result<()> {
787        use std::os::unix::fs::MetadataExt;
788
789        let meta = std::fs::metadata(path)
790            .map_err(|e| NucleusError::NetworkError(format!("Cannot stat {}: {}", name, e)))?;
791        let mode = meta.mode();
792        if mode & 0o022 != 0 {
793            return Err(NucleusError::NetworkError(format!(
794                "Binary '{}' at {:?} is writable by group/others (mode {:o}), refusing to execute",
795                name, path, mode
796            )));
797        }
798        let owner = meta.uid();
799        let euid = nix::unistd::Uid::effective().as_raw();
800        if owner != 0 && owner != euid {
801            return Err(NucleusError::NetworkError(format!(
802                "Binary '{}' at {:?} owned by UID {} (expected root or euid {}), refusing to execute",
803                name, path, owner, euid
804            )));
805        }
806        Ok(())
807    }
808
809    fn run_cmd(program: &str, args: &[&str]) -> Result<()> {
810        let resolved = Self::resolve_bin(program)?;
811        let output = Command::new(&resolved).args(args).output().map_err(|e| {
812            NucleusError::NetworkError(format!("Failed to run {} {:?}: {}", resolved, args, e))
813        })?;
814
815        if !output.status.success() {
816            let stderr = String::from_utf8_lossy(&output.stderr);
817            return Err(NucleusError::NetworkError(format!(
818                "{} {:?} failed: {}",
819                program, args, stderr
820            )));
821        }
822
823        Ok(())
824    }
825
826    fn run_cmd_owned(program: &str, args: &[String]) -> Result<()> {
827        let refs: Vec<&str> = args.iter().map(String::as_str).collect();
828        Self::run_cmd(program, &refs)
829    }
830
831    fn port_forward_rule_args(
832        operation: &str,
833        chain: &str,
834        container_ip: &str,
835        pf: &PortForward,
836    ) -> Vec<String> {
837        let mut args = vec![
838            "-t".to_string(),
839            "nat".to_string(),
840            operation.to_string(),
841            chain.to_string(),
842            "-p".to_string(),
843            pf.protocol.as_str().to_string(),
844        ];
845
846        if chain == "OUTPUT" {
847            args.extend([
848                "-m".to_string(),
849                "addrtype".to_string(),
850                "--dst-type".to_string(),
851                "LOCAL".to_string(),
852            ]);
853        }
854
855        if let Some(host_ip) = pf.host_ip {
856            args.extend(["-d".to_string(), host_ip.to_string()]);
857        }
858
859        args.extend([
860            "--dport".to_string(),
861            pf.host_port.to_string(),
862            "-j".to_string(),
863            "DNAT".to_string(),
864            "--to-destination".to_string(),
865            format!("{}:{}", container_ip, pf.container_port),
866        ]);
867
868        args
869    }
870
871    fn is_ip_in_use(ip: &str) -> Result<bool> {
872        let addr: Ipv4Addr = ip
873            .parse()
874            .map_err(|e| NucleusError::NetworkError(format!("invalid IP '{}': {}", ip, e)))?;
875        netlink::is_addr_in_use(&addr)
876    }
877
878    /// Write resolv.conf inside container (for writable /etc, e.g. agent mode)
879    pub fn write_resolv_conf(root: &std::path::Path, dns: &[String]) -> Result<()> {
880        let resolv_path = root.join("etc/resolv.conf");
881        let content: String = dns
882            .iter()
883            .map(|server| format!("nameserver {}\n", server))
884            .collect();
885        std::fs::write(&resolv_path, content).map_err(|e| {
886            NucleusError::NetworkError(format!("Failed to write resolv.conf: {}", e))
887        })?;
888        Ok(())
889    }
890
891    /// Bind-mount a resolv.conf over a read-only /etc (for production rootfs mode).
892    ///
893    /// Creates a memfd-backed resolv.conf and bind-mounts it over
894    /// /etc/resolv.conf so it works even when the rootfs /etc is read-only.
895    /// The memfd is cleaned up when the container exits.
896    pub fn bind_mount_resolv_conf(root: &std::path::Path, dns: &[String]) -> Result<()> {
897        use nix::mount::{mount, MsFlags};
898
899        let content: String = dns
900            .iter()
901            .map(|server| format!("nameserver {}\n", server))
902            .collect();
903
904        // Create a memfd-backed file to avoid leaving staging files on disk
905        let memfd_name = std::ffi::CString::new("nucleus-resolv").map_err(|e| {
906            NucleusError::NetworkError(format!("Failed to create memfd name: {}", e))
907        })?;
908        // SAFETY: memfd_name is a valid NUL-terminated CString. memfd_create
909        // returns a new fd or -1 on error; we check for error below.
910        let raw_fd = unsafe { libc::memfd_create(memfd_name.as_ptr(), 0) };
911        if raw_fd < 0 {
912            // Fallback to staging file if memfd_create is unavailable
913            return Self::bind_mount_resolv_conf_staging(root, dns);
914        }
915        // SAFETY: raw_fd is a valid, newly-created fd from memfd_create.
916        // OwnedFd takes ownership and will close it exactly once on drop,
917        // preventing double-close on any error path.
918        let memfd = unsafe { std::os::fd::OwnedFd::from_raw_fd(raw_fd) };
919
920        // Write content to memfd
921        // SAFETY: memfd is a valid open fd. content is a valid byte buffer
922        // with correct length. write() may return -1 on error.
923        let write_result = unsafe {
924            libc::write(
925                memfd.as_raw_fd(),
926                content.as_ptr() as *const libc::c_void,
927                content.len(),
928            )
929        };
930        if write_result < 0 {
931            // memfd dropped here, closing the fd automatically
932            return Self::bind_mount_resolv_conf_staging(root, dns);
933        }
934
935        // Ensure the mount target exists
936        let target = root.join("etc/resolv.conf");
937        if !target.exists() {
938            let _ = std::fs::write(&target, "");
939        }
940
941        // Bind mount the memfd over the read-only resolv.conf
942        let memfd_path = format!("/proc/self/fd/{}", memfd.as_raw_fd());
943        mount(
944            Some(memfd_path.as_str()),
945            &target,
946            None::<&str>,
947            MsFlags::MS_BIND,
948            None::<&str>,
949        )
950        .map_err(|e| {
951            // memfd dropped here via the returned Err, closing the fd automatically
952            NucleusError::NetworkError(format!("Failed to bind mount resolv.conf: {}", e))
953        })?;
954
955        // memfd dropped here — the mount holds a kernel reference to the file,
956        // so it survives the fd close.
957
958        info!("Bind-mounted resolv.conf for bridge networking (rootfs mode, memfd)");
959        Ok(())
960    }
961
962    /// Fallback: bind-mount a staging resolv.conf file.
963    fn bind_mount_resolv_conf_staging(root: &std::path::Path, dns: &[String]) -> Result<()> {
964        use nix::mount::{mount, MsFlags};
965
966        let content: String = dns
967            .iter()
968            .map(|server| format!("nameserver {}\n", server))
969            .collect();
970
971        // Write to a staging file outside /etc
972        let staging = root.join("tmp/.resolv.conf.nucleus");
973        if let Some(parent) = staging.parent() {
974            std::fs::create_dir_all(parent).map_err(|e| {
975                NucleusError::NetworkError(format!(
976                    "Failed to create resolv.conf staging parent: {}",
977                    e
978                ))
979            })?;
980        }
981        std::fs::write(&staging, content).map_err(|e| {
982            NucleusError::NetworkError(format!("Failed to write staging resolv.conf: {}", e))
983        })?;
984
985        // Ensure the mount target exists
986        let target = root.join("etc/resolv.conf");
987        if !target.exists() {
988            let _ = std::fs::write(&target, "");
989        }
990
991        // Bind mount the staging file over the read-only resolv.conf
992        mount(
993            Some(staging.as_path()),
994            &target,
995            None::<&str>,
996            MsFlags::MS_BIND,
997            None::<&str>,
998        )
999        .map_err(|e| {
1000            NucleusError::NetworkError(format!("Failed to bind mount resolv.conf: {}", e))
1001        })?;
1002
1003        // The bind mount holds a reference to the inode, so we can safely
1004        // unlink the staging path to avoid leaking DNS server info on disk.
1005        if let Err(e) = std::fs::remove_file(&staging) {
1006            warn!("Failed to remove staging resolv.conf {:?}: {}", staging, e);
1007        }
1008
1009        info!("Bind-mounted resolv.conf for bridge networking (rootfs mode, staging)");
1010        Ok(())
1011    }
1012}
1013
1014impl Drop for BridgeNetwork {
1015    fn drop(&mut self) {
1016        self.cleanup_best_effort();
1017    }
1018}
1019
1020struct SetupRollback {
1021    veth_host: String,
1022    subnet: String,
1023    veth_created: bool,
1024    nat_added: bool,
1025    port_forwards: Vec<(String, PortForward)>,
1026    prev_ip_forward: Option<String>,
1027    reserved_ip: Option<(std::path::PathBuf, String)>,
1028    armed: bool,
1029}
1030
1031impl SetupRollback {
1032    fn new(
1033        veth_host: String,
1034        subnet: String,
1035        reserved_ip: Option<(std::path::PathBuf, String)>,
1036    ) -> Self {
1037        Self {
1038            veth_host,
1039            subnet,
1040            veth_created: false,
1041            nat_added: false,
1042            port_forwards: Vec::new(),
1043            prev_ip_forward: None,
1044            reserved_ip,
1045            armed: true,
1046        }
1047    }
1048
1049    fn disarm(&mut self) {
1050        self.armed = false;
1051    }
1052}
1053
1054impl Drop for SetupRollback {
1055    fn drop(&mut self) {
1056        if !self.armed {
1057            return;
1058        }
1059
1060        for (container_ip, pf) in self.port_forwards.iter().rev() {
1061            for chain in ["OUTPUT", "PREROUTING"] {
1062                let args = BridgeNetwork::port_forward_rule_args("-D", chain, container_ip, pf);
1063                if let Err(e) = BridgeNetwork::run_cmd_owned("iptables", &args) {
1064                    warn!(
1065                        "Rollback: failed to remove iptables {} rule for {}: {}",
1066                        chain, container_ip, e
1067                    );
1068                }
1069            }
1070        }
1071
1072        if self.nat_added {
1073            if let Err(e) = BridgeNetwork::run_cmd(
1074                "iptables",
1075                &[
1076                    "-t",
1077                    "nat",
1078                    "-D",
1079                    "POSTROUTING",
1080                    "-s",
1081                    &self.subnet,
1082                    "-j",
1083                    "MASQUERADE",
1084                ],
1085            ) {
1086                warn!("Rollback: failed to remove NAT rule: {}", e);
1087            }
1088        }
1089
1090        if self.veth_created {
1091            if let Err(e) = netlink::del_link(&self.veth_host) {
1092                warn!("Rollback: failed to delete veth {}: {}", self.veth_host, e);
1093            }
1094        }
1095
1096        if let Some((alloc_dir, container_id)) = &self.reserved_ip {
1097            BridgeNetwork::release_allocated_ip_in_dir(alloc_dir, container_id);
1098        }
1099    }
1100}
1101
1102#[cfg(test)]
1103mod tests {
1104    use super::*;
1105
1106    #[test]
1107    fn test_ip_allocation_rejection_sampling_range() {
1108        // H-5: Verify that rejection sampling produces values in 2..=254
1109        // and that values >= 253 are rejected (no modulo bias).
1110        for byte in 0u8..253 {
1111            let offset = byte as u32 + 2;
1112            assert!(
1113                (2..=254).contains(&offset),
1114                "offset {} out of range",
1115                offset
1116            );
1117        }
1118        // Values 253, 254, 255 must be rejected
1119        for byte in [253u8, 254, 255] {
1120            assert!(byte >= 253);
1121        }
1122    }
1123
1124    #[test]
1125    fn test_reserve_ip_blocks_duplicate_requested_address() {
1126        let temp = tempfile::tempdir().unwrap();
1127        BridgeNetwork::record_allocated_ip_in_dir(temp.path(), "one", "10.0.42.2").unwrap();
1128
1129        let err =
1130            BridgeNetwork::reserve_ip_in_dir(temp.path(), "two", "10.0.42.0/24", Some("10.0.42.2"))
1131                .unwrap_err();
1132        assert!(
1133            err.to_string().contains("already in use"),
1134            "second reservation of the same IP must fail"
1135        );
1136    }
1137
1138    #[test]
1139    fn test_setup_rollback_releases_reserved_ip() {
1140        let temp = tempfile::tempdir().unwrap();
1141        BridgeNetwork::record_allocated_ip_in_dir(temp.path(), "rollback", "10.0.42.3").unwrap();
1142
1143        let rollback = SetupRollback {
1144            veth_host: "veth-test".to_string(),
1145            subnet: "10.0.42.0/24".to_string(),
1146            veth_created: false,
1147            nat_added: false,
1148            port_forwards: Vec::new(),
1149            prev_ip_forward: None,
1150            reserved_ip: Some((temp.path().to_path_buf(), "rollback".to_string())),
1151            armed: true,
1152        };
1153
1154        drop(rollback);
1155
1156        assert!(
1157            !temp.path().join("rollback.ip").exists(),
1158            "rollback must release reserved IP files on setup failure"
1159        );
1160    }
1161
1162    #[test]
1163    fn test_port_forward_rules_include_output_chain_for_local_host_clients() {
1164        let pf = PortForward {
1165            host_ip: None,
1166            host_port: 8080,
1167            container_port: 80,
1168            protocol: crate::network::config::Protocol::Tcp,
1169        };
1170
1171        let prerouting =
1172            BridgeNetwork::port_forward_rule_args("-A", "PREROUTING", "10.0.42.2", &pf);
1173        let output = BridgeNetwork::port_forward_rule_args("-A", "OUTPUT", "10.0.42.2", &pf);
1174
1175        assert!(prerouting.iter().any(|arg| arg == "PREROUTING"));
1176        assert!(output.iter().any(|arg| arg == "OUTPUT"));
1177        assert!(
1178            output
1179                .windows(2)
1180                .any(|pair| pair[0] == "--dst-type" && pair[1] == "LOCAL"),
1181            "OUTPUT rule must target local-destination traffic"
1182        );
1183    }
1184
1185    #[test]
1186    fn test_port_forward_rules_include_host_ip_when_configured() {
1187        let pf = PortForward {
1188            host_ip: Some(std::net::Ipv4Addr::new(127, 0, 0, 1)),
1189            host_port: 4173,
1190            container_port: 4173,
1191            protocol: crate::network::config::Protocol::Tcp,
1192        };
1193
1194        let prerouting =
1195            BridgeNetwork::port_forward_rule_args("-A", "PREROUTING", "10.0.42.2", &pf);
1196        let output = BridgeNetwork::port_forward_rule_args("-A", "OUTPUT", "10.0.42.2", &pf);
1197
1198        for args in [&prerouting, &output] {
1199            assert!(
1200                args.windows(2)
1201                    .any(|pair| pair[0] == "-d" && pair[1] == "127.0.0.1"),
1202                "port forward must restrict DNAT rules to the configured host IP"
1203            );
1204        }
1205    }
1206}