Skip to main content

nucleus/network/
bridge.rs

1use crate::error::{NucleusError, Result, StateTransition};
2use crate::network::config::{BridgeConfig, EgressPolicy, PortForward};
3use crate::network::NetworkState;
4use std::os::fd::FromRawFd;
5use std::os::unix::io::AsRawFd;
6use std::process::Command;
7use tracing::{debug, info, warn};
8
9/// Bridge network manager
10pub struct BridgeNetwork {
11    config: BridgeConfig,
12    container_ip: String,
13    veth_host: String,
14    container_id: String,
15    prev_ip_forward: Option<String>,
16    state: NetworkState,
17}
18
19impl BridgeNetwork {
20    /// Set up bridge networking for a container
21    ///
22    /// Creates bridge, veth pair, assigns IPs, enables NAT.
23    /// Must be called from the parent process after fork (needs host netns).
24    ///
25    /// State transitions: Unconfigured -> Configuring -> Active
26    pub fn setup(pid: u32, config: &BridgeConfig) -> Result<Self> {
27        Self::setup_for(pid, config, &format!("{:x}", pid))
28    }
29
30    /// Set up bridge networking with an explicit container ID for IP tracking.
31    pub fn setup_with_id(pid: u32, config: &BridgeConfig, container_id: &str) -> Result<Self> {
32        Self::setup_for(pid, config, container_id)
33    }
34
35    fn setup_for(pid: u32, config: &BridgeConfig, container_id: &str) -> Result<Self> {
36        // Validate all network parameters before using them in shell commands
37        config.validate()?;
38
39        let mut net_state = NetworkState::Unconfigured;
40        net_state = net_state.transition(NetworkState::Configuring)?;
41
42        let alloc_dir = Self::ip_alloc_dir();
43        let container_ip = Self::reserve_ip_in_dir(
44            &alloc_dir,
45            container_id,
46            &config.subnet,
47            config.container_ip.as_deref(),
48        )?;
49        let prefix = Self::subnet_prefix(&config.subnet);
50
51        // Linux interface names max 15 chars; truncate if needed
52        let veth_host_full = format!("veth-{:x}", pid);
53        let veth_cont_full = format!("vethc-{:x}", pid);
54        let veth_host = veth_host_full[..veth_host_full.len().min(15)].to_string();
55        let veth_container = veth_cont_full[..veth_cont_full.len().min(15)].to_string();
56        let mut rollback = SetupRollback::new(
57            veth_host.clone(),
58            config.subnet.clone(),
59            Some((alloc_dir.clone(), container_id.to_string())),
60        );
61
62        // 1. Create bridge if it doesn't exist
63        Self::ensure_bridge_for(&config.bridge_name, &config.subnet)?;
64
65        // 2. Create veth pair
66        Self::run_cmd(
67            "ip",
68            &[
69                "link",
70                "add",
71                &veth_host,
72                "type",
73                "veth",
74                "peer",
75                "name",
76                &veth_container,
77            ],
78        )?;
79        rollback.veth_created = true;
80
81        // 3. Attach host end to bridge
82        Self::run_cmd(
83            "ip",
84            &["link", "set", &veth_host, "master", &config.bridge_name],
85        )?;
86        Self::run_cmd("ip", &["link", "set", &veth_host, "up"])?;
87
88        // 4. Move container end to container's network namespace
89        Self::run_cmd(
90            "ip",
91            &["link", "set", &veth_container, "netns", &pid.to_string()],
92        )?;
93
94        // 5. Configure container interface (inside container netns via nsenter).
95        // Capture the process start time from /proc to detect PID recycling
96        // between the caller passing the PID and our nsenter invocations.
97        let pid_str = pid.to_string();
98        let start_ticks = Self::read_pid_start_ticks(pid);
99        if start_ticks == 0 {
100            drop(rollback);
101            return Err(NucleusError::NetworkError(format!(
102                "Cannot read start_ticks for PID {} — process may have exited",
103                pid
104            )));
105        }
106
107        Self::run_cmd(
108            "nsenter",
109            &[
110                "-t",
111                &pid_str,
112                "-n",
113                "ip",
114                "addr",
115                "add",
116                &format!("{}/{}", container_ip, prefix),
117                "dev",
118                &veth_container,
119            ],
120        )?;
121        Self::run_cmd(
122            "nsenter",
123            &[
124                "-t",
125                &pid_str,
126                "-n",
127                "ip",
128                "link",
129                "set",
130                &veth_container,
131                "up",
132            ],
133        )?;
134        Self::run_cmd(
135            "nsenter",
136            &["-t", &pid_str, "-n", "ip", "link", "set", "lo", "up"],
137        )?;
138
139        // Verify PID was not recycled during nsenter operations
140        let current_ticks = Self::read_pid_start_ticks(pid);
141        if current_ticks != start_ticks {
142            drop(rollback);
143            return Err(NucleusError::NetworkError(format!(
144                "PID {} was recycled during network setup (start_ticks changed: {} -> {})",
145                pid, start_ticks, current_ticks
146            )));
147        }
148
149        // 6. Set default route in container
150        let gateway = Self::gateway_from_subnet(&config.subnet);
151        Self::run_cmd(
152            "nsenter",
153            &[
154                "-t", &pid_str, "-n", "ip", "route", "add", "default", "via", &gateway,
155            ],
156        )?;
157
158        // 7. Enable NAT (masquerade) on the host
159        Self::run_cmd(
160            "iptables",
161            &[
162                "-t",
163                "nat",
164                "-A",
165                "POSTROUTING",
166                "-s",
167                &config.subnet,
168                "-j",
169                "MASQUERADE",
170            ],
171        )?;
172        rollback.nat_added = true;
173
174        // 8. Enable IP forwarding (save previous value for restore on cleanup)
175        let prev_ip_forward = match std::fs::read_to_string("/proc/sys/net/ipv4/ip_forward") {
176            Ok(v) => Some(v.trim().to_string()),
177            Err(e) => {
178                warn!(
179                    "Could not read ip_forward state (will not restore on cleanup): {}",
180                    e
181                );
182                None
183            }
184        };
185        rollback.prev_ip_forward = prev_ip_forward;
186        std::fs::write("/proc/sys/net/ipv4/ip_forward", "1").map_err(|e| {
187            NucleusError::NetworkError(format!("Failed to enable IP forwarding: {}", e))
188        })?;
189
190        // 9. Set up port forwarding rules
191        for pf in &config.port_forwards {
192            Self::setup_port_forward_for(&container_ip, pf)?;
193            rollback
194                .port_forwards
195                .push((container_ip.clone(), pf.clone()));
196        }
197
198        net_state = net_state.transition(NetworkState::Active)?;
199
200        info!(
201            "Bridge network configured: {} -> {} (IP: {})",
202            veth_host, veth_container, container_ip
203        );
204        let prev_ip_forward = rollback.prev_ip_forward.clone();
205        rollback.disarm();
206
207        Ok(Self {
208            config: config.clone(),
209            container_ip,
210            veth_host,
211            container_id: container_id.to_string(),
212            prev_ip_forward,
213            state: net_state,
214        })
215    }
216
217    /// Apply egress policy rules inside the container's network namespace.
218    ///
219    /// Uses iptables OUTPUT chain to restrict outbound connections.
220    /// Must be called after bridge setup while the container netns is reachable.
221    pub fn apply_egress_policy(&self, pid: u32, policy: &EgressPolicy) -> Result<()> {
222        // Validate egress CIDRs before passing to iptables
223        for cidr in &policy.allowed_cidrs {
224            crate::network::config::validate_egress_cidr(cidr)
225                .map_err(|e| NucleusError::NetworkError(format!("Invalid egress CIDR: {}", e)))?;
226        }
227
228        let pid_str = pid.to_string();
229
230        // M15: Set DROP policy BEFORE flushing rules to avoid a window where
231        // all egress is unrestricted. The order is: DROP -> flush -> add rules.
232        Self::run_cmd(
233            "nsenter",
234            &["-t", &pid_str, "-n", "iptables", "-P", "OUTPUT", "DROP"],
235        )?;
236        // Flush any existing OUTPUT rules to prevent duplication on repeated calls
237        Self::run_cmd(
238            "nsenter",
239            &["-t", &pid_str, "-n", "iptables", "-F", "OUTPUT"],
240        )?;
241
242        // Default policy: drop all OUTPUT (except established/related and loopback)
243        Self::run_cmd(
244            "nsenter",
245            &[
246                "-t", &pid_str, "-n", "iptables", "-A", "OUTPUT", "-o", "lo", "-j", "ACCEPT",
247            ],
248        )?;
249
250        Self::run_cmd(
251            "nsenter",
252            &[
253                "-t",
254                &pid_str,
255                "-n",
256                "iptables",
257                "-A",
258                "OUTPUT",
259                "-m",
260                "conntrack",
261                "--ctstate",
262                "ESTABLISHED,RELATED",
263                "-j",
264                "ACCEPT",
265            ],
266        )?;
267
268        // Allow DNS to configured resolvers (only when policy permits it)
269        if policy.allow_dns {
270            for dns in &self.config.dns {
271                Self::run_cmd(
272                    "nsenter",
273                    &[
274                        "-t", &pid_str, "-n", "iptables", "-A", "OUTPUT", "-p", "udp", "-d", dns,
275                        "--dport", "53", "-j", "ACCEPT",
276                    ],
277                )?;
278                Self::run_cmd(
279                    "nsenter",
280                    &[
281                        "-t", &pid_str, "-n", "iptables", "-A", "OUTPUT", "-p", "tcp", "-d", dns,
282                        "--dport", "53", "-j", "ACCEPT",
283                    ],
284                )?;
285            }
286        }
287
288        // Allow traffic to each permitted CIDR
289        for cidr in &policy.allowed_cidrs {
290            if policy.allowed_tcp_ports.is_empty() && policy.allowed_udp_ports.is_empty() {
291                // Allow all ports to this CIDR
292                Self::run_cmd(
293                    "nsenter",
294                    &[
295                        "-t", &pid_str, "-n", "iptables", "-A", "OUTPUT", "-d", cidr, "-j",
296                        "ACCEPT",
297                    ],
298                )?;
299            } else {
300                for port in &policy.allowed_tcp_ports {
301                    Self::run_cmd(
302                        "nsenter",
303                        &[
304                            "-t",
305                            &pid_str,
306                            "-n",
307                            "iptables",
308                            "-A",
309                            "OUTPUT",
310                            "-p",
311                            "tcp",
312                            "-d",
313                            cidr,
314                            "--dport",
315                            &port.to_string(),
316                            "-j",
317                            "ACCEPT",
318                        ],
319                    )?;
320                }
321                for port in &policy.allowed_udp_ports {
322                    Self::run_cmd(
323                        "nsenter",
324                        &[
325                            "-t",
326                            &pid_str,
327                            "-n",
328                            "iptables",
329                            "-A",
330                            "OUTPUT",
331                            "-p",
332                            "udp",
333                            "-d",
334                            cidr,
335                            "--dport",
336                            &port.to_string(),
337                            "-j",
338                            "ACCEPT",
339                        ],
340                    )?;
341                }
342            }
343        }
344
345        // Log denied packets (rate-limited)
346        if policy.log_denied {
347            Self::run_cmd(
348                "nsenter",
349                &[
350                    "-t",
351                    &pid_str,
352                    "-n",
353                    "iptables",
354                    "-A",
355                    "OUTPUT",
356                    "-m",
357                    "limit",
358                    "--limit",
359                    "5/min",
360                    "-j",
361                    "LOG",
362                    "--log-prefix",
363                    "nucleus-egress-denied: ",
364                ],
365            )?;
366        }
367
368        // Drop everything else
369        Self::run_cmd(
370            "nsenter",
371            &["-t", &pid_str, "-n", "iptables", "-P", "OUTPUT", "DROP"],
372        )?;
373
374        info!(
375            "Egress policy applied: {} allowed CIDRs",
376            policy.allowed_cidrs.len()
377        );
378        debug!("Egress policy details: {:?}", policy);
379
380        Ok(())
381    }
382
383    /// Clean up bridge networking
384    ///
385    /// State transition: Active -> Cleaned
386    pub fn cleanup(mut self) -> Result<()> {
387        self.state = self.state.transition(NetworkState::Cleaned)?;
388
389        // Release the IP allocation
390        Self::release_allocated_ip(&self.container_id);
391
392        // Remove port forwarding rules
393        for pf in &self.config.port_forwards {
394            if let Err(e) = self.cleanup_port_forward(pf) {
395                warn!("Failed to cleanup port forward: {}", e);
396            }
397        }
398
399        // Remove NAT rule
400        let _ = Self::run_cmd(
401            "iptables",
402            &[
403                "-t",
404                "nat",
405                "-D",
406                "POSTROUTING",
407                "-s",
408                &self.config.subnet,
409                "-j",
410                "MASQUERADE",
411            ],
412        );
413
414        // Delete veth pair (deleting one end removes both)
415        let _ = Self::run_cmd("ip", &["link", "del", &self.veth_host]);
416
417        // Restore previous ip_forward state if we changed it
418        if let Some(ref prev) = self.prev_ip_forward {
419            if prev == "0" {
420                if let Err(e) = std::fs::write("/proc/sys/net/ipv4/ip_forward", "0") {
421                    warn!("Failed to restore ip_forward to 0: {}", e);
422                } else {
423                    info!("Restored net.ipv4.ip_forward to 0");
424                }
425            }
426        }
427
428        info!("Bridge network cleaned up");
429        Ok(())
430    }
431
432    /// Best-effort cleanup for use in Drop. Performs the same teardown as
433    /// `cleanup()` but ignores all errors and skips the state transition
434    /// (which requires ownership).
435    fn cleanup_best_effort(&mut self) {
436        if self.state == NetworkState::Cleaned {
437            return;
438        }
439
440        Self::release_allocated_ip(&self.container_id);
441
442        for pf in &self.config.port_forwards {
443            let _ = self.cleanup_port_forward(pf);
444        }
445
446        let _ = Self::run_cmd(
447            "iptables",
448            &[
449                "-t",
450                "nat",
451                "-D",
452                "POSTROUTING",
453                "-s",
454                &self.config.subnet,
455                "-j",
456                "MASQUERADE",
457            ],
458        );
459
460        let _ = Self::run_cmd("ip", &["link", "del", &self.veth_host]);
461
462        if let Some(ref prev) = self.prev_ip_forward {
463            if prev == "0" {
464                let _ = std::fs::write("/proc/sys/net/ipv4/ip_forward", "0");
465            }
466        }
467
468        self.state = NetworkState::Cleaned;
469        debug!("Bridge network cleaned up (best-effort via drop)");
470    }
471
472    /// Detect and remove orphaned iptables rules from previous Nucleus runs.
473    ///
474    /// Checks for stale MASQUERADE rules referencing the nucleus subnet that
475    /// have no corresponding running container. Prevents gradual degradation
476    /// of network isolation from accumulated orphaned rules.
477    pub fn cleanup_orphaned_rules(subnet: &str) {
478        // List NAT rules and look for nucleus-related MASQUERADE entries
479        let output = match Command::new("iptables")
480            .args(["-t", "nat", "-L", "POSTROUTING", "-n"])
481            .output()
482        {
483            Ok(o) => o,
484            Err(e) => {
485                debug!("Cannot check iptables for orphaned rules: {}", e);
486                return;
487            }
488        };
489
490        let stdout = String::from_utf8_lossy(&output.stdout);
491        let mut orphaned_count = 0u32;
492        for line in stdout.lines() {
493            if line.contains("MASQUERADE") && line.contains(subnet) {
494                // Try to remove it; if it fails, it may be actively used
495                let _ = Self::run_cmd(
496                    "iptables",
497                    &[
498                        "-t",
499                        "nat",
500                        "-D",
501                        "POSTROUTING",
502                        "-s",
503                        subnet,
504                        "-j",
505                        "MASQUERADE",
506                    ],
507                );
508                orphaned_count += 1;
509            }
510        }
511
512        if orphaned_count > 0 {
513            info!(
514                "Cleaned up {} orphaned iptables MASQUERADE rule(s) for subnet {}",
515                orphaned_count, subnet
516            );
517        }
518    }
519
520    fn ensure_bridge_for(bridge_name: &str, subnet: &str) -> Result<()> {
521        // Check if bridge exists
522        if Self::run_cmd("ip", &["link", "show", bridge_name]).is_ok() {
523            return Ok(());
524        }
525
526        // Create bridge
527        Self::run_cmd(
528            "ip",
529            &["link", "add", "name", bridge_name, "type", "bridge"],
530        )?;
531
532        let gateway = Self::gateway_from_subnet(subnet);
533        Self::run_cmd(
534            "ip",
535            &[
536                "addr",
537                "add",
538                &format!("{}/{}", gateway, Self::subnet_prefix(subnet)),
539                "dev",
540                bridge_name,
541            ],
542        )?;
543        Self::run_cmd("ip", &["link", "set", bridge_name, "up"])?;
544
545        info!("Created bridge {}", bridge_name);
546        Ok(())
547    }
548
549    fn setup_port_forward_for(container_ip: &str, pf: &PortForward) -> Result<()> {
550        for chain in ["PREROUTING", "OUTPUT"] {
551            let args = Self::port_forward_rule_args("-A", chain, container_ip, pf);
552            Self::run_cmd_owned("iptables", &args)?;
553        }
554
555        let host_ip = pf
556            .host_ip
557            .map(|ip| ip.to_string())
558            .unwrap_or_else(|| "0.0.0.0".to_string());
559        info!(
560            "Port forward: {}:{} -> {}:{}/{}",
561            host_ip, pf.host_port, container_ip, pf.container_port, pf.protocol
562        );
563        Ok(())
564    }
565
566    fn cleanup_port_forward(&self, pf: &PortForward) -> Result<()> {
567        for chain in ["OUTPUT", "PREROUTING"] {
568            let args = Self::port_forward_rule_args("-D", chain, &self.container_ip, pf);
569            Self::run_cmd_owned("iptables", &args)?;
570        }
571        Ok(())
572    }
573
574    /// Allocate a container IP from the subnet using /dev/urandom.
575    ///
576    /// Checks both host-visible interfaces (via `ip addr`) and IPs assigned to
577    /// other Nucleus containers (via state files) to avoid duplicates. Container
578    /// IPs inside network namespaces are invisible to `ip addr show` on the host.
579    fn allocate_ip_with_reserved(
580        subnet: &str,
581        reserved: &std::collections::HashSet<String>,
582    ) -> Result<String> {
583        let base = subnet.split('/').next().unwrap_or("10.0.42.0");
584        let parts: Vec<&str> = base.split('.').collect();
585        if parts.len() != 4 {
586            return Ok("10.0.42.2".to_string());
587        }
588
589        // Use rejection sampling to avoid modulo bias.
590        // Range is 2..=254 (253 values). We reject random bytes >= 253 to
591        // ensure uniform distribution, then add 2 to shift into the valid range.
592        // Open /dev/urandom once and read all randomness in a single batch.
593        // 128 bytes gives ~125 valid candidates (byte < 253), making exhaustion
594        // in a populated subnet far less likely than the previous 32-byte buffer.
595        let mut rand_buf = [0u8; 128];
596        std::fs::File::open("/dev/urandom")
597            .and_then(|mut f| std::io::Read::read_exact(&mut f, &mut rand_buf))
598            .map_err(|e| {
599                NucleusError::NetworkError(format!("Failed to read /dev/urandom: {}", e))
600            })?;
601        for &byte in &rand_buf {
602            // Rejection sampling: discard values that would cause modulo bias
603            if byte >= 253 {
604                continue;
605            }
606            let offset = byte as u32 + 2;
607            let candidate = format!("{}.{}.{}.{}", parts[0], parts[1], parts[2], offset);
608            if reserved.contains(&candidate) {
609                continue;
610            }
611            if !Self::is_ip_in_use(&candidate)? {
612                // Lock is released when lock_file is dropped
613                return Ok(candidate);
614            }
615        }
616
617        Err(NucleusError::NetworkError(format!(
618            "Failed to allocate free IP in subnet {}",
619            subnet
620        )))
621    }
622
623    fn reserve_ip_in_dir(
624        alloc_dir: &std::path::Path,
625        container_id: &str,
626        subnet: &str,
627        requested_ip: Option<&str>,
628    ) -> Result<String> {
629        Self::ensure_alloc_dir(alloc_dir)?;
630        let lock_path = alloc_dir.join(".lock");
631        let lock_file = std::fs::OpenOptions::new()
632            .create(true)
633            .write(true)
634            .truncate(false)
635            .open(&lock_path)
636            .map_err(|e| {
637                NucleusError::NetworkError(format!("Failed to open IP alloc lock: {}", e))
638            })?;
639        // SAFETY: lock_file is a valid open fd. LOCK_EX is a blocking exclusive
640        // lock that is released when the fd is closed (end of scope).
641        let lock_ret = unsafe { libc::flock(lock_file.as_raw_fd(), libc::LOCK_EX) };
642        if lock_ret != 0 {
643            return Err(NucleusError::NetworkError(format!(
644                "Failed to acquire IP alloc lock: {}",
645                std::io::Error::last_os_error()
646            )));
647        }
648
649        let reserved = Self::collect_reserved_ips_in_dir(alloc_dir);
650        let ip = match requested_ip {
651            Some(ip) => {
652                if reserved.contains(ip) || Self::is_ip_in_use(ip)? {
653                    return Err(NucleusError::NetworkError(format!(
654                        "Requested container IP {} is already in use",
655                        ip
656                    )));
657                }
658                ip.to_string()
659            }
660            None => Self::allocate_ip_with_reserved(subnet, &reserved)?,
661        };
662
663        Self::record_allocated_ip_in_dir(alloc_dir, container_id, &ip)?;
664        Ok(ip)
665    }
666
667    /// Scan the Nucleus IP allocation directory for IPs already assigned.
668    fn collect_reserved_ips_in_dir(
669        alloc_dir: &std::path::Path,
670    ) -> std::collections::HashSet<String> {
671        let mut ips = std::collections::HashSet::new();
672        if let Ok(entries) = std::fs::read_dir(alloc_dir) {
673            for entry in entries.flatten() {
674                if let Some(name) = entry.file_name().to_str() {
675                    if name.ends_with(".ip") {
676                        if let Ok(ip) = std::fs::read_to_string(entry.path()) {
677                            let ip = ip.trim().to_string();
678                            if !ip.is_empty() {
679                                ips.insert(ip);
680                            }
681                        }
682                    }
683                }
684            }
685        }
686        ips
687    }
688
689    /// Persist the allocated IP for this container so other containers can see it.
690    fn record_allocated_ip_in_dir(
691        alloc_dir: &std::path::Path,
692        container_id: &str,
693        ip: &str,
694    ) -> Result<()> {
695        Self::ensure_alloc_dir(alloc_dir)?;
696        let path = alloc_dir.join(format!("{}.ip", container_id));
697        std::fs::write(&path, ip).map_err(|e| {
698            NucleusError::NetworkError(format!("Failed to record IP allocation: {}", e))
699        })?;
700        Ok(())
701    }
702
703    /// Remove the persisted IP allocation for a container.
704    fn release_allocated_ip(container_id: &str) {
705        let alloc_dir = Self::ip_alloc_dir();
706        Self::release_allocated_ip_in_dir(&alloc_dir, container_id);
707    }
708
709    fn release_allocated_ip_in_dir(alloc_dir: &std::path::Path, container_id: &str) {
710        let path = alloc_dir.join(format!("{}.ip", container_id));
711        let _ = std::fs::remove_file(path);
712    }
713
714    /// Create the IP allocation directory with restrictive permissions (0700)
715    /// and reject symlinked paths to prevent symlink attacks.
716    fn ensure_alloc_dir(alloc_dir: &std::path::Path) -> Result<()> {
717        // L11: Check for symlinks BEFORE creating directories to avoid TOCTOU.
718        // If the path already exists, verify it's not a symlink.
719        if alloc_dir.exists() {
720            if let Ok(meta) = std::fs::symlink_metadata(alloc_dir) {
721                if meta.file_type().is_symlink() {
722                    return Err(NucleusError::NetworkError(format!(
723                        "IP alloc dir {:?} is a symlink, refusing to use",
724                        alloc_dir
725                    )));
726                }
727            }
728        }
729        // Also check parent directory for symlinks
730        if let Some(parent) = alloc_dir.parent() {
731            if let Ok(meta) = std::fs::symlink_metadata(parent) {
732                if meta.file_type().is_symlink() {
733                    return Err(NucleusError::NetworkError(format!(
734                        "IP alloc dir parent {:?} is a symlink, refusing to use",
735                        parent
736                    )));
737                }
738            }
739        }
740
741        std::fs::create_dir_all(alloc_dir).map_err(|e| {
742            NucleusError::NetworkError(format!("Failed to create IP alloc dir: {}", e))
743        })?;
744
745        // Restrict permissions to owner-only atomically after creation
746        use std::os::unix::fs::PermissionsExt;
747        let perms = std::fs::Permissions::from_mode(0o700);
748        std::fs::set_permissions(alloc_dir, perms).map_err(|e| {
749            NucleusError::NetworkError(format!(
750                "Failed to set permissions on IP alloc dir {:?}: {}",
751                alloc_dir, e
752            ))
753        })?;
754
755        // Re-verify no symlink replacement after permissions were set
756        if let Ok(meta) = std::fs::symlink_metadata(alloc_dir) {
757            if meta.file_type().is_symlink() {
758                return Err(NucleusError::NetworkError(format!(
759                    "IP alloc dir {:?} was replaced with a symlink during setup",
760                    alloc_dir
761                )));
762            }
763        }
764        Ok(())
765    }
766
767    fn ip_alloc_dir() -> std::path::PathBuf {
768        if nix::unistd::Uid::effective().is_root() {
769            std::path::PathBuf::from("/var/run/nucleus/ip-alloc")
770        } else {
771            dirs::runtime_dir()
772                .map(|d| d.join("nucleus/ip-alloc"))
773                .or_else(|| dirs::data_local_dir().map(|d| d.join("nucleus/ip-alloc")))
774                .unwrap_or_else(|| {
775                    dirs::home_dir()
776                        .map(|h| h.join(".nucleus/ip-alloc"))
777                        .unwrap_or_else(|| std::path::PathBuf::from("/var/run/nucleus/ip-alloc"))
778                })
779        }
780    }
781
782    /// Read the start time (field 22) from /proc/<pid>/stat to detect PID recycling.
783    /// Returns 0 if the process does not exist or the field cannot be parsed.
784    fn read_pid_start_ticks(pid: u32) -> u64 {
785        let stat_path = format!("/proc/{}/stat", pid);
786        if let Ok(content) = std::fs::read_to_string(&stat_path) {
787            // Field 22 is starttime. The comm field (2) may contain spaces/parens,
788            // so find the last ')' and count fields from there.
789            if let Some(after_comm) = content.rfind(')') {
790                return content[after_comm + 2..]
791                    .split_whitespace()
792                    .nth(19) // field 22 is 20th after the ')' + state field
793                    .and_then(|s| s.parse().ok())
794                    .unwrap_or(0);
795            }
796        }
797        0
798    }
799
800    /// Get gateway IP from subnet (first usable address)
801    fn gateway_from_subnet(subnet: &str) -> String {
802        let base = subnet.split('/').next().unwrap_or("10.0.42.0");
803        let parts: Vec<&str> = base.split('.').collect();
804        if parts.len() == 4 {
805            format!("{}.{}.{}.1", parts[0], parts[1], parts[2])
806        } else {
807            "10.0.42.1".to_string()
808        }
809    }
810
811    fn subnet_prefix(subnet: &str) -> u8 {
812        subnet
813            .split_once('/')
814            .and_then(|(_, p)| p.parse::<u8>().ok())
815            .filter(|p| *p <= 32)
816            .unwrap_or(24)
817    }
818
819    /// Resolve a system binary to a validated absolute path.
820    ///
821    /// When running as root, searches known sysadmin paths and validates
822    /// ownership and permissions before use. When unprivileged, uses
823    /// `which`-style PATH resolution but still validates the result.
824    /// Returns an error if no valid binary is found.
825    fn resolve_bin(name: &str) -> Result<String> {
826        let search_dirs: &[&str] = match name {
827            "ip" => &["/usr/sbin/ip", "/sbin/ip", "/usr/bin/ip"],
828            "iptables" => &["/usr/sbin/iptables", "/sbin/iptables", "/usr/bin/iptables"],
829            "nsenter" => &["/usr/bin/nsenter", "/usr/sbin/nsenter", "/bin/nsenter"],
830            _ => &[],
831        };
832
833        for path in search_dirs {
834            let p = std::path::Path::new(path);
835            if p.exists() {
836                Self::validate_network_binary(p, name)?;
837                return Ok(path.to_string());
838            }
839        }
840
841        // Fallback: resolve via PATH, but validate the result
842        if let Ok(output) = Command::new("which").arg(name).output() {
843            if output.status.success() {
844                let resolved = String::from_utf8_lossy(&output.stdout).trim().to_string();
845                if !resolved.is_empty() {
846                    let p = std::path::Path::new(&resolved);
847                    Self::validate_network_binary(p, name)?;
848                    return Ok(resolved);
849                }
850            }
851        }
852
853        Err(NucleusError::NetworkError(format!(
854            "Required binary '{}' not found or failed validation",
855            name
856        )))
857    }
858
859    /// Validate a network binary's ownership and permissions.
860    /// Rejects binaries that are group/world-writable or not owned by root/euid.
861    fn validate_network_binary(path: &std::path::Path, name: &str) -> Result<()> {
862        use std::os::unix::fs::MetadataExt;
863
864        let meta = std::fs::metadata(path)
865            .map_err(|e| NucleusError::NetworkError(format!("Cannot stat {}: {}", name, e)))?;
866        let mode = meta.mode();
867        if mode & 0o022 != 0 {
868            return Err(NucleusError::NetworkError(format!(
869                "Binary '{}' at {:?} is writable by group/others (mode {:o}), refusing to execute",
870                name, path, mode
871            )));
872        }
873        let owner = meta.uid();
874        let euid = nix::unistd::Uid::effective().as_raw();
875        if owner != 0 && owner != euid {
876            return Err(NucleusError::NetworkError(format!(
877                "Binary '{}' at {:?} owned by UID {} (expected root or euid {}), refusing to execute",
878                name, path, owner, euid
879            )));
880        }
881        Ok(())
882    }
883
884    fn run_cmd(program: &str, args: &[&str]) -> Result<()> {
885        let resolved = Self::resolve_bin(program)?;
886        let output = Command::new(&resolved).args(args).output().map_err(|e| {
887            NucleusError::NetworkError(format!("Failed to run {} {:?}: {}", resolved, args, e))
888        })?;
889
890        if !output.status.success() {
891            let stderr = String::from_utf8_lossy(&output.stderr);
892            return Err(NucleusError::NetworkError(format!(
893                "{} {:?} failed: {}",
894                program, args, stderr
895            )));
896        }
897
898        Ok(())
899    }
900
901    fn run_cmd_owned(program: &str, args: &[String]) -> Result<()> {
902        let refs: Vec<&str> = args.iter().map(String::as_str).collect();
903        Self::run_cmd(program, &refs)
904    }
905
906    fn port_forward_rule_args(
907        operation: &str,
908        chain: &str,
909        container_ip: &str,
910        pf: &PortForward,
911    ) -> Vec<String> {
912        let mut args = vec![
913            "-t".to_string(),
914            "nat".to_string(),
915            operation.to_string(),
916            chain.to_string(),
917            "-p".to_string(),
918            pf.protocol.as_str().to_string(),
919        ];
920
921        if chain == "OUTPUT" {
922            args.extend([
923                "-m".to_string(),
924                "addrtype".to_string(),
925                "--dst-type".to_string(),
926                "LOCAL".to_string(),
927            ]);
928        }
929
930        if let Some(host_ip) = pf.host_ip {
931            args.extend(["-d".to_string(), host_ip.to_string()]);
932        }
933
934        args.extend([
935            "--dport".to_string(),
936            pf.host_port.to_string(),
937            "-j".to_string(),
938            "DNAT".to_string(),
939            "--to-destination".to_string(),
940            format!("{}:{}", container_ip, pf.container_port),
941        ]);
942
943        args
944    }
945
946    fn is_ip_in_use(ip: &str) -> Result<bool> {
947        let ip_bin = Self::resolve_bin("ip")?;
948        let output = Command::new(&ip_bin)
949            .args(["-4", "addr", "show"])
950            .output()
951            .map_err(|e| {
952                NucleusError::NetworkError(format!("Failed to inspect host IPs: {}", e))
953            })?;
954
955        if !output.status.success() {
956            let stderr = String::from_utf8_lossy(&output.stderr);
957            return Err(NucleusError::NetworkError(format!(
958                "ip -4 addr show failed: {}",
959                stderr.trim()
960            )));
961        }
962
963        let stdout = String::from_utf8_lossy(&output.stdout);
964        Ok(stdout.contains(&format!(" {}/", ip)))
965    }
966
967    /// Write resolv.conf inside container (for writable /etc, e.g. agent mode)
968    pub fn write_resolv_conf(root: &std::path::Path, dns: &[String]) -> Result<()> {
969        let resolv_path = root.join("etc/resolv.conf");
970        let content: String = dns
971            .iter()
972            .map(|server| format!("nameserver {}\n", server))
973            .collect();
974        std::fs::write(&resolv_path, content).map_err(|e| {
975            NucleusError::NetworkError(format!("Failed to write resolv.conf: {}", e))
976        })?;
977        Ok(())
978    }
979
980    /// Bind-mount a resolv.conf over a read-only /etc (for production rootfs mode).
981    ///
982    /// Creates a memfd-backed resolv.conf and bind-mounts it over
983    /// /etc/resolv.conf so it works even when the rootfs /etc is read-only.
984    /// The memfd is cleaned up when the container exits.
985    pub fn bind_mount_resolv_conf(root: &std::path::Path, dns: &[String]) -> Result<()> {
986        use nix::mount::{mount, MsFlags};
987
988        let content: String = dns
989            .iter()
990            .map(|server| format!("nameserver {}\n", server))
991            .collect();
992
993        // Create a memfd-backed file to avoid leaving staging files on disk
994        let memfd_name = std::ffi::CString::new("nucleus-resolv").map_err(|e| {
995            NucleusError::NetworkError(format!("Failed to create memfd name: {}", e))
996        })?;
997        // SAFETY: memfd_name is a valid NUL-terminated CString. memfd_create
998        // returns a new fd or -1 on error; we check for error below.
999        let raw_fd = unsafe { libc::memfd_create(memfd_name.as_ptr(), 0) };
1000        if raw_fd < 0 {
1001            // Fallback to staging file if memfd_create is unavailable
1002            return Self::bind_mount_resolv_conf_staging(root, dns);
1003        }
1004        // SAFETY: raw_fd is a valid, newly-created fd from memfd_create.
1005        // OwnedFd takes ownership and will close it exactly once on drop,
1006        // preventing double-close on any error path.
1007        let memfd = unsafe { std::os::fd::OwnedFd::from_raw_fd(raw_fd) };
1008
1009        // Write content to memfd
1010        // SAFETY: memfd is a valid open fd. content is a valid byte buffer
1011        // with correct length. write() may return -1 on error.
1012        let write_result = unsafe {
1013            libc::write(
1014                memfd.as_raw_fd(),
1015                content.as_ptr() as *const libc::c_void,
1016                content.len(),
1017            )
1018        };
1019        if write_result < 0 {
1020            // memfd dropped here, closing the fd automatically
1021            return Self::bind_mount_resolv_conf_staging(root, dns);
1022        }
1023
1024        // Ensure the mount target exists
1025        let target = root.join("etc/resolv.conf");
1026        if !target.exists() {
1027            let _ = std::fs::write(&target, "");
1028        }
1029
1030        // Bind mount the memfd over the read-only resolv.conf
1031        let memfd_path = format!("/proc/self/fd/{}", memfd.as_raw_fd());
1032        mount(
1033            Some(memfd_path.as_str()),
1034            &target,
1035            None::<&str>,
1036            MsFlags::MS_BIND,
1037            None::<&str>,
1038        )
1039        .map_err(|e| {
1040            // memfd dropped here via the returned Err, closing the fd automatically
1041            NucleusError::NetworkError(format!("Failed to bind mount resolv.conf: {}", e))
1042        })?;
1043
1044        // memfd dropped here — the mount holds a kernel reference to the file,
1045        // so it survives the fd close.
1046
1047        info!("Bind-mounted resolv.conf for bridge networking (rootfs mode, memfd)");
1048        Ok(())
1049    }
1050
1051    /// Fallback: bind-mount a staging resolv.conf file.
1052    fn bind_mount_resolv_conf_staging(root: &std::path::Path, dns: &[String]) -> Result<()> {
1053        use nix::mount::{mount, MsFlags};
1054
1055        let content: String = dns
1056            .iter()
1057            .map(|server| format!("nameserver {}\n", server))
1058            .collect();
1059
1060        // Write to a staging file outside /etc
1061        let staging = root.join("tmp/.resolv.conf.nucleus");
1062        if let Some(parent) = staging.parent() {
1063            std::fs::create_dir_all(parent).map_err(|e| {
1064                NucleusError::NetworkError(format!(
1065                    "Failed to create resolv.conf staging parent: {}",
1066                    e
1067                ))
1068            })?;
1069        }
1070        std::fs::write(&staging, content).map_err(|e| {
1071            NucleusError::NetworkError(format!("Failed to write staging resolv.conf: {}", e))
1072        })?;
1073
1074        // Ensure the mount target exists
1075        let target = root.join("etc/resolv.conf");
1076        if !target.exists() {
1077            let _ = std::fs::write(&target, "");
1078        }
1079
1080        // Bind mount the staging file over the read-only resolv.conf
1081        mount(
1082            Some(staging.as_path()),
1083            &target,
1084            None::<&str>,
1085            MsFlags::MS_BIND,
1086            None::<&str>,
1087        )
1088        .map_err(|e| {
1089            NucleusError::NetworkError(format!("Failed to bind mount resolv.conf: {}", e))
1090        })?;
1091
1092        // The bind mount holds a reference to the inode, so we can safely
1093        // unlink the staging path to avoid leaking DNS server info on disk.
1094        if let Err(e) = std::fs::remove_file(&staging) {
1095            warn!("Failed to remove staging resolv.conf {:?}: {}", staging, e);
1096        }
1097
1098        info!("Bind-mounted resolv.conf for bridge networking (rootfs mode, staging)");
1099        Ok(())
1100    }
1101}
1102
1103impl Drop for BridgeNetwork {
1104    fn drop(&mut self) {
1105        self.cleanup_best_effort();
1106    }
1107}
1108
1109struct SetupRollback {
1110    veth_host: String,
1111    subnet: String,
1112    veth_created: bool,
1113    nat_added: bool,
1114    port_forwards: Vec<(String, PortForward)>,
1115    prev_ip_forward: Option<String>,
1116    reserved_ip: Option<(std::path::PathBuf, String)>,
1117    armed: bool,
1118}
1119
1120impl SetupRollback {
1121    fn new(
1122        veth_host: String,
1123        subnet: String,
1124        reserved_ip: Option<(std::path::PathBuf, String)>,
1125    ) -> Self {
1126        Self {
1127            veth_host,
1128            subnet,
1129            veth_created: false,
1130            nat_added: false,
1131            port_forwards: Vec::new(),
1132            prev_ip_forward: None,
1133            reserved_ip,
1134            armed: true,
1135        }
1136    }
1137
1138    fn disarm(&mut self) {
1139        self.armed = false;
1140    }
1141}
1142
1143impl Drop for SetupRollback {
1144    fn drop(&mut self) {
1145        if !self.armed {
1146            return;
1147        }
1148
1149        for (container_ip, pf) in self.port_forwards.iter().rev() {
1150            for chain in ["OUTPUT", "PREROUTING"] {
1151                let args = BridgeNetwork::port_forward_rule_args("-D", chain, container_ip, pf);
1152                if let Err(e) = BridgeNetwork::run_cmd_owned("iptables", &args) {
1153                    warn!(
1154                        "Rollback: failed to remove iptables {} rule for {}: {}",
1155                        chain, container_ip, e
1156                    );
1157                }
1158            }
1159        }
1160
1161        if self.nat_added {
1162            if let Err(e) = BridgeNetwork::run_cmd(
1163                "iptables",
1164                &[
1165                    "-t",
1166                    "nat",
1167                    "-D",
1168                    "POSTROUTING",
1169                    "-s",
1170                    &self.subnet,
1171                    "-j",
1172                    "MASQUERADE",
1173                ],
1174            ) {
1175                warn!("Rollback: failed to remove NAT rule: {}", e);
1176            }
1177        }
1178
1179        if self.veth_created {
1180            if let Err(e) = BridgeNetwork::run_cmd("ip", &["link", "del", &self.veth_host]) {
1181                warn!("Rollback: failed to delete veth {}: {}", self.veth_host, e);
1182            }
1183        }
1184
1185        if let Some((alloc_dir, container_id)) = &self.reserved_ip {
1186            BridgeNetwork::release_allocated_ip_in_dir(alloc_dir, container_id);
1187        }
1188    }
1189}
1190
1191#[cfg(test)]
1192mod tests {
1193    use super::*;
1194
1195    #[test]
1196    fn test_ip_allocation_rejection_sampling_range() {
1197        // H-5: Verify that rejection sampling produces values in 2..=254
1198        // and that values >= 253 are rejected (no modulo bias).
1199        for byte in 0u8..253 {
1200            let offset = byte as u32 + 2;
1201            assert!(
1202                (2..=254).contains(&offset),
1203                "offset {} out of range",
1204                offset
1205            );
1206        }
1207        // Values 253, 254, 255 must be rejected
1208        for byte in [253u8, 254, 255] {
1209            assert!(byte >= 253);
1210        }
1211    }
1212
1213    #[test]
1214    fn test_reserve_ip_blocks_duplicate_requested_address() {
1215        let temp = tempfile::tempdir().unwrap();
1216        BridgeNetwork::record_allocated_ip_in_dir(temp.path(), "one", "10.0.42.2").unwrap();
1217
1218        let err =
1219            BridgeNetwork::reserve_ip_in_dir(temp.path(), "two", "10.0.42.0/24", Some("10.0.42.2"))
1220                .unwrap_err();
1221        assert!(
1222            err.to_string().contains("already in use"),
1223            "second reservation of the same IP must fail"
1224        );
1225    }
1226
1227    #[test]
1228    fn test_setup_rollback_releases_reserved_ip() {
1229        let temp = tempfile::tempdir().unwrap();
1230        BridgeNetwork::record_allocated_ip_in_dir(temp.path(), "rollback", "10.0.42.3").unwrap();
1231
1232        let rollback = SetupRollback {
1233            veth_host: "veth-test".to_string(),
1234            subnet: "10.0.42.0/24".to_string(),
1235            veth_created: false,
1236            nat_added: false,
1237            port_forwards: Vec::new(),
1238            prev_ip_forward: None,
1239            reserved_ip: Some((temp.path().to_path_buf(), "rollback".to_string())),
1240            armed: true,
1241        };
1242
1243        drop(rollback);
1244
1245        assert!(
1246            !temp.path().join("rollback.ip").exists(),
1247            "rollback must release reserved IP files on setup failure"
1248        );
1249    }
1250
1251    #[test]
1252    fn test_port_forward_rules_include_output_chain_for_local_host_clients() {
1253        let pf = PortForward {
1254            host_ip: None,
1255            host_port: 8080,
1256            container_port: 80,
1257            protocol: crate::network::config::Protocol::Tcp,
1258        };
1259
1260        let prerouting =
1261            BridgeNetwork::port_forward_rule_args("-A", "PREROUTING", "10.0.42.2", &pf);
1262        let output = BridgeNetwork::port_forward_rule_args("-A", "OUTPUT", "10.0.42.2", &pf);
1263
1264        assert!(prerouting.iter().any(|arg| arg == "PREROUTING"));
1265        assert!(output.iter().any(|arg| arg == "OUTPUT"));
1266        assert!(
1267            output
1268                .windows(2)
1269                .any(|pair| pair[0] == "--dst-type" && pair[1] == "LOCAL"),
1270            "OUTPUT rule must target local-destination traffic"
1271        );
1272    }
1273
1274    #[test]
1275    fn test_port_forward_rules_include_host_ip_when_configured() {
1276        let pf = PortForward {
1277            host_ip: Some(std::net::Ipv4Addr::new(127, 0, 0, 1)),
1278            host_port: 4173,
1279            container_port: 4173,
1280            protocol: crate::network::config::Protocol::Tcp,
1281        };
1282
1283        let prerouting =
1284            BridgeNetwork::port_forward_rule_args("-A", "PREROUTING", "10.0.42.2", &pf);
1285        let output = BridgeNetwork::port_forward_rule_args("-A", "OUTPUT", "10.0.42.2", &pf);
1286
1287        for args in [&prerouting, &output] {
1288            assert!(
1289                args.windows(2)
1290                    .any(|pair| pair[0] == "-d" && pair[1] == "127.0.0.1"),
1291                "port forward must restrict DNAT rules to the configured host IP"
1292            );
1293        }
1294    }
1295}