Skip to main content

nucleus/network/
bridge.rs

1use crate::error::{NucleusError, Result, StateTransition};
2use crate::network::config::{BridgeConfig, EgressPolicy, PortForward};
3use crate::network::NetworkState;
4use std::process::Command;
5use tracing::{debug, info, warn};
6
7/// Bridge network manager
8pub struct BridgeNetwork {
9    config: BridgeConfig,
10    container_ip: String,
11    veth_host: String,
12    container_id: String,
13    prev_ip_forward: Option<String>,
14    state: NetworkState,
15}
16
17impl BridgeNetwork {
18    /// Set up bridge networking for a container
19    ///
20    /// Creates bridge, veth pair, assigns IPs, enables NAT.
21    /// Must be called from the parent process after fork (needs host netns).
22    ///
23    /// State transitions: Unconfigured -> Configuring -> Active
24    pub fn setup(pid: u32, config: &BridgeConfig) -> Result<Self> {
25        Self::setup_for(pid, config, &format!("{:x}", pid))
26    }
27
28    /// Set up bridge networking with an explicit container ID for IP tracking.
29    pub fn setup_with_id(pid: u32, config: &BridgeConfig, container_id: &str) -> Result<Self> {
30        Self::setup_for(pid, config, container_id)
31    }
32
33    fn setup_for(pid: u32, config: &BridgeConfig, container_id: &str) -> Result<Self> {
34        // Validate all network parameters before using them in shell commands
35        config.validate()?;
36
37        let mut net_state = NetworkState::Unconfigured;
38        net_state = net_state.transition(NetworkState::Configuring)?;
39
40        let alloc_dir = Self::ip_alloc_dir();
41        let container_ip = Self::reserve_ip_in_dir(
42            &alloc_dir,
43            container_id,
44            &config.subnet,
45            config.container_ip.as_deref(),
46        )?;
47        let prefix = Self::subnet_prefix(&config.subnet);
48
49        // Linux interface names max 15 chars; truncate if needed
50        let veth_host_full = format!("veth-{:x}", pid);
51        let veth_cont_full = format!("vethc-{:x}", pid);
52        let veth_host = veth_host_full[..veth_host_full.len().min(15)].to_string();
53        let veth_container = veth_cont_full[..veth_cont_full.len().min(15)].to_string();
54        let mut rollback = SetupRollback::new(
55            veth_host.clone(),
56            config.subnet.clone(),
57            Some((alloc_dir.clone(), container_id.to_string())),
58        );
59
60        // 1. Create bridge if it doesn't exist
61        Self::ensure_bridge_for(&config.bridge_name, &config.subnet)?;
62
63        // 2. Create veth pair
64        Self::run_cmd(
65            "ip",
66            &[
67                "link",
68                "add",
69                &veth_host,
70                "type",
71                "veth",
72                "peer",
73                "name",
74                &veth_container,
75            ],
76        )?;
77        rollback.veth_created = true;
78
79        // 3. Attach host end to bridge
80        Self::run_cmd(
81            "ip",
82            &["link", "set", &veth_host, "master", &config.bridge_name],
83        )?;
84        Self::run_cmd("ip", &["link", "set", &veth_host, "up"])?;
85
86        // 4. Move container end to container's network namespace
87        Self::run_cmd(
88            "ip",
89            &["link", "set", &veth_container, "netns", &pid.to_string()],
90        )?;
91
92        // 5. Configure container interface (inside container netns via nsenter)
93        let pid_str = pid.to_string();
94        Self::run_cmd(
95            "nsenter",
96            &[
97                "-t",
98                &pid_str,
99                "-n",
100                "ip",
101                "addr",
102                "add",
103                &format!("{}/{}", container_ip, prefix),
104                "dev",
105                &veth_container,
106            ],
107        )?;
108        Self::run_cmd(
109            "nsenter",
110            &[
111                "-t",
112                &pid_str,
113                "-n",
114                "ip",
115                "link",
116                "set",
117                &veth_container,
118                "up",
119            ],
120        )?;
121        Self::run_cmd(
122            "nsenter",
123            &["-t", &pid_str, "-n", "ip", "link", "set", "lo", "up"],
124        )?;
125
126        // 6. Set default route in container
127        let gateway = Self::gateway_from_subnet(&config.subnet);
128        Self::run_cmd(
129            "nsenter",
130            &[
131                "-t", &pid_str, "-n", "ip", "route", "add", "default", "via", &gateway,
132            ],
133        )?;
134
135        // 7. Enable NAT (masquerade) on the host
136        Self::run_cmd(
137            "iptables",
138            &[
139                "-t",
140                "nat",
141                "-A",
142                "POSTROUTING",
143                "-s",
144                &config.subnet,
145                "-j",
146                "MASQUERADE",
147            ],
148        )?;
149        rollback.nat_added = true;
150
151        // 8. Enable IP forwarding (save previous value for restore on cleanup)
152        let prev_ip_forward = std::fs::read_to_string("/proc/sys/net/ipv4/ip_forward")
153            .unwrap_or_default()
154            .trim()
155            .to_string();
156        rollback.prev_ip_forward = Some(prev_ip_forward);
157        std::fs::write("/proc/sys/net/ipv4/ip_forward", "1").map_err(|e| {
158            NucleusError::NetworkError(format!("Failed to enable IP forwarding: {}", e))
159        })?;
160
161        // 9. Set up port forwarding rules
162        for pf in &config.port_forwards {
163            Self::setup_port_forward_for(&container_ip, pf)?;
164            rollback
165                .port_forwards
166                .push((container_ip.clone(), pf.clone()));
167        }
168
169        net_state = net_state.transition(NetworkState::Active)?;
170
171        info!(
172            "Bridge network configured: {} -> {} (IP: {})",
173            veth_host, veth_container, container_ip
174        );
175        let prev_ip_forward = rollback.prev_ip_forward.clone();
176        rollback.disarm();
177
178        Ok(Self {
179            config: config.clone(),
180            container_ip,
181            veth_host,
182            container_id: container_id.to_string(),
183            prev_ip_forward,
184            state: net_state,
185        })
186    }
187
188    /// Apply egress policy rules inside the container's network namespace.
189    ///
190    /// Uses iptables OUTPUT chain to restrict outbound connections.
191    /// Must be called after bridge setup while the container netns is reachable.
192    pub fn apply_egress_policy(&self, pid: u32, policy: &EgressPolicy) -> Result<()> {
193        // Validate egress CIDRs before passing to iptables
194        for cidr in &policy.allowed_cidrs {
195            crate::network::config::validate_egress_cidr(cidr)
196                .map_err(|e| NucleusError::NetworkError(format!("Invalid egress CIDR: {}", e)))?;
197        }
198
199        let pid_str = pid.to_string();
200
201        // Flush any existing OUTPUT rules to prevent duplication on repeated calls
202        Self::run_cmd(
203            "nsenter",
204            &["-t", &pid_str, "-n", "iptables", "-F", "OUTPUT"],
205        )?;
206        // Reset OUTPUT policy to ACCEPT before rebuilding rules
207        Self::run_cmd(
208            "nsenter",
209            &["-t", &pid_str, "-n", "iptables", "-P", "OUTPUT", "ACCEPT"],
210        )?;
211
212        // Default policy: drop all OUTPUT (except established/related and loopback)
213        Self::run_cmd(
214            "nsenter",
215            &[
216                "-t", &pid_str, "-n", "iptables", "-A", "OUTPUT", "-o", "lo", "-j", "ACCEPT",
217            ],
218        )?;
219
220        Self::run_cmd(
221            "nsenter",
222            &[
223                "-t",
224                &pid_str,
225                "-n",
226                "iptables",
227                "-A",
228                "OUTPUT",
229                "-m",
230                "conntrack",
231                "--ctstate",
232                "ESTABLISHED,RELATED",
233                "-j",
234                "ACCEPT",
235            ],
236        )?;
237
238        // Allow DNS to configured resolvers (only when policy permits it)
239        if policy.allow_dns {
240            for dns in &self.config.dns {
241                Self::run_cmd(
242                    "nsenter",
243                    &[
244                        "-t", &pid_str, "-n", "iptables", "-A", "OUTPUT", "-p", "udp", "-d", dns,
245                        "--dport", "53", "-j", "ACCEPT",
246                    ],
247                )?;
248                Self::run_cmd(
249                    "nsenter",
250                    &[
251                        "-t", &pid_str, "-n", "iptables", "-A", "OUTPUT", "-p", "tcp", "-d", dns,
252                        "--dport", "53", "-j", "ACCEPT",
253                    ],
254                )?;
255            }
256        }
257
258        // Allow traffic to each permitted CIDR
259        for cidr in &policy.allowed_cidrs {
260            if policy.allowed_tcp_ports.is_empty() && policy.allowed_udp_ports.is_empty() {
261                // Allow all ports to this CIDR
262                Self::run_cmd(
263                    "nsenter",
264                    &[
265                        "-t", &pid_str, "-n", "iptables", "-A", "OUTPUT", "-d", cidr, "-j",
266                        "ACCEPT",
267                    ],
268                )?;
269            } else {
270                for port in &policy.allowed_tcp_ports {
271                    Self::run_cmd(
272                        "nsenter",
273                        &[
274                            "-t",
275                            &pid_str,
276                            "-n",
277                            "iptables",
278                            "-A",
279                            "OUTPUT",
280                            "-p",
281                            "tcp",
282                            "-d",
283                            cidr,
284                            "--dport",
285                            &port.to_string(),
286                            "-j",
287                            "ACCEPT",
288                        ],
289                    )?;
290                }
291                for port in &policy.allowed_udp_ports {
292                    Self::run_cmd(
293                        "nsenter",
294                        &[
295                            "-t",
296                            &pid_str,
297                            "-n",
298                            "iptables",
299                            "-A",
300                            "OUTPUT",
301                            "-p",
302                            "udp",
303                            "-d",
304                            cidr,
305                            "--dport",
306                            &port.to_string(),
307                            "-j",
308                            "ACCEPT",
309                        ],
310                    )?;
311                }
312            }
313        }
314
315        // Log denied packets (rate-limited)
316        if policy.log_denied {
317            Self::run_cmd(
318                "nsenter",
319                &[
320                    "-t",
321                    &pid_str,
322                    "-n",
323                    "iptables",
324                    "-A",
325                    "OUTPUT",
326                    "-m",
327                    "limit",
328                    "--limit",
329                    "5/min",
330                    "-j",
331                    "LOG",
332                    "--log-prefix",
333                    "nucleus-egress-denied: ",
334                ],
335            )?;
336        }
337
338        // Drop everything else
339        Self::run_cmd(
340            "nsenter",
341            &["-t", &pid_str, "-n", "iptables", "-P", "OUTPUT", "DROP"],
342        )?;
343
344        info!(
345            "Egress policy applied: {} allowed CIDRs",
346            policy.allowed_cidrs.len()
347        );
348        debug!("Egress policy details: {:?}", policy);
349
350        Ok(())
351    }
352
353    /// Clean up bridge networking
354    ///
355    /// State transition: Active -> Cleaned
356    pub fn cleanup(mut self) -> Result<()> {
357        self.state = self.state.transition(NetworkState::Cleaned)?;
358
359        // Release the IP allocation
360        Self::release_allocated_ip(&self.container_id);
361
362        // Remove port forwarding rules
363        for pf in &self.config.port_forwards {
364            if let Err(e) = self.cleanup_port_forward(pf) {
365                warn!("Failed to cleanup port forward: {}", e);
366            }
367        }
368
369        // Remove NAT rule
370        let _ = Self::run_cmd(
371            "iptables",
372            &[
373                "-t",
374                "nat",
375                "-D",
376                "POSTROUTING",
377                "-s",
378                &self.config.subnet,
379                "-j",
380                "MASQUERADE",
381            ],
382        );
383
384        // Delete veth pair (deleting one end removes both)
385        let _ = Self::run_cmd("ip", &["link", "del", &self.veth_host]);
386
387        // Restore previous ip_forward state if we changed it
388        if let Some(ref prev) = self.prev_ip_forward {
389            if prev == "0" {
390                if let Err(e) = std::fs::write("/proc/sys/net/ipv4/ip_forward", "0") {
391                    warn!("Failed to restore ip_forward to 0: {}", e);
392                } else {
393                    info!("Restored net.ipv4.ip_forward to 0");
394                }
395            }
396        }
397
398        info!("Bridge network cleaned up");
399        Ok(())
400    }
401
402    /// Best-effort cleanup for use in Drop. Performs the same teardown as
403    /// `cleanup()` but ignores all errors and skips the state transition
404    /// (which requires ownership).
405    fn cleanup_best_effort(&mut self) {
406        if self.state == NetworkState::Cleaned {
407            return;
408        }
409
410        Self::release_allocated_ip(&self.container_id);
411
412        for pf in &self.config.port_forwards {
413            let _ = self.cleanup_port_forward(pf);
414        }
415
416        let _ = Self::run_cmd(
417            "iptables",
418            &[
419                "-t",
420                "nat",
421                "-D",
422                "POSTROUTING",
423                "-s",
424                &self.config.subnet,
425                "-j",
426                "MASQUERADE",
427            ],
428        );
429
430        let _ = Self::run_cmd("ip", &["link", "del", &self.veth_host]);
431
432        if let Some(ref prev) = self.prev_ip_forward {
433            if prev == "0" {
434                let _ = std::fs::write("/proc/sys/net/ipv4/ip_forward", "0");
435            }
436        }
437
438        self.state = NetworkState::Cleaned;
439        debug!("Bridge network cleaned up (best-effort via drop)");
440    }
441
442    /// Detect and remove orphaned iptables rules from previous Nucleus runs.
443    ///
444    /// Checks for stale MASQUERADE rules referencing the nucleus subnet that
445    /// have no corresponding running container. Prevents gradual degradation
446    /// of network isolation from accumulated orphaned rules.
447    pub fn cleanup_orphaned_rules(subnet: &str) {
448        // List NAT rules and look for nucleus-related MASQUERADE entries
449        let output = match Command::new("iptables")
450            .args(["-t", "nat", "-L", "POSTROUTING", "-n"])
451            .output()
452        {
453            Ok(o) => o,
454            Err(e) => {
455                debug!("Cannot check iptables for orphaned rules: {}", e);
456                return;
457            }
458        };
459
460        let stdout = String::from_utf8_lossy(&output.stdout);
461        let mut orphaned_count = 0u32;
462        for line in stdout.lines() {
463            if line.contains("MASQUERADE") && line.contains(subnet) {
464                // Try to remove it; if it fails, it may be actively used
465                let _ = Self::run_cmd(
466                    "iptables",
467                    &[
468                        "-t",
469                        "nat",
470                        "-D",
471                        "POSTROUTING",
472                        "-s",
473                        subnet,
474                        "-j",
475                        "MASQUERADE",
476                    ],
477                );
478                orphaned_count += 1;
479            }
480        }
481
482        if orphaned_count > 0 {
483            info!(
484                "Cleaned up {} orphaned iptables MASQUERADE rule(s) for subnet {}",
485                orphaned_count, subnet
486            );
487        }
488    }
489
490    fn ensure_bridge_for(bridge_name: &str, subnet: &str) -> Result<()> {
491        // Check if bridge exists
492        if Self::run_cmd("ip", &["link", "show", bridge_name]).is_ok() {
493            return Ok(());
494        }
495
496        // Create bridge
497        Self::run_cmd(
498            "ip",
499            &["link", "add", "name", bridge_name, "type", "bridge"],
500        )?;
501
502        let gateway = Self::gateway_from_subnet(subnet);
503        Self::run_cmd(
504            "ip",
505            &[
506                "addr",
507                "add",
508                &format!("{}/{}", gateway, Self::subnet_prefix(subnet)),
509                "dev",
510                bridge_name,
511            ],
512        )?;
513        Self::run_cmd("ip", &["link", "set", bridge_name, "up"])?;
514
515        info!("Created bridge {}", bridge_name);
516        Ok(())
517    }
518
519    fn setup_port_forward_for(container_ip: &str, pf: &PortForward) -> Result<()> {
520        for chain in ["PREROUTING", "OUTPUT"] {
521            let args = Self::port_forward_rule_args("-A", chain, container_ip, pf);
522            Self::run_cmd_owned("iptables", &args)?;
523        }
524
525        let host_ip = pf
526            .host_ip
527            .map(|ip| ip.to_string())
528            .unwrap_or_else(|| "0.0.0.0".to_string());
529        info!(
530            "Port forward: {}:{} -> {}:{}/{}",
531            host_ip, pf.host_port, container_ip, pf.container_port, pf.protocol
532        );
533        Ok(())
534    }
535
536    fn cleanup_port_forward(&self, pf: &PortForward) -> Result<()> {
537        for chain in ["OUTPUT", "PREROUTING"] {
538            let args = Self::port_forward_rule_args("-D", chain, &self.container_ip, pf);
539            Self::run_cmd_owned("iptables", &args)?;
540        }
541        Ok(())
542    }
543
544    /// Allocate a container IP from the subnet using /dev/urandom.
545    ///
546    /// Checks both host-visible interfaces (via `ip addr`) and IPs assigned to
547    /// other Nucleus containers (via state files) to avoid duplicates. Container
548    /// IPs inside network namespaces are invisible to `ip addr show` on the host.
549    fn allocate_ip_with_reserved(
550        subnet: &str,
551        reserved: &std::collections::HashSet<String>,
552    ) -> Result<String> {
553        let base = subnet.split('/').next().unwrap_or("10.0.42.0");
554        let parts: Vec<&str> = base.split('.').collect();
555        if parts.len() != 4 {
556            return Ok("10.0.42.2".to_string());
557        }
558
559        // Use rejection sampling to avoid modulo bias.
560        // Range is 2..=254 (253 values). We reject random bytes >= 253 to
561        // ensure uniform distribution, then add 2 to shift into the valid range.
562        // Open /dev/urandom once and read all randomness in a single batch.
563        // 128 bytes gives ~125 valid candidates (byte < 253), making exhaustion
564        // in a populated subnet far less likely than the previous 32-byte buffer.
565        let mut rand_buf = [0u8; 128];
566        std::fs::File::open("/dev/urandom")
567            .and_then(|mut f| std::io::Read::read_exact(&mut f, &mut rand_buf))
568            .map_err(|e| {
569                NucleusError::NetworkError(format!("Failed to read /dev/urandom: {}", e))
570            })?;
571        for &byte in &rand_buf {
572            // Rejection sampling: discard values that would cause modulo bias
573            if byte >= 253 {
574                continue;
575            }
576            let offset = byte as u32 + 2;
577            let candidate = format!("{}.{}.{}.{}", parts[0], parts[1], parts[2], offset);
578            if reserved.contains(&candidate) {
579                continue;
580            }
581            if !Self::is_ip_in_use(&candidate)? {
582                // Lock is released when lock_file is dropped
583                return Ok(candidate);
584            }
585        }
586
587        Err(NucleusError::NetworkError(format!(
588            "Failed to allocate free IP in subnet {}",
589            subnet
590        )))
591    }
592
593    fn reserve_ip_in_dir(
594        alloc_dir: &std::path::Path,
595        container_id: &str,
596        subnet: &str,
597        requested_ip: Option<&str>,
598    ) -> Result<String> {
599        std::fs::create_dir_all(alloc_dir).map_err(|e| {
600            NucleusError::NetworkError(format!("Failed to create IP alloc dir: {}", e))
601        })?;
602        let lock_path = alloc_dir.join(".lock");
603        let lock_file = std::fs::OpenOptions::new()
604            .create(true)
605            .write(true)
606            .truncate(false)
607            .open(&lock_path)
608            .map_err(|e| {
609                NucleusError::NetworkError(format!("Failed to open IP alloc lock: {}", e))
610            })?;
611        use std::os::unix::io::AsRawFd;
612        let lock_ret = unsafe { libc::flock(lock_file.as_raw_fd(), libc::LOCK_EX) };
613        if lock_ret != 0 {
614            return Err(NucleusError::NetworkError(format!(
615                "Failed to acquire IP alloc lock: {}",
616                std::io::Error::last_os_error()
617            )));
618        }
619
620        let reserved = Self::collect_reserved_ips_in_dir(alloc_dir);
621        let ip = match requested_ip {
622            Some(ip) => {
623                if reserved.contains(ip) || Self::is_ip_in_use(ip)? {
624                    return Err(NucleusError::NetworkError(format!(
625                        "Requested container IP {} is already in use",
626                        ip
627                    )));
628                }
629                ip.to_string()
630            }
631            None => Self::allocate_ip_with_reserved(subnet, &reserved)?,
632        };
633
634        Self::record_allocated_ip_in_dir(alloc_dir, container_id, &ip)?;
635        Ok(ip)
636    }
637
638    /// Scan the Nucleus IP allocation directory for IPs already assigned.
639    fn collect_reserved_ips_in_dir(
640        alloc_dir: &std::path::Path,
641    ) -> std::collections::HashSet<String> {
642        let mut ips = std::collections::HashSet::new();
643        if let Ok(entries) = std::fs::read_dir(alloc_dir) {
644            for entry in entries.flatten() {
645                if let Some(name) = entry.file_name().to_str() {
646                    if name.ends_with(".ip") {
647                        if let Ok(ip) = std::fs::read_to_string(entry.path()) {
648                            let ip = ip.trim().to_string();
649                            if !ip.is_empty() {
650                                ips.insert(ip);
651                            }
652                        }
653                    }
654                }
655            }
656        }
657        ips
658    }
659
660    /// Persist the allocated IP for this container so other containers can see it.
661    fn record_allocated_ip_in_dir(
662        alloc_dir: &std::path::Path,
663        container_id: &str,
664        ip: &str,
665    ) -> Result<()> {
666        std::fs::create_dir_all(alloc_dir).map_err(|e| {
667            NucleusError::NetworkError(format!("Failed to create IP alloc dir: {}", e))
668        })?;
669        let path = alloc_dir.join(format!("{}.ip", container_id));
670        std::fs::write(&path, ip).map_err(|e| {
671            NucleusError::NetworkError(format!("Failed to record IP allocation: {}", e))
672        })?;
673        Ok(())
674    }
675
676    /// Remove the persisted IP allocation for a container.
677    fn release_allocated_ip(container_id: &str) {
678        let alloc_dir = Self::ip_alloc_dir();
679        Self::release_allocated_ip_in_dir(&alloc_dir, container_id);
680    }
681
682    fn release_allocated_ip_in_dir(alloc_dir: &std::path::Path, container_id: &str) {
683        let path = alloc_dir.join(format!("{}.ip", container_id));
684        let _ = std::fs::remove_file(path);
685    }
686
687    fn ip_alloc_dir() -> std::path::PathBuf {
688        if nix::unistd::Uid::effective().is_root() {
689            std::path::PathBuf::from("/var/run/nucleus/ip-alloc")
690        } else {
691            dirs::runtime_dir()
692                .map(|d| d.join("nucleus/ip-alloc"))
693                .or_else(|| dirs::data_local_dir().map(|d| d.join("nucleus/ip-alloc")))
694                .unwrap_or_else(|| {
695                    dirs::home_dir()
696                        .map(|h| h.join(".nucleus/ip-alloc"))
697                        .unwrap_or_else(|| std::path::PathBuf::from("/var/run/nucleus/ip-alloc"))
698                })
699        }
700    }
701
702    /// Get gateway IP from subnet (first usable address)
703    fn gateway_from_subnet(subnet: &str) -> String {
704        let base = subnet.split('/').next().unwrap_or("10.0.42.0");
705        let parts: Vec<&str> = base.split('.').collect();
706        if parts.len() == 4 {
707            format!("{}.{}.{}.1", parts[0], parts[1], parts[2])
708        } else {
709            "10.0.42.1".to_string()
710        }
711    }
712
713    fn subnet_prefix(subnet: &str) -> u8 {
714        subnet
715            .split_once('/')
716            .and_then(|(_, p)| p.parse::<u8>().ok())
717            .filter(|p| *p <= 32)
718            .unwrap_or(24)
719    }
720
721    /// Resolve a system binary to an absolute path when running as root.
722    /// When unprivileged, falls back to bare name (PATH-based resolution).
723    fn resolve_bin(name: &str) -> String {
724        if nix::unistd::Uid::effective().is_root() {
725            let search_dirs: &[&str] = match name {
726                "ip" => &["/usr/sbin/ip", "/sbin/ip", "/usr/bin/ip"],
727                "iptables" => &["/usr/sbin/iptables", "/sbin/iptables", "/usr/bin/iptables"],
728                "nsenter" => &["/usr/bin/nsenter", "/usr/sbin/nsenter", "/bin/nsenter"],
729                _ => &[],
730            };
731            for path in search_dirs {
732                if std::path::Path::new(path).exists() {
733                    return path.to_string();
734                }
735            }
736        }
737        name.to_string()
738    }
739
740    fn run_cmd(program: &str, args: &[&str]) -> Result<()> {
741        let resolved = Self::resolve_bin(program);
742        let output = Command::new(&resolved).args(args).output().map_err(|e| {
743            NucleusError::NetworkError(format!("Failed to run {} {:?}: {}", resolved, args, e))
744        })?;
745
746        if !output.status.success() {
747            let stderr = String::from_utf8_lossy(&output.stderr);
748            return Err(NucleusError::NetworkError(format!(
749                "{} {:?} failed: {}",
750                program, args, stderr
751            )));
752        }
753
754        Ok(())
755    }
756
757    fn run_cmd_owned(program: &str, args: &[String]) -> Result<()> {
758        let refs: Vec<&str> = args.iter().map(String::as_str).collect();
759        Self::run_cmd(program, &refs)
760    }
761
762    fn port_forward_rule_args(
763        operation: &str,
764        chain: &str,
765        container_ip: &str,
766        pf: &PortForward,
767    ) -> Vec<String> {
768        let mut args = vec![
769            "-t".to_string(),
770            "nat".to_string(),
771            operation.to_string(),
772            chain.to_string(),
773            "-p".to_string(),
774            pf.protocol.as_str().to_string(),
775        ];
776
777        if chain == "OUTPUT" {
778            args.extend([
779                "-m".to_string(),
780                "addrtype".to_string(),
781                "--dst-type".to_string(),
782                "LOCAL".to_string(),
783            ]);
784        }
785
786        if let Some(host_ip) = pf.host_ip {
787            args.extend(["-d".to_string(), host_ip.to_string()]);
788        }
789
790        args.extend([
791            "--dport".to_string(),
792            pf.host_port.to_string(),
793            "-j".to_string(),
794            "DNAT".to_string(),
795            "--to-destination".to_string(),
796            format!("{}:{}", container_ip, pf.container_port),
797        ]);
798
799        args
800    }
801
802    fn is_ip_in_use(ip: &str) -> Result<bool> {
803        let ip_bin = Self::resolve_bin("ip");
804        let output = Command::new(&ip_bin)
805            .args(["-4", "addr", "show"])
806            .output()
807            .map_err(|e| {
808                NucleusError::NetworkError(format!("Failed to inspect host IPs: {}", e))
809            })?;
810
811        if !output.status.success() {
812            let stderr = String::from_utf8_lossy(&output.stderr);
813            return Err(NucleusError::NetworkError(format!(
814                "ip -4 addr show failed: {}",
815                stderr.trim()
816            )));
817        }
818
819        let stdout = String::from_utf8_lossy(&output.stdout);
820        Ok(stdout.contains(&format!(" {}/", ip)))
821    }
822
823    /// Write resolv.conf inside container (for writable /etc, e.g. agent mode)
824    pub fn write_resolv_conf(root: &std::path::Path, dns: &[String]) -> Result<()> {
825        let resolv_path = root.join("etc/resolv.conf");
826        let content: String = dns
827            .iter()
828            .map(|server| format!("nameserver {}\n", server))
829            .collect();
830        std::fs::write(&resolv_path, content).map_err(|e| {
831            NucleusError::NetworkError(format!("Failed to write resolv.conf: {}", e))
832        })?;
833        Ok(())
834    }
835
836    /// Bind-mount a resolv.conf over a read-only /etc (for production rootfs mode).
837    ///
838    /// Creates a memfd-backed resolv.conf and bind-mounts it over
839    /// /etc/resolv.conf so it works even when the rootfs /etc is read-only.
840    /// The memfd is cleaned up when the container exits.
841    pub fn bind_mount_resolv_conf(root: &std::path::Path, dns: &[String]) -> Result<()> {
842        use nix::mount::{mount, MsFlags};
843
844        let content: String = dns
845            .iter()
846            .map(|server| format!("nameserver {}\n", server))
847            .collect();
848
849        // Create a memfd-backed file to avoid leaving staging files on disk
850        let memfd_name = std::ffi::CString::new("nucleus-resolv").map_err(|e| {
851            NucleusError::NetworkError(format!("Failed to create memfd name: {}", e))
852        })?;
853        let memfd_fd = unsafe { libc::memfd_create(memfd_name.as_ptr(), 0) };
854        if memfd_fd < 0 {
855            // Fallback to staging file if memfd_create is unavailable
856            return Self::bind_mount_resolv_conf_staging(root, dns);
857        }
858
859        // Write content to memfd
860        let write_result = unsafe {
861            libc::write(
862                memfd_fd,
863                content.as_ptr() as *const libc::c_void,
864                content.len(),
865            )
866        };
867        if write_result < 0 {
868            unsafe { libc::close(memfd_fd) };
869            return Self::bind_mount_resolv_conf_staging(root, dns);
870        }
871
872        // Ensure the mount target exists
873        let target = root.join("etc/resolv.conf");
874        if !target.exists() {
875            let _ = std::fs::write(&target, "");
876        }
877
878        // Bind mount the memfd over the read-only resolv.conf
879        let memfd_path = format!("/proc/self/fd/{}", memfd_fd);
880        mount(
881            Some(memfd_path.as_str()),
882            &target,
883            None::<&str>,
884            MsFlags::MS_BIND,
885            None::<&str>,
886        )
887        .map_err(|e| {
888            unsafe { libc::close(memfd_fd) };
889            NucleusError::NetworkError(format!("Failed to bind mount resolv.conf: {}", e))
890        })?;
891
892        // Close the fd — the mount keeps the file alive
893        unsafe { libc::close(memfd_fd) };
894
895        info!("Bind-mounted resolv.conf for bridge networking (rootfs mode, memfd)");
896        Ok(())
897    }
898
899    /// Fallback: bind-mount a staging resolv.conf file.
900    fn bind_mount_resolv_conf_staging(root: &std::path::Path, dns: &[String]) -> Result<()> {
901        use nix::mount::{mount, MsFlags};
902
903        let content: String = dns
904            .iter()
905            .map(|server| format!("nameserver {}\n", server))
906            .collect();
907
908        // Write to a staging file outside /etc
909        let staging = root.join("tmp/.resolv.conf.nucleus");
910        if let Some(parent) = staging.parent() {
911            std::fs::create_dir_all(parent).map_err(|e| {
912                NucleusError::NetworkError(format!(
913                    "Failed to create resolv.conf staging parent: {}",
914                    e
915                ))
916            })?;
917        }
918        std::fs::write(&staging, content).map_err(|e| {
919            NucleusError::NetworkError(format!("Failed to write staging resolv.conf: {}", e))
920        })?;
921
922        // Ensure the mount target exists
923        let target = root.join("etc/resolv.conf");
924        if !target.exists() {
925            let _ = std::fs::write(&target, "");
926        }
927
928        // Bind mount the staging file over the read-only resolv.conf
929        mount(
930            Some(staging.as_path()),
931            &target,
932            None::<&str>,
933            MsFlags::MS_BIND,
934            None::<&str>,
935        )
936        .map_err(|e| {
937            NucleusError::NetworkError(format!("Failed to bind mount resolv.conf: {}", e))
938        })?;
939
940        // The bind mount holds a reference to the inode, so we can safely
941        // unlink the staging path to avoid leaking DNS server info on disk.
942        if let Err(e) = std::fs::remove_file(&staging) {
943            warn!("Failed to remove staging resolv.conf {:?}: {}", staging, e);
944        }
945
946        info!("Bind-mounted resolv.conf for bridge networking (rootfs mode, staging)");
947        Ok(())
948    }
949}
950
951impl Drop for BridgeNetwork {
952    fn drop(&mut self) {
953        self.cleanup_best_effort();
954    }
955}
956
957struct SetupRollback {
958    veth_host: String,
959    subnet: String,
960    veth_created: bool,
961    nat_added: bool,
962    port_forwards: Vec<(String, PortForward)>,
963    prev_ip_forward: Option<String>,
964    reserved_ip: Option<(std::path::PathBuf, String)>,
965    armed: bool,
966}
967
968impl SetupRollback {
969    fn new(
970        veth_host: String,
971        subnet: String,
972        reserved_ip: Option<(std::path::PathBuf, String)>,
973    ) -> Self {
974        Self {
975            veth_host,
976            subnet,
977            veth_created: false,
978            nat_added: false,
979            port_forwards: Vec::new(),
980            prev_ip_forward: None,
981            reserved_ip,
982            armed: true,
983        }
984    }
985
986    fn disarm(&mut self) {
987        self.armed = false;
988    }
989}
990
991impl Drop for SetupRollback {
992    fn drop(&mut self) {
993        if !self.armed {
994            return;
995        }
996
997        for (container_ip, pf) in self.port_forwards.iter().rev() {
998            for chain in ["OUTPUT", "PREROUTING"] {
999                let args = BridgeNetwork::port_forward_rule_args("-D", chain, container_ip, pf);
1000                if let Err(e) = BridgeNetwork::run_cmd_owned("iptables", &args) {
1001                    warn!(
1002                        "Rollback: failed to remove iptables {} rule for {}: {}",
1003                        chain, container_ip, e
1004                    );
1005                }
1006            }
1007        }
1008
1009        if self.nat_added {
1010            if let Err(e) = BridgeNetwork::run_cmd(
1011                "iptables",
1012                &[
1013                    "-t",
1014                    "nat",
1015                    "-D",
1016                    "POSTROUTING",
1017                    "-s",
1018                    &self.subnet,
1019                    "-j",
1020                    "MASQUERADE",
1021                ],
1022            ) {
1023                warn!("Rollback: failed to remove NAT rule: {}", e);
1024            }
1025        }
1026
1027        if self.veth_created {
1028            if let Err(e) = BridgeNetwork::run_cmd("ip", &["link", "del", &self.veth_host]) {
1029                warn!("Rollback: failed to delete veth {}: {}", self.veth_host, e);
1030            }
1031        }
1032
1033        if let Some((alloc_dir, container_id)) = &self.reserved_ip {
1034            BridgeNetwork::release_allocated_ip_in_dir(alloc_dir, container_id);
1035        }
1036    }
1037}
1038
1039#[cfg(test)]
1040mod tests {
1041    use super::*;
1042
1043    #[test]
1044    fn test_ip_allocation_rejection_sampling_range() {
1045        // H-5: Verify that rejection sampling produces values in 2..=254
1046        // and that values >= 253 are rejected (no modulo bias).
1047        for byte in 0u8..253 {
1048            let offset = byte as u32 + 2;
1049            assert!(
1050                (2..=254).contains(&offset),
1051                "offset {} out of range",
1052                offset
1053            );
1054        }
1055        // Values 253, 254, 255 must be rejected
1056        for byte in [253u8, 254, 255] {
1057            assert!(byte >= 253);
1058        }
1059    }
1060
1061    #[test]
1062    fn test_reserve_ip_blocks_duplicate_requested_address() {
1063        let temp = tempfile::tempdir().unwrap();
1064        BridgeNetwork::record_allocated_ip_in_dir(temp.path(), "one", "10.0.42.2").unwrap();
1065
1066        let err =
1067            BridgeNetwork::reserve_ip_in_dir(temp.path(), "two", "10.0.42.0/24", Some("10.0.42.2"))
1068                .unwrap_err();
1069        assert!(
1070            err.to_string().contains("already in use"),
1071            "second reservation of the same IP must fail"
1072        );
1073    }
1074
1075    #[test]
1076    fn test_setup_rollback_releases_reserved_ip() {
1077        let temp = tempfile::tempdir().unwrap();
1078        BridgeNetwork::record_allocated_ip_in_dir(temp.path(), "rollback", "10.0.42.3").unwrap();
1079
1080        let rollback = SetupRollback {
1081            veth_host: "veth-test".to_string(),
1082            subnet: "10.0.42.0/24".to_string(),
1083            veth_created: false,
1084            nat_added: false,
1085            port_forwards: Vec::new(),
1086            prev_ip_forward: None,
1087            reserved_ip: Some((temp.path().to_path_buf(), "rollback".to_string())),
1088            armed: true,
1089        };
1090
1091        drop(rollback);
1092
1093        assert!(
1094            !temp.path().join("rollback.ip").exists(),
1095            "rollback must release reserved IP files on setup failure"
1096        );
1097    }
1098
1099    #[test]
1100    fn test_port_forward_rules_include_output_chain_for_local_host_clients() {
1101        let pf = PortForward {
1102            host_ip: None,
1103            host_port: 8080,
1104            container_port: 80,
1105            protocol: crate::network::config::Protocol::Tcp,
1106        };
1107
1108        let prerouting =
1109            BridgeNetwork::port_forward_rule_args("-A", "PREROUTING", "10.0.42.2", &pf);
1110        let output = BridgeNetwork::port_forward_rule_args("-A", "OUTPUT", "10.0.42.2", &pf);
1111
1112        assert!(prerouting.iter().any(|arg| arg == "PREROUTING"));
1113        assert!(output.iter().any(|arg| arg == "OUTPUT"));
1114        assert!(
1115            output
1116                .windows(2)
1117                .any(|pair| pair[0] == "--dst-type" && pair[1] == "LOCAL"),
1118            "OUTPUT rule must target local-destination traffic"
1119        );
1120    }
1121
1122    #[test]
1123    fn test_port_forward_rules_include_host_ip_when_configured() {
1124        let pf = PortForward {
1125            host_ip: Some(std::net::Ipv4Addr::new(127, 0, 0, 1)),
1126            host_port: 4173,
1127            container_port: 4173,
1128            protocol: crate::network::config::Protocol::Tcp,
1129        };
1130
1131        let prerouting =
1132            BridgeNetwork::port_forward_rule_args("-A", "PREROUTING", "10.0.42.2", &pf);
1133        let output = BridgeNetwork::port_forward_rule_args("-A", "OUTPUT", "10.0.42.2", &pf);
1134
1135        for args in [&prerouting, &output] {
1136            assert!(
1137                args.windows(2)
1138                    .any(|pair| pair[0] == "-d" && pair[1] == "127.0.0.1"),
1139                "port forward must restrict DNAT rules to the configured host IP"
1140            );
1141        }
1142    }
1143}