1use super::{egress, netlink, netns};
2use crate::error::{NucleusError, Result, StateTransition};
3use crate::network::config::{BridgeConfig, EgressPolicy, PortForward};
4use crate::network::NetworkState;
5use serde::{Deserialize, Serialize};
6use std::fs::OpenOptions;
7use std::net::Ipv4Addr;
8use std::os::fd::FromRawFd;
9use std::os::unix::fs::FileTypeExt;
10use std::os::unix::fs::OpenOptionsExt;
11use std::os::unix::io::AsRawFd;
12use std::process::Command;
13use tracing::{debug, info, warn};
14
15pub struct BridgeNetwork {
17 config: BridgeConfig,
18 container_ip: String,
19 veth_host: String,
20 container_id: String,
21 ip_forward_ref_acquired: bool,
22 state: NetworkState,
23}
24
25const IP_FORWARD_SYSCTL_PATH: &str = "/proc/sys/net/ipv4/ip_forward";
26const IP_FORWARD_LOCK_FILE: &str = ".ip_forward.lock";
27const IP_FORWARD_STATE_FILE: &str = ".ip_forward.state";
28
29#[derive(Debug, Clone, Serialize, Deserialize)]
30struct IpForwardRefState {
31 refcount: u64,
32 original_value: String,
33}
34
35impl BridgeNetwork {
36 fn open_dev_urandom() -> Result<std::fs::File> {
37 let file = OpenOptions::new()
38 .read(true)
39 .custom_flags(libc::O_NOFOLLOW | libc::O_CLOEXEC)
40 .open("/dev/urandom")
41 .map_err(|e| {
42 NucleusError::NetworkError(format!("Failed to open /dev/urandom: {}", e))
43 })?;
44
45 let metadata = file.metadata().map_err(|e| {
46 NucleusError::NetworkError(format!("Failed to stat /dev/urandom: {}", e))
47 })?;
48 if !metadata.file_type().is_char_device() {
49 return Err(NucleusError::NetworkError(
50 "/dev/urandom is not a character device".to_string(),
51 ));
52 }
53
54 Ok(file)
55 }
56
57 pub fn setup(pid: u32, config: &BridgeConfig) -> Result<Self> {
64 Self::setup_for(pid, config, &format!("{:x}", pid))
65 }
66
67 pub fn setup_with_id(pid: u32, config: &BridgeConfig, container_id: &str) -> Result<Self> {
69 Self::setup_for(pid, config, container_id)
70 }
71
72 fn setup_for(pid: u32, config: &BridgeConfig, container_id: &str) -> Result<Self> {
73 config.validate()?;
75
76 let mut net_state = NetworkState::Unconfigured;
77 net_state = net_state.transition(NetworkState::Configuring)?;
78
79 let alloc_dir = Self::ip_alloc_dir();
80 let container_ip = Self::reserve_ip_in_dir(
81 &alloc_dir,
82 container_id,
83 &config.subnet,
84 config.container_ip.as_deref(),
85 )?;
86 let prefix = Self::subnet_prefix(&config.subnet);
87
88 let veth_host_full = format!("veth-{:x}", pid);
90 let veth_cont_full = format!("vethc-{:x}", pid);
91 let veth_host = veth_host_full[..veth_host_full.len().min(15)].to_string();
92 let veth_container = veth_cont_full[..veth_cont_full.len().min(15)].to_string();
93 let mut rollback = SetupRollback::new(
94 veth_host.clone(),
95 config.subnet.clone(),
96 Some((alloc_dir.clone(), container_id.to_string())),
97 );
98
99 Self::ensure_bridge_for(&config.bridge_name, &config.subnet)?;
101
102 netlink::create_veth(&veth_host, &veth_container)?;
104 rollback.veth_created = true;
105
106 netlink::set_link_master(&veth_host, &config.bridge_name)?;
108 netlink::set_link_up(&veth_host)?;
109
110 netlink::set_link_netns(&veth_container, pid)?;
112
113 let start_ticks = Self::read_pid_start_ticks(pid);
117 if start_ticks == 0 {
118 drop(rollback);
119 return Err(NucleusError::NetworkError(format!(
120 "Cannot read start_ticks for PID {} – process may have exited",
121 pid
122 )));
123 }
124
125 let container_addr: Ipv4Addr = container_ip.parse().map_err(|e| {
126 NucleusError::NetworkError(format!("invalid container IP '{}': {}", container_ip, e))
127 })?;
128 {
129 let vc = veth_container.clone();
130 netns::in_netns(pid, move || {
131 netlink::add_addr(&vc, container_addr, prefix)?;
132 netlink::set_link_up(&vc)?;
133 netlink::set_link_up("lo")?;
134 Ok(())
135 })?;
136 }
137
138 let current_ticks = Self::read_pid_start_ticks(pid);
140 if current_ticks != start_ticks {
141 drop(rollback);
142 return Err(NucleusError::NetworkError(format!(
143 "PID {} was recycled during network setup (start_ticks changed: {} -> {})",
144 pid, start_ticks, current_ticks
145 )));
146 }
147
148 let gateway = Self::gateway_from_subnet(&config.subnet);
150 let gateway_addr: Ipv4Addr = gateway.parse().map_err(|e| {
151 NucleusError::NetworkError(format!("invalid gateway IP '{}': {}", gateway, e))
152 })?;
153 netns::in_netns(pid, move || netlink::add_default_route(gateway_addr))?;
154
155 Self::run_cmd(
157 "iptables",
158 &[
159 "-t",
160 "nat",
161 "-A",
162 "POSTROUTING",
163 "-s",
164 &config.subnet,
165 "-j",
166 "MASQUERADE",
167 ],
168 )?;
169 rollback.nat_added = true;
170
171 Self::acquire_ip_forward_ref()?;
174 rollback.ip_forward_ref_acquired = true;
175
176 for pf in &config.port_forwards {
178 Self::setup_port_forward_for(&container_ip, pf)?;
179 rollback
180 .port_forwards
181 .push((container_ip.clone(), pf.clone()));
182 }
183
184 net_state = net_state.transition(NetworkState::Active)?;
185
186 info!(
187 "Bridge network configured: {} -> {} (IP: {})",
188 veth_host, veth_container, container_ip
189 );
190 let ip_forward_ref_acquired = rollback.ip_forward_ref_acquired;
191 rollback.disarm();
192
193 Ok(Self {
194 config: config.clone(),
195 container_ip,
196 veth_host,
197 container_id: container_id.to_string(),
198 ip_forward_ref_acquired,
199 state: net_state,
200 })
201 }
202
203 pub fn apply_egress_policy(&self, pid: u32, policy: &EgressPolicy) -> Result<()> {
208 egress::apply_egress_policy(pid, &self.config.dns, policy, false)
209 }
210
211 pub fn cleanup(mut self) -> Result<()> {
215 self.state = self.state.transition(NetworkState::Cleaned)?;
216
217 Self::release_allocated_ip(&self.container_id);
219
220 for pf in &self.config.port_forwards {
222 if let Err(e) = self.cleanup_port_forward(pf) {
223 warn!("Failed to cleanup port forward: {}", e);
224 }
225 }
226
227 let _ = Self::run_cmd(
229 "iptables",
230 &[
231 "-t",
232 "nat",
233 "-D",
234 "POSTROUTING",
235 "-s",
236 &self.config.subnet,
237 "-j",
238 "MASQUERADE",
239 ],
240 );
241
242 let _ = netlink::del_link(&self.veth_host);
244
245 if self.ip_forward_ref_acquired {
246 if let Err(e) = Self::release_ip_forward_ref() {
247 warn!("Failed to release ip_forward refcount: {}", e);
248 } else {
249 self.ip_forward_ref_acquired = false;
250 }
251 }
252
253 info!("Bridge network cleaned up");
254 Ok(())
255 }
256
257 fn cleanup_best_effort(&mut self) {
261 if self.state == NetworkState::Cleaned {
262 return;
263 }
264
265 Self::release_allocated_ip(&self.container_id);
266
267 for pf in &self.config.port_forwards {
268 let _ = self.cleanup_port_forward(pf);
269 }
270
271 let _ = Self::run_cmd(
272 "iptables",
273 &[
274 "-t",
275 "nat",
276 "-D",
277 "POSTROUTING",
278 "-s",
279 &self.config.subnet,
280 "-j",
281 "MASQUERADE",
282 ],
283 );
284
285 let _ = netlink::del_link(&self.veth_host);
286
287 if self.ip_forward_ref_acquired {
288 let _ = Self::release_ip_forward_ref();
289 self.ip_forward_ref_acquired = false;
290 }
291
292 self.state = NetworkState::Cleaned;
293 debug!("Bridge network cleaned up (best-effort via drop)");
294 }
295
296 pub fn cleanup_orphaned_rules(subnet: &str) {
302 let iptables = match Self::resolve_bin("iptables") {
304 Ok(path) => path,
305 Err(e) => {
306 debug!("Cannot resolve iptables for orphaned rule cleanup: {}", e);
307 return;
308 }
309 };
310 let output = match Command::new(&iptables)
311 .args(["-t", "nat", "-L", "POSTROUTING", "-n"])
312 .output()
313 {
314 Ok(o) => o,
315 Err(e) => {
316 debug!("Cannot check iptables for orphaned rules: {}", e);
317 return;
318 }
319 };
320
321 let stdout = String::from_utf8_lossy(&output.stdout);
322 let mut orphaned_count = 0u32;
323 for line in stdout.lines() {
324 if line.contains("MASQUERADE") && line.contains(subnet) {
325 let _ = Self::run_cmd(
327 "iptables",
328 &[
329 "-t",
330 "nat",
331 "-D",
332 "POSTROUTING",
333 "-s",
334 subnet,
335 "-j",
336 "MASQUERADE",
337 ],
338 );
339 orphaned_count += 1;
340 }
341 }
342
343 if orphaned_count > 0 {
344 info!(
345 "Cleaned up {} orphaned iptables MASQUERADE rule(s) for subnet {}",
346 orphaned_count, subnet
347 );
348 }
349 }
350
351 fn ensure_bridge_for(bridge_name: &str, subnet: &str) -> Result<()> {
352 if netlink::link_exists(bridge_name) {
353 return Ok(());
354 }
355
356 netlink::create_bridge(bridge_name)?;
357
358 let gateway = Self::gateway_from_subnet(subnet);
359 let gateway_addr: Ipv4Addr = gateway.parse().map_err(|e| {
360 NucleusError::NetworkError(format!("invalid bridge gateway '{}': {}", gateway, e))
361 })?;
362 netlink::add_addr(bridge_name, gateway_addr, Self::subnet_prefix(subnet))?;
363 netlink::set_link_up(bridge_name)?;
364
365 info!("Created bridge {}", bridge_name);
366 Ok(())
367 }
368
369 fn setup_port_forward_for(container_ip: &str, pf: &PortForward) -> Result<()> {
370 for chain in ["PREROUTING", "OUTPUT"] {
371 let args = Self::port_forward_rule_args("-A", chain, container_ip, pf);
372 Self::run_cmd_owned("iptables", &args)?;
373 }
374
375 let host_ip = pf
376 .host_ip
377 .map(|ip| ip.to_string())
378 .unwrap_or_else(|| "0.0.0.0".to_string());
379 info!(
380 "Port forward: {}:{} -> {}:{}/{}",
381 host_ip, pf.host_port, container_ip, pf.container_port, pf.protocol
382 );
383 Ok(())
384 }
385
386 fn cleanup_port_forward(&self, pf: &PortForward) -> Result<()> {
387 for chain in ["OUTPUT", "PREROUTING"] {
388 let args = Self::port_forward_rule_args("-D", chain, &self.container_ip, pf);
389 Self::run_cmd_owned("iptables", &args)?;
390 }
391 Ok(())
392 }
393
394 fn allocate_ip_with_reserved(
400 subnet: &str,
401 reserved: &std::collections::HashSet<String>,
402 ) -> Result<String> {
403 let base = subnet.split('/').next().unwrap_or("10.0.42.0");
404 let parts: Vec<&str> = base.split('.').collect();
405 if parts.len() != 4 {
406 return Ok("10.0.42.2".to_string());
407 }
408
409 let mut rand_buf = [0u8; 128];
416 let mut urandom = Self::open_dev_urandom()?;
417 std::io::Read::read_exact(&mut urandom, &mut rand_buf).map_err(|e| {
418 NucleusError::NetworkError(format!("Failed to read /dev/urandom: {}", e))
419 })?;
420 for &byte in &rand_buf {
421 if byte >= 253 {
423 continue;
424 }
425 let offset = byte as u32 + 2;
426 let candidate = format!("{}.{}.{}.{}", parts[0], parts[1], parts[2], offset);
427 if reserved.contains(&candidate) {
428 continue;
429 }
430 if !Self::is_ip_in_use(&candidate)? {
431 return Ok(candidate);
433 }
434 }
435
436 Err(NucleusError::NetworkError(format!(
437 "Failed to allocate free IP in subnet {}",
438 subnet
439 )))
440 }
441
442 fn reserve_ip_in_dir(
443 alloc_dir: &std::path::Path,
444 container_id: &str,
445 subnet: &str,
446 requested_ip: Option<&str>,
447 ) -> Result<String> {
448 Self::ensure_alloc_dir(alloc_dir)?;
449 let lock_path = alloc_dir.join(".lock");
450 let lock_file = std::fs::OpenOptions::new()
451 .create(true)
452 .write(true)
453 .truncate(false)
454 .open(&lock_path)
455 .map_err(|e| {
456 NucleusError::NetworkError(format!("Failed to open IP alloc lock: {}", e))
457 })?;
458 let lock_ret = unsafe { libc::flock(lock_file.as_raw_fd(), libc::LOCK_EX) };
461 if lock_ret != 0 {
462 return Err(NucleusError::NetworkError(format!(
463 "Failed to acquire IP alloc lock: {}",
464 std::io::Error::last_os_error()
465 )));
466 }
467
468 let reserved = Self::collect_reserved_ips_in_dir(alloc_dir);
469 let ip = match requested_ip {
470 Some(ip) => {
471 if reserved.contains(ip) || Self::is_ip_in_use(ip)? {
472 return Err(NucleusError::NetworkError(format!(
473 "Requested container IP {} is already in use",
474 ip
475 )));
476 }
477 ip.to_string()
478 }
479 None => Self::allocate_ip_with_reserved(subnet, &reserved)?,
480 };
481
482 Self::record_allocated_ip_in_dir(alloc_dir, container_id, &ip)?;
483 Ok(ip)
484 }
485
486 fn collect_reserved_ips_in_dir(
488 alloc_dir: &std::path::Path,
489 ) -> std::collections::HashSet<String> {
490 let mut ips = std::collections::HashSet::new();
491 if let Ok(entries) = std::fs::read_dir(alloc_dir) {
492 for entry in entries.flatten() {
493 if let Some(name) = entry.file_name().to_str() {
494 if name.ends_with(".ip") {
495 if let Ok(ip) = std::fs::read_to_string(entry.path()) {
496 let ip = ip.trim().to_string();
497 if !ip.is_empty() {
498 ips.insert(ip);
499 }
500 }
501 }
502 }
503 }
504 }
505 ips
506 }
507
508 fn record_allocated_ip_in_dir(
510 alloc_dir: &std::path::Path,
511 container_id: &str,
512 ip: &str,
513 ) -> Result<()> {
514 Self::ensure_alloc_dir(alloc_dir)?;
515 let path = alloc_dir.join(format!("{}.ip", container_id));
516 std::fs::write(&path, ip).map_err(|e| {
517 NucleusError::NetworkError(format!("Failed to record IP allocation: {}", e))
518 })?;
519 Ok(())
520 }
521
522 fn release_allocated_ip(container_id: &str) {
524 let alloc_dir = Self::ip_alloc_dir();
525 Self::release_allocated_ip_in_dir(&alloc_dir, container_id);
526 }
527
528 fn release_allocated_ip_in_dir(alloc_dir: &std::path::Path, container_id: &str) {
529 let path = alloc_dir.join(format!("{}.ip", container_id));
530 let _ = std::fs::remove_file(path);
531 }
532
533 fn ensure_alloc_dir(alloc_dir: &std::path::Path) -> Result<()> {
536 if alloc_dir.exists() {
539 if let Ok(meta) = std::fs::symlink_metadata(alloc_dir) {
540 if meta.file_type().is_symlink() {
541 return Err(NucleusError::NetworkError(format!(
542 "IP alloc dir {:?} is a symlink, refusing to use",
543 alloc_dir
544 )));
545 }
546 }
547 }
548 if let Some(parent) = alloc_dir.parent() {
550 if let Ok(meta) = std::fs::symlink_metadata(parent) {
551 if meta.file_type().is_symlink() {
552 return Err(NucleusError::NetworkError(format!(
553 "IP alloc dir parent {:?} is a symlink, refusing to use",
554 parent
555 )));
556 }
557 }
558 }
559
560 std::fs::create_dir_all(alloc_dir).map_err(|e| {
561 NucleusError::NetworkError(format!("Failed to create IP alloc dir: {}", e))
562 })?;
563
564 use std::os::unix::fs::PermissionsExt;
566 let perms = std::fs::Permissions::from_mode(0o700);
567 std::fs::set_permissions(alloc_dir, perms).map_err(|e| {
568 NucleusError::NetworkError(format!(
569 "Failed to set permissions on IP alloc dir {:?}: {}",
570 alloc_dir, e
571 ))
572 })?;
573
574 if let Ok(meta) = std::fs::symlink_metadata(alloc_dir) {
576 if meta.file_type().is_symlink() {
577 return Err(NucleusError::NetworkError(format!(
578 "IP alloc dir {:?} was replaced with a symlink during setup",
579 alloc_dir
580 )));
581 }
582 }
583 Ok(())
584 }
585
586 fn ip_alloc_dir() -> std::path::PathBuf {
587 if nix::unistd::Uid::effective().is_root() {
588 std::path::PathBuf::from("/var/run/nucleus/ip-alloc")
589 } else {
590 dirs::runtime_dir()
591 .map(|d| d.join("nucleus/ip-alloc"))
592 .or_else(|| dirs::data_local_dir().map(|d| d.join("nucleus/ip-alloc")))
593 .unwrap_or_else(|| {
594 dirs::home_dir()
595 .map(|h| h.join(".nucleus/ip-alloc"))
596 .unwrap_or_else(|| std::path::PathBuf::from("/var/run/nucleus/ip-alloc"))
597 })
598 }
599 }
600
601 fn ip_forward_lock_path(alloc_dir: &std::path::Path) -> std::path::PathBuf {
602 alloc_dir.join(IP_FORWARD_LOCK_FILE)
603 }
604
605 fn ip_forward_state_path(alloc_dir: &std::path::Path) -> std::path::PathBuf {
606 alloc_dir.join(IP_FORWARD_STATE_FILE)
607 }
608
609 fn read_ip_forward_value(sysctl_path: &std::path::Path) -> Result<String> {
610 std::fs::read_to_string(sysctl_path)
611 .map(|value| value.trim().to_string())
612 .map_err(|e| {
613 NucleusError::NetworkError(format!(
614 "Failed to read {}: {}",
615 sysctl_path.display(),
616 e
617 ))
618 })
619 }
620
621 fn write_ip_forward_value(sysctl_path: &std::path::Path, value: &str) -> Result<()> {
622 std::fs::write(sysctl_path, value).map_err(|e| {
623 NucleusError::NetworkError(format!(
624 "Failed to write {} to {}: {}",
625 value,
626 sysctl_path.display(),
627 e
628 ))
629 })
630 }
631
632 fn load_ip_forward_state(alloc_dir: &std::path::Path) -> Result<Option<IpForwardRefState>> {
633 let state_path = Self::ip_forward_state_path(alloc_dir);
634 match std::fs::read_to_string(&state_path) {
635 Ok(content) => serde_json::from_str(&content).map(Some).map_err(|e| {
636 NucleusError::NetworkError(format!(
637 "Failed to parse ip_forward refcount state {:?}: {}",
638 state_path, e
639 ))
640 }),
641 Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(None),
642 Err(e) => Err(NucleusError::NetworkError(format!(
643 "Failed to read ip_forward refcount state {:?}: {}",
644 state_path, e
645 ))),
646 }
647 }
648
649 fn store_ip_forward_state(
650 alloc_dir: &std::path::Path,
651 state: &IpForwardRefState,
652 ) -> Result<()> {
653 let state_path = Self::ip_forward_state_path(alloc_dir);
654 let encoded = serde_json::to_vec(state).map_err(|e| {
655 NucleusError::NetworkError(format!(
656 "Failed to serialize ip_forward refcount state {:?}: {}",
657 state_path, e
658 ))
659 })?;
660 std::fs::write(&state_path, encoded).map_err(|e| {
661 NucleusError::NetworkError(format!(
662 "Failed to persist ip_forward refcount state {:?}: {}",
663 state_path, e
664 ))
665 })
666 }
667
668 fn remove_ip_forward_state(alloc_dir: &std::path::Path) -> Result<()> {
669 let state_path = Self::ip_forward_state_path(alloc_dir);
670 match std::fs::remove_file(&state_path) {
671 Ok(()) => Ok(()),
672 Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(()),
673 Err(e) => Err(NucleusError::NetworkError(format!(
674 "Failed to remove ip_forward refcount state {:?}: {}",
675 state_path, e
676 ))),
677 }
678 }
679
680 fn acquire_ip_forward_ref() -> Result<()> {
681 let alloc_dir = Self::ip_alloc_dir();
682 Self::acquire_ip_forward_ref_in_dir(
683 &alloc_dir,
684 std::path::Path::new(IP_FORWARD_SYSCTL_PATH),
685 )
686 }
687
688 fn acquire_ip_forward_ref_in_dir(
689 alloc_dir: &std::path::Path,
690 sysctl_path: &std::path::Path,
691 ) -> Result<()> {
692 Self::ensure_alloc_dir(alloc_dir)?;
693 let lock_path = Self::ip_forward_lock_path(alloc_dir);
694 let lock_file = std::fs::OpenOptions::new()
695 .create(true)
696 .write(true)
697 .truncate(false)
698 .open(&lock_path)
699 .map_err(|e| {
700 NucleusError::NetworkError(format!(
701 "Failed to open ip_forward lock {:?}: {}",
702 lock_path, e
703 ))
704 })?;
705 let lock_ret = unsafe { libc::flock(lock_file.as_raw_fd(), libc::LOCK_EX) };
706 if lock_ret != 0 {
707 return Err(NucleusError::NetworkError(format!(
708 "Failed to acquire ip_forward lock: {}",
709 std::io::Error::last_os_error()
710 )));
711 }
712
713 let mut state = match Self::load_ip_forward_state(alloc_dir)? {
714 Some(state) => state,
715 None => {
716 let original_value = Self::read_ip_forward_value(sysctl_path)?;
717 let state = IpForwardRefState {
718 refcount: 0,
719 original_value,
720 };
721 Self::store_ip_forward_state(alloc_dir, &state)?;
722 state
723 }
724 };
725
726 if state.refcount == 0 {
727 Self::write_ip_forward_value(sysctl_path, "1")?;
728 }
729 state.refcount = state.refcount.checked_add(1).ok_or_else(|| {
730 NucleusError::NetworkError("ip_forward refcount overflow".to_string())
731 })?;
732 Self::store_ip_forward_state(alloc_dir, &state)
733 }
734
735 fn release_ip_forward_ref() -> Result<()> {
736 let alloc_dir = Self::ip_alloc_dir();
737 Self::release_ip_forward_ref_in_dir(
738 &alloc_dir,
739 std::path::Path::new(IP_FORWARD_SYSCTL_PATH),
740 )
741 }
742
743 fn release_ip_forward_ref_in_dir(
744 alloc_dir: &std::path::Path,
745 sysctl_path: &std::path::Path,
746 ) -> Result<()> {
747 if !alloc_dir.exists() {
748 return Ok(());
749 }
750 let lock_path = Self::ip_forward_lock_path(alloc_dir);
751 let lock_file = std::fs::OpenOptions::new()
752 .create(true)
753 .write(true)
754 .truncate(false)
755 .open(&lock_path)
756 .map_err(|e| {
757 NucleusError::NetworkError(format!(
758 "Failed to open ip_forward lock {:?}: {}",
759 lock_path, e
760 ))
761 })?;
762 let lock_ret = unsafe { libc::flock(lock_file.as_raw_fd(), libc::LOCK_EX) };
763 if lock_ret != 0 {
764 return Err(NucleusError::NetworkError(format!(
765 "Failed to acquire ip_forward lock: {}",
766 std::io::Error::last_os_error()
767 )));
768 }
769
770 let Some(mut state) = Self::load_ip_forward_state(alloc_dir)? else {
771 return Ok(());
772 };
773
774 if state.refcount == 0 {
775 return Self::remove_ip_forward_state(alloc_dir);
776 }
777
778 state.refcount -= 1;
779 if state.refcount == 0 {
780 Self::write_ip_forward_value(sysctl_path, &state.original_value)?;
781 Self::remove_ip_forward_state(alloc_dir)?;
782 info!("Restored net.ipv4.ip_forward to {}", state.original_value);
783 } else {
784 Self::store_ip_forward_state(alloc_dir, &state)?;
785 }
786
787 Ok(())
788 }
789
790 fn read_pid_start_ticks(pid: u32) -> u64 {
793 let stat_path = format!("/proc/{}/stat", pid);
794 if let Ok(content) = std::fs::read_to_string(&stat_path) {
795 if let Some(after_comm) = content.rfind(')') {
798 return content[after_comm + 2..]
799 .split_whitespace()
800 .nth(19) .and_then(|s| s.parse().ok())
802 .unwrap_or(0);
803 }
804 }
805 0
806 }
807
808 fn gateway_from_subnet(subnet: &str) -> String {
810 let base = subnet.split('/').next().unwrap_or("10.0.42.0");
811 let parts: Vec<&str> = base.split('.').collect();
812 if parts.len() == 4 {
813 format!("{}.{}.{}.1", parts[0], parts[1], parts[2])
814 } else {
815 "10.0.42.1".to_string()
816 }
817 }
818
819 fn subnet_prefix(subnet: &str) -> u8 {
820 subnet
821 .split_once('/')
822 .and_then(|(_, p)| p.parse::<u8>().ok())
823 .filter(|p| *p <= 32)
824 .unwrap_or(24)
825 }
826
827 pub(crate) fn resolve_bin(name: &str) -> Result<String> {
836 let search_dirs: &[&str] = match name {
837 "iptables" => &[
838 "/usr/sbin/iptables",
839 "/sbin/iptables",
840 "/usr/bin/iptables",
841 "/run/current-system/sw/bin/iptables",
842 ],
843 "slirp4netns" => &[
844 "/usr/bin/slirp4netns",
845 "/bin/slirp4netns",
846 "/run/current-system/sw/bin/slirp4netns",
847 ],
848 _ => &[],
849 };
850
851 for path in search_dirs {
852 let p = std::path::Path::new(path);
853 if p.exists() {
854 Self::validate_network_binary(p, name)?;
855 let resolved = std::fs::canonicalize(p).map_err(|e| {
856 NucleusError::NetworkError(format!(
857 "Cannot canonicalize {} at {:?}: {}",
858 name, p, e
859 ))
860 })?;
861 return Ok(resolved.to_string_lossy().into_owned());
862 }
863 }
864
865 if nix::unistd::Uid::effective().is_root() {
866 return Err(NucleusError::NetworkError(format!(
867 "Required binary '{}' not found in trusted system paths",
868 name
869 )));
870 }
871
872 if let Some(path_var) = std::env::var_os("PATH") {
873 for dir in std::env::split_paths(&path_var) {
874 let candidate = dir.join(name);
875 if candidate.exists() {
876 Self::validate_network_binary(&candidate, name)?;
877 let resolved = std::fs::canonicalize(&candidate).map_err(|e| {
878 NucleusError::NetworkError(format!(
879 "Cannot canonicalize {} at {:?}: {}",
880 name, candidate, e
881 ))
882 })?;
883 return Ok(resolved.to_string_lossy().into_owned());
884 }
885 }
886 }
887
888 Err(NucleusError::NetworkError(format!(
889 "Required binary '{}' not found or failed validation",
890 name
891 )))
892 }
893
894 fn validate_network_binary(path: &std::path::Path, name: &str) -> Result<()> {
898 use std::os::unix::fs::MetadataExt;
899
900 let resolved = std::fs::canonicalize(path).unwrap_or_else(|_| path.to_path_buf());
901 let meta = std::fs::metadata(&resolved)
902 .map_err(|e| NucleusError::NetworkError(format!("Cannot stat {}: {}", name, e)))?;
903 let mode = meta.mode();
904 if mode & 0o111 == 0 {
905 return Err(NucleusError::NetworkError(format!(
906 "Binary '{}' at {:?} is not executable, refusing to execute",
907 name, resolved
908 )));
909 }
910 if mode & 0o022 != 0 {
911 return Err(NucleusError::NetworkError(format!(
912 "Binary '{}' at {:?} is writable by group/others (mode {:o}), refusing to execute",
913 name, resolved, mode
914 )));
915 }
916 let owner = meta.uid();
917 let euid = nix::unistd::Uid::effective().as_raw();
918 if owner != 0 && owner != euid && !Self::is_trusted_store_network_binary(&resolved, mode) {
919 return Err(NucleusError::NetworkError(format!(
920 "Binary '{}' at {:?} owned by UID {} (expected root or euid {}), refusing to execute",
921 name, resolved, owner, euid
922 )));
923 }
924 Ok(())
925 }
926
927 fn is_trusted_store_network_binary(path: &std::path::Path, mode: u32) -> bool {
928 use std::os::unix::fs::MetadataExt;
929 if !path.starts_with("/nix/store") {
930 return false;
931 }
932 if mode & 0o200 != 0 {
933 return false;
934 }
935 if let Some(parent) = path.parent() {
936 if let Ok(parent_meta) = std::fs::metadata(parent) {
937 return parent_meta.mode() & 0o222 == 0;
938 }
939 }
940 false
941 }
942
943 fn run_cmd(program: &str, args: &[&str]) -> Result<()> {
944 let resolved = Self::resolve_bin(program)?;
945 let output = Command::new(&resolved).args(args).output().map_err(|e| {
946 NucleusError::NetworkError(format!("Failed to run {} {:?}: {}", resolved, args, e))
947 })?;
948
949 if !output.status.success() {
950 let stderr = String::from_utf8_lossy(&output.stderr);
951 return Err(NucleusError::NetworkError(format!(
952 "{} {:?} failed: {}",
953 program, args, stderr
954 )));
955 }
956
957 Ok(())
958 }
959
960 fn run_cmd_owned(program: &str, args: &[String]) -> Result<()> {
961 let refs: Vec<&str> = args.iter().map(String::as_str).collect();
962 Self::run_cmd(program, &refs)
963 }
964
965 fn port_forward_rule_args(
966 operation: &str,
967 chain: &str,
968 container_ip: &str,
969 pf: &PortForward,
970 ) -> Vec<String> {
971 let mut args = vec![
972 "-t".to_string(),
973 "nat".to_string(),
974 operation.to_string(),
975 chain.to_string(),
976 "-p".to_string(),
977 pf.protocol.as_str().to_string(),
978 ];
979
980 if chain == "OUTPUT" {
981 args.extend([
982 "-m".to_string(),
983 "addrtype".to_string(),
984 "--dst-type".to_string(),
985 "LOCAL".to_string(),
986 ]);
987 }
988
989 if let Some(host_ip) = pf.host_ip {
990 args.extend(["-d".to_string(), host_ip.to_string()]);
991 }
992
993 args.extend([
994 "--dport".to_string(),
995 pf.host_port.to_string(),
996 "-j".to_string(),
997 "DNAT".to_string(),
998 "--to-destination".to_string(),
999 format!("{}:{}", container_ip, pf.container_port),
1000 ]);
1001
1002 args
1003 }
1004
1005 fn is_ip_in_use(ip: &str) -> Result<bool> {
1006 let addr: Ipv4Addr = ip
1007 .parse()
1008 .map_err(|e| NucleusError::NetworkError(format!("invalid IP '{}': {}", ip, e)))?;
1009 netlink::is_addr_in_use(&addr)
1010 }
1011
1012 pub fn write_resolv_conf(root: &std::path::Path, dns: &[String]) -> Result<()> {
1014 let resolv_path = root.join("etc/resolv.conf");
1015 let content: String = dns
1016 .iter()
1017 .map(|server| format!("nameserver {}\n", server))
1018 .collect();
1019 std::fs::write(&resolv_path, content).map_err(|e| {
1020 NucleusError::NetworkError(format!("Failed to write resolv.conf: {}", e))
1021 })?;
1022 Ok(())
1023 }
1024
1025 pub fn bind_mount_resolv_conf(root: &std::path::Path, dns: &[String]) -> Result<()> {
1031 use nix::mount::{mount, MsFlags};
1032
1033 let content: String = dns
1034 .iter()
1035 .map(|server| format!("nameserver {}\n", server))
1036 .collect();
1037
1038 let memfd_name = std::ffi::CString::new("nucleus-resolv").map_err(|e| {
1040 NucleusError::NetworkError(format!("Failed to create memfd name: {}", e))
1041 })?;
1042 let raw_fd = unsafe { libc::memfd_create(memfd_name.as_ptr(), 0) };
1045 if raw_fd < 0 {
1046 return Self::bind_mount_resolv_conf_staging(root, dns);
1048 }
1049 let memfd = unsafe { std::os::fd::OwnedFd::from_raw_fd(raw_fd) };
1053
1054 use std::io::Write as _;
1056 let mut memfd_file = std::fs::File::from(memfd);
1057 if memfd_file.write_all(content.as_bytes()).is_err() {
1058 return Self::bind_mount_resolv_conf_staging(root, dns);
1060 }
1061 use std::os::fd::IntoRawFd;
1063 let memfd = {
1064 let raw = memfd_file.into_raw_fd();
1065 unsafe { std::os::fd::OwnedFd::from_raw_fd(raw) }
1067 };
1068
1069 let target = root.join("etc/resolv.conf");
1071 if !target.exists() {
1072 let _ = std::fs::write(&target, "");
1073 }
1074
1075 let memfd_path = format!("/proc/self/fd/{}", memfd.as_raw_fd());
1077 if let Err(e) = mount(
1078 Some(memfd_path.as_str()),
1079 &target,
1080 None::<&str>,
1081 MsFlags::MS_BIND,
1082 None::<&str>,
1083 ) {
1084 return Err(NucleusError::NetworkError(format!(
1085 "Failed to bind mount memfd-backed resolv.conf: {}",
1086 e
1087 )));
1088 }
1089 Self::harden_resolv_conf_bind(&target)?;
1090
1091 info!("Bind-mounted resolv.conf for bridge networking (rootfs mode, memfd)");
1095 Ok(())
1096 }
1097
1098 fn bind_mount_resolv_conf_staging(root: &std::path::Path, dns: &[String]) -> Result<()> {
1100 use nix::mount::{mount, MsFlags};
1101
1102 let content: String = dns
1103 .iter()
1104 .map(|server| format!("nameserver {}\n", server))
1105 .collect();
1106
1107 let staging = Self::create_resolv_conf_staging_file(root, content.as_bytes())?;
1108
1109 let target = root.join("etc/resolv.conf");
1111 if !target.exists() {
1112 let _ = std::fs::write(&target, "");
1113 }
1114
1115 mount(
1117 Some(staging.path()),
1118 &target,
1119 None::<&str>,
1120 MsFlags::MS_BIND,
1121 None::<&str>,
1122 )
1123 .map_err(|e| {
1124 NucleusError::NetworkError(format!("Failed to bind mount resolv.conf: {}", e))
1125 })?;
1126 Self::harden_resolv_conf_bind(&target)?;
1127
1128 info!("Bind-mounted resolv.conf for bridge networking (rootfs mode, staging)");
1132 Ok(())
1133 }
1134
1135 fn create_resolv_conf_staging_file(
1136 root: &std::path::Path,
1137 content: &[u8],
1138 ) -> Result<tempfile::NamedTempFile> {
1139 use std::io::Write as _;
1140
1141 let staging_dir = root.parent().ok_or_else(|| {
1142 NucleusError::NetworkError(format!(
1143 "Container root {:?} has no parent for resolv.conf staging",
1144 root
1145 ))
1146 })?;
1147
1148 let mut staging = tempfile::Builder::new()
1149 .prefix(".resolv.conf.nucleus.")
1150 .tempfile_in(staging_dir)
1151 .map_err(|e| {
1152 NucleusError::NetworkError(format!(
1153 "Failed to create temporary resolv.conf staging file under {:?}: {}",
1154 staging_dir, e
1155 ))
1156 })?;
1157
1158 staging.as_file_mut().write_all(content).map_err(|e| {
1159 NucleusError::NetworkError(format!(
1160 "Failed to write temporary resolv.conf staging file {:?}: {}",
1161 staging.path(),
1162 e
1163 ))
1164 })?;
1165
1166 Ok(staging)
1167 }
1168
1169 fn harden_resolv_conf_bind(target: &std::path::Path) -> Result<()> {
1170 use nix::mount::{mount, MsFlags};
1171
1172 mount(
1173 None::<&str>,
1174 target,
1175 None::<&str>,
1176 MsFlags::MS_REMOUNT
1177 | MsFlags::MS_BIND
1178 | MsFlags::MS_RDONLY
1179 | MsFlags::MS_NOSUID
1180 | MsFlags::MS_NODEV
1181 | MsFlags::MS_NOEXEC,
1182 None::<&str>,
1183 )
1184 .map_err(|e| {
1185 NucleusError::NetworkError(format!(
1186 "Failed to remount resolv.conf with hardened flags at {:?}: {}",
1187 target, e
1188 ))
1189 })
1190 }
1191}
1192
1193impl Drop for BridgeNetwork {
1194 fn drop(&mut self) {
1195 self.cleanup_best_effort();
1196 }
1197}
1198
1199struct SetupRollback {
1200 veth_host: String,
1201 subnet: String,
1202 veth_created: bool,
1203 nat_added: bool,
1204 port_forwards: Vec<(String, PortForward)>,
1205 ip_forward_ref_acquired: bool,
1206 reserved_ip: Option<(std::path::PathBuf, String)>,
1207 armed: bool,
1208}
1209
1210impl SetupRollback {
1211 fn new(
1212 veth_host: String,
1213 subnet: String,
1214 reserved_ip: Option<(std::path::PathBuf, String)>,
1215 ) -> Self {
1216 Self {
1217 veth_host,
1218 subnet,
1219 veth_created: false,
1220 nat_added: false,
1221 port_forwards: Vec::new(),
1222 ip_forward_ref_acquired: false,
1223 reserved_ip,
1224 armed: true,
1225 }
1226 }
1227
1228 fn disarm(&mut self) {
1229 self.armed = false;
1230 }
1231}
1232
1233impl Drop for SetupRollback {
1234 fn drop(&mut self) {
1235 if !self.armed {
1236 return;
1237 }
1238
1239 for (container_ip, pf) in self.port_forwards.iter().rev() {
1240 for chain in ["OUTPUT", "PREROUTING"] {
1241 let args = BridgeNetwork::port_forward_rule_args("-D", chain, container_ip, pf);
1242 if let Err(e) = BridgeNetwork::run_cmd_owned("iptables", &args) {
1243 warn!(
1244 "Rollback: failed to remove iptables {} rule for {}: {}",
1245 chain, container_ip, e
1246 );
1247 }
1248 }
1249 }
1250
1251 if self.nat_added {
1252 if let Err(e) = BridgeNetwork::run_cmd(
1253 "iptables",
1254 &[
1255 "-t",
1256 "nat",
1257 "-D",
1258 "POSTROUTING",
1259 "-s",
1260 &self.subnet,
1261 "-j",
1262 "MASQUERADE",
1263 ],
1264 ) {
1265 warn!("Rollback: failed to remove NAT rule: {}", e);
1266 }
1267 }
1268
1269 if self.veth_created {
1270 if let Err(e) = netlink::del_link(&self.veth_host) {
1271 warn!("Rollback: failed to delete veth {}: {}", self.veth_host, e);
1272 }
1273 }
1274
1275 if self.ip_forward_ref_acquired {
1276 if let Err(e) = BridgeNetwork::release_ip_forward_ref() {
1277 warn!("Rollback: failed to release ip_forward refcount: {}", e);
1278 }
1279 }
1280
1281 if let Some((alloc_dir, container_id)) = &self.reserved_ip {
1282 BridgeNetwork::release_allocated_ip_in_dir(alloc_dir, container_id);
1283 }
1284 }
1285}
1286
1287#[cfg(test)]
1288mod tests {
1289 use super::*;
1290
1291 #[test]
1292 fn test_ip_allocation_rejection_sampling_range() {
1293 for byte in 0u8..253 {
1296 let offset = byte as u32 + 2;
1297 assert!(
1298 (2..=254).contains(&offset),
1299 "offset {} out of range",
1300 offset
1301 );
1302 }
1303 for byte in [253u8, 254, 255] {
1305 assert!(byte >= 253);
1306 }
1307 }
1308
1309 #[test]
1310 fn test_reserve_ip_blocks_duplicate_requested_address() {
1311 let temp = tempfile::tempdir().unwrap();
1312 BridgeNetwork::record_allocated_ip_in_dir(temp.path(), "one", "10.0.42.2").unwrap();
1313
1314 let err =
1315 BridgeNetwork::reserve_ip_in_dir(temp.path(), "two", "10.0.42.0/24", Some("10.0.42.2"))
1316 .unwrap_err();
1317 assert!(
1318 err.to_string().contains("already in use"),
1319 "second reservation of the same IP must fail"
1320 );
1321 }
1322
1323 #[test]
1324 fn test_setup_rollback_releases_reserved_ip() {
1325 let temp = tempfile::tempdir().unwrap();
1326 BridgeNetwork::record_allocated_ip_in_dir(temp.path(), "rollback", "10.0.42.3").unwrap();
1327
1328 let rollback = SetupRollback {
1329 veth_host: "veth-test".to_string(),
1330 subnet: "10.0.42.0/24".to_string(),
1331 veth_created: false,
1332 nat_added: false,
1333 port_forwards: Vec::new(),
1334 ip_forward_ref_acquired: false,
1335 reserved_ip: Some((temp.path().to_path_buf(), "rollback".to_string())),
1336 armed: true,
1337 };
1338
1339 drop(rollback);
1340
1341 assert!(
1342 !temp.path().join("rollback.ip").exists(),
1343 "rollback must release reserved IP files on setup failure"
1344 );
1345 }
1346
1347 #[test]
1348 fn test_resolv_conf_staging_file_is_outside_container_root() {
1349 let temp = tempfile::tempdir().unwrap();
1350 let root = temp.path().join("root");
1351 std::fs::create_dir_all(root.join("tmp")).unwrap();
1352
1353 let staging =
1354 BridgeNetwork::create_resolv_conf_staging_file(&root, b"nameserver 203.0.113.53\n")
1355 .unwrap();
1356
1357 assert_eq!(
1358 std::fs::read_to_string(staging.path()).unwrap(),
1359 "nameserver 203.0.113.53\n"
1360 );
1361 assert!(
1362 !staging.path().starts_with(&root),
1363 "staging file must not be created under the container root"
1364 );
1365 }
1366
1367 #[test]
1368 fn test_bind_mount_resolv_conf_does_not_overwrite_root_tmp_symlink_on_failure() {
1369 use std::os::unix::fs::symlink;
1370
1371 let temp = tempfile::tempdir().unwrap();
1372 let root = temp.path().join("root");
1373 std::fs::create_dir_all(root.join("tmp")).unwrap();
1374
1375 let victim = temp.path().join("host_victim_file");
1376 std::fs::write(&victim, "ORIGINAL_HOST_CONTENT\n").unwrap();
1377 symlink(&victim, root.join("tmp/.resolv.conf.nucleus")).unwrap();
1378
1379 let dns = vec!["203.0.113.53".to_string()];
1380 let result = BridgeNetwork::bind_mount_resolv_conf(&root, &dns);
1381
1382 assert!(
1383 result.is_err(),
1384 "test root intentionally lacks /etc so bind mount setup must fail"
1385 );
1386 assert_eq!(
1387 std::fs::read_to_string(&victim).unwrap(),
1388 "ORIGINAL_HOST_CONTENT\n",
1389 "resolv.conf setup must not write through attacker-controlled /tmp symlinks"
1390 );
1391 }
1392
1393 #[test]
1394 fn test_ip_forward_refcount_restores_original_only_after_last_release() {
1395 let temp = tempfile::tempdir().unwrap();
1396 let sysctl = temp.path().join("ip_forward");
1397 std::fs::write(&sysctl, "0").unwrap();
1398
1399 BridgeNetwork::acquire_ip_forward_ref_in_dir(temp.path(), &sysctl).unwrap();
1400 BridgeNetwork::acquire_ip_forward_ref_in_dir(temp.path(), &sysctl).unwrap();
1401 assert_eq!(std::fs::read_to_string(&sysctl).unwrap(), "1");
1402
1403 BridgeNetwork::release_ip_forward_ref_in_dir(temp.path(), &sysctl).unwrap();
1404 assert_eq!(std::fs::read_to_string(&sysctl).unwrap(), "1");
1405
1406 BridgeNetwork::release_ip_forward_ref_in_dir(temp.path(), &sysctl).unwrap();
1407 assert_eq!(std::fs::read_to_string(&sysctl).unwrap(), "0");
1408 assert!(
1409 !temp.path().join(IP_FORWARD_STATE_FILE).exists(),
1410 "state file must be removed when the last bridge releases ip_forward"
1411 );
1412 }
1413
1414 #[test]
1415 fn test_port_forward_rules_include_output_chain_for_local_host_clients() {
1416 let pf = PortForward {
1417 host_ip: None,
1418 host_port: 8080,
1419 container_port: 80,
1420 protocol: crate::network::config::Protocol::Tcp,
1421 };
1422
1423 let prerouting =
1424 BridgeNetwork::port_forward_rule_args("-A", "PREROUTING", "10.0.42.2", &pf);
1425 let output = BridgeNetwork::port_forward_rule_args("-A", "OUTPUT", "10.0.42.2", &pf);
1426
1427 assert!(prerouting.iter().any(|arg| arg == "PREROUTING"));
1428 assert!(output.iter().any(|arg| arg == "OUTPUT"));
1429 assert!(
1430 output
1431 .windows(2)
1432 .any(|pair| pair[0] == "--dst-type" && pair[1] == "LOCAL"),
1433 "OUTPUT rule must target local-destination traffic"
1434 );
1435 }
1436
1437 #[test]
1438 fn test_port_forward_rules_include_host_ip_when_configured() {
1439 let pf = PortForward {
1440 host_ip: Some(std::net::Ipv4Addr::new(127, 0, 0, 1)),
1441 host_port: 4173,
1442 container_port: 4173,
1443 protocol: crate::network::config::Protocol::Tcp,
1444 };
1445
1446 let prerouting =
1447 BridgeNetwork::port_forward_rule_args("-A", "PREROUTING", "10.0.42.2", &pf);
1448 let output = BridgeNetwork::port_forward_rule_args("-A", "OUTPUT", "10.0.42.2", &pf);
1449
1450 for args in [&prerouting, &output] {
1451 assert!(
1452 args.windows(2)
1453 .any(|pair| pair[0] == "-d" && pair[1] == "127.0.0.1"),
1454 "port forward must restrict DNAT rules to the configured host IP"
1455 );
1456 }
1457 }
1458}