1use super::{egress, netlink, netns};
2use crate::error::{NucleusError, Result, StateTransition};
3use crate::network::config::{BridgeConfig, EgressPolicy, PortForward};
4use crate::network::NetworkState;
5use serde::{Deserialize, Serialize};
6use std::fs::OpenOptions;
7use std::net::Ipv4Addr;
8use std::os::fd::FromRawFd;
9use std::os::unix::fs::FileTypeExt;
10use std::os::unix::fs::OpenOptionsExt;
11use std::os::unix::io::AsRawFd;
12use std::process::Command;
13use tracing::{debug, info, warn};
14
15pub struct BridgeNetwork {
17 config: BridgeConfig,
18 container_ip: String,
19 veth_host: String,
20 container_id: String,
21 ip_forward_ref_acquired: bool,
22 state: NetworkState,
23}
24
25const IP_FORWARD_SYSCTL_PATH: &str = "/proc/sys/net/ipv4/ip_forward";
26const IP_FORWARD_LOCK_FILE: &str = ".ip_forward.lock";
27const IP_FORWARD_STATE_FILE: &str = ".ip_forward.state";
28
29#[derive(Debug, Clone, Serialize, Deserialize)]
30struct IpForwardRefState {
31 refcount: u64,
32 original_value: String,
33}
34
35impl BridgeNetwork {
36 fn open_dev_urandom() -> Result<std::fs::File> {
37 let file = OpenOptions::new()
38 .read(true)
39 .custom_flags(libc::O_NOFOLLOW | libc::O_CLOEXEC)
40 .open("/dev/urandom")
41 .map_err(|e| {
42 NucleusError::NetworkError(format!("Failed to open /dev/urandom: {}", e))
43 })?;
44
45 let metadata = file.metadata().map_err(|e| {
46 NucleusError::NetworkError(format!("Failed to stat /dev/urandom: {}", e))
47 })?;
48 if !metadata.file_type().is_char_device() {
49 return Err(NucleusError::NetworkError(
50 "/dev/urandom is not a character device".to_string(),
51 ));
52 }
53
54 Ok(file)
55 }
56
57 pub fn setup(pid: u32, config: &BridgeConfig) -> Result<Self> {
64 Self::setup_for(pid, config, &format!("{:x}", pid))
65 }
66
67 pub fn setup_with_id(pid: u32, config: &BridgeConfig, container_id: &str) -> Result<Self> {
69 Self::setup_for(pid, config, container_id)
70 }
71
72 fn setup_for(pid: u32, config: &BridgeConfig, container_id: &str) -> Result<Self> {
73 config.validate()?;
75
76 let mut net_state = NetworkState::Unconfigured;
77 net_state = net_state.transition(NetworkState::Configuring)?;
78
79 let alloc_dir = Self::ip_alloc_dir();
80 let container_ip = Self::reserve_ip_in_dir(
81 &alloc_dir,
82 container_id,
83 &config.subnet,
84 config.container_ip.as_deref(),
85 )?;
86 let prefix = Self::subnet_prefix(&config.subnet);
87
88 let veth_host_full = format!("veth-{:x}", pid);
90 let veth_cont_full = format!("vethc-{:x}", pid);
91 let veth_host = veth_host_full[..veth_host_full.len().min(15)].to_string();
92 let veth_container = veth_cont_full[..veth_cont_full.len().min(15)].to_string();
93 let mut rollback = SetupRollback::new(
94 veth_host.clone(),
95 config.subnet.clone(),
96 Some((alloc_dir.clone(), container_id.to_string())),
97 );
98
99 Self::ensure_bridge_for(&config.bridge_name, &config.subnet)?;
101
102 netlink::create_veth(&veth_host, &veth_container)?;
104 rollback.veth_created = true;
105
106 netlink::set_link_master(&veth_host, &config.bridge_name)?;
108 netlink::set_link_up(&veth_host)?;
109
110 netlink::set_link_netns(&veth_container, pid)?;
112
113 let start_ticks = Self::read_pid_start_ticks(pid);
117 if start_ticks == 0 {
118 drop(rollback);
119 return Err(NucleusError::NetworkError(format!(
120 "Cannot read start_ticks for PID {} – process may have exited",
121 pid
122 )));
123 }
124
125 let container_addr: Ipv4Addr = container_ip.parse().map_err(|e| {
126 NucleusError::NetworkError(format!("invalid container IP '{}': {}", container_ip, e))
127 })?;
128 {
129 let vc = veth_container.clone();
130 netns::in_netns(pid, move || {
131 netlink::add_addr(&vc, container_addr, prefix)?;
132 netlink::set_link_up(&vc)?;
133 netlink::set_link_up("lo")?;
134 Ok(())
135 })?;
136 }
137
138 let current_ticks = Self::read_pid_start_ticks(pid);
140 if current_ticks != start_ticks {
141 drop(rollback);
142 return Err(NucleusError::NetworkError(format!(
143 "PID {} was recycled during network setup (start_ticks changed: {} -> {})",
144 pid, start_ticks, current_ticks
145 )));
146 }
147
148 let gateway = Self::gateway_from_subnet(&config.subnet);
150 let gateway_addr: Ipv4Addr = gateway.parse().map_err(|e| {
151 NucleusError::NetworkError(format!("invalid gateway IP '{}': {}", gateway, e))
152 })?;
153 netns::in_netns(pid, move || netlink::add_default_route(gateway_addr))?;
154
155 Self::run_cmd(
157 "iptables",
158 &[
159 "-t",
160 "nat",
161 "-A",
162 "POSTROUTING",
163 "-s",
164 &config.subnet,
165 "-j",
166 "MASQUERADE",
167 ],
168 )?;
169 rollback.nat_added = true;
170
171 Self::acquire_ip_forward_ref()?;
174 rollback.ip_forward_ref_acquired = true;
175
176 for pf in &config.port_forwards {
178 Self::setup_port_forward_for(&container_ip, pf)?;
179 rollback
180 .port_forwards
181 .push((container_ip.clone(), pf.clone()));
182 }
183
184 net_state = net_state.transition(NetworkState::Active)?;
185
186 info!(
187 "Bridge network configured: {} -> {} (IP: {})",
188 veth_host, veth_container, container_ip
189 );
190 let ip_forward_ref_acquired = rollback.ip_forward_ref_acquired;
191 rollback.disarm();
192
193 Ok(Self {
194 config: config.clone(),
195 container_ip,
196 veth_host,
197 container_id: container_id.to_string(),
198 ip_forward_ref_acquired,
199 state: net_state,
200 })
201 }
202
203 pub fn apply_egress_policy(&self, pid: u32, policy: &EgressPolicy) -> Result<()> {
208 egress::apply_egress_policy(pid, &self.config.dns, policy, false)
209 }
210
211 pub fn cleanup(mut self) -> Result<()> {
215 self.state = self.state.transition(NetworkState::Cleaned)?;
216
217 Self::release_allocated_ip(&self.container_id);
219
220 for pf in &self.config.port_forwards {
222 if let Err(e) = self.cleanup_port_forward(pf) {
223 warn!("Failed to cleanup port forward: {}", e);
224 }
225 }
226
227 let _ = Self::run_cmd(
229 "iptables",
230 &[
231 "-t",
232 "nat",
233 "-D",
234 "POSTROUTING",
235 "-s",
236 &self.config.subnet,
237 "-j",
238 "MASQUERADE",
239 ],
240 );
241
242 let _ = netlink::del_link(&self.veth_host);
244
245 if self.ip_forward_ref_acquired {
246 if let Err(e) = Self::release_ip_forward_ref() {
247 warn!("Failed to release ip_forward refcount: {}", e);
248 } else {
249 self.ip_forward_ref_acquired = false;
250 }
251 }
252
253 info!("Bridge network cleaned up");
254 Ok(())
255 }
256
257 fn cleanup_best_effort(&mut self) {
261 if self.state == NetworkState::Cleaned {
262 return;
263 }
264
265 Self::release_allocated_ip(&self.container_id);
266
267 for pf in &self.config.port_forwards {
268 let _ = self.cleanup_port_forward(pf);
269 }
270
271 let _ = Self::run_cmd(
272 "iptables",
273 &[
274 "-t",
275 "nat",
276 "-D",
277 "POSTROUTING",
278 "-s",
279 &self.config.subnet,
280 "-j",
281 "MASQUERADE",
282 ],
283 );
284
285 let _ = netlink::del_link(&self.veth_host);
286
287 if self.ip_forward_ref_acquired {
288 let _ = Self::release_ip_forward_ref();
289 self.ip_forward_ref_acquired = false;
290 }
291
292 self.state = NetworkState::Cleaned;
293 debug!("Bridge network cleaned up (best-effort via drop)");
294 }
295
296 pub fn cleanup_orphaned_rules(subnet: &str) {
302 let output = match Command::new("iptables")
304 .args(["-t", "nat", "-L", "POSTROUTING", "-n"])
305 .output()
306 {
307 Ok(o) => o,
308 Err(e) => {
309 debug!("Cannot check iptables for orphaned rules: {}", e);
310 return;
311 }
312 };
313
314 let stdout = String::from_utf8_lossy(&output.stdout);
315 let mut orphaned_count = 0u32;
316 for line in stdout.lines() {
317 if line.contains("MASQUERADE") && line.contains(subnet) {
318 let _ = Self::run_cmd(
320 "iptables",
321 &[
322 "-t",
323 "nat",
324 "-D",
325 "POSTROUTING",
326 "-s",
327 subnet,
328 "-j",
329 "MASQUERADE",
330 ],
331 );
332 orphaned_count += 1;
333 }
334 }
335
336 if orphaned_count > 0 {
337 info!(
338 "Cleaned up {} orphaned iptables MASQUERADE rule(s) for subnet {}",
339 orphaned_count, subnet
340 );
341 }
342 }
343
344 fn ensure_bridge_for(bridge_name: &str, subnet: &str) -> Result<()> {
345 if netlink::link_exists(bridge_name) {
346 return Ok(());
347 }
348
349 netlink::create_bridge(bridge_name)?;
350
351 let gateway = Self::gateway_from_subnet(subnet);
352 let gateway_addr: Ipv4Addr = gateway.parse().map_err(|e| {
353 NucleusError::NetworkError(format!("invalid bridge gateway '{}': {}", gateway, e))
354 })?;
355 netlink::add_addr(bridge_name, gateway_addr, Self::subnet_prefix(subnet))?;
356 netlink::set_link_up(bridge_name)?;
357
358 info!("Created bridge {}", bridge_name);
359 Ok(())
360 }
361
362 fn setup_port_forward_for(container_ip: &str, pf: &PortForward) -> Result<()> {
363 for chain in ["PREROUTING", "OUTPUT"] {
364 let args = Self::port_forward_rule_args("-A", chain, container_ip, pf);
365 Self::run_cmd_owned("iptables", &args)?;
366 }
367
368 let host_ip = pf
369 .host_ip
370 .map(|ip| ip.to_string())
371 .unwrap_or_else(|| "0.0.0.0".to_string());
372 info!(
373 "Port forward: {}:{} -> {}:{}/{}",
374 host_ip, pf.host_port, container_ip, pf.container_port, pf.protocol
375 );
376 Ok(())
377 }
378
379 fn cleanup_port_forward(&self, pf: &PortForward) -> Result<()> {
380 for chain in ["OUTPUT", "PREROUTING"] {
381 let args = Self::port_forward_rule_args("-D", chain, &self.container_ip, pf);
382 Self::run_cmd_owned("iptables", &args)?;
383 }
384 Ok(())
385 }
386
387 fn allocate_ip_with_reserved(
393 subnet: &str,
394 reserved: &std::collections::HashSet<String>,
395 ) -> Result<String> {
396 let base = subnet.split('/').next().unwrap_or("10.0.42.0");
397 let parts: Vec<&str> = base.split('.').collect();
398 if parts.len() != 4 {
399 return Ok("10.0.42.2".to_string());
400 }
401
402 let mut rand_buf = [0u8; 128];
409 let mut urandom = Self::open_dev_urandom()?;
410 std::io::Read::read_exact(&mut urandom, &mut rand_buf).map_err(|e| {
411 NucleusError::NetworkError(format!("Failed to read /dev/urandom: {}", e))
412 })?;
413 for &byte in &rand_buf {
414 if byte >= 253 {
416 continue;
417 }
418 let offset = byte as u32 + 2;
419 let candidate = format!("{}.{}.{}.{}", parts[0], parts[1], parts[2], offset);
420 if reserved.contains(&candidate) {
421 continue;
422 }
423 if !Self::is_ip_in_use(&candidate)? {
424 return Ok(candidate);
426 }
427 }
428
429 Err(NucleusError::NetworkError(format!(
430 "Failed to allocate free IP in subnet {}",
431 subnet
432 )))
433 }
434
435 fn reserve_ip_in_dir(
436 alloc_dir: &std::path::Path,
437 container_id: &str,
438 subnet: &str,
439 requested_ip: Option<&str>,
440 ) -> Result<String> {
441 Self::ensure_alloc_dir(alloc_dir)?;
442 let lock_path = alloc_dir.join(".lock");
443 let lock_file = std::fs::OpenOptions::new()
444 .create(true)
445 .write(true)
446 .truncate(false)
447 .open(&lock_path)
448 .map_err(|e| {
449 NucleusError::NetworkError(format!("Failed to open IP alloc lock: {}", e))
450 })?;
451 let lock_ret = unsafe { libc::flock(lock_file.as_raw_fd(), libc::LOCK_EX) };
454 if lock_ret != 0 {
455 return Err(NucleusError::NetworkError(format!(
456 "Failed to acquire IP alloc lock: {}",
457 std::io::Error::last_os_error()
458 )));
459 }
460
461 let reserved = Self::collect_reserved_ips_in_dir(alloc_dir);
462 let ip = match requested_ip {
463 Some(ip) => {
464 if reserved.contains(ip) || Self::is_ip_in_use(ip)? {
465 return Err(NucleusError::NetworkError(format!(
466 "Requested container IP {} is already in use",
467 ip
468 )));
469 }
470 ip.to_string()
471 }
472 None => Self::allocate_ip_with_reserved(subnet, &reserved)?,
473 };
474
475 Self::record_allocated_ip_in_dir(alloc_dir, container_id, &ip)?;
476 Ok(ip)
477 }
478
479 fn collect_reserved_ips_in_dir(
481 alloc_dir: &std::path::Path,
482 ) -> std::collections::HashSet<String> {
483 let mut ips = std::collections::HashSet::new();
484 if let Ok(entries) = std::fs::read_dir(alloc_dir) {
485 for entry in entries.flatten() {
486 if let Some(name) = entry.file_name().to_str() {
487 if name.ends_with(".ip") {
488 if let Ok(ip) = std::fs::read_to_string(entry.path()) {
489 let ip = ip.trim().to_string();
490 if !ip.is_empty() {
491 ips.insert(ip);
492 }
493 }
494 }
495 }
496 }
497 }
498 ips
499 }
500
501 fn record_allocated_ip_in_dir(
503 alloc_dir: &std::path::Path,
504 container_id: &str,
505 ip: &str,
506 ) -> Result<()> {
507 Self::ensure_alloc_dir(alloc_dir)?;
508 let path = alloc_dir.join(format!("{}.ip", container_id));
509 std::fs::write(&path, ip).map_err(|e| {
510 NucleusError::NetworkError(format!("Failed to record IP allocation: {}", e))
511 })?;
512 Ok(())
513 }
514
515 fn release_allocated_ip(container_id: &str) {
517 let alloc_dir = Self::ip_alloc_dir();
518 Self::release_allocated_ip_in_dir(&alloc_dir, container_id);
519 }
520
521 fn release_allocated_ip_in_dir(alloc_dir: &std::path::Path, container_id: &str) {
522 let path = alloc_dir.join(format!("{}.ip", container_id));
523 let _ = std::fs::remove_file(path);
524 }
525
526 fn ensure_alloc_dir(alloc_dir: &std::path::Path) -> Result<()> {
529 if alloc_dir.exists() {
532 if let Ok(meta) = std::fs::symlink_metadata(alloc_dir) {
533 if meta.file_type().is_symlink() {
534 return Err(NucleusError::NetworkError(format!(
535 "IP alloc dir {:?} is a symlink, refusing to use",
536 alloc_dir
537 )));
538 }
539 }
540 }
541 if let Some(parent) = alloc_dir.parent() {
543 if let Ok(meta) = std::fs::symlink_metadata(parent) {
544 if meta.file_type().is_symlink() {
545 return Err(NucleusError::NetworkError(format!(
546 "IP alloc dir parent {:?} is a symlink, refusing to use",
547 parent
548 )));
549 }
550 }
551 }
552
553 std::fs::create_dir_all(alloc_dir).map_err(|e| {
554 NucleusError::NetworkError(format!("Failed to create IP alloc dir: {}", e))
555 })?;
556
557 use std::os::unix::fs::PermissionsExt;
559 let perms = std::fs::Permissions::from_mode(0o700);
560 std::fs::set_permissions(alloc_dir, perms).map_err(|e| {
561 NucleusError::NetworkError(format!(
562 "Failed to set permissions on IP alloc dir {:?}: {}",
563 alloc_dir, e
564 ))
565 })?;
566
567 if let Ok(meta) = std::fs::symlink_metadata(alloc_dir) {
569 if meta.file_type().is_symlink() {
570 return Err(NucleusError::NetworkError(format!(
571 "IP alloc dir {:?} was replaced with a symlink during setup",
572 alloc_dir
573 )));
574 }
575 }
576 Ok(())
577 }
578
579 fn ip_alloc_dir() -> std::path::PathBuf {
580 if nix::unistd::Uid::effective().is_root() {
581 std::path::PathBuf::from("/var/run/nucleus/ip-alloc")
582 } else {
583 dirs::runtime_dir()
584 .map(|d| d.join("nucleus/ip-alloc"))
585 .or_else(|| dirs::data_local_dir().map(|d| d.join("nucleus/ip-alloc")))
586 .unwrap_or_else(|| {
587 dirs::home_dir()
588 .map(|h| h.join(".nucleus/ip-alloc"))
589 .unwrap_or_else(|| std::path::PathBuf::from("/var/run/nucleus/ip-alloc"))
590 })
591 }
592 }
593
594 fn ip_forward_lock_path(alloc_dir: &std::path::Path) -> std::path::PathBuf {
595 alloc_dir.join(IP_FORWARD_LOCK_FILE)
596 }
597
598 fn ip_forward_state_path(alloc_dir: &std::path::Path) -> std::path::PathBuf {
599 alloc_dir.join(IP_FORWARD_STATE_FILE)
600 }
601
602 fn read_ip_forward_value(sysctl_path: &std::path::Path) -> Result<String> {
603 std::fs::read_to_string(sysctl_path)
604 .map(|value| value.trim().to_string())
605 .map_err(|e| {
606 NucleusError::NetworkError(format!(
607 "Failed to read {}: {}",
608 sysctl_path.display(),
609 e
610 ))
611 })
612 }
613
614 fn write_ip_forward_value(sysctl_path: &std::path::Path, value: &str) -> Result<()> {
615 std::fs::write(sysctl_path, value).map_err(|e| {
616 NucleusError::NetworkError(format!(
617 "Failed to write {} to {}: {}",
618 value,
619 sysctl_path.display(),
620 e
621 ))
622 })
623 }
624
625 fn load_ip_forward_state(alloc_dir: &std::path::Path) -> Result<Option<IpForwardRefState>> {
626 let state_path = Self::ip_forward_state_path(alloc_dir);
627 match std::fs::read_to_string(&state_path) {
628 Ok(content) => serde_json::from_str(&content).map(Some).map_err(|e| {
629 NucleusError::NetworkError(format!(
630 "Failed to parse ip_forward refcount state {:?}: {}",
631 state_path, e
632 ))
633 }),
634 Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(None),
635 Err(e) => Err(NucleusError::NetworkError(format!(
636 "Failed to read ip_forward refcount state {:?}: {}",
637 state_path, e
638 ))),
639 }
640 }
641
642 fn store_ip_forward_state(
643 alloc_dir: &std::path::Path,
644 state: &IpForwardRefState,
645 ) -> Result<()> {
646 let state_path = Self::ip_forward_state_path(alloc_dir);
647 let encoded = serde_json::to_vec(state).map_err(|e| {
648 NucleusError::NetworkError(format!(
649 "Failed to serialize ip_forward refcount state {:?}: {}",
650 state_path, e
651 ))
652 })?;
653 std::fs::write(&state_path, encoded).map_err(|e| {
654 NucleusError::NetworkError(format!(
655 "Failed to persist ip_forward refcount state {:?}: {}",
656 state_path, e
657 ))
658 })
659 }
660
661 fn remove_ip_forward_state(alloc_dir: &std::path::Path) -> Result<()> {
662 let state_path = Self::ip_forward_state_path(alloc_dir);
663 match std::fs::remove_file(&state_path) {
664 Ok(()) => Ok(()),
665 Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(()),
666 Err(e) => Err(NucleusError::NetworkError(format!(
667 "Failed to remove ip_forward refcount state {:?}: {}",
668 state_path, e
669 ))),
670 }
671 }
672
673 fn acquire_ip_forward_ref() -> Result<()> {
674 let alloc_dir = Self::ip_alloc_dir();
675 Self::acquire_ip_forward_ref_in_dir(
676 &alloc_dir,
677 std::path::Path::new(IP_FORWARD_SYSCTL_PATH),
678 )
679 }
680
681 fn acquire_ip_forward_ref_in_dir(
682 alloc_dir: &std::path::Path,
683 sysctl_path: &std::path::Path,
684 ) -> Result<()> {
685 Self::ensure_alloc_dir(alloc_dir)?;
686 let lock_path = Self::ip_forward_lock_path(alloc_dir);
687 let lock_file = std::fs::OpenOptions::new()
688 .create(true)
689 .write(true)
690 .truncate(false)
691 .open(&lock_path)
692 .map_err(|e| {
693 NucleusError::NetworkError(format!(
694 "Failed to open ip_forward lock {:?}: {}",
695 lock_path, e
696 ))
697 })?;
698 let lock_ret = unsafe { libc::flock(lock_file.as_raw_fd(), libc::LOCK_EX) };
699 if lock_ret != 0 {
700 return Err(NucleusError::NetworkError(format!(
701 "Failed to acquire ip_forward lock: {}",
702 std::io::Error::last_os_error()
703 )));
704 }
705
706 let mut state = match Self::load_ip_forward_state(alloc_dir)? {
707 Some(state) => state,
708 None => {
709 let original_value = Self::read_ip_forward_value(sysctl_path)?;
710 let state = IpForwardRefState {
711 refcount: 0,
712 original_value,
713 };
714 Self::store_ip_forward_state(alloc_dir, &state)?;
715 state
716 }
717 };
718
719 if state.refcount == 0 {
720 Self::write_ip_forward_value(sysctl_path, "1")?;
721 }
722 state.refcount = state.refcount.checked_add(1).ok_or_else(|| {
723 NucleusError::NetworkError("ip_forward refcount overflow".to_string())
724 })?;
725 Self::store_ip_forward_state(alloc_dir, &state)
726 }
727
728 fn release_ip_forward_ref() -> Result<()> {
729 let alloc_dir = Self::ip_alloc_dir();
730 Self::release_ip_forward_ref_in_dir(
731 &alloc_dir,
732 std::path::Path::new(IP_FORWARD_SYSCTL_PATH),
733 )
734 }
735
736 fn release_ip_forward_ref_in_dir(
737 alloc_dir: &std::path::Path,
738 sysctl_path: &std::path::Path,
739 ) -> Result<()> {
740 if !alloc_dir.exists() {
741 return Ok(());
742 }
743 let lock_path = Self::ip_forward_lock_path(alloc_dir);
744 let lock_file = std::fs::OpenOptions::new()
745 .create(true)
746 .write(true)
747 .truncate(false)
748 .open(&lock_path)
749 .map_err(|e| {
750 NucleusError::NetworkError(format!(
751 "Failed to open ip_forward lock {:?}: {}",
752 lock_path, e
753 ))
754 })?;
755 let lock_ret = unsafe { libc::flock(lock_file.as_raw_fd(), libc::LOCK_EX) };
756 if lock_ret != 0 {
757 return Err(NucleusError::NetworkError(format!(
758 "Failed to acquire ip_forward lock: {}",
759 std::io::Error::last_os_error()
760 )));
761 }
762
763 let Some(mut state) = Self::load_ip_forward_state(alloc_dir)? else {
764 return Ok(());
765 };
766
767 if state.refcount == 0 {
768 return Self::remove_ip_forward_state(alloc_dir);
769 }
770
771 state.refcount -= 1;
772 if state.refcount == 0 {
773 Self::write_ip_forward_value(sysctl_path, &state.original_value)?;
774 Self::remove_ip_forward_state(alloc_dir)?;
775 info!("Restored net.ipv4.ip_forward to {}", state.original_value);
776 } else {
777 Self::store_ip_forward_state(alloc_dir, &state)?;
778 }
779
780 Ok(())
781 }
782
783 fn read_pid_start_ticks(pid: u32) -> u64 {
786 let stat_path = format!("/proc/{}/stat", pid);
787 if let Ok(content) = std::fs::read_to_string(&stat_path) {
788 if let Some(after_comm) = content.rfind(')') {
791 return content[after_comm + 2..]
792 .split_whitespace()
793 .nth(19) .and_then(|s| s.parse().ok())
795 .unwrap_or(0);
796 }
797 }
798 0
799 }
800
801 fn gateway_from_subnet(subnet: &str) -> String {
803 let base = subnet.split('/').next().unwrap_or("10.0.42.0");
804 let parts: Vec<&str> = base.split('.').collect();
805 if parts.len() == 4 {
806 format!("{}.{}.{}.1", parts[0], parts[1], parts[2])
807 } else {
808 "10.0.42.1".to_string()
809 }
810 }
811
812 fn subnet_prefix(subnet: &str) -> u8 {
813 subnet
814 .split_once('/')
815 .and_then(|(_, p)| p.parse::<u8>().ok())
816 .filter(|p| *p <= 32)
817 .unwrap_or(24)
818 }
819
820 pub(crate) fn resolve_bin(name: &str) -> Result<String> {
827 let search_dirs: &[&str] = match name {
828 "iptables" => &["/usr/sbin/iptables", "/sbin/iptables", "/usr/bin/iptables"],
829 "slirp4netns" => &[
830 "/usr/bin/slirp4netns",
831 "/bin/slirp4netns",
832 "/run/current-system/sw/bin/slirp4netns",
833 ],
834 _ => &[],
835 };
836
837 for path in search_dirs {
838 let p = std::path::Path::new(path);
839 if p.exists() {
840 Self::validate_network_binary(p, name)?;
841 return Ok(path.to_string());
842 }
843 }
844
845 if let Some(path_var) = std::env::var_os("PATH") {
846 for dir in std::env::split_paths(&path_var) {
847 let candidate = dir.join(name);
848 if candidate.exists() {
849 Self::validate_network_binary(&candidate, name)?;
850 return Ok(candidate.to_string_lossy().into_owned());
851 }
852 }
853 }
854
855 Err(NucleusError::NetworkError(format!(
856 "Required binary '{}' not found or failed validation",
857 name
858 )))
859 }
860
861 fn validate_network_binary(path: &std::path::Path, name: &str) -> Result<()> {
865 use std::os::unix::fs::MetadataExt;
866
867 let resolved = std::fs::canonicalize(path).unwrap_or_else(|_| path.to_path_buf());
868 let meta = std::fs::metadata(&resolved)
869 .map_err(|e| NucleusError::NetworkError(format!("Cannot stat {}: {}", name, e)))?;
870 let mode = meta.mode();
871 if mode & 0o022 != 0 {
872 return Err(NucleusError::NetworkError(format!(
873 "Binary '{}' at {:?} is writable by group/others (mode {:o}), refusing to execute",
874 name, resolved, mode
875 )));
876 }
877 let owner = meta.uid();
878 let euid = nix::unistd::Uid::effective().as_raw();
879 if owner != 0 && owner != euid && !Self::is_trusted_store_network_binary(&resolved, mode) {
880 return Err(NucleusError::NetworkError(format!(
881 "Binary '{}' at {:?} owned by UID {} (expected root or euid {}), refusing to execute",
882 name, resolved, owner, euid
883 )));
884 }
885 Ok(())
886 }
887
888 fn is_trusted_store_network_binary(path: &std::path::Path, mode: u32) -> bool {
889 use std::os::unix::fs::MetadataExt;
890 if !path.starts_with("/nix/store") {
891 return false;
892 }
893 if mode & 0o200 != 0 {
894 return false;
895 }
896 if let Some(parent) = path.parent() {
897 if let Ok(parent_meta) = std::fs::metadata(parent) {
898 return parent_meta.mode() & 0o222 == 0;
899 }
900 }
901 false
902 }
903
904 fn run_cmd(program: &str, args: &[&str]) -> Result<()> {
905 let resolved = Self::resolve_bin(program)?;
906 let output = Command::new(&resolved).args(args).output().map_err(|e| {
907 NucleusError::NetworkError(format!("Failed to run {} {:?}: {}", resolved, args, e))
908 })?;
909
910 if !output.status.success() {
911 let stderr = String::from_utf8_lossy(&output.stderr);
912 return Err(NucleusError::NetworkError(format!(
913 "{} {:?} failed: {}",
914 program, args, stderr
915 )));
916 }
917
918 Ok(())
919 }
920
921 fn run_cmd_owned(program: &str, args: &[String]) -> Result<()> {
922 let refs: Vec<&str> = args.iter().map(String::as_str).collect();
923 Self::run_cmd(program, &refs)
924 }
925
926 fn port_forward_rule_args(
927 operation: &str,
928 chain: &str,
929 container_ip: &str,
930 pf: &PortForward,
931 ) -> Vec<String> {
932 let mut args = vec![
933 "-t".to_string(),
934 "nat".to_string(),
935 operation.to_string(),
936 chain.to_string(),
937 "-p".to_string(),
938 pf.protocol.as_str().to_string(),
939 ];
940
941 if chain == "OUTPUT" {
942 args.extend([
943 "-m".to_string(),
944 "addrtype".to_string(),
945 "--dst-type".to_string(),
946 "LOCAL".to_string(),
947 ]);
948 }
949
950 if let Some(host_ip) = pf.host_ip {
951 args.extend(["-d".to_string(), host_ip.to_string()]);
952 }
953
954 args.extend([
955 "--dport".to_string(),
956 pf.host_port.to_string(),
957 "-j".to_string(),
958 "DNAT".to_string(),
959 "--to-destination".to_string(),
960 format!("{}:{}", container_ip, pf.container_port),
961 ]);
962
963 args
964 }
965
966 fn is_ip_in_use(ip: &str) -> Result<bool> {
967 let addr: Ipv4Addr = ip
968 .parse()
969 .map_err(|e| NucleusError::NetworkError(format!("invalid IP '{}': {}", ip, e)))?;
970 netlink::is_addr_in_use(&addr)
971 }
972
973 pub fn write_resolv_conf(root: &std::path::Path, dns: &[String]) -> Result<()> {
975 let resolv_path = root.join("etc/resolv.conf");
976 let content: String = dns
977 .iter()
978 .map(|server| format!("nameserver {}\n", server))
979 .collect();
980 std::fs::write(&resolv_path, content).map_err(|e| {
981 NucleusError::NetworkError(format!("Failed to write resolv.conf: {}", e))
982 })?;
983 Ok(())
984 }
985
986 pub fn bind_mount_resolv_conf(root: &std::path::Path, dns: &[String]) -> Result<()> {
992 use nix::mount::{mount, MsFlags};
993
994 let content: String = dns
995 .iter()
996 .map(|server| format!("nameserver {}\n", server))
997 .collect();
998
999 let memfd_name = std::ffi::CString::new("nucleus-resolv").map_err(|e| {
1001 NucleusError::NetworkError(format!("Failed to create memfd name: {}", e))
1002 })?;
1003 let raw_fd = unsafe { libc::memfd_create(memfd_name.as_ptr(), 0) };
1006 if raw_fd < 0 {
1007 return Self::bind_mount_resolv_conf_staging(root, dns);
1009 }
1010 let memfd = unsafe { std::os::fd::OwnedFd::from_raw_fd(raw_fd) };
1014
1015 use std::io::Write as _;
1017 let mut memfd_file = std::fs::File::from(memfd);
1018 if memfd_file.write_all(content.as_bytes()).is_err() {
1019 return Self::bind_mount_resolv_conf_staging(root, dns);
1021 }
1022 use std::os::fd::IntoRawFd;
1024 let memfd = {
1025 let raw = memfd_file.into_raw_fd();
1026 unsafe { std::os::fd::OwnedFd::from_raw_fd(raw) }
1028 };
1029
1030 let target = root.join("etc/resolv.conf");
1032 if !target.exists() {
1033 let _ = std::fs::write(&target, "");
1034 }
1035
1036 let memfd_path = format!("/proc/self/fd/{}", memfd.as_raw_fd());
1038 if let Err(e) = mount(
1039 Some(memfd_path.as_str()),
1040 &target,
1041 None::<&str>,
1042 MsFlags::MS_BIND,
1043 None::<&str>,
1044 ) {
1045 return Err(NucleusError::NetworkError(format!(
1046 "Failed to bind mount memfd-backed resolv.conf: {}",
1047 e
1048 )));
1049 }
1050 Self::harden_resolv_conf_bind(&target)?;
1051
1052 info!("Bind-mounted resolv.conf for bridge networking (rootfs mode, memfd)");
1056 Ok(())
1057 }
1058
1059 fn bind_mount_resolv_conf_staging(root: &std::path::Path, dns: &[String]) -> Result<()> {
1061 use nix::mount::{mount, MsFlags};
1062
1063 let content: String = dns
1064 .iter()
1065 .map(|server| format!("nameserver {}\n", server))
1066 .collect();
1067
1068 let staging = Self::create_resolv_conf_staging_file(root, content.as_bytes())?;
1069
1070 let target = root.join("etc/resolv.conf");
1072 if !target.exists() {
1073 let _ = std::fs::write(&target, "");
1074 }
1075
1076 mount(
1078 Some(staging.path()),
1079 &target,
1080 None::<&str>,
1081 MsFlags::MS_BIND,
1082 None::<&str>,
1083 )
1084 .map_err(|e| {
1085 NucleusError::NetworkError(format!("Failed to bind mount resolv.conf: {}", e))
1086 })?;
1087 Self::harden_resolv_conf_bind(&target)?;
1088
1089 info!("Bind-mounted resolv.conf for bridge networking (rootfs mode, staging)");
1093 Ok(())
1094 }
1095
1096 fn create_resolv_conf_staging_file(
1097 root: &std::path::Path,
1098 content: &[u8],
1099 ) -> Result<tempfile::NamedTempFile> {
1100 use std::io::Write as _;
1101
1102 let staging_dir = root.parent().ok_or_else(|| {
1103 NucleusError::NetworkError(format!(
1104 "Container root {:?} has no parent for resolv.conf staging",
1105 root
1106 ))
1107 })?;
1108
1109 let mut staging = tempfile::Builder::new()
1110 .prefix(".resolv.conf.nucleus.")
1111 .tempfile_in(staging_dir)
1112 .map_err(|e| {
1113 NucleusError::NetworkError(format!(
1114 "Failed to create temporary resolv.conf staging file under {:?}: {}",
1115 staging_dir, e
1116 ))
1117 })?;
1118
1119 staging.as_file_mut().write_all(content).map_err(|e| {
1120 NucleusError::NetworkError(format!(
1121 "Failed to write temporary resolv.conf staging file {:?}: {}",
1122 staging.path(),
1123 e
1124 ))
1125 })?;
1126
1127 Ok(staging)
1128 }
1129
1130 fn harden_resolv_conf_bind(target: &std::path::Path) -> Result<()> {
1131 use nix::mount::{mount, MsFlags};
1132
1133 mount(
1134 None::<&str>,
1135 target,
1136 None::<&str>,
1137 MsFlags::MS_REMOUNT
1138 | MsFlags::MS_BIND
1139 | MsFlags::MS_RDONLY
1140 | MsFlags::MS_NOSUID
1141 | MsFlags::MS_NODEV
1142 | MsFlags::MS_NOEXEC,
1143 None::<&str>,
1144 )
1145 .map_err(|e| {
1146 NucleusError::NetworkError(format!(
1147 "Failed to remount resolv.conf with hardened flags at {:?}: {}",
1148 target, e
1149 ))
1150 })
1151 }
1152}
1153
1154impl Drop for BridgeNetwork {
1155 fn drop(&mut self) {
1156 self.cleanup_best_effort();
1157 }
1158}
1159
1160struct SetupRollback {
1161 veth_host: String,
1162 subnet: String,
1163 veth_created: bool,
1164 nat_added: bool,
1165 port_forwards: Vec<(String, PortForward)>,
1166 ip_forward_ref_acquired: bool,
1167 reserved_ip: Option<(std::path::PathBuf, String)>,
1168 armed: bool,
1169}
1170
1171impl SetupRollback {
1172 fn new(
1173 veth_host: String,
1174 subnet: String,
1175 reserved_ip: Option<(std::path::PathBuf, String)>,
1176 ) -> Self {
1177 Self {
1178 veth_host,
1179 subnet,
1180 veth_created: false,
1181 nat_added: false,
1182 port_forwards: Vec::new(),
1183 ip_forward_ref_acquired: false,
1184 reserved_ip,
1185 armed: true,
1186 }
1187 }
1188
1189 fn disarm(&mut self) {
1190 self.armed = false;
1191 }
1192}
1193
1194impl Drop for SetupRollback {
1195 fn drop(&mut self) {
1196 if !self.armed {
1197 return;
1198 }
1199
1200 for (container_ip, pf) in self.port_forwards.iter().rev() {
1201 for chain in ["OUTPUT", "PREROUTING"] {
1202 let args = BridgeNetwork::port_forward_rule_args("-D", chain, container_ip, pf);
1203 if let Err(e) = BridgeNetwork::run_cmd_owned("iptables", &args) {
1204 warn!(
1205 "Rollback: failed to remove iptables {} rule for {}: {}",
1206 chain, container_ip, e
1207 );
1208 }
1209 }
1210 }
1211
1212 if self.nat_added {
1213 if let Err(e) = BridgeNetwork::run_cmd(
1214 "iptables",
1215 &[
1216 "-t",
1217 "nat",
1218 "-D",
1219 "POSTROUTING",
1220 "-s",
1221 &self.subnet,
1222 "-j",
1223 "MASQUERADE",
1224 ],
1225 ) {
1226 warn!("Rollback: failed to remove NAT rule: {}", e);
1227 }
1228 }
1229
1230 if self.veth_created {
1231 if let Err(e) = netlink::del_link(&self.veth_host) {
1232 warn!("Rollback: failed to delete veth {}: {}", self.veth_host, e);
1233 }
1234 }
1235
1236 if self.ip_forward_ref_acquired {
1237 if let Err(e) = BridgeNetwork::release_ip_forward_ref() {
1238 warn!("Rollback: failed to release ip_forward refcount: {}", e);
1239 }
1240 }
1241
1242 if let Some((alloc_dir, container_id)) = &self.reserved_ip {
1243 BridgeNetwork::release_allocated_ip_in_dir(alloc_dir, container_id);
1244 }
1245 }
1246}
1247
1248#[cfg(test)]
1249mod tests {
1250 use super::*;
1251
1252 #[test]
1253 fn test_ip_allocation_rejection_sampling_range() {
1254 for byte in 0u8..253 {
1257 let offset = byte as u32 + 2;
1258 assert!(
1259 (2..=254).contains(&offset),
1260 "offset {} out of range",
1261 offset
1262 );
1263 }
1264 for byte in [253u8, 254, 255] {
1266 assert!(byte >= 253);
1267 }
1268 }
1269
1270 #[test]
1271 fn test_reserve_ip_blocks_duplicate_requested_address() {
1272 let temp = tempfile::tempdir().unwrap();
1273 BridgeNetwork::record_allocated_ip_in_dir(temp.path(), "one", "10.0.42.2").unwrap();
1274
1275 let err =
1276 BridgeNetwork::reserve_ip_in_dir(temp.path(), "two", "10.0.42.0/24", Some("10.0.42.2"))
1277 .unwrap_err();
1278 assert!(
1279 err.to_string().contains("already in use"),
1280 "second reservation of the same IP must fail"
1281 );
1282 }
1283
1284 #[test]
1285 fn test_setup_rollback_releases_reserved_ip() {
1286 let temp = tempfile::tempdir().unwrap();
1287 BridgeNetwork::record_allocated_ip_in_dir(temp.path(), "rollback", "10.0.42.3").unwrap();
1288
1289 let rollback = SetupRollback {
1290 veth_host: "veth-test".to_string(),
1291 subnet: "10.0.42.0/24".to_string(),
1292 veth_created: false,
1293 nat_added: false,
1294 port_forwards: Vec::new(),
1295 ip_forward_ref_acquired: false,
1296 reserved_ip: Some((temp.path().to_path_buf(), "rollback".to_string())),
1297 armed: true,
1298 };
1299
1300 drop(rollback);
1301
1302 assert!(
1303 !temp.path().join("rollback.ip").exists(),
1304 "rollback must release reserved IP files on setup failure"
1305 );
1306 }
1307
1308 #[test]
1309 fn test_resolv_conf_staging_file_is_outside_container_root() {
1310 let temp = tempfile::tempdir().unwrap();
1311 let root = temp.path().join("root");
1312 std::fs::create_dir_all(root.join("tmp")).unwrap();
1313
1314 let staging =
1315 BridgeNetwork::create_resolv_conf_staging_file(&root, b"nameserver 203.0.113.53\n")
1316 .unwrap();
1317
1318 assert_eq!(
1319 std::fs::read_to_string(staging.path()).unwrap(),
1320 "nameserver 203.0.113.53\n"
1321 );
1322 assert!(
1323 !staging.path().starts_with(&root),
1324 "staging file must not be created under the container root"
1325 );
1326 }
1327
1328 #[test]
1329 fn test_bind_mount_resolv_conf_does_not_overwrite_root_tmp_symlink_on_failure() {
1330 use std::os::unix::fs::symlink;
1331
1332 let temp = tempfile::tempdir().unwrap();
1333 let root = temp.path().join("root");
1334 std::fs::create_dir_all(root.join("tmp")).unwrap();
1335
1336 let victim = temp.path().join("host_victim_file");
1337 std::fs::write(&victim, "ORIGINAL_HOST_CONTENT\n").unwrap();
1338 symlink(&victim, root.join("tmp/.resolv.conf.nucleus")).unwrap();
1339
1340 let dns = vec!["203.0.113.53".to_string()];
1341 let result = BridgeNetwork::bind_mount_resolv_conf(&root, &dns);
1342
1343 assert!(
1344 result.is_err(),
1345 "test root intentionally lacks /etc so bind mount setup must fail"
1346 );
1347 assert_eq!(
1348 std::fs::read_to_string(&victim).unwrap(),
1349 "ORIGINAL_HOST_CONTENT\n",
1350 "resolv.conf setup must not write through attacker-controlled /tmp symlinks"
1351 );
1352 }
1353
1354 #[test]
1355 fn test_ip_forward_refcount_restores_original_only_after_last_release() {
1356 let temp = tempfile::tempdir().unwrap();
1357 let sysctl = temp.path().join("ip_forward");
1358 std::fs::write(&sysctl, "0").unwrap();
1359
1360 BridgeNetwork::acquire_ip_forward_ref_in_dir(temp.path(), &sysctl).unwrap();
1361 BridgeNetwork::acquire_ip_forward_ref_in_dir(temp.path(), &sysctl).unwrap();
1362 assert_eq!(std::fs::read_to_string(&sysctl).unwrap(), "1");
1363
1364 BridgeNetwork::release_ip_forward_ref_in_dir(temp.path(), &sysctl).unwrap();
1365 assert_eq!(std::fs::read_to_string(&sysctl).unwrap(), "1");
1366
1367 BridgeNetwork::release_ip_forward_ref_in_dir(temp.path(), &sysctl).unwrap();
1368 assert_eq!(std::fs::read_to_string(&sysctl).unwrap(), "0");
1369 assert!(
1370 !temp.path().join(IP_FORWARD_STATE_FILE).exists(),
1371 "state file must be removed when the last bridge releases ip_forward"
1372 );
1373 }
1374
1375 #[test]
1376 fn test_port_forward_rules_include_output_chain_for_local_host_clients() {
1377 let pf = PortForward {
1378 host_ip: None,
1379 host_port: 8080,
1380 container_port: 80,
1381 protocol: crate::network::config::Protocol::Tcp,
1382 };
1383
1384 let prerouting =
1385 BridgeNetwork::port_forward_rule_args("-A", "PREROUTING", "10.0.42.2", &pf);
1386 let output = BridgeNetwork::port_forward_rule_args("-A", "OUTPUT", "10.0.42.2", &pf);
1387
1388 assert!(prerouting.iter().any(|arg| arg == "PREROUTING"));
1389 assert!(output.iter().any(|arg| arg == "OUTPUT"));
1390 assert!(
1391 output
1392 .windows(2)
1393 .any(|pair| pair[0] == "--dst-type" && pair[1] == "LOCAL"),
1394 "OUTPUT rule must target local-destination traffic"
1395 );
1396 }
1397
1398 #[test]
1399 fn test_port_forward_rules_include_host_ip_when_configured() {
1400 let pf = PortForward {
1401 host_ip: Some(std::net::Ipv4Addr::new(127, 0, 0, 1)),
1402 host_port: 4173,
1403 container_port: 4173,
1404 protocol: crate::network::config::Protocol::Tcp,
1405 };
1406
1407 let prerouting =
1408 BridgeNetwork::port_forward_rule_args("-A", "PREROUTING", "10.0.42.2", &pf);
1409 let output = BridgeNetwork::port_forward_rule_args("-A", "OUTPUT", "10.0.42.2", &pf);
1410
1411 for args in [&prerouting, &output] {
1412 assert!(
1413 args.windows(2)
1414 .any(|pair| pair[0] == "-d" && pair[1] == "127.0.0.1"),
1415 "port forward must restrict DNAT rules to the configured host IP"
1416 );
1417 }
1418 }
1419}