Skip to main content

nucleus/network/
userspace.rs

1use super::{
2    egress, BridgeConfig, BridgeNetwork, EgressPolicy, NatBackend, NetworkState, PortForward,
3};
4use crate::error::{NucleusError, Result, StateTransition};
5use nix::fcntl::{fcntl, FcntlArg, FdFlag};
6use serde_json::json;
7use std::io::{Read, Write};
8use std::os::fd::{AsRawFd, OwnedFd};
9use std::os::unix::net::UnixStream;
10use std::os::unix::process::ExitStatusExt;
11use std::path::{Path, PathBuf};
12use std::process::{Child, Command};
13use std::time::{Duration, Instant};
14use tracing::{debug, info, warn};
15
16const SLIRP_TAP_NAME: &str = "tap0";
17
18/// Native bridge-mode driver for the native runtime.
19pub enum BridgeDriver {
20    Kernel(BridgeNetwork),
21    Userspace(UserspaceNetwork),
22}
23
24impl BridgeDriver {
25    pub fn setup_with_id(
26        pid: u32,
27        config: &BridgeConfig,
28        container_id: &str,
29        host_is_root: bool,
30        rootless: bool,
31    ) -> Result<Self> {
32        match config.selected_nat_backend(host_is_root, rootless) {
33            NatBackend::Kernel => Ok(Self::Kernel(BridgeNetwork::setup_with_id(
34                pid,
35                config,
36                container_id,
37            )?)),
38            NatBackend::Userspace => Ok(Self::Userspace(UserspaceNetwork::setup_with_id(
39                pid,
40                config,
41                container_id,
42                host_is_root,
43                rootless,
44            )?)),
45            NatBackend::Auto => Err(NucleusError::NetworkError(
46                "nat backend selection resolved to auto unexpectedly".to_string(),
47            )),
48        }
49    }
50
51    pub fn apply_egress_policy(
52        &self,
53        pid: u32,
54        policy: &EgressPolicy,
55        rootless: bool,
56    ) -> Result<()> {
57        match self {
58            Self::Kernel(net) => net.apply_egress_policy(pid, policy),
59            Self::Userspace(net) => net.apply_egress_policy(pid, policy, rootless),
60        }
61    }
62
63    pub fn cleanup(self) -> Result<()> {
64        match self {
65            Self::Kernel(net) => net.cleanup(),
66            Self::Userspace(net) => net.cleanup(),
67        }
68    }
69}
70
71/// Userspace NAT manager backed by slirp4netns.
72pub struct UserspaceNetwork {
73    config: BridgeConfig,
74    guest_ip: String,
75    container_id: String,
76    api_socket_path: PathBuf,
77    runtime_dir: PathBuf,
78    exit_signal: Option<OwnedFd>,
79    child: Child,
80    state: NetworkState,
81}
82
83impl UserspaceNetwork {
84    pub(crate) fn default_dns_server(subnet: &str) -> Result<String> {
85        Self::dns_ip_from_subnet(subnet)
86    }
87
88    pub fn setup_with_id(
89        pid: u32,
90        config: &BridgeConfig,
91        container_id: &str,
92        host_is_root: bool,
93        rootless: bool,
94    ) -> Result<Self> {
95        config.validate()?;
96
97        let guest_ip = Self::guest_ip_from_subnet(&config.subnet)?;
98        Self::validate_userspace_config(config, &guest_ip)?;
99
100        let mut state = NetworkState::Unconfigured;
101        state = state.transition(NetworkState::Configuring)?;
102
103        let runtime_dir = Self::runtime_dir(container_id);
104        Self::ensure_runtime_dir(&runtime_dir)?;
105        let api_socket_path = runtime_dir.join("slirp4netns.sock");
106
107        let slirp = BridgeNetwork::resolve_bin("slirp4netns")?;
108        // Keep slirp4netns inside the container user namespace whenever the
109        // container is rootless/root-remapped, even when Nucleus itself runs as
110        // host root. slirp4netns processes untrusted guest network traffic, so
111        // omitting --userns-path would run that helper as host root in the
112        // initial user namespace.
113        let needs_userns = Self::should_join_userns(host_is_root, rootless);
114
115        let slirp_path = Path::new(&slirp);
116        let (child, exit_write) = match Self::spawn_slirp(
117            slirp_path,
118            pid,
119            config,
120            needs_userns,
121            &api_socket_path,
122            true,
123        ) {
124            Ok(result) => result,
125            Err(e) => {
126                warn!(
127                    "slirp4netns sandbox failed ({}), retrying without --enable-sandbox",
128                    e
129                );
130                // The sandbox uses pivot_root(2) which can fail in constrained
131                // environments (e.g. QEMU test VMs, nested containers) where
132                // mount propagation or /tmp restrictions prevent the pivot.
133                // Retry without --enable-sandbox while preserving user namespace
134                // confinement for rootless/root-remapped containers.
135                let _ = std::fs::remove_file(&api_socket_path);
136                Self::spawn_slirp(
137                    slirp_path,
138                    pid,
139                    config,
140                    needs_userns,
141                    &api_socket_path,
142                    false,
143                )
144                .map_err(|retry_err| {
145                    let _ = std::fs::remove_dir_all(&runtime_dir);
146                    retry_err
147                })?
148            }
149        };
150
151        let mut network = Self {
152            config: config.clone(),
153            guest_ip: guest_ip.to_string(),
154            container_id: container_id.to_string(),
155            api_socket_path,
156            runtime_dir,
157            exit_signal: Some(exit_write),
158            child,
159            state,
160        };
161
162        if let Err(e) = network.configure_port_forwards() {
163            network.cleanup_best_effort();
164            return Err(e);
165        }
166
167        network.state = network.state.transition(NetworkState::Active)?;
168
169        info!(
170            "Userspace NAT configured via slirp4netns for container {} (guest IP {})",
171            network.container_id, network.guest_ip
172        );
173
174        Ok(network)
175    }
176
177    pub fn apply_egress_policy(
178        &self,
179        pid: u32,
180        policy: &EgressPolicy,
181        rootless: bool,
182    ) -> Result<()> {
183        egress::apply_egress_policy(pid, &self.effective_dns_servers(), policy, rootless)
184    }
185
186    pub fn cleanup(mut self) -> Result<()> {
187        self.state = self.state.transition(NetworkState::Cleaned)?;
188        self.stop_child()?;
189        self.cleanup_runtime_dir();
190        Ok(())
191    }
192
193    fn effective_dns_servers(&self) -> Vec<String> {
194        if self.config.dns.is_empty() {
195            vec![Self::dns_ip_from_subnet(&self.config.subnet)
196                .unwrap_or_else(|_| "10.0.2.3".to_string())]
197        } else {
198            self.config.dns.clone()
199        }
200    }
201
202    fn configure_port_forwards(&mut self) -> Result<()> {
203        for pf in &self.config.port_forwards {
204            self.add_port_forward(pf)?;
205        }
206        Ok(())
207    }
208
209    fn add_port_forward(&self, pf: &PortForward) -> Result<()> {
210        let mut arguments = serde_json::Map::new();
211        arguments.insert("proto".to_string(), json!(pf.protocol.as_str()));
212        arguments.insert("host_port".to_string(), json!(pf.host_port));
213        arguments.insert("guest_port".to_string(), json!(pf.container_port));
214        if let Some(host_ip) = pf.host_ip {
215            arguments.insert("host_addr".to_string(), json!(host_ip.to_string()));
216        }
217
218        let response = Self::api_request(
219            &self.api_socket_path,
220            &json!({
221                "execute": "add_hostfwd",
222                "arguments": arguments,
223            }),
224        )?;
225
226        if let Some(error) = response.get("error") {
227            return Err(NucleusError::NetworkError(format!(
228                "slirp4netns add_hostfwd failed for {}:{}->{}/{}: {}",
229                pf.host_ip
230                    .map(|ip| ip.to_string())
231                    .unwrap_or_else(|| "0.0.0.0".to_string()),
232                pf.host_port,
233                pf.container_port,
234                pf.protocol,
235                error
236            )));
237        }
238
239        debug!(
240            "Configured slirp4netns port forward {}:{} -> {}:{}/{}",
241            pf.host_ip
242                .map(|ip| ip.to_string())
243                .unwrap_or_else(|| "0.0.0.0".to_string()),
244            pf.host_port,
245            self.guest_ip,
246            pf.container_port,
247            pf.protocol
248        );
249        Ok(())
250    }
251
252    fn api_request(socket_path: &Path, request: &serde_json::Value) -> Result<serde_json::Value> {
253        let mut stream = UnixStream::connect(socket_path).map_err(|e| {
254            NucleusError::NetworkError(format!(
255                "connect slirp4netns API socket {:?}: {}",
256                socket_path, e
257            ))
258        })?;
259        let payload = serde_json::to_vec(request).map_err(|e| {
260            NucleusError::NetworkError(format!("serialize slirp4netns API request: {}", e))
261        })?;
262        stream.write_all(&payload).map_err(|e| {
263            NucleusError::NetworkError(format!("write slirp4netns API request: {}", e))
264        })?;
265        stream
266            .shutdown(std::net::Shutdown::Write)
267            .map_err(|e| NucleusError::NetworkError(format!("shutdown slirp4netns API: {}", e)))?;
268
269        let mut buf = Vec::new();
270        stream.read_to_end(&mut buf).map_err(|e| {
271            NucleusError::NetworkError(format!("read slirp4netns API response: {}", e))
272        })?;
273
274        serde_json::from_slice(&buf).map_err(|e| {
275            NucleusError::NetworkError(format!(
276                "parse slirp4netns API response '{}': {}",
277                String::from_utf8_lossy(&buf),
278                e
279            ))
280        })
281    }
282
283    fn wait_until_ready(child: &mut Child, ready_read: OwnedFd) -> Result<()> {
284        let mut ready = std::fs::File::from(ready_read);
285        let mut buf = [0u8; 1];
286        match ready.read_exact(&mut buf) {
287            Ok(()) if buf == [b'1'] => Ok(()),
288            Ok(()) => Err(NucleusError::NetworkError(format!(
289                "slirp4netns ready-fd returned unexpected byte {:?}",
290                buf
291            ))),
292            Err(e) => {
293                if let Ok(Some(status)) = child.try_wait() {
294                    let detail = status
295                        .code()
296                        .map(|code| format!("exit code {}", code))
297                        .or_else(|| status.signal().map(|sig| format!("signal {}", sig)))
298                        .unwrap_or_else(|| "unknown status".to_string());
299                    Err(NucleusError::NetworkError(format!(
300                        "slirp4netns exited before ready: {}",
301                        detail
302                    )))
303                } else {
304                    Err(NucleusError::NetworkError(format!(
305                        "failed waiting for slirp4netns readiness: {}",
306                        e
307                    )))
308                }
309            }
310        }
311    }
312
313    fn stop_child(&mut self) -> Result<()> {
314        self.exit_signal.take();
315
316        let deadline = Instant::now() + Duration::from_secs(2);
317        loop {
318            match self.child.try_wait() {
319                Ok(Some(_)) => break,
320                Ok(None) if Instant::now() < deadline => {
321                    std::thread::sleep(Duration::from_millis(50))
322                }
323                Ok(None) => {
324                    self.child.kill().map_err(|e| {
325                        NucleusError::NetworkError(format!("kill slirp4netns: {}", e))
326                    })?;
327                    let _ = self.child.wait();
328                    break;
329                }
330                Err(e) => {
331                    return Err(NucleusError::NetworkError(format!(
332                        "wait for slirp4netns shutdown: {}",
333                        e
334                    )))
335                }
336            }
337        }
338
339        info!(
340            "Userspace NAT cleaned up for container {}",
341            self.container_id
342        );
343        Ok(())
344    }
345
346    fn cleanup_best_effort(&mut self) {
347        if self.state == NetworkState::Cleaned {
348            return;
349        }
350
351        self.exit_signal.take();
352
353        if let Ok(None) = self.child.try_wait() {
354            let deadline = Instant::now() + Duration::from_secs(1);
355            while Instant::now() < deadline {
356                match self.child.try_wait() {
357                    Ok(Some(_)) => break,
358                    Ok(None) => std::thread::sleep(Duration::from_millis(25)),
359                    Err(_) => break,
360                }
361            }
362
363            if let Ok(None) = self.child.try_wait() {
364                let _ = self.child.kill();
365                let _ = self.child.wait();
366            }
367        }
368
369        self.cleanup_runtime_dir();
370        self.state = NetworkState::Cleaned;
371        debug!(
372            "Userspace NAT cleaned up (best-effort via drop) for container {}",
373            self.container_id
374        );
375    }
376
377    fn cleanup_runtime_dir(&self) {
378        if let Err(e) = std::fs::remove_dir_all(&self.runtime_dir) {
379            if self.runtime_dir.exists() {
380                warn!(
381                    "Failed to remove slirp4netns runtime dir {:?}: {}",
382                    self.runtime_dir, e
383                );
384            }
385        }
386    }
387
388    fn validate_userspace_config(config: &BridgeConfig, guest_ip: &str) -> Result<()> {
389        let prefix = config
390            .subnet
391            .split_once('/')
392            .and_then(|(_, prefix)| prefix.parse::<u8>().ok())
393            .unwrap_or(24);
394        if prefix > 25 {
395            return Err(NucleusError::NetworkError(format!(
396                "Userspace NAT requires a subnet with at least 128 addresses; '{}' is too small",
397                config.subnet
398            )));
399        }
400
401        if let Some(requested_ip) = config.container_ip.as_deref() {
402            if requested_ip != guest_ip {
403                return Err(NucleusError::NetworkError(format!(
404                    "Userspace NAT uses the slirp4netns guest address {}; requested container IP {} is unsupported",
405                    guest_ip, requested_ip
406                )));
407            }
408        }
409
410        Ok(())
411    }
412
413    fn should_join_userns(_host_is_root: bool, rootless: bool) -> bool {
414        rootless
415    }
416
417    fn spawn_slirp(
418        slirp_bin: &Path,
419        pid: u32,
420        config: &BridgeConfig,
421        needs_userns: bool,
422        api_socket_path: &Path,
423        enable_sandbox: bool,
424    ) -> Result<(Child, OwnedFd)> {
425        let (ready_read, ready_write) = nix::unistd::pipe()
426            .map_err(|e| NucleusError::NetworkError(format!("ready pipe: {}", e)))?;
427        let (exit_read, exit_write) = nix::unistd::pipe()
428            .map_err(|e| NucleusError::NetworkError(format!("exit pipe: {}", e)))?;
429        Self::clear_cloexec(&ready_write)?;
430        Self::clear_cloexec(&exit_read)?;
431
432        let args = Self::command_args(
433            pid,
434            config,
435            needs_userns,
436            api_socket_path,
437            ready_write.as_raw_fd(),
438            exit_read.as_raw_fd(),
439            enable_sandbox,
440        );
441
442        let mut child = Command::new(slirp_bin)
443            .args(&args)
444            .spawn()
445            .map_err(|e| NucleusError::NetworkError(format!("spawn slirp4netns: {}", e)))?;
446
447        drop(ready_write);
448        drop(exit_read);
449
450        match Self::wait_until_ready(&mut child, ready_read) {
451            Ok(()) => Ok((child, exit_write)),
452            Err(e) => {
453                let _ = child.kill();
454                let _ = child.wait();
455                Err(e)
456            }
457        }
458    }
459
460    fn command_args(
461        pid: u32,
462        config: &BridgeConfig,
463        join_userns: bool,
464        api_socket_path: &Path,
465        ready_fd: i32,
466        exit_fd: i32,
467        enable_sandbox: bool,
468    ) -> Vec<String> {
469        let mut args = vec![
470            "--configure".to_string(),
471            "--ready-fd".to_string(),
472            ready_fd.to_string(),
473            "--exit-fd".to_string(),
474            exit_fd.to_string(),
475            "--api-socket".to_string(),
476            api_socket_path.display().to_string(),
477            "--cidr".to_string(),
478            config.subnet.clone(),
479            "--disable-host-loopback".to_string(),
480        ];
481
482        if enable_sandbox {
483            args.push("--enable-sandbox".to_string());
484        }
485
486        if !config.dns.is_empty() {
487            args.push("--disable-dns".to_string());
488        }
489
490        if join_userns {
491            args.push("--userns-path".to_string());
492            args.push(format!("/proc/{}/ns/user", pid));
493        }
494
495        args.push(pid.to_string());
496        args.push(SLIRP_TAP_NAME.to_string());
497        args
498    }
499
500    fn runtime_dir(container_id: &str) -> PathBuf {
501        let base = if nix::unistd::Uid::effective().is_root() {
502            PathBuf::from("/run/nucleus/userspace-net")
503        } else {
504            dirs::runtime_dir()
505                .map(|dir| dir.join("nucleus/userspace-net"))
506                .or_else(|| dirs::data_local_dir().map(|dir| dir.join("nucleus/userspace-net")))
507                .unwrap_or_else(|| std::env::temp_dir().join("nucleus-userspace-net"))
508        };
509        base.join(container_id)
510    }
511
512    fn ensure_runtime_dir(path: &Path) -> Result<()> {
513        if let Some(parent) = path.parent() {
514            std::fs::create_dir_all(parent).map_err(|e| {
515                NucleusError::NetworkError(format!(
516                    "create userspace-net parent dir {:?}: {}",
517                    parent, e
518                ))
519            })?;
520        }
521        std::fs::create_dir_all(path).map_err(|e| {
522            NucleusError::NetworkError(format!("create userspace-net dir {:?}: {}", path, e))
523        })?;
524        use std::os::unix::fs::PermissionsExt;
525        std::fs::set_permissions(path, std::fs::Permissions::from_mode(0o700)).map_err(|e| {
526            NucleusError::NetworkError(format!(
527                "secure userspace-net dir permissions for {:?}: {}",
528                path, e
529            ))
530        })?;
531        Ok(())
532    }
533
534    fn clear_cloexec(fd: &OwnedFd) -> Result<()> {
535        let flags = fcntl(fd, FcntlArg::F_GETFD).map_err(|e| {
536            NucleusError::NetworkError(format!("fcntl(F_GETFD) on fd {}: {}", fd.as_raw_fd(), e))
537        })?;
538        let fd_flags = FdFlag::from_bits_truncate(flags);
539        let new_flags = fd_flags & !FdFlag::FD_CLOEXEC;
540        fcntl(fd, FcntlArg::F_SETFD(new_flags)).map_err(|e| {
541            NucleusError::NetworkError(format!("fcntl(F_SETFD) on fd {}: {}", fd.as_raw_fd(), e))
542        })?;
543        Ok(())
544    }
545
546    fn guest_ip_from_subnet(subnet: &str) -> Result<String> {
547        Self::offset_ip_from_subnet(subnet, 100).map(|ip| ip.to_string())
548    }
549
550    fn dns_ip_from_subnet(subnet: &str) -> Result<String> {
551        Self::offset_ip_from_subnet(subnet, 3).map(|ip| ip.to_string())
552    }
553
554    fn offset_ip_from_subnet(subnet: &str, offset: u32) -> Result<std::net::Ipv4Addr> {
555        let (base, prefix) = subnet.split_once('/').ok_or_else(|| {
556            NucleusError::NetworkError(format!("Invalid CIDR (missing /prefix): '{}'", subnet))
557        })?;
558        let prefix = prefix.parse::<u8>().map_err(|e| {
559            NucleusError::NetworkError(format!("Invalid CIDR prefix '{}': {}", subnet, e))
560        })?;
561        let base_ip = base.parse::<std::net::Ipv4Addr>().map_err(|e| {
562            NucleusError::NetworkError(format!("Invalid CIDR base '{}': {}", subnet, e))
563        })?;
564
565        let host_capacity = if prefix == 32 {
566            1u64
567        } else {
568            1u64 << (32 - prefix)
569        };
570        if offset as u64 >= host_capacity {
571            return Err(NucleusError::NetworkError(format!(
572                "CIDR '{}' does not have room for host offset {}",
573                subnet, offset
574            )));
575        }
576
577        let candidate = u32::from(base_ip)
578            .checked_add(offset)
579            .ok_or_else(|| NucleusError::NetworkError(format!("CIDR '{}' overflowed", subnet)))?;
580        Ok(std::net::Ipv4Addr::from(candidate))
581    }
582}
583
584impl Drop for UserspaceNetwork {
585    fn drop(&mut self) {
586        self.cleanup_best_effort();
587    }
588}
589
590#[cfg(test)]
591mod tests {
592    use super::*;
593    use crate::network::Protocol;
594
595    #[test]
596    fn test_auto_nat_backend_prefers_kernel_for_rootful_hosts() {
597        let cfg = BridgeConfig::default();
598        assert_eq!(cfg.selected_nat_backend(true, false), NatBackend::Kernel);
599        assert_eq!(cfg.selected_nat_backend(true, true), NatBackend::Userspace);
600        assert_eq!(cfg.selected_nat_backend(false, true), NatBackend::Userspace);
601    }
602
603    #[test]
604    fn test_userspace_backend_rejects_too_small_subnets() {
605        let cfg = BridgeConfig {
606            subnet: "10.0.42.0/26".to_string(),
607            ..BridgeConfig::default()
608        };
609
610        let guest_ip = UserspaceNetwork::guest_ip_from_subnet(&cfg.subnet).unwrap_err();
611        assert!(
612            guest_ip.to_string().contains("does not have room"),
613            "unexpected error: {guest_ip}"
614        );
615    }
616
617    #[test]
618    fn test_userspace_backend_rejects_custom_guest_ip() {
619        let cfg = BridgeConfig {
620            container_ip: Some("10.0.42.2".to_string()),
621            ..BridgeConfig::default()
622        };
623
624        let err = UserspaceNetwork::validate_userspace_config(&cfg, "10.0.42.100").unwrap_err();
625        assert!(err
626            .to_string()
627            .contains("requested container IP 10.0.42.2 is unsupported"));
628    }
629
630    #[test]
631    fn test_slirp_command_args_disable_builtin_dns_when_explicit_dns_is_set() {
632        let cfg = BridgeConfig::default().with_dns(vec!["1.1.1.1".to_string()]);
633        let args = UserspaceNetwork::command_args(
634            4242,
635            &cfg,
636            true,
637            Path::new("/tmp/slirp.sock"),
638            5,
639            6,
640            true,
641        );
642
643        assert!(args.iter().any(|arg| arg == "--disable-dns"));
644        assert!(args.iter().any(|arg| arg == "--userns-path"));
645    }
646
647    #[test]
648    fn test_slirp_userns_join_is_kept_for_root_remapped_hosts() {
649        assert!(UserspaceNetwork::should_join_userns(true, true));
650        assert!(UserspaceNetwork::should_join_userns(false, true));
651        assert!(!UserspaceNetwork::should_join_userns(true, false));
652        assert!(!UserspaceNetwork::should_join_userns(false, false));
653    }
654
655    #[test]
656    fn test_slirp_command_args_keep_userns_without_sandbox() {
657        let cfg = BridgeConfig::default();
658        let args = UserspaceNetwork::command_args(
659            4242,
660            &cfg,
661            true,
662            Path::new("/tmp/slirp.sock"),
663            5,
664            6,
665            false,
666        );
667
668        assert!(!args.iter().any(|arg| arg == "--enable-sandbox"));
669        assert!(args.iter().any(|arg| arg == "--userns-path"));
670        assert!(args.iter().any(|arg| arg == "/proc/4242/ns/user"));
671    }
672
673    #[test]
674    fn test_userspace_port_forward_request_uses_slirp_hostfwd_shape() {
675        let pf = PortForward {
676            host_ip: Some(std::net::Ipv4Addr::new(127, 0, 0, 1)),
677            host_port: 8080,
678            container_port: 80,
679            protocol: Protocol::Tcp,
680        };
681
682        let mut arguments = serde_json::Map::new();
683        arguments.insert("proto".to_string(), json!(pf.protocol.as_str()));
684        arguments.insert("host_port".to_string(), json!(pf.host_port));
685        arguments.insert("guest_port".to_string(), json!(pf.container_port));
686        if let Some(host_ip) = pf.host_ip {
687            arguments.insert("host_addr".to_string(), json!(host_ip.to_string()));
688        }
689        let request = json!({
690            "execute": "add_hostfwd",
691            "arguments": arguments,
692        });
693
694        assert_eq!(request["execute"], "add_hostfwd");
695        assert_eq!(request["arguments"]["proto"], "tcp");
696        assert_eq!(request["arguments"]["host_addr"], "127.0.0.1");
697        assert_eq!(request["arguments"]["host_port"], 8080);
698        assert_eq!(request["arguments"]["guest_port"], 80);
699    }
700}