Skip to main content

nucleus/network/
userspace.rs

1use super::{
2    egress, BridgeConfig, BridgeNetwork, EgressPolicy, NatBackend, NetworkState, PortForward,
3};
4use crate::error::{NucleusError, Result, StateTransition};
5use nix::fcntl::{fcntl, FcntlArg, FdFlag};
6use serde_json::json;
7use std::io::{Read, Write};
8use std::os::fd::{AsRawFd, OwnedFd};
9use std::os::unix::net::UnixStream;
10use std::os::unix::process::ExitStatusExt;
11use std::path::{Path, PathBuf};
12use std::process::{Child, Command};
13use std::time::{Duration, Instant};
14use tracing::{debug, info, warn};
15
16const SLIRP_TAP_NAME: &str = "tap0";
17
18/// Native bridge-mode driver for the native runtime.
19pub enum BridgeDriver {
20    Kernel(BridgeNetwork),
21    Userspace(UserspaceNetwork),
22}
23
24impl BridgeDriver {
25    pub fn setup_with_id(
26        pid: u32,
27        config: &BridgeConfig,
28        container_id: &str,
29        host_is_root: bool,
30        rootless: bool,
31    ) -> Result<Self> {
32        match config.selected_nat_backend(host_is_root, rootless) {
33            NatBackend::Kernel => Ok(Self::Kernel(BridgeNetwork::setup_with_id(
34                pid,
35                config,
36                container_id,
37            )?)),
38            NatBackend::Userspace => Ok(Self::Userspace(UserspaceNetwork::setup_with_id(
39                pid,
40                config,
41                container_id,
42                host_is_root,
43                rootless,
44            )?)),
45            NatBackend::Auto => Err(NucleusError::NetworkError(
46                "nat backend selection resolved to auto unexpectedly".to_string(),
47            )),
48        }
49    }
50
51    pub fn apply_egress_policy(
52        &self,
53        pid: u32,
54        policy: &EgressPolicy,
55        rootless: bool,
56    ) -> Result<()> {
57        match self {
58            Self::Kernel(net) => net.apply_egress_policy(pid, policy),
59            Self::Userspace(net) => net.apply_egress_policy(pid, policy, rootless),
60        }
61    }
62
63    pub fn cleanup(self) -> Result<()> {
64        match self {
65            Self::Kernel(net) => net.cleanup(),
66            Self::Userspace(net) => net.cleanup(),
67        }
68    }
69}
70
71/// Userspace NAT manager backed by slirp4netns.
72pub struct UserspaceNetwork {
73    config: BridgeConfig,
74    guest_ip: String,
75    container_id: String,
76    api_socket_path: PathBuf,
77    runtime_dir: PathBuf,
78    exit_signal: Option<OwnedFd>,
79    child: Child,
80    state: NetworkState,
81}
82
83impl UserspaceNetwork {
84    pub(crate) fn default_dns_server(subnet: &str) -> Result<String> {
85        Self::dns_ip_from_subnet(subnet)
86    }
87
88    pub fn setup_with_id(
89        pid: u32,
90        config: &BridgeConfig,
91        container_id: &str,
92        host_is_root: bool,
93        rootless: bool,
94    ) -> Result<Self> {
95        config.validate()?;
96
97        let guest_ip = Self::guest_ip_from_subnet(&config.subnet)?;
98        Self::validate_userspace_config(config, &guest_ip)?;
99
100        let mut state = NetworkState::Unconfigured;
101        state = state.transition(NetworkState::Configuring)?;
102
103        let runtime_dir = Self::runtime_dir(container_id);
104        Self::ensure_runtime_dir(&runtime_dir)?;
105        let api_socket_path = runtime_dir.join("slirp4netns.sock");
106
107        let slirp = BridgeNetwork::resolve_bin("slirp4netns")?;
108        // Only join the container's user namespace when the host process is
109        // genuinely unprivileged.  A root-owned process can already access any
110        // network namespace via /proc/{pid}/ns/net.  Entering the container's
111        // user namespace would cause the host root mount to become a *locked
112        // mount* in the new mount namespace slirp4netns creates for its sandbox,
113        // and pivot_root(2) cannot pivot away from a locked mount.
114        let needs_userns = rootless && !host_is_root;
115
116        let slirp_path = Path::new(&slirp);
117        let (child, exit_write) = match Self::spawn_slirp(
118            slirp_path,
119            pid,
120            config,
121            needs_userns,
122            &api_socket_path,
123            true,
124        ) {
125            Ok(result) => result,
126            Err(e) => {
127                warn!(
128                    "slirp4netns sandbox failed ({}), retrying without --enable-sandbox",
129                    e
130                );
131                // The sandbox uses pivot_root(2) which can fail in constrained
132                // environments (e.g. QEMU test VMs, nested containers) where
133                // mount propagation or /tmp restrictions prevent the pivot.
134                // Retry without --enable-sandbox; slirp4netns is still
135                // process-isolated via its network namespace.
136                let _ = std::fs::remove_file(&api_socket_path);
137                Self::spawn_slirp(
138                    slirp_path,
139                    pid,
140                    config,
141                    needs_userns,
142                    &api_socket_path,
143                    false,
144                )
145                .map_err(|retry_err| {
146                    let _ = std::fs::remove_dir_all(&runtime_dir);
147                    retry_err
148                })?
149            }
150        };
151
152        let mut network = Self {
153            config: config.clone(),
154            guest_ip: guest_ip.to_string(),
155            container_id: container_id.to_string(),
156            api_socket_path,
157            runtime_dir,
158            exit_signal: Some(exit_write),
159            child,
160            state,
161        };
162
163        if let Err(e) = network.configure_port_forwards() {
164            network.cleanup_best_effort();
165            return Err(e);
166        }
167
168        network.state = network.state.transition(NetworkState::Active)?;
169
170        info!(
171            "Userspace NAT configured via slirp4netns for container {} (guest IP {})",
172            network.container_id, network.guest_ip
173        );
174
175        Ok(network)
176    }
177
178    pub fn apply_egress_policy(
179        &self,
180        pid: u32,
181        policy: &EgressPolicy,
182        rootless: bool,
183    ) -> Result<()> {
184        egress::apply_egress_policy(pid, &self.effective_dns_servers(), policy, rootless)
185    }
186
187    pub fn cleanup(mut self) -> Result<()> {
188        self.state = self.state.transition(NetworkState::Cleaned)?;
189        self.stop_child()?;
190        self.cleanup_runtime_dir();
191        Ok(())
192    }
193
194    fn effective_dns_servers(&self) -> Vec<String> {
195        if self.config.dns.is_empty() {
196            vec![Self::dns_ip_from_subnet(&self.config.subnet)
197                .unwrap_or_else(|_| "10.0.2.3".to_string())]
198        } else {
199            self.config.dns.clone()
200        }
201    }
202
203    fn configure_port_forwards(&mut self) -> Result<()> {
204        for pf in &self.config.port_forwards {
205            self.add_port_forward(pf)?;
206        }
207        Ok(())
208    }
209
210    fn add_port_forward(&self, pf: &PortForward) -> Result<()> {
211        let mut arguments = serde_json::Map::new();
212        arguments.insert("proto".to_string(), json!(pf.protocol.as_str()));
213        arguments.insert("host_port".to_string(), json!(pf.host_port));
214        arguments.insert("guest_port".to_string(), json!(pf.container_port));
215        if let Some(host_ip) = pf.host_ip {
216            arguments.insert("host_addr".to_string(), json!(host_ip.to_string()));
217        }
218
219        let response = Self::api_request(
220            &self.api_socket_path,
221            &json!({
222                "execute": "add_hostfwd",
223                "arguments": arguments,
224            }),
225        )?;
226
227        if let Some(error) = response.get("error") {
228            return Err(NucleusError::NetworkError(format!(
229                "slirp4netns add_hostfwd failed for {}:{}->{}/{}: {}",
230                pf.host_ip
231                    .map(|ip| ip.to_string())
232                    .unwrap_or_else(|| "0.0.0.0".to_string()),
233                pf.host_port,
234                pf.container_port,
235                pf.protocol,
236                error
237            )));
238        }
239
240        debug!(
241            "Configured slirp4netns port forward {}:{} -> {}:{}/{}",
242            pf.host_ip
243                .map(|ip| ip.to_string())
244                .unwrap_or_else(|| "0.0.0.0".to_string()),
245            pf.host_port,
246            self.guest_ip,
247            pf.container_port,
248            pf.protocol
249        );
250        Ok(())
251    }
252
253    fn api_request(socket_path: &Path, request: &serde_json::Value) -> Result<serde_json::Value> {
254        let mut stream = UnixStream::connect(socket_path).map_err(|e| {
255            NucleusError::NetworkError(format!(
256                "connect slirp4netns API socket {:?}: {}",
257                socket_path, e
258            ))
259        })?;
260        let payload = serde_json::to_vec(request).map_err(|e| {
261            NucleusError::NetworkError(format!("serialize slirp4netns API request: {}", e))
262        })?;
263        stream.write_all(&payload).map_err(|e| {
264            NucleusError::NetworkError(format!("write slirp4netns API request: {}", e))
265        })?;
266        stream
267            .shutdown(std::net::Shutdown::Write)
268            .map_err(|e| NucleusError::NetworkError(format!("shutdown slirp4netns API: {}", e)))?;
269
270        let mut buf = Vec::new();
271        stream.read_to_end(&mut buf).map_err(|e| {
272            NucleusError::NetworkError(format!("read slirp4netns API response: {}", e))
273        })?;
274
275        serde_json::from_slice(&buf).map_err(|e| {
276            NucleusError::NetworkError(format!(
277                "parse slirp4netns API response '{}': {}",
278                String::from_utf8_lossy(&buf),
279                e
280            ))
281        })
282    }
283
284    fn wait_until_ready(child: &mut Child, ready_read: OwnedFd) -> Result<()> {
285        let mut ready = std::fs::File::from(ready_read);
286        let mut buf = [0u8; 1];
287        match ready.read_exact(&mut buf) {
288            Ok(()) if buf == [b'1'] => Ok(()),
289            Ok(()) => Err(NucleusError::NetworkError(format!(
290                "slirp4netns ready-fd returned unexpected byte {:?}",
291                buf
292            ))),
293            Err(e) => {
294                if let Ok(Some(status)) = child.try_wait() {
295                    let detail = status
296                        .code()
297                        .map(|code| format!("exit code {}", code))
298                        .or_else(|| status.signal().map(|sig| format!("signal {}", sig)))
299                        .unwrap_or_else(|| "unknown status".to_string());
300                    Err(NucleusError::NetworkError(format!(
301                        "slirp4netns exited before ready: {}",
302                        detail
303                    )))
304                } else {
305                    Err(NucleusError::NetworkError(format!(
306                        "failed waiting for slirp4netns readiness: {}",
307                        e
308                    )))
309                }
310            }
311        }
312    }
313
314    fn stop_child(&mut self) -> Result<()> {
315        self.exit_signal.take();
316
317        let deadline = Instant::now() + Duration::from_secs(2);
318        loop {
319            match self.child.try_wait() {
320                Ok(Some(_)) => break,
321                Ok(None) if Instant::now() < deadline => {
322                    std::thread::sleep(Duration::from_millis(50))
323                }
324                Ok(None) => {
325                    self.child.kill().map_err(|e| {
326                        NucleusError::NetworkError(format!("kill slirp4netns: {}", e))
327                    })?;
328                    let _ = self.child.wait();
329                    break;
330                }
331                Err(e) => {
332                    return Err(NucleusError::NetworkError(format!(
333                        "wait for slirp4netns shutdown: {}",
334                        e
335                    )))
336                }
337            }
338        }
339
340        info!(
341            "Userspace NAT cleaned up for container {}",
342            self.container_id
343        );
344        Ok(())
345    }
346
347    fn cleanup_best_effort(&mut self) {
348        if self.state == NetworkState::Cleaned {
349            return;
350        }
351
352        self.exit_signal.take();
353
354        if let Ok(None) = self.child.try_wait() {
355            let deadline = Instant::now() + Duration::from_secs(1);
356            while Instant::now() < deadline {
357                match self.child.try_wait() {
358                    Ok(Some(_)) => break,
359                    Ok(None) => std::thread::sleep(Duration::from_millis(25)),
360                    Err(_) => break,
361                }
362            }
363
364            if let Ok(None) = self.child.try_wait() {
365                let _ = self.child.kill();
366                let _ = self.child.wait();
367            }
368        }
369
370        self.cleanup_runtime_dir();
371        self.state = NetworkState::Cleaned;
372        debug!(
373            "Userspace NAT cleaned up (best-effort via drop) for container {}",
374            self.container_id
375        );
376    }
377
378    fn cleanup_runtime_dir(&self) {
379        if let Err(e) = std::fs::remove_dir_all(&self.runtime_dir) {
380            if self.runtime_dir.exists() {
381                warn!(
382                    "Failed to remove slirp4netns runtime dir {:?}: {}",
383                    self.runtime_dir, e
384                );
385            }
386        }
387    }
388
389    fn validate_userspace_config(config: &BridgeConfig, guest_ip: &str) -> Result<()> {
390        let prefix = config
391            .subnet
392            .split_once('/')
393            .and_then(|(_, prefix)| prefix.parse::<u8>().ok())
394            .unwrap_or(24);
395        if prefix > 25 {
396            return Err(NucleusError::NetworkError(format!(
397                "Userspace NAT requires a subnet with at least 128 addresses; '{}' is too small",
398                config.subnet
399            )));
400        }
401
402        if let Some(requested_ip) = config.container_ip.as_deref() {
403            if requested_ip != guest_ip {
404                return Err(NucleusError::NetworkError(format!(
405                    "Userspace NAT uses the slirp4netns guest address {}; requested container IP {} is unsupported",
406                    guest_ip, requested_ip
407                )));
408            }
409        }
410
411        Ok(())
412    }
413
414    fn spawn_slirp(
415        slirp_bin: &Path,
416        pid: u32,
417        config: &BridgeConfig,
418        needs_userns: bool,
419        api_socket_path: &Path,
420        enable_sandbox: bool,
421    ) -> Result<(Child, OwnedFd)> {
422        let (ready_read, ready_write) = nix::unistd::pipe()
423            .map_err(|e| NucleusError::NetworkError(format!("ready pipe: {}", e)))?;
424        let (exit_read, exit_write) = nix::unistd::pipe()
425            .map_err(|e| NucleusError::NetworkError(format!("exit pipe: {}", e)))?;
426        Self::clear_cloexec(&ready_write)?;
427        Self::clear_cloexec(&exit_read)?;
428
429        let args = Self::command_args(
430            pid,
431            config,
432            needs_userns,
433            api_socket_path,
434            ready_write.as_raw_fd(),
435            exit_read.as_raw_fd(),
436            enable_sandbox,
437        );
438
439        let mut child = Command::new(slirp_bin)
440            .args(&args)
441            .spawn()
442            .map_err(|e| NucleusError::NetworkError(format!("spawn slirp4netns: {}", e)))?;
443
444        drop(ready_write);
445        drop(exit_read);
446
447        match Self::wait_until_ready(&mut child, ready_read) {
448            Ok(()) => Ok((child, exit_write)),
449            Err(e) => {
450                let _ = child.kill();
451                let _ = child.wait();
452                Err(e)
453            }
454        }
455    }
456
457    fn command_args(
458        pid: u32,
459        config: &BridgeConfig,
460        join_userns: bool,
461        api_socket_path: &Path,
462        ready_fd: i32,
463        exit_fd: i32,
464        enable_sandbox: bool,
465    ) -> Vec<String> {
466        let mut args = vec![
467            "--configure".to_string(),
468            "--ready-fd".to_string(),
469            ready_fd.to_string(),
470            "--exit-fd".to_string(),
471            exit_fd.to_string(),
472            "--api-socket".to_string(),
473            api_socket_path.display().to_string(),
474            "--cidr".to_string(),
475            config.subnet.clone(),
476            "--disable-host-loopback".to_string(),
477        ];
478
479        if enable_sandbox {
480            args.push("--enable-sandbox".to_string());
481        }
482
483        if !config.dns.is_empty() {
484            args.push("--disable-dns".to_string());
485        }
486
487        if join_userns {
488            args.push("--userns-path".to_string());
489            args.push(format!("/proc/{}/ns/user", pid));
490        }
491
492        args.push(pid.to_string());
493        args.push(SLIRP_TAP_NAME.to_string());
494        args
495    }
496
497    fn runtime_dir(container_id: &str) -> PathBuf {
498        let base = if nix::unistd::Uid::effective().is_root() {
499            PathBuf::from("/run/nucleus/userspace-net")
500        } else {
501            dirs::runtime_dir()
502                .map(|dir| dir.join("nucleus/userspace-net"))
503                .or_else(|| dirs::data_local_dir().map(|dir| dir.join("nucleus/userspace-net")))
504                .unwrap_or_else(|| std::env::temp_dir().join("nucleus-userspace-net"))
505        };
506        base.join(container_id)
507    }
508
509    fn ensure_runtime_dir(path: &Path) -> Result<()> {
510        if let Some(parent) = path.parent() {
511            std::fs::create_dir_all(parent).map_err(|e| {
512                NucleusError::NetworkError(format!(
513                    "create userspace-net parent dir {:?}: {}",
514                    parent, e
515                ))
516            })?;
517        }
518        std::fs::create_dir_all(path).map_err(|e| {
519            NucleusError::NetworkError(format!("create userspace-net dir {:?}: {}", path, e))
520        })?;
521        use std::os::unix::fs::PermissionsExt;
522        std::fs::set_permissions(path, std::fs::Permissions::from_mode(0o700)).map_err(|e| {
523            NucleusError::NetworkError(format!(
524                "secure userspace-net dir permissions for {:?}: {}",
525                path, e
526            ))
527        })?;
528        Ok(())
529    }
530
531    fn clear_cloexec(fd: &OwnedFd) -> Result<()> {
532        let flags = fcntl(fd, FcntlArg::F_GETFD).map_err(|e| {
533            NucleusError::NetworkError(format!("fcntl(F_GETFD) on fd {}: {}", fd.as_raw_fd(), e))
534        })?;
535        let fd_flags = FdFlag::from_bits_truncate(flags);
536        let new_flags = fd_flags & !FdFlag::FD_CLOEXEC;
537        fcntl(fd, FcntlArg::F_SETFD(new_flags)).map_err(|e| {
538            NucleusError::NetworkError(format!("fcntl(F_SETFD) on fd {}: {}", fd.as_raw_fd(), e))
539        })?;
540        Ok(())
541    }
542
543    fn guest_ip_from_subnet(subnet: &str) -> Result<String> {
544        Self::offset_ip_from_subnet(subnet, 100).map(|ip| ip.to_string())
545    }
546
547    fn dns_ip_from_subnet(subnet: &str) -> Result<String> {
548        Self::offset_ip_from_subnet(subnet, 3).map(|ip| ip.to_string())
549    }
550
551    fn offset_ip_from_subnet(subnet: &str, offset: u32) -> Result<std::net::Ipv4Addr> {
552        let (base, prefix) = subnet.split_once('/').ok_or_else(|| {
553            NucleusError::NetworkError(format!("Invalid CIDR (missing /prefix): '{}'", subnet))
554        })?;
555        let prefix = prefix.parse::<u8>().map_err(|e| {
556            NucleusError::NetworkError(format!("Invalid CIDR prefix '{}': {}", subnet, e))
557        })?;
558        let base_ip = base.parse::<std::net::Ipv4Addr>().map_err(|e| {
559            NucleusError::NetworkError(format!("Invalid CIDR base '{}': {}", subnet, e))
560        })?;
561
562        let host_capacity = if prefix == 32 {
563            1u64
564        } else {
565            1u64 << (32 - prefix)
566        };
567        if offset as u64 >= host_capacity {
568            return Err(NucleusError::NetworkError(format!(
569                "CIDR '{}' does not have room for host offset {}",
570                subnet, offset
571            )));
572        }
573
574        let candidate = u32::from(base_ip)
575            .checked_add(offset)
576            .ok_or_else(|| NucleusError::NetworkError(format!("CIDR '{}' overflowed", subnet)))?;
577        Ok(std::net::Ipv4Addr::from(candidate))
578    }
579}
580
581impl Drop for UserspaceNetwork {
582    fn drop(&mut self) {
583        self.cleanup_best_effort();
584    }
585}
586
587#[cfg(test)]
588mod tests {
589    use super::*;
590    use crate::network::Protocol;
591
592    #[test]
593    fn test_auto_nat_backend_prefers_kernel_for_rootful_hosts() {
594        let cfg = BridgeConfig::default();
595        assert_eq!(cfg.selected_nat_backend(true, false), NatBackend::Kernel);
596        assert_eq!(cfg.selected_nat_backend(true, true), NatBackend::Userspace);
597        assert_eq!(cfg.selected_nat_backend(false, true), NatBackend::Userspace);
598    }
599
600    #[test]
601    fn test_userspace_backend_rejects_too_small_subnets() {
602        let cfg = BridgeConfig {
603            subnet: "10.0.42.0/26".to_string(),
604            ..BridgeConfig::default()
605        };
606
607        let guest_ip = UserspaceNetwork::guest_ip_from_subnet(&cfg.subnet).unwrap_err();
608        assert!(
609            guest_ip.to_string().contains("does not have room"),
610            "unexpected error: {guest_ip}"
611        );
612    }
613
614    #[test]
615    fn test_userspace_backend_rejects_custom_guest_ip() {
616        let cfg = BridgeConfig {
617            container_ip: Some("10.0.42.2".to_string()),
618            ..BridgeConfig::default()
619        };
620
621        let err = UserspaceNetwork::validate_userspace_config(&cfg, "10.0.42.100").unwrap_err();
622        assert!(err
623            .to_string()
624            .contains("requested container IP 10.0.42.2 is unsupported"));
625    }
626
627    #[test]
628    fn test_slirp_command_args_disable_builtin_dns_when_explicit_dns_is_set() {
629        let cfg = BridgeConfig::default().with_dns(vec!["1.1.1.1".to_string()]);
630        let args = UserspaceNetwork::command_args(
631            4242,
632            &cfg,
633            true,
634            Path::new("/tmp/slirp.sock"),
635            5,
636            6,
637            true,
638        );
639
640        assert!(args.iter().any(|arg| arg == "--disable-dns"));
641        assert!(args.iter().any(|arg| arg == "--userns-path"));
642    }
643
644    #[test]
645    fn test_userspace_port_forward_request_uses_slirp_hostfwd_shape() {
646        let pf = PortForward {
647            host_ip: Some(std::net::Ipv4Addr::new(127, 0, 0, 1)),
648            host_port: 8080,
649            container_port: 80,
650            protocol: Protocol::Tcp,
651        };
652
653        let mut arguments = serde_json::Map::new();
654        arguments.insert("proto".to_string(), json!(pf.protocol.as_str()));
655        arguments.insert("host_port".to_string(), json!(pf.host_port));
656        arguments.insert("guest_port".to_string(), json!(pf.container_port));
657        if let Some(host_ip) = pf.host_ip {
658            arguments.insert("host_addr".to_string(), json!(host_ip.to_string()));
659        }
660        let request = json!({
661            "execute": "add_hostfwd",
662            "arguments": arguments,
663        });
664
665        assert_eq!(request["execute"], "add_hostfwd");
666        assert_eq!(request["arguments"]["proto"], "tcp");
667        assert_eq!(request["arguments"]["host_addr"], "127.0.0.1");
668        assert_eq!(request["arguments"]["host_port"], 8080);
669        assert_eq!(request["arguments"]["guest_port"], 80);
670    }
671}