Skip to main content

microsandbox_network/host/
linux.rs

1//! Linux host backend: TAP device + kernel forwarding/NAT via nftables.
2//!
3//! Creates one TAP device per sandbox, assigns gateway addresses, enables
4//! IP forwarding, and registers the sandbox in shared nftables sets.
5
6use std::{
7    net::{Ipv4Addr, Ipv6Addr},
8    os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd},
9};
10
11use ipnetwork::{Ipv4Network, Ipv6Network};
12
13use super::FrameTransport;
14use crate::{
15    config::InterfaceConfig,
16    ready::{MsbnetReady, MsbnetReadyIpv4, MsbnetReadyIpv6},
17};
18
19//--------------------------------------------------------------------------------------------------
20// Constants
21//--------------------------------------------------------------------------------------------------
22
23/// IPv4 base pool: `100.96.0.0/11`
24const IPV4_POOL_BASE: u32 = 0x6460_0000; // 100.96.0.0
25
26/// IPv6 base prefix: `fd42:6d73:62::`
27const IPV6_PREFIX: [u8; 6] = [0xfd, 0x42, 0x6d, 0x73, 0x00, 0x62];
28
29/// TAP device name prefix.
30const TAP_PREFIX: &str = "msbtap";
31
32/// TUN/TAP ioctl request code.
33const TUNSETIFF: libc::c_ulong = 0x4004_54ca;
34
35/// IFF_TAP — TAP device (layer 2).
36const IFF_TAP: libc::c_short = 0x0002;
37
38/// IFF_NO_PI — no packet information header.
39const IFF_NO_PI: libc::c_short = 0x1000;
40
41//--------------------------------------------------------------------------------------------------
42// Types
43//--------------------------------------------------------------------------------------------------
44
45/// TAP-based network backend for Linux.
46///
47/// Holds the TAP file descriptor and the resolved network parameters.
48/// On drop, cleans up nftables set elements and removes the TAP device.
49pub struct TapLink {
50    /// TAP device file descriptor.
51    pub tap_fd: OwnedFd,
52
53    /// TAP interface name (e.g. `msbtap42`).
54    pub ifname: String,
55
56    /// Gateway IPv4 address (assigned to the TAP interface).
57    pub gateway_v4: Ipv4Addr,
58
59    /// Guest IPv4 address.
60    pub guest_v4: Ipv4Addr,
61
62    /// IPv4 prefix length.
63    pub prefix_v4: u8,
64
65    /// Gateway IPv6 address (assigned to the TAP interface).
66    pub gateway_v6: Ipv6Addr,
67
68    /// Guest IPv6 address.
69    pub guest_v6: Ipv6Addr,
70
71    /// IPv6 prefix length.
72    pub prefix_v6: u8,
73
74    /// MTU.
75    pub mtu: u16,
76
77    /// Guest MAC address.
78    pub mac: [u8; 6],
79
80    /// Sandbox IPv4 subnet registered in nftables.
81    pub subnet_v4: String,
82
83    /// Sandbox IPv6 subnet registered in nftables.
84    pub subnet_v6: String,
85}
86
87//--------------------------------------------------------------------------------------------------
88// Methods
89//--------------------------------------------------------------------------------------------------
90
91impl TapLink {
92    /// Creates a new TAP device and configures host networking for the given sandbox slot.
93    ///
94    /// Privileged operations (must run as root):
95    /// 1. Create TAP device
96    /// 2. Assign gateway IPv4/IPv6 addresses to the TAP interface
97    /// 3. Set MTU
98    /// 4. Bring interface up
99    /// 5. Enable IP forwarding (sysctl)
100    /// 6. Ensure shared nftables table/chains/sets exist
101    /// 7. Register this sandbox in the nftables sets
102    pub fn create(slot: u32, interface: &InterfaceConfig) -> std::io::Result<Self> {
103        // The IPv6 pool (fd42:6d73:62::/48) holds 2^16 /64 subnets = 65536 slots.
104        // The IPv4 pool (100.96.0.0/11) is larger (2^19) but must be capped to the
105        // IPv6 limit to keep both address families valid.
106        const MAX_SLOT: u32 = 0xFFFF;
107        if slot > MAX_SLOT {
108            return Err(std::io::Error::other(format!(
109                "sandbox slot {slot} exceeds maximum ({MAX_SLOT})"
110            )));
111        }
112
113        let ifname = format!("{TAP_PREFIX}{slot}");
114        let (gateway_v4, guest_v4, prefix_v4) = interface
115            .ipv4
116            .as_ref()
117            .map(|ipv4| (ipv4.gateway, ipv4.address, ipv4.prefix_len))
118            .unwrap_or_else(|| compute_ipv4_addresses(slot));
119        let (gateway_v6, guest_v6, prefix_v6) = interface
120            .ipv6
121            .as_ref()
122            .map(|ipv6| (ipv6.gateway, ipv6.address, ipv6.prefix_len))
123            .unwrap_or_else(|| compute_ipv6_addresses(slot));
124        let mac = interface.mac.unwrap_or_else(|| compute_mac(slot));
125        let mtu = interface.mtu.unwrap_or(1500u16);
126        if !(68..=1500).contains(&mtu) {
127            return Err(std::io::Error::other(format!(
128                "MTU {mtu} is outside supported range (68–1500)"
129            )));
130        }
131        let subnet_v4 = subnet_v4_cidr(guest_v4, prefix_v4)?;
132        let subnet_v6 = subnet_v6_cidr(guest_v6, prefix_v6)?;
133
134        // 1. Create TAP device.
135        let tap_fd = create_tap_device(&ifname)?;
136
137        // 2-4. Configure TAP interface.
138        run_ip_cmd(&[
139            "addr",
140            "add",
141            &format!("{gateway_v4}/{prefix_v4}"),
142            "dev",
143            &ifname,
144        ])?;
145        run_ip_cmd(&[
146            "addr",
147            "add",
148            &format!("{gateway_v6}/{prefix_v6}"),
149            "dev",
150            &ifname,
151        ])?;
152        run_ip_cmd(&["link", "set", &ifname, "mtu", &mtu.to_string(), "up"])?;
153
154        // 5. Enable IP forwarding.
155        enable_ip_forwarding()?;
156
157        // 6-7. nftables setup.
158        ensure_nftables_shared()?;
159        add_nftables_elements(&ifname, &subnet_v4, &subnet_v6)?;
160
161        Ok(Self {
162            tap_fd,
163            ifname,
164            gateway_v4,
165            guest_v4,
166            prefix_v4,
167            gateway_v6,
168            guest_v6,
169            prefix_v6,
170            mtu,
171            mac,
172            subnet_v4,
173            subnet_v6,
174        })
175    }
176
177    /// Builds the `MsbnetReady` payload from the resolved parameters.
178    pub fn ready_info(&self) -> MsbnetReady {
179        MsbnetReady {
180            pid: std::process::id(),
181            backend: "linux_tap".to_string(),
182            ifname: self.ifname.clone(),
183            guest_iface: "eth0".to_string(),
184            mac: format_mac(&self.mac),
185            mtu: self.mtu,
186            ipv4: Some(MsbnetReadyIpv4 {
187                address: self.guest_v4.to_string(),
188                prefix_len: self.prefix_v4,
189                gateway: self.gateway_v4.to_string(),
190                dns: vec![self.gateway_v4.to_string()],
191            }),
192            ipv6: Some(MsbnetReadyIpv6 {
193                address: self.guest_v6.to_string(),
194                prefix_len: self.prefix_v6,
195                gateway: self.gateway_v6.to_string(),
196                dns: vec![self.gateway_v6.to_string()],
197            }),
198            tls: None,
199        }
200    }
201
202    /// Returns the raw TAP file descriptor.
203    pub fn as_raw_fd(&self) -> RawFd {
204        self.tap_fd.as_raw_fd()
205    }
206
207    /// Removes nftables set elements and deletes the TAP device.
208    pub fn cleanup(&self) {
209        // Best-effort cleanup — ignore errors.
210        let _ = remove_nftables_elements(&self.ifname, &self.subnet_v4, &self.subnet_v6);
211        let _ = run_ip_cmd(&["link", "delete", &self.ifname]);
212    }
213}
214
215impl Drop for TapLink {
216    fn drop(&mut self) {
217        self.cleanup();
218    }
219}
220
221//--------------------------------------------------------------------------------------------------
222// Trait Implementations
223//--------------------------------------------------------------------------------------------------
224
225impl FrameTransport for TapLink {
226    fn ready_fd(&self) -> RawFd {
227        self.tap_fd.as_raw_fd()
228    }
229
230    fn read_frame(&self, buf: &mut [u8]) -> std::io::Result<usize> {
231        let n = unsafe { libc::read(self.tap_fd.as_raw_fd(), buf.as_mut_ptr().cast(), buf.len()) };
232        if n < 0 {
233            Err(std::io::Error::last_os_error())
234        } else {
235            Ok(n as usize)
236        }
237    }
238
239    fn write_frame(&self, buf: &[u8]) -> std::io::Result<()> {
240        let n = unsafe { libc::write(self.tap_fd.as_raw_fd(), buf.as_ptr().cast(), buf.len()) };
241        if n < 0 {
242            Err(std::io::Error::last_os_error())
243        } else {
244            Ok(())
245        }
246    }
247}
248
249//--------------------------------------------------------------------------------------------------
250// Functions: Address Computation
251//--------------------------------------------------------------------------------------------------
252
253/// Computes IPv4 addresses from a slot index.
254///
255/// Each slot gets a `/30` subnet from `100.96.0.0/11`.
256/// Gateway = first usable host, guest = second usable host.
257fn compute_ipv4_addresses(slot: u32) -> (Ipv4Addr, Ipv4Addr, u8) {
258    let subnet_base = IPV4_POOL_BASE + (slot * 4);
259    let gateway = Ipv4Addr::from(subnet_base + 1);
260    let guest = Ipv4Addr::from(subnet_base + 2);
261    (gateway, guest, 30)
262}
263
264/// Computes IPv6 addresses from a slot index.
265///
266/// Each slot gets a `/64` prefix from `fd42:6d73:62::/48`.
267/// Gateway = `::1`, guest = `::2`.
268fn compute_ipv6_addresses(slot: u32) -> (Ipv6Addr, Ipv6Addr, u8) {
269    let prefix = format_ipv6_prefix(slot);
270    let gateway: Ipv6Addr = format!("{prefix}::1").parse().unwrap();
271    let guest: Ipv6Addr = format!("{prefix}::2").parse().unwrap();
272    (gateway, guest, 64)
273}
274
275/// Formats the IPv6 prefix for a given slot.
276fn format_ipv6_prefix(slot: u32) -> String {
277    format!(
278        "{:02x}{:02x}:{:02x}{:02x}:{:02x}{:02x}:{:x}",
279        IPV6_PREFIX[0],
280        IPV6_PREFIX[1],
281        IPV6_PREFIX[2],
282        IPV6_PREFIX[3],
283        IPV6_PREFIX[4],
284        IPV6_PREFIX[5],
285        slot
286    )
287}
288
289fn subnet_v4_cidr(addr: Ipv4Addr, prefix: u8) -> std::io::Result<String> {
290    let network = Ipv4Network::new(addr, prefix).map_err(std::io::Error::other)?;
291    Ok(format!("{}/{}", network.network(), prefix))
292}
293
294fn subnet_v6_cidr(addr: Ipv6Addr, prefix: u8) -> std::io::Result<String> {
295    let network = Ipv6Network::new(addr, prefix).map_err(std::io::Error::other)?;
296    Ok(format!("{}/{}", network.network(), prefix))
297}
298
299/// Computes a deterministic MAC address from a slot index.
300///
301/// Uses the locally-administered, unicast prefix `02:5a:7b` followed by
302/// 3 bytes derived from the slot.
303fn compute_mac(slot: u32) -> [u8; 6] {
304    let slot_bytes = slot.to_be_bytes();
305    [
306        0x02,
307        0x5a,
308        0x7b,
309        slot_bytes[1],
310        slot_bytes[2],
311        slot_bytes[3],
312    ]
313}
314
315/// Formats a MAC address as `AA:BB:CC:DD:EE:FF`.
316fn format_mac(mac: &[u8; 6]) -> String {
317    format!(
318        "{:02x}:{:02x}:{:02x}:{:02x}:{:02x}:{:02x}",
319        mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]
320    )
321}
322
323//--------------------------------------------------------------------------------------------------
324// Functions: TAP Device
325//--------------------------------------------------------------------------------------------------
326
327/// Creates a TAP device with the given name.
328fn create_tap_device(ifname: &str) -> std::io::Result<OwnedFd> {
329    let fd = unsafe {
330        libc::open(
331            c"/dev/net/tun".as_ptr(),
332            libc::O_RDWR | libc::O_NONBLOCK | libc::O_CLOEXEC,
333        )
334    };
335    if fd < 0 {
336        return Err(std::io::Error::last_os_error());
337    }
338
339    let mut ifr: libc::ifreq = unsafe { std::mem::zeroed() };
340    let name_bytes = ifname.as_bytes();
341    if name_bytes.len() >= libc::IFNAMSIZ {
342        unsafe { libc::close(fd) };
343        return Err(std::io::Error::other("interface name too long"));
344    }
345    unsafe {
346        std::ptr::copy_nonoverlapping(
347            name_bytes.as_ptr(),
348            ifr.ifr_name.as_mut_ptr().cast(),
349            name_bytes.len(),
350        );
351        ifr.ifr_ifru.ifru_flags = IFF_TAP | IFF_NO_PI;
352    }
353
354    if unsafe { libc::ioctl(fd, TUNSETIFF, &ifr) } < 0 {
355        let err = std::io::Error::last_os_error();
356        unsafe { libc::close(fd) };
357        return Err(err);
358    }
359
360    Ok(unsafe { OwnedFd::from_raw_fd(fd) })
361}
362
363//--------------------------------------------------------------------------------------------------
364// Functions: IP Command
365//--------------------------------------------------------------------------------------------------
366
367/// Runs an `ip` command with the given arguments.
368fn run_ip_cmd(args: &[&str]) -> std::io::Result<()> {
369    let output = std::process::Command::new("ip").args(args).output()?;
370    if !output.status.success() {
371        let stderr = String::from_utf8_lossy(&output.stderr);
372        return Err(std::io::Error::other(format!(
373            "ip {} failed: {stderr}",
374            args.join(" ")
375        )));
376    }
377    Ok(())
378}
379
380//--------------------------------------------------------------------------------------------------
381// Functions: IP Forwarding
382//--------------------------------------------------------------------------------------------------
383
384/// Enables IPv4 and IPv6 forwarding via sysctl.
385fn enable_ip_forwarding() -> std::io::Result<()> {
386    std::fs::write("/proc/sys/net/ipv4/ip_forward", "1")?;
387    std::fs::write("/proc/sys/net/ipv6/conf/all/forwarding", "1")?;
388    Ok(())
389}
390
391//--------------------------------------------------------------------------------------------------
392// Functions: nftables
393//--------------------------------------------------------------------------------------------------
394
395/// Ensures the shared `inet msb` table, chains, sets, and rules exist.
396///
397/// Idempotent — checks for existing table before adding rules to avoid
398/// duplicates when multiple msbnet processes start concurrently.
399fn ensure_nftables_shared() -> std::io::Result<()> {
400    // Check if the table already exists. If so, skip rule creation —
401    // `add table/chain/set` are idempotent, but `add rule` is not
402    // (it appends duplicates).
403    let table_exists = std::process::Command::new("nft")
404        .args(["list", "table", "inet", "msb"])
405        .stdout(std::process::Stdio::null())
406        .stderr(std::process::Stdio::null())
407        .status()
408        .map(|s| s.success())
409        .unwrap_or(false);
410
411    if table_exists {
412        return Ok(());
413    }
414
415    // Create table, sets, chains, and rules atomically via nft -f.
416    let script = "\
417        add table inet msb\n\
418        add set inet msb ifaces { type ifname; }\n\
419        add set inet msb nets_v4 { type ipv4_addr; flags interval; }\n\
420        add set inet msb nets_v6 { type ipv6_addr; flags interval; }\n\
421        add chain inet msb forward { type filter hook forward priority 0; policy drop; }\n\
422        add rule inet msb forward iif @ifaces accept\n\
423        add rule inet msb forward oif @ifaces ct state established,related accept\n\
424        add chain inet msb postrouting { type nat hook postrouting priority 100; }\n\
425        add rule inet msb postrouting ip saddr @nets_v4 masquerade\n\
426        add rule inet msb postrouting ip6 saddr @nets_v6 masquerade\n";
427
428    let output = std::process::Command::new("nft")
429        .arg("-f")
430        .arg("-")
431        .stdin(std::process::Stdio::piped())
432        .stdout(std::process::Stdio::null())
433        .stderr(std::process::Stdio::piped())
434        .spawn()
435        .and_then(|mut child| {
436            use std::io::Write;
437            child.stdin.take().unwrap().write_all(script.as_bytes())?;
438            child.wait_with_output()
439        })?;
440
441    if !output.status.success() {
442        let stderr = String::from_utf8_lossy(&output.stderr);
443        // If another process created the table between our check and our
444        // create, the atomic script may partially fail. Re-check existence.
445        let exists_now = std::process::Command::new("nft")
446            .args(["list", "table", "inet", "msb"])
447            .stdout(std::process::Stdio::null())
448            .stderr(std::process::Stdio::null())
449            .status()
450            .map(|s| s.success())
451            .unwrap_or(false);
452
453        if !exists_now {
454            return Err(std::io::Error::other(format!("nft setup failed: {stderr}")));
455        }
456    }
457
458    Ok(())
459}
460
461/// Adds set elements for a sandbox atomically via `nft -f`.
462fn add_nftables_elements(ifname: &str, subnet_v4: &str, subnet_v6: &str) -> std::io::Result<()> {
463    let script = format!(
464        "add element inet msb ifaces {{ \"{ifname}\" }}\n\
465         add element inet msb nets_v4 {{ {subnet_v4} }}\n\
466         add element inet msb nets_v6 {{ {subnet_v6} }}\n"
467    );
468    nft_script(&script)
469}
470
471/// Removes set elements for a sandbox atomically via `nft -f`.
472fn remove_nftables_elements(ifname: &str, subnet_v4: &str, subnet_v6: &str) -> std::io::Result<()> {
473    let script = format!(
474        "delete element inet msb ifaces {{ \"{ifname}\" }}\n\
475         delete element inet msb nets_v4 {{ {subnet_v4} }}\n\
476         delete element inet msb nets_v6 {{ {subnet_v6} }}\n"
477    );
478    nft_script(&script)
479}
480
481/// Runs an `nft -f` script atomically.
482pub(crate) fn nft_script(script: &str) -> std::io::Result<()> {
483    let output = std::process::Command::new("nft")
484        .arg("-f")
485        .arg("-")
486        .stdin(std::process::Stdio::piped())
487        .stdout(std::process::Stdio::null())
488        .stderr(std::process::Stdio::piped())
489        .spawn()
490        .and_then(|mut child| {
491            use std::io::Write;
492            child.stdin.take().unwrap().write_all(script.as_bytes())?;
493            child.wait_with_output()
494        })?;
495
496    if !output.status.success() {
497        let stderr = String::from_utf8_lossy(&output.stderr);
498        return Err(std::io::Error::other(format!(
499            "nft script failed: {stderr}"
500        )));
501    }
502    Ok(())
503}
504
505//--------------------------------------------------------------------------------------------------
506// Tests
507//--------------------------------------------------------------------------------------------------
508
509#[cfg(test)]
510mod tests {
511    use super::*;
512
513    #[test]
514    fn test_compute_ipv4_slot_0() {
515        let (gw, guest, prefix) = compute_ipv4_addresses(0);
516        assert_eq!(gw, Ipv4Addr::new(100, 96, 0, 1));
517        assert_eq!(guest, Ipv4Addr::new(100, 96, 0, 2));
518        assert_eq!(prefix, 30);
519    }
520
521    #[test]
522    fn test_compute_ipv4_slot_1() {
523        let (gw, guest, _) = compute_ipv4_addresses(1);
524        assert_eq!(gw, Ipv4Addr::new(100, 96, 0, 5));
525        assert_eq!(guest, Ipv4Addr::new(100, 96, 0, 6));
526    }
527
528    #[test]
529    fn test_compute_ipv4_slot_42() {
530        let (gw, guest, _) = compute_ipv4_addresses(42);
531        assert_eq!(gw, Ipv4Addr::new(100, 96, 0, 169));
532        assert_eq!(guest, Ipv4Addr::new(100, 96, 0, 170));
533    }
534
535    #[test]
536    fn test_compute_ipv6_slot_0() {
537        let (gw, guest, prefix) = compute_ipv6_addresses(0);
538        assert_eq!(gw, "fd42:6d73:0062:0::1".parse::<Ipv6Addr>().unwrap());
539        assert_eq!(guest, "fd42:6d73:0062:0::2".parse::<Ipv6Addr>().unwrap());
540        assert_eq!(prefix, 64);
541    }
542
543    #[test]
544    fn test_compute_ipv6_slot_42() {
545        let (gw, guest, _) = compute_ipv6_addresses(42);
546        assert_eq!(gw, "fd42:6d73:0062:2a::1".parse::<Ipv6Addr>().unwrap());
547        assert_eq!(guest, "fd42:6d73:0062:2a::2".parse::<Ipv6Addr>().unwrap());
548    }
549
550    #[test]
551    fn test_compute_mac() {
552        let mac = compute_mac(42);
553        assert_eq!(mac, [0x02, 0x5a, 0x7b, 0x00, 0x00, 0x2a]);
554    }
555
556    #[test]
557    fn test_format_mac() {
558        assert_eq!(
559            format_mac(&[0x02, 0x5a, 0x7b, 0x13, 0x01, 0x02]),
560            "02:5a:7b:13:01:02"
561        );
562    }
563
564    #[test]
565    fn test_subnet_v4_cidr_uses_network_base() {
566        assert_eq!(
567            subnet_v4_cidr(Ipv4Addr::new(100, 96, 0, 2), 30).unwrap(),
568            "100.96.0.0/30"
569        );
570    }
571
572    #[test]
573    fn test_subnet_v6_cidr_uses_network_base() {
574        assert_eq!(
575            subnet_v6_cidr("fd42:6d73:62:2a::2".parse().unwrap(), 64).unwrap(),
576            "fd42:6d73:62:2a::/64"
577        );
578    }
579
580    #[test]
581    fn test_compute_ipv6_max_slot() {
582        // Slot 0xFFFF is the maximum — must produce a valid IPv6 address.
583        let (gw, guest, prefix) = compute_ipv6_addresses(0xFFFF);
584        assert_eq!(gw, "fd42:6d73:0062:ffff::1".parse::<Ipv6Addr>().unwrap());
585        assert_eq!(guest, "fd42:6d73:0062:ffff::2".parse::<Ipv6Addr>().unwrap());
586        assert_eq!(prefix, 64);
587    }
588}