Skip to main content

sandlock_core/
network.rs

1// Network policy and control handlers — IP allowlist enforcement via seccomp notification.
2//
3// Intercepts connect/sendto/sendmsg syscalls, extracts the destination IP from
4// the child's memory, and checks it against an allowlist of resolved IPs.
5
6use std::collections::{HashMap, HashSet};
7use std::io;
8use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
9use std::os::unix::io::{AsRawFd, RawFd};
10use std::sync::Arc;
11
12use serde::{Deserialize, Serialize};
13
14use crate::error::SandboxError;
15use crate::seccomp::ctx::SupervisorCtx;
16use crate::seccomp::notif::{read_child_mem, write_child_mem, NotifAction};
17use crate::sys::structs::{SeccompNotif, AF_INET, AF_INET6, ECONNREFUSED};
18
19/// Maximum buffer size for sendto/sendmsg on-behalf operations (64 MiB).
20/// Prevents a sandboxed process from triggering OOM in the supervisor.
21const MAX_SEND_BUF: usize = 64 << 20;
22
23/// L4 protocol that a `NetAllow` rule applies to.
24///
25/// `Tcp` is the default if a rule has no scheme (the bare `host:port`
26/// form). `Udp` and `Icmp` require an explicit scheme.
27///
28/// `Icmp` is the kernel's unprivileged ping socket
29/// (`SOCK_DGRAM + IPPROTO_ICMP{,V6}`), gated by `ping_group_range` —
30/// destinations are filterable per host. Sandlock does not expose raw
31/// ICMP (`SOCK_RAW + IPPROTO_ICMP`): destination filtering at `sendto`
32/// would lie because raw sockets let the agent craft the IP header,
33/// and packet-crafting capabilities aren't part of the XOA threat
34/// model. Workloads that genuinely need raw ICMP should run outside
35/// sandlock or rely on the host's `ping_group_range` for the dgram
36/// path instead.
37#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
38#[serde(rename_all = "lowercase")]
39pub enum Protocol {
40    Tcp,
41    Udp,
42    Icmp,
43}
44
45impl Protocol {
46    fn parse(s: &str) -> Option<Self> {
47        match s {
48            "tcp" => Some(Protocol::Tcp),
49            "udp" => Some(Protocol::Udp),
50            "icmp" => Some(Protocol::Icmp),
51            _ => None,
52        }
53    }
54}
55
56/// A network endpoint allow rule.
57///
58/// Each rule permits one protocol's traffic to one host (or any IP, for
59/// the `:port` form) on a specific set of ports. Multiple rules are
60/// OR'd: traffic is permitted if any rule matches the protocol, the
61/// destination IP, and the destination port.
62///
63/// ICMP rules carry no port (ICMP has none); their `ports` is empty
64/// and `all_ports` is false.
65#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
66pub struct NetAllow {
67    /// L4 protocol this rule applies to.
68    #[serde(default = "default_protocol_tcp")]
69    pub protocol: Protocol,
70    /// Hostname; `None` means "any IP" (the `:port` form, or `icmp://*`).
71    pub host: Option<String>,
72    /// Permitted ports. Must be non-empty unless `all_ports` is true,
73    /// in which case it must be empty. Always empty for `Protocol::Icmp`.
74    pub ports: Vec<u16>,
75    /// "Any port" wildcard from the `*` token in port position. When
76    /// true, `ports` is empty; the rule permits every TCP/UDP port to
77    /// the host (or to any IP, when `host` is `None`).
78    #[serde(default)]
79    pub all_ports: bool,
80}
81
82fn default_protocol_tcp() -> Protocol {
83    Protocol::Tcp
84}
85
86impl NetAllow {
87    /// Parse a rule spec. Forms:
88    ///
89    /// - `host:port[,port,...]`, `:port`, `*:port`, `host:*`, `:*`, `*:*`
90    ///   — TCP (the default scheme).
91    /// - `tcp://...` — explicit TCP, same suffix grammar as the bare form.
92    /// - `udp://...` — UDP, same suffix grammar as the bare form.
93    /// - `icmp://host` or `icmp://*` — ICMP echo (kernel ping socket).
94    ///   No port field; `icmp://host:80` is rejected.
95    ///
96    /// `*` in port position means "any port" (the all-ports wildcard).
97    /// Mixing `*` with concrete ports (e.g. `host:80,*`) is rejected.
98    pub fn parse(s: &str) -> Result<Self, SandboxError> {
99        // Split off the optional scheme prefix `<proto>://`. If absent,
100        // default to TCP and the rest of the parser is unchanged.
101        let (protocol, rest) = match s.split_once("://") {
102            Some((scheme, body)) => {
103                let proto = Protocol::parse(scheme).ok_or_else(|| {
104                    SandboxError::Invalid(format!(
105                        "--net-allow: unknown scheme `{}://` in `{}` (expected tcp, udp, icmp)",
106                        scheme, s
107                    ))
108                })?;
109                (proto, body)
110            }
111            None => (Protocol::Tcp, s),
112        };
113
114        if protocol == Protocol::Icmp {
115            return Self::parse_icmp(rest, s);
116        }
117
118        let (host_part, port_part) = rest.rsplit_once(':').ok_or_else(|| {
119            SandboxError::Invalid(format!(
120                "--net-allow: expected `host:port` or `:port`, got `{}`",
121                s
122            ))
123        })?;
124        let host = match host_part {
125            "" | "*" => None,
126            h => Some(h.to_string()),
127        };
128
129        // Detect the wildcard token. We split on ',' first so a
130        // single `*` is a clean match — `*,80` is rejected explicitly
131        // below rather than letting `*` parse as port 0.
132        let mut ports = Vec::new();
133        let mut saw_wildcard = false;
134        for p in port_part.split(',') {
135            let p = p.trim();
136            if p == "*" {
137                saw_wildcard = true;
138                continue;
139            }
140            let n: u16 = p.parse().map_err(|_| {
141                SandboxError::Invalid(format!("--net-allow: invalid port `{}` in `{}`", p, s))
142            })?;
143            if n == 0 {
144                return Err(SandboxError::Invalid(format!(
145                    "--net-allow: port 0 is not valid in `{}`",
146                    s
147                )));
148            }
149            ports.push(n);
150        }
151        if saw_wildcard && !ports.is_empty() {
152            return Err(SandboxError::Invalid(format!(
153                "--net-allow: cannot mix `*` with concrete ports in `{}`",
154                s
155            )));
156        }
157        if !saw_wildcard && ports.is_empty() {
158            return Err(SandboxError::Invalid(format!(
159                "--net-allow: at least one port required in `{}`",
160                s
161            )));
162        }
163        Ok(NetAllow {
164            protocol,
165            host,
166            ports,
167            all_ports: saw_wildcard,
168        })
169    }
170
171    /// Parse the body of an `icmp://` rule. Accepts a host or `*` —
172    /// ICMP has no ports, so any `:` separator is rejected.
173    fn parse_icmp(body: &str, full: &str) -> Result<Self, SandboxError> {
174        if body.contains(':') {
175            return Err(SandboxError::Invalid(format!(
176                "--net-allow: icmp rules take no port, got `{}`",
177                full
178            )));
179        }
180        if body.is_empty() {
181            return Err(SandboxError::Invalid(format!(
182                "--net-allow: icmp rule needs a host or `*`, got `{}`",
183                full
184            )));
185        }
186        let host = match body {
187            "*" => None,
188            h => Some(h.to_string()),
189        };
190        Ok(NetAllow {
191            protocol: Protocol::Icmp,
192            host,
193            ports: Vec::new(),
194            all_ports: false,
195        })
196    }
197}
198
199// ============================================================
200// parse_ip_from_sockaddr — parse IP from a sockaddr byte buffer
201// ============================================================
202
203/// Parse IP address from a sockaddr byte buffer.
204/// Returns None for non-IP families (AF_UNIX etc.) — always allowed.
205fn parse_ip_from_sockaddr(bytes: &[u8]) -> Option<IpAddr> {
206    if bytes.len() < 2 {
207        return None;
208    }
209    let family = u16::from_ne_bytes([bytes[0], bytes[1]]) as u32;
210    match family {
211        f if f == AF_INET => {
212            if bytes.len() < 8 {
213                return None;
214            }
215            Some(IpAddr::V4(Ipv4Addr::new(
216                bytes[4], bytes[5], bytes[6], bytes[7],
217            )))
218        }
219        f if f == AF_INET6 => {
220            if bytes.len() < 24 {
221                return None;
222            }
223            let mut addr_bytes = [0u8; 16];
224            addr_bytes.copy_from_slice(&bytes[8..24]);
225            Some(IpAddr::V6(Ipv6Addr::from(addr_bytes)))
226        }
227        _ => None,
228    }
229}
230
231// ============================================================
232// parse_port_from_sockaddr — parse TCP port from sockaddr bytes
233// ============================================================
234
235/// Parse TCP port from a sockaddr byte buffer.
236/// Returns None for non-IP families (AF_UNIX etc.).
237fn parse_port_from_sockaddr(bytes: &[u8]) -> Option<u16> {
238    if bytes.len() < 4 {
239        return None;
240    }
241    let family = u16::from_ne_bytes([bytes[0], bytes[1]]) as u32;
242    match family {
243        f if f == AF_INET || f == AF_INET6 => {
244            Some(u16::from_be_bytes([bytes[2], bytes[3]]))
245        }
246        _ => None,
247    }
248}
249
250fn set_port_in_sockaddr(bytes: &mut [u8], port: u16) {
251    if bytes.len() >= 4 {
252        let port_bytes = port.to_be_bytes();
253        bytes[2] = port_bytes[0];
254        bytes[3] = port_bytes[1];
255    }
256}
257
258// ============================================================
259// query_socket_protocol — derive the rule Protocol from a fd via getsockopt
260// ============================================================
261
262/// Query `SO_PROTOCOL` on a dup'd socket fd to learn whether to route
263/// the on-behalf check through the TCP, UDP, or ICMP policy.
264///
265/// Returns `None` for protocols sandlock does not gate via `net_allow`
266/// (raw, SCTP, etc.) — the handler treats those as "no rule applies"
267/// which collapses to the default-deny path.
268fn query_socket_protocol(fd: RawFd) -> Option<Protocol> {
269    let mut proto: libc::c_int = 0;
270    let mut len: libc::socklen_t = std::mem::size_of::<libc::c_int>() as libc::socklen_t;
271    let rc = unsafe {
272        libc::getsockopt(
273            fd,
274            libc::SOL_SOCKET,
275            libc::SO_PROTOCOL,
276            &mut proto as *mut _ as *mut libc::c_void,
277            &mut len,
278        )
279    };
280    if rc != 0 {
281        return None;
282    }
283    match proto {
284        libc::IPPROTO_TCP => Some(Protocol::Tcp),
285        libc::IPPROTO_UDP => Some(Protocol::Udp),
286        // IPPROTO_ICMP and IPPROTO_ICMPV6 both route to the ICMP policy
287        // (the policy doesn't distinguish IP versions; the rule's
288        // resolved IP set already covers both via DNS).
289        libc::IPPROTO_ICMP | libc::IPPROTO_ICMPV6 => Some(Protocol::Icmp),
290        _ => None,
291    }
292}
293
294// ============================================================
295// connect_on_behalf — perform connect() on behalf of the child (TOCTOU-safe)
296// ============================================================
297
298/// Perform connect() on behalf of the child process (TOCTOU-safe).
299///
300/// 1. Copy sockaddr from child memory (our copy — immune to TOCTOU)
301/// 2. Check IP against allowlist on our copy
302/// 3. Duplicate child's socket fd via pidfd_getfd
303/// 4. connect() in supervisor with our validated sockaddr
304/// 5. Return result to child
305async fn connect_on_behalf(
306    notif: &SeccompNotif,
307    ctx: &Arc<SupervisorCtx>,
308    notif_fd: RawFd,
309) -> NotifAction {
310    let args = &notif.data.args;
311    let sockfd = args[0] as i32;
312    let addr_ptr = args[1];
313    let addr_len = args[2] as u32;
314
315    // 1. Copy sockaddr from child memory
316    let addr_bytes =
317        match read_child_mem(notif_fd, notif.id, notif.pid, addr_ptr, addr_len as usize) {
318            Ok(b) => b,
319            Err(_) => return NotifAction::Errno(libc::EIO),
320        };
321
322    // 2. Check destination against the per-protocol endpoint allowlist.
323    // The dup we'd need anyway for the on-behalf connect doubles as
324    // our SO_PROTOCOL probe — one pidfd_getfd, one getsockopt. The
325    // per-protocol policy is keyed on whether the socket is TCP / UDP
326    // / kernel ping (ICMP). Unknown protocol (raw, SCTP, etc.) fails
327    // closed: the BPF should have prevented socket creation, so
328    // reaching here with one is an unexpected case worth refusing.
329    if let Some(ip) = parse_ip_from_sockaddr(&addr_bytes) {
330        let dest_port = parse_port_from_sockaddr(&addr_bytes);
331        let dup_fd = match crate::seccomp::notif::dup_fd_from_pid(notif.pid, sockfd) {
332            Ok(fd) => fd,
333            Err(e) => return NotifAction::Errno(e.raw_os_error().unwrap_or(libc::EBADF)),
334        };
335        let protocol = match query_socket_protocol(dup_fd.as_raw_fd()) {
336            Some(p) => p,
337            None => return NotifAction::Errno(ECONNREFUSED),
338        };
339        let ns = ctx.network.lock().await;
340        let live_policy = {
341            let pfs = ctx.policy_fn.lock().await;
342            pfs.live_policy.clone()
343        };
344        let effective = ns.effective_network_policy(notif.pid, protocol, live_policy.as_ref());
345        match (effective, dest_port) {
346            (crate::seccomp::notif::NetworkPolicy::Unrestricted, _) => {
347                // No rules for this protocol's wildcard — Landlock (TCP
348                // only) or the protocol's wildcard rule covers it; no
349                // additional check here.
350            }
351            (policy, Some(p)) => {
352                // For ICMP rules every per-IP entry is `PortAllow::Any`,
353                // so the port arg from the sockaddr (typically 0 or the
354                // ICMP id) is functionally ignored — IP is what matters.
355                if !policy.allows(ip, p) {
356                    return NotifAction::Errno(ECONNREFUSED);
357                }
358            }
359            (_, None) => {
360                // Couldn't parse port from sockaddr — fail closed.
361                return NotifAction::Errno(ECONNREFUSED);
362            }
363        }
364        // Check for HTTP ACL redirect
365        let http_acl_addr = ns.http_acl_addr;
366        let http_acl_intercept = dest_port.map_or(false, |p| ns.http_acl_ports.contains(&p));
367        let http_acl_orig_dest = ns.http_acl_orig_dest.clone();
368        let remapped_loopback_port = if ctx.policy.port_remap && ip.is_loopback() {
369            dest_port.and_then(|p| ns.port_map.get_real(p))
370        } else {
371            None
372        };
373
374        drop(ns);
375
376        // Determine the actual connect target (redirect HTTP/HTTPS to proxy)
377        let mut redirected = false;
378        let is_ipv6 = parse_ip_from_sockaddr(&addr_bytes)
379            .map_or(false, |ip| ip.is_ipv6());
380        let (mut connect_addr, connect_len) = if let Some(proxy_addr) = http_acl_addr {
381            if http_acl_intercept {
382                redirected = true;
383                if is_ipv6 {
384                    // IPv6 socket: redirect via IPv4-mapped IPv6 address
385                    // (::ffff:127.0.0.1) so it connects to the IPv4 proxy.
386                    let mut sa6: libc::sockaddr_in6 = unsafe { std::mem::zeroed() };
387                    sa6.sin6_family = libc::AF_INET6 as u16;
388                    sa6.sin6_port = proxy_addr.port().to_be();
389                    // Build ::ffff:127.0.0.1
390                    let mapped = std::net::Ipv6Addr::from(
391                        match proxy_addr {
392                            std::net::SocketAddr::V4(v4) => v4.ip().to_ipv6_mapped(),
393                            std::net::SocketAddr::V6(v6) => *v6.ip(),
394                        }
395                    );
396                    sa6.sin6_addr.s6_addr = mapped.octets();
397                    let bytes = unsafe {
398                        std::slice::from_raw_parts(
399                            &sa6 as *const _ as *const u8,
400                            std::mem::size_of::<libc::sockaddr_in6>(),
401                        )
402                    }
403                    .to_vec();
404                    (bytes, std::mem::size_of::<libc::sockaddr_in6>() as u32)
405                } else {
406                    // IPv4 socket: redirect directly.
407                    let mut sa: libc::sockaddr_in = unsafe { std::mem::zeroed() };
408                    sa.sin_family = libc::AF_INET as u16;
409                    sa.sin_port = proxy_addr.port().to_be();
410                    match proxy_addr {
411                        std::net::SocketAddr::V4(v4) => {
412                            sa.sin_addr.s_addr = u32::from_ne_bytes(v4.ip().octets());
413                        }
414                        std::net::SocketAddr::V6(_) => {
415                            // Proxy always binds to 127.0.0.1
416                            return NotifAction::Errno(libc::EAFNOSUPPORT);
417                        }
418                    }
419                    let bytes = unsafe {
420                        std::slice::from_raw_parts(
421                            &sa as *const _ as *const u8,
422                            std::mem::size_of::<libc::sockaddr_in>(),
423                        )
424                    }
425                    .to_vec();
426                    (bytes, std::mem::size_of::<libc::sockaddr_in>() as u32)
427                }
428            } else {
429                (addr_bytes.clone(), addr_len)
430            }
431        } else {
432            (addr_bytes.clone(), addr_len)
433        };
434        if !redirected {
435            if let Some(real_port) = remapped_loopback_port {
436                // The child sees virtual ports via getsockname(); connect
437                // still has to target the real bound loopback port.
438                set_port_in_sockaddr(&mut connect_addr, real_port);
439            }
440        }
441
442        // (The supervisor-side dup is the same fd we already created
443        // for the SO_PROTOCOL probe above — reuse it rather than
444        // pidfd_getfd-ing a second time.)
445
446        // 4. Record original dest IP *before* connect to prevent TOCTOU race:
447        //    the proxy may receive the request before we write the mapping if
448        //    we do it after connect(). We already have the original IP from
449        //    addr_bytes (our immune copy).
450        if redirected {
451            if let Some(ref orig_dest_map) = http_acl_orig_dest {
452                if let Some(orig_ip) = parse_ip_from_sockaddr(&addr_bytes) {
453                    // Bind the socket so getsockname() returns the local addr
454                    // the proxy will see as client_addr.
455                    if is_ipv6 {
456                        let mut bind_sa6: libc::sockaddr_in6 = unsafe { std::mem::zeroed() };
457                        bind_sa6.sin6_family = libc::AF_INET6 as u16;
458                        // port 0 + IN6ADDR_ANY = kernel picks ephemeral port
459                        unsafe {
460                            libc::bind(
461                                dup_fd.as_raw_fd(),
462                                &bind_sa6 as *const _ as *const libc::sockaddr,
463                                std::mem::size_of::<libc::sockaddr_in6>() as libc::socklen_t,
464                            );
465                        }
466                        let mut local_sa6: libc::sockaddr_in6 = unsafe { std::mem::zeroed() };
467                        let mut local_len: libc::socklen_t =
468                            std::mem::size_of::<libc::sockaddr_in6>() as libc::socklen_t;
469                        let gs_ret = unsafe {
470                            libc::getsockname(
471                                dup_fd.as_raw_fd(),
472                                &mut local_sa6 as *mut _ as *mut libc::sockaddr,
473                                &mut local_len,
474                            )
475                        };
476                        if gs_ret == 0 {
477                            let local_port = u16::from_be(local_sa6.sin6_port);
478                            let local_ip = Ipv6Addr::from(local_sa6.sin6_addr.s6_addr);
479                            let local_addr = std::net::SocketAddr::V6(
480                                std::net::SocketAddrV6::new(local_ip, local_port, 0, 0),
481                            );
482                            if let Ok(mut map) = orig_dest_map.write() {
483                                map.insert(local_addr, orig_ip);
484                            }
485                        }
486                    } else {
487                        let mut bind_sa: libc::sockaddr_in = unsafe { std::mem::zeroed() };
488                        bind_sa.sin_family = libc::AF_INET as u16;
489                        // port 0 + INADDR_ANY = kernel picks ephemeral port
490                        unsafe {
491                            libc::bind(
492                                dup_fd.as_raw_fd(),
493                                &bind_sa as *const _ as *const libc::sockaddr,
494                                std::mem::size_of::<libc::sockaddr_in>() as libc::socklen_t,
495                            );
496                        }
497                        let mut local_sa: libc::sockaddr_in = unsafe { std::mem::zeroed() };
498                        let mut local_len: libc::socklen_t =
499                            std::mem::size_of::<libc::sockaddr_in>() as libc::socklen_t;
500                        let gs_ret = unsafe {
501                            libc::getsockname(
502                                dup_fd.as_raw_fd(),
503                                &mut local_sa as *mut _ as *mut libc::sockaddr,
504                                &mut local_len,
505                            )
506                        };
507                        if gs_ret == 0 {
508                            let local_port = u16::from_be(local_sa.sin_port);
509                            let local_ip = Ipv4Addr::from(u32::from_be(local_sa.sin_addr.s_addr));
510                            let local_addr = std::net::SocketAddr::V4(
511                                std::net::SocketAddrV4::new(local_ip, local_port),
512                            );
513                            if let Ok(mut map) = orig_dest_map.write() {
514                                map.insert(local_addr, orig_ip);
515                            }
516                        }
517                    }
518                }
519            }
520        }
521
522        // 5. Perform connect in supervisor with our validated sockaddr
523        let ret = unsafe {
524            libc::connect(
525                dup_fd.as_raw_fd(),
526                connect_addr.as_ptr() as *const libc::sockaddr,
527                connect_len as libc::socklen_t,
528            )
529        };
530
531        // 6. Return result.
532        // On failure, the stale orig_dest entry is harmless: the proxy never
533        // sees this connection, and the entry will be cleaned up on the next
534        // successful request from the same local address (or on shutdown).
535        if ret == 0 {
536            NotifAction::ReturnValue(0)
537        } else {
538            let errno = unsafe { *libc::__errno_location() };
539            NotifAction::Errno(errno)
540        }
541        // dup_fd dropped here, closing supervisor's copy
542    } else {
543        // Non-IP family (AF_UNIX etc.) — allow through
544        NotifAction::Continue
545    }
546}
547
548// ============================================================
549// sendto_on_behalf / sendmsg_on_behalf — on-behalf (TOCTOU-safe)
550// ============================================================
551
552/// Perform sendto() on behalf of the child process (TOCTOU-safe).
553///
554/// 1. Copy sockaddr from child memory (our copy — immune to TOCTOU)
555/// 2. Check IP against allowlist on our copy
556/// 3. Copy data buffer from child memory
557/// 4. Duplicate child's socket fd via pidfd_getfd
558/// 5. sendto() in supervisor with validated sockaddr + copied data
559/// 6. Return byte count or errno
560///
561/// Only triggers for unconnected sends (addr_ptr != NULL), which is
562/// primarily UDP. Connected sockets (addr_ptr == NULL) use CONTINUE.
563async fn sendto_on_behalf(
564    notif: &SeccompNotif,
565    ctx: &Arc<SupervisorCtx>,
566    notif_fd: RawFd,
567) -> NotifAction {
568    let args = &notif.data.args;
569    let sockfd = args[0] as i32;
570    let buf_ptr = args[1];
571    let buf_len = args[2] as usize;
572    if buf_len > MAX_SEND_BUF {
573        return NotifAction::Errno(libc::EMSGSIZE);
574    }
575    let flags = args[3] as i32;
576    let addr_ptr = args[4];
577    let addr_len = args[5] as u32;
578
579    if addr_ptr == 0 {
580        return NotifAction::Continue; // connected socket, no addr to check
581    }
582
583    // 1. Copy sockaddr from child memory (small: 16-28 bytes)
584    let addr_bytes =
585        match read_child_mem(notif_fd, notif.id, notif.pid, addr_ptr, addr_len as usize) {
586            Ok(b) => b,
587            Err(_) => return NotifAction::Errno(libc::EIO),
588        };
589
590    // 2. Check (ip, port) against the per-protocol endpoint allowlist.
591    // One pidfd_getfd serves both the SO_PROTOCOL probe and the
592    // on-behalf sendto.
593    if let Some(ip) = parse_ip_from_sockaddr(&addr_bytes) {
594        let dest_port = parse_port_from_sockaddr(&addr_bytes);
595        let dup_fd = match crate::seccomp::notif::dup_fd_from_pid(notif.pid, sockfd) {
596            Ok(fd) => fd,
597            Err(e) => return NotifAction::Errno(e.raw_os_error().unwrap_or(libc::EBADF)),
598        };
599        let protocol = match query_socket_protocol(dup_fd.as_raw_fd()) {
600            Some(p) => p,
601            None => return NotifAction::Errno(ECONNREFUSED),
602        };
603        let ns = ctx.network.lock().await;
604        let live_policy = {
605            let pfs = ctx.policy_fn.lock().await;
606            pfs.live_policy.clone()
607        };
608        let effective = ns.effective_network_policy(notif.pid, protocol, live_policy.as_ref());
609        if !matches!(effective, crate::seccomp::notif::NetworkPolicy::Unrestricted) {
610            match dest_port {
611                Some(p) if !effective.allows(ip, p) => {
612                    return NotifAction::Errno(ECONNREFUSED);
613                }
614                None => return NotifAction::Errno(ECONNREFUSED),
615                Some(_) => {}
616            }
617        }
618        drop(ns);
619
620        // 3. Copy data buffer from child memory
621        let data = match read_child_mem(notif_fd, notif.id, notif.pid, buf_ptr, buf_len) {
622            Ok(b) => b,
623            Err(_) => return NotifAction::Errno(libc::EIO),
624        };
625
626        // 4. (dup_fd from step 2 is reused for the supervisor sendto.)
627
628        // 5. Perform sendto in supervisor with validated sockaddr + copied data
629        let ret = unsafe {
630            libc::sendto(
631                dup_fd.as_raw_fd(),
632                data.as_ptr() as *const libc::c_void,
633                data.len(),
634                flags,
635                addr_bytes.as_ptr() as *const libc::sockaddr,
636                addr_len as libc::socklen_t,
637            )
638        };
639
640        // 6. Return result
641        if ret >= 0 {
642            NotifAction::ReturnValue(ret as i64)
643        } else {
644            let errno = unsafe { *libc::__errno_location() };
645            NotifAction::Errno(errno)
646        }
647    } else {
648        // Non-IP family (AF_UNIX etc.) — allow through
649        NotifAction::Continue
650    }
651}
652
653/// Perform sendmsg() on behalf of the child process (TOCTOU-safe).
654///
655/// 1. Copy full msghdr from child memory
656/// 2. Copy sockaddr from msg_name (our copy — immune to TOCTOU)
657/// 3. Check IP against allowlist on our copy
658/// 4. Copy iovec data buffers from child memory
659/// 5. Copy control message buffer from child memory
660/// 6. Duplicate child's socket fd via pidfd_getfd
661/// 7. sendmsg() in supervisor with validated sockaddr + copied data
662/// 8. Return byte count or errno
663async fn sendmsg_on_behalf(
664    notif: &SeccompNotif,
665    ctx: &Arc<SupervisorCtx>,
666    notif_fd: RawFd,
667) -> NotifAction {
668    let args = &notif.data.args;
669    let sockfd = args[0] as i32;
670    let msghdr_ptr = args[1];
671    let flags = args[2] as i32;
672
673    // Pre-scan for Continue cases (connected socket / non-IP family).
674    // Same TOCTOU-aware semantics as before: EFAULT on unreadable
675    // msghdr (vs. Continue, which would let the kernel re-read child
676    // memory and bypass our check).
677    match prescan_msghdr(notif, notif_fd, msghdr_ptr) {
678        PrescanResult::ContinueWholeCall => return NotifAction::Continue,
679        PrescanResult::Errno(e) => return NotifAction::Errno(e),
680        PrescanResult::OnBehalf => {}
681    }
682
683    let dup_fd = match crate::seccomp::notif::dup_fd_from_pid(notif.pid, sockfd) {
684        Ok(fd) => fd,
685        Err(e) => return NotifAction::Errno(e.raw_os_error().unwrap_or(libc::EBADF)),
686    };
687    let protocol = match query_socket_protocol(dup_fd.as_raw_fd()) {
688        Some(p) => p,
689        None => return NotifAction::Errno(ECONNREFUSED),
690    };
691
692    match send_msghdr_on_behalf(notif, ctx, notif_fd, &dup_fd, protocol, msghdr_ptr, flags).await {
693        Ok(n) => NotifAction::ReturnValue(n as i64),
694        Err(errno) => NotifAction::Errno(errno),
695    }
696}
697
698// ============================================================
699// prescan_msghdr / send_msghdr_on_behalf — shared per-message work
700// ============================================================
701
702#[derive(Clone, Copy)]
703enum PrescanResult {
704    /// All fields present, IP-family destination — caller can take the
705    /// on-behalf path with `send_msghdr_on_behalf`.
706    OnBehalf,
707    /// `msg_name == NULL` (connected socket) or non-IP family
708    /// (AF_UNIX etc.). Caller should return `NotifAction::Continue` so
709    /// the kernel handles the syscall in the child's namespace —
710    /// AF_UNIX path resolution is the canonical reason we don't take
711    /// these messages on behalf.
712    ContinueWholeCall,
713    /// Memory read failure. Caller maps to the appropriate errno
714    /// (EFAULT for unreadable msghdr, EIO for the sockaddr).
715    Errno(i32),
716}
717
718/// Probe one `struct msghdr` to decide whether the on-behalf path
719/// applies. Used by both `sendmsg_on_behalf` (one msghdr) and
720/// `sendmmsg_on_behalf` (one per `mmsghdr` entry, before doing any
721/// sends — Continue is a whole-syscall decision).
722fn prescan_msghdr(
723    notif: &SeccompNotif,
724    notif_fd: RawFd,
725    msghdr_ptr: u64,
726) -> PrescanResult {
727    let msghdr_bytes = match read_child_mem(notif_fd, notif.id, notif.pid, msghdr_ptr, 56) {
728        Ok(b) if b.len() >= 56 => b,
729        _ => return PrescanResult::Errno(libc::EFAULT),
730    };
731    let msg_name_ptr = u64::from_ne_bytes(msghdr_bytes[0..8].try_into().unwrap());
732    if msg_name_ptr == 0 {
733        return PrescanResult::ContinueWholeCall;
734    }
735    let msg_namelen = u32::from_ne_bytes(msghdr_bytes[8..12].try_into().unwrap());
736    let addr_bytes = match read_child_mem(notif_fd, notif.id, notif.pid, msg_name_ptr, msg_namelen as usize) {
737        Ok(b) => b,
738        Err(_) => return PrescanResult::Errno(libc::EIO),
739    };
740    if parse_ip_from_sockaddr(&addr_bytes).is_none() {
741        return PrescanResult::ContinueWholeCall;
742    }
743    PrescanResult::OnBehalf
744}
745
746/// Validate, materialize, and send one `struct msghdr` on behalf of
747/// the child. Caller is responsible for:
748///   - dup'ing the child fd (`dup_fd`),
749///   - resolving the socket protocol (`protocol`) via
750///     `query_socket_protocol` on that dup,
751///   - having confirmed via `prescan_msghdr` that `msghdr_ptr` points
752///     at an IP-family destination (non-NULL `msg_name`).
753///
754/// Returns the byte count returned by `sendmsg`, or an errno suitable
755/// for `NotifAction::Errno`. ECONNREFUSED is used both for "destination
756/// blocked by policy" and for "couldn't parse a port from the
757/// sockaddr"; EIO for sub-buffer read failures (iovec / control).
758async fn send_msghdr_on_behalf(
759    notif: &SeccompNotif,
760    ctx: &Arc<SupervisorCtx>,
761    notif_fd: RawFd,
762    dup_fd: &std::os::unix::io::OwnedFd,
763    protocol: Protocol,
764    msghdr_ptr: u64,
765    flags: i32,
766) -> Result<isize, i32> {
767    let msghdr_bytes = match read_child_mem(notif_fd, notif.id, notif.pid, msghdr_ptr, 56) {
768        Ok(b) if b.len() >= 56 => b,
769        _ => return Err(libc::EFAULT),
770    };
771    let msg_name_ptr = u64::from_ne_bytes(msghdr_bytes[0..8].try_into().unwrap());
772    let msg_namelen = u32::from_ne_bytes(msghdr_bytes[8..12].try_into().unwrap());
773    let msg_iov_ptr = u64::from_ne_bytes(msghdr_bytes[16..24].try_into().unwrap());
774    let msg_iovlen = u64::from_ne_bytes(msghdr_bytes[24..32].try_into().unwrap());
775    let msg_control_ptr = u64::from_ne_bytes(msghdr_bytes[32..40].try_into().unwrap());
776    let msg_controllen = u64::from_ne_bytes(msghdr_bytes[40..48].try_into().unwrap());
777
778    let addr_bytes = match read_child_mem(notif_fd, notif.id, notif.pid, msg_name_ptr, msg_namelen as usize) {
779        Ok(b) => b,
780        Err(_) => return Err(libc::EIO),
781    };
782    let ip = match parse_ip_from_sockaddr(&addr_bytes) {
783        Some(ip) => ip,
784        // Caller pre-checks via prescan_msghdr; reaching this branch
785        // means the sockaddr changed under us between the prescan and
786        // here. Fail closed.
787        None => return Err(libc::EAFNOSUPPORT),
788    };
789    let dest_port = parse_port_from_sockaddr(&addr_bytes);
790
791    let ns = ctx.network.lock().await;
792    let live_policy = {
793        let pfs = ctx.policy_fn.lock().await;
794        pfs.live_policy.clone()
795    };
796    let effective = ns.effective_network_policy(notif.pid, protocol, live_policy.as_ref());
797    if !matches!(effective, crate::seccomp::notif::NetworkPolicy::Unrestricted) {
798        match dest_port {
799            Some(p) if !effective.allows(ip, p) => return Err(ECONNREFUSED),
800            None => return Err(ECONNREFUSED),
801            Some(_) => {}
802        }
803    }
804    drop(ns);
805
806    let iovlen = (msg_iovlen as usize).min(1024);
807    let iov_size = iovlen * 16;
808    let iov_bytes = match read_child_mem(notif_fd, notif.id, notif.pid, msg_iov_ptr, iov_size) {
809        Ok(b) => b,
810        Err(_) => return Err(libc::EIO),
811    };
812    let mut data_bufs: Vec<Vec<u8>> = Vec::with_capacity(iovlen);
813    let mut local_iovs: Vec<libc::iovec> = Vec::with_capacity(iovlen);
814    for i in 0..iovlen {
815        let off = i * 16;
816        if off + 16 > iov_bytes.len() { break; }
817        let iov_base = u64::from_ne_bytes(iov_bytes[off..off + 8].try_into().unwrap());
818        let iov_len = u64::from_ne_bytes(iov_bytes[off + 8..off + 16].try_into().unwrap()) as usize;
819        if iov_len > MAX_SEND_BUF {
820            return Err(libc::EMSGSIZE);
821        }
822        if iov_base == 0 || iov_len == 0 {
823            data_bufs.push(Vec::new());
824            continue;
825        }
826        let buf = match read_child_mem(notif_fd, notif.id, notif.pid, iov_base, iov_len) {
827            Ok(b) => b,
828            Err(_) => return Err(libc::EIO),
829        };
830        data_bufs.push(buf);
831    }
832    for buf in &data_bufs {
833        local_iovs.push(libc::iovec {
834            iov_base: buf.as_ptr() as *mut libc::c_void,
835            iov_len: buf.len(),
836        });
837    }
838
839    let control_buf = if msg_control_ptr != 0 && msg_controllen > 0 {
840        let len = (msg_controllen as usize).min(4096);
841        read_child_mem(notif_fd, notif.id, notif.pid, msg_control_ptr, len).ok()
842    } else {
843        None
844    };
845
846    let mut msg: libc::msghdr = unsafe { std::mem::zeroed() };
847    msg.msg_name = addr_bytes.as_ptr() as *mut libc::c_void;
848    msg.msg_namelen = addr_bytes.len() as u32;
849    msg.msg_iov = local_iovs.as_mut_ptr();
850    msg.msg_iovlen = local_iovs.len();
851    if let Some(ref ctrl) = control_buf {
852        msg.msg_control = ctrl.as_ptr() as *mut libc::c_void;
853        msg.msg_controllen = ctrl.len();
854    }
855
856    let ret = unsafe { libc::sendmsg(dup_fd.as_raw_fd(), &msg, flags) };
857    if ret >= 0 {
858        Ok(ret)
859    } else {
860        Err(unsafe { *libc::__errno_location() })
861    }
862}
863
864// ============================================================
865// sendmmsg_on_behalf — multi-message variant
866// ============================================================
867
868/// `struct mmsghdr` size on Linux x86_64 / aarch64: 56-byte msghdr +
869/// 4-byte msg_len + 4-byte tail padding = 64 bytes. msg_len lives at
870/// offset 56.
871const MMSGHDR_SIZE: usize = 64;
872const MSG_LEN_OFFSET: usize = 56;
873/// Cap on the number of messages we'll process per sendmmsg call.
874/// Linux's UIO_MAXIOV is 1024; lower here to bound supervisor work
875/// per syscall (each entry costs at minimum a few read_child_mem
876/// hops + one sendmsg).
877const MAX_MMSGHDR_ENTRIES: usize = 256;
878
879/// Perform `sendmmsg()` on behalf of the child. Pre-scans every entry
880/// for Continue cases (NULL `msg_name` or non-IP family) — if any
881/// entry would Continue, we Continue the whole syscall to match
882/// `sendmsg_on_behalf`'s coarse-grained behavior. Otherwise dup the
883/// child fd once, query SO_PROTOCOL once, then loop:
884/// validate → send → write `msg_len` back to the child's mmsghdr.
885///
886/// On partial failure (entry K denied or send fails), returns
887/// `ReturnValue(K)` matching the kernel's "messages successfully
888/// transmitted" semantics. Returns the errno only when the very first
889/// entry fails — otherwise the child sees a positive count and reads
890/// per-entry `msg_len` to learn the per-message status.
891async fn sendmmsg_on_behalf(
892    notif: &SeccompNotif,
893    ctx: &Arc<SupervisorCtx>,
894    notif_fd: RawFd,
895) -> NotifAction {
896    let args = &notif.data.args;
897    let sockfd = args[0] as i32;
898    let msgvec_ptr = args[1];
899    let vlen = (args[2] as u32 as usize).min(MAX_MMSGHDR_ENTRIES);
900    let flags = args[3] as i32;
901
902    if vlen == 0 {
903        return NotifAction::ReturnValue(0);
904    }
905
906    // Pre-scan every entry. If any has a Continue-eligible shape
907    // (NULL msg_name or non-IP family), Continue the whole sendmmsg.
908    // Mixed-shape sendmmsg calls (some entries on-behalf, others not)
909    // aren't supported because Continue is binary at the syscall
910    // level.
911    for i in 0..vlen {
912        let entry_ptr = msgvec_ptr + (i * MMSGHDR_SIZE) as u64;
913        match prescan_msghdr(notif, notif_fd, entry_ptr) {
914            PrescanResult::OnBehalf => continue,
915            PrescanResult::ContinueWholeCall => return NotifAction::Continue,
916            PrescanResult::Errno(e) => return NotifAction::Errno(e),
917        }
918    }
919
920    let dup_fd = match crate::seccomp::notif::dup_fd_from_pid(notif.pid, sockfd) {
921        Ok(fd) => fd,
922        Err(e) => return NotifAction::Errno(e.raw_os_error().unwrap_or(libc::EBADF)),
923    };
924    let protocol = match query_socket_protocol(dup_fd.as_raw_fd()) {
925        Some(p) => p,
926        None => return NotifAction::Errno(ECONNREFUSED),
927    };
928
929    let mut sent: usize = 0;
930    let mut first_errno: Option<i32> = None;
931
932    for i in 0..vlen {
933        let entry_ptr = msgvec_ptr + (i * MMSGHDR_SIZE) as u64;
934        match send_msghdr_on_behalf(notif, ctx, notif_fd, &dup_fd, protocol, entry_ptr, flags).await {
935            Ok(n) => {
936                let bytes = (n as u32).to_ne_bytes();
937                let _ = write_child_mem(
938                    notif_fd, notif.id, notif.pid,
939                    entry_ptr + MSG_LEN_OFFSET as u64,
940                    &bytes,
941                );
942                sent += 1;
943            }
944            Err(errno) => {
945                first_errno = Some(errno);
946                break;
947            }
948        }
949    }
950
951    if sent > 0 {
952        NotifAction::ReturnValue(sent as i64)
953    } else {
954        // Defensive: vlen > 0 + no successes means at least one attempt
955        // failed, so first_errno is set. Fall back to ECONNREFUSED
956        // rather than panicking on the unwrap if invariants ever drift.
957        NotifAction::Errno(first_errno.unwrap_or(ECONNREFUSED))
958    }
959}
960
961// ============================================================
962// handle_net — main handler for connect/sendto/sendmsg
963// ============================================================
964
965/// Handle network-related notifications (connect, sendto, sendmsg).
966///
967/// All three are handled on-behalf (TOCTOU-safe): the supervisor copies data
968/// from child memory, validates the destination, duplicates the socket via
969/// pidfd_getfd, and performs the syscall itself. The child's memory is never
970/// re-read by the kernel after validation.
971///
972/// Continue safety (issue #27): the on-behalf paths don't return Continue
973/// at all (they return ReturnValue/Errno after performing the syscall in
974/// the supervisor). The Continue cases in this module are:
975///   1. Non-IP families (AF_UNIX etc.) — the IP allowlist doesn't apply;
976///      Landlock IPC scoping is the enforcement boundary.
977///   2. Connected sockets with addr_ptr == 0 — the address was already
978///      validated at connect time, so the kernel re-read of (nothing) is
979///      moot.
980///   3. The fall-through case below — only reachable if the BPF filter
981///      mis-routes a syscall; the kernel handles it normally.
982/// In sendmsg_on_behalf, the msghdr read failure path returns
983/// Errno(EFAULT) rather than Continue: a racing thread that briefly
984/// unmaps the msghdr could otherwise force a fall-through that lets the
985/// kernel execute sendmsg without the allowlist check. Sub-buffer read
986/// failures (sockaddr/iovec/control) already return Errno(EIO) and so
987/// don't bypass the check either.
988pub(crate) async fn handle_net(
989    notif: &SeccompNotif,
990    ctx: &Arc<SupervisorCtx>,
991    notif_fd: RawFd,
992) -> NotifAction {
993    let nr = notif.data.nr as i64;
994
995    if nr == libc::SYS_connect {
996        connect_on_behalf(notif, ctx, notif_fd).await
997    } else if nr == libc::SYS_sendto {
998        sendto_on_behalf(notif, ctx, notif_fd).await
999    } else if nr == libc::SYS_sendmsg {
1000        sendmsg_on_behalf(notif, ctx, notif_fd).await
1001    } else if nr == libc::SYS_sendmmsg {
1002        sendmmsg_on_behalf(notif, ctx, notif_fd).await
1003    } else {
1004        NotifAction::Continue
1005    }
1006}
1007
1008// ============================================================
1009// resolve_net_allow — resolve --net-allow rules to runtime allowlist
1010// ============================================================
1011
1012/// Resolved form of `Policy::net_allow`, ready for the on-behalf path.
1013pub struct ResolvedNetAllow {
1014    /// Per-IP port rules (each concrete-host entry resolves to one or
1015    /// more IPs). An IP appearing here with an empty port set means
1016    /// "all ports for this IP" (from a `host:*` rule).
1017    pub per_ip: HashMap<IpAddr, HashSet<u16>>,
1018    /// IPs permitted on every port (from `host:*` rules after host
1019    /// resolution). The on-behalf path treats these the same as
1020    /// `PortAllow::Any` — the entry in `per_ip` is kept as a
1021    /// placeholder for diagnostic / `/etc/hosts` purposes.
1022    pub per_ip_all_ports: HashSet<IpAddr>,
1023    /// Ports permitted to any IP (the `:port` form).
1024    pub any_ip_ports: HashSet<u16>,
1025    /// Any-host any-port wildcard (`:*` / `*:*`, or `icmp://*`). When
1026    /// true, the per-protocol policy becomes `Unrestricted` and the
1027    /// on-behalf check is bypassed for that protocol.
1028    pub any_ip_all_ports: bool,
1029}
1030
1031/// Per-protocol resolved allowlists. Each protocol gets its own
1032/// `ResolvedNetAllow`; the on-behalf path picks the right one based on
1033/// the dup'd fd's `SO_PROTOCOL`. `etc_hosts` is shared across all
1034/// protocols (the synthetic file maps every concrete host that appears
1035/// in any rule).
1036pub struct ResolvedNetAllowSet {
1037    pub tcp: ResolvedNetAllow,
1038    pub udp: ResolvedNetAllow,
1039    pub icmp: ResolvedNetAllow,
1040    /// `<ip> <hostname>\n` lines from every concrete-host rule across
1041    /// every protocol, in resolution order. Empty when no concrete-host
1042    /// rules are present. Combined with the loopback base (or, in chroot
1043    /// mode, the image's `/etc/hosts`) by [`compose_virtual_etc_hosts`]
1044    /// to build the synthetic file served to the sandbox.
1045    pub concrete_host_entries: String,
1046}
1047
1048/// Resolve `--net-allow` rules into per-protocol runtime allowlists.
1049///
1050/// Rules are grouped by `Protocol` and each group is resolved
1051/// independently. ICMP rules carry no ports, so the resulting ICMP
1052/// `ResolvedNetAllow` always has empty `any_ip_ports` / per-IP port
1053/// sets — the on-behalf check routes ICMP through the IP-only path
1054/// (PortAllow::Any). A `*` host on ICMP becomes `any_ip_all_ports`,
1055/// which the handler reads as "no destination check."
1056pub async fn resolve_net_allow(
1057    rules: &[NetAllow],
1058) -> io::Result<ResolvedNetAllowSet> {
1059    let per_proto = |target: Protocol| async move {
1060        let mut per_ip: HashMap<IpAddr, HashSet<u16>> = HashMap::new();
1061        let mut per_ip_all_ports: HashSet<IpAddr> = HashSet::new();
1062        let mut any_ip_ports: HashSet<u16> = HashSet::new();
1063        let mut any_ip_all_ports = false;
1064        let mut local_etc_hosts = String::new();
1065
1066        for rule in rules.iter().filter(|r| r.protocol == target) {
1067            match &rule.host {
1068                None => {
1069                    if rule.all_ports || target == Protocol::Icmp {
1070                        // ICMP rules never carry ports, so a wildcard-host
1071                        // ICMP rule (`icmp://*`) means "any destination."
1072                        any_ip_all_ports = true;
1073                    } else {
1074                        for &p in &rule.ports {
1075                            any_ip_ports.insert(p);
1076                        }
1077                    }
1078                }
1079                Some(host) => {
1080                    let addr = format!("{}:0", host);
1081                    let resolved = tokio::net::lookup_host(addr.as_str()).await.map_err(|e| {
1082                        io::Error::new(
1083                            e.kind(),
1084                            format!("failed to resolve host '{}': {}", host, e),
1085                        )
1086                    })?;
1087                    for socket_addr in resolved {
1088                        let ip = socket_addr.ip();
1089                        if rule.all_ports || target == Protocol::Icmp {
1090                            per_ip_all_ports.insert(ip);
1091                            per_ip.entry(ip).or_default();
1092                        } else {
1093                            let entry = per_ip.entry(ip).or_default();
1094                            for &p in &rule.ports {
1095                                entry.insert(p);
1096                            }
1097                        }
1098                        local_etc_hosts.push_str(&format!("{} {}\n", ip, host));
1099                    }
1100                }
1101            }
1102        }
1103
1104        Ok::<_, io::Error>((
1105            ResolvedNetAllow {
1106                per_ip,
1107                per_ip_all_ports,
1108                any_ip_ports,
1109                any_ip_all_ports,
1110            },
1111            local_etc_hosts,
1112        ))
1113    };
1114
1115    let (tcp, tcp_eh) = per_proto(Protocol::Tcp).await?;
1116    let (udp, udp_eh) = per_proto(Protocol::Udp).await?;
1117    let (icmp, icmp_eh) = per_proto(Protocol::Icmp).await?;
1118
1119    let mut concrete_host_entries = String::new();
1120    for chunk in [tcp_eh, udp_eh, icmp_eh] {
1121        concrete_host_entries.push_str(&chunk);
1122    }
1123
1124    Ok(ResolvedNetAllowSet {
1125        tcp,
1126        udp,
1127        icmp,
1128        concrete_host_entries,
1129    })
1130}
1131
1132/// Compose the synthetic `/etc/hosts` served to the sandbox.
1133///
1134/// - **No chroot**: emit the fixed loopback base
1135///   (`127.0.0.1 localhost\n::1 localhost\n`) followed by the
1136///   concrete-host entries from [`resolve_net_allow`]. The sandbox sees
1137///   the same baseline regardless of what the host's on-disk file says.
1138/// - **With chroot**: read `<chroot>/etc/hosts` and use it as the base
1139///   (an image that bakes in private-registry entries or similar keeps
1140///   them). Inject loopback entries only for any localhost family the
1141///   image doesn't already cover — never both, so we don't duplicate
1142///   what the image already has. Concrete-host entries are still
1143///   appended on top.
1144///
1145/// If a chroot is set but `<chroot>/etc/hosts` is unreadable (absent,
1146/// permission denied, etc.), fall back to the bare loopback base — the
1147/// sandbox always sees a usable hosts file.
1148pub fn compose_virtual_etc_hosts(
1149    chroot_root: Option<&std::path::Path>,
1150    concrete_host_entries: &str,
1151) -> String {
1152    let mut out = String::new();
1153    let mut has_v4_localhost = false;
1154    let mut has_v6_localhost = false;
1155
1156    if let Some(root) = chroot_root {
1157        if let Ok(image) = std::fs::read_to_string(root.join("etc").join("hosts")) {
1158            for line in image.lines() {
1159                // Strip an inline `#` comment before tokenizing — the
1160                // hosts(5) format treats everything after `#` as a comment.
1161                let stripped = line.split('#').next().unwrap_or("");
1162                let mut parts = stripped.split_whitespace();
1163                let Some(ip) = parts.next() else { continue };
1164                for name in parts {
1165                    if name == "localhost" {
1166                        if ip == "127.0.0.1" {
1167                            has_v4_localhost = true;
1168                        } else if ip == "::1" {
1169                            has_v6_localhost = true;
1170                        }
1171                    }
1172                }
1173            }
1174            out.push_str(&image);
1175            if !out.is_empty() && !out.ends_with('\n') {
1176                out.push('\n');
1177            }
1178        }
1179    }
1180
1181    if !has_v4_localhost {
1182        out.push_str("127.0.0.1 localhost\n");
1183    }
1184    if !has_v6_localhost {
1185        out.push_str("::1 localhost\n");
1186    }
1187    out.push_str(concrete_host_entries);
1188    out
1189}
1190
1191// ============================================================
1192// Tests
1193// ============================================================
1194
1195#[cfg(test)]
1196mod tests {
1197    use super::*;
1198
1199    // --- NetAllow::parse tests ---
1200
1201    #[test]
1202    fn netallow_parse_concrete_host_port() {
1203        let r = NetAllow::parse("example.com:443").unwrap();
1204        assert_eq!(r.host.as_deref(), Some("example.com"));
1205        assert_eq!(r.ports, vec![443]);
1206        assert!(!r.all_ports);
1207    }
1208
1209    #[test]
1210    fn netallow_parse_any_host_port() {
1211        let r = NetAllow::parse(":8080").unwrap();
1212        assert_eq!(r.host, None);
1213        assert_eq!(r.ports, vec![8080]);
1214        assert!(!r.all_ports);
1215
1216        let r = NetAllow::parse("*:8080").unwrap();
1217        assert_eq!(r.host, None);
1218        assert_eq!(r.ports, vec![8080]);
1219        assert!(!r.all_ports);
1220    }
1221
1222    #[test]
1223    fn netallow_parse_multiple_ports() {
1224        let r = NetAllow::parse("github.com:22,80,443").unwrap();
1225        assert_eq!(r.host.as_deref(), Some("github.com"));
1226        assert_eq!(r.ports, vec![22, 80, 443]);
1227        assert!(!r.all_ports);
1228    }
1229
1230    #[test]
1231    fn netallow_parse_wildcard_any_host_any_port_colon() {
1232        let r = NetAllow::parse(":*").unwrap();
1233        assert_eq!(r.host, None);
1234        assert!(r.ports.is_empty());
1235        assert!(r.all_ports);
1236    }
1237
1238    #[test]
1239    fn netallow_parse_wildcard_any_host_any_port_star() {
1240        let r = NetAllow::parse("*:*").unwrap();
1241        assert_eq!(r.host, None);
1242        assert!(r.ports.is_empty());
1243        assert!(r.all_ports);
1244    }
1245
1246    #[test]
1247    fn netallow_parse_wildcard_concrete_host_any_port() {
1248        let r = NetAllow::parse("example.com:*").unwrap();
1249        assert_eq!(r.host.as_deref(), Some("example.com"));
1250        assert!(r.ports.is_empty());
1251        assert!(r.all_ports);
1252    }
1253
1254    #[test]
1255    fn netallow_parse_rejects_mixed_wildcard_and_concrete() {
1256        // `host:80,*` and `host:*,80` are both ambiguous: the user
1257        // either meant "any port" (wildcard wins) or "ports 80 plus
1258        // some weird placeholder". Refuse and force a clean spec.
1259        let err = NetAllow::parse("example.com:80,*").unwrap_err();
1260        assert!(format!("{}", err).contains("cannot mix"));
1261        let err = NetAllow::parse("example.com:*,80").unwrap_err();
1262        assert!(format!("{}", err).contains("cannot mix"));
1263    }
1264
1265    #[test]
1266    fn netallow_parse_rejects_port_zero() {
1267        let err = NetAllow::parse("example.com:0").unwrap_err();
1268        assert!(format!("{}", err).contains("port 0"));
1269    }
1270
1271    #[test]
1272    fn netallow_parse_rejects_empty_port() {
1273        let err = NetAllow::parse("example.com:").unwrap_err();
1274        assert!(format!("{}", err).contains("invalid port"));
1275    }
1276
1277    #[test]
1278    fn netallow_parse_rejects_no_colon() {
1279        let err = NetAllow::parse("example.com").unwrap_err();
1280        assert!(format!("{}", err).contains("expected"));
1281    }
1282
1283    #[test]
1284    fn netallow_parse_repeated_wildcard_is_idempotent() {
1285        // `*,*` collapses to a single wildcard — neither token contributes
1286        // a concrete port, so the rule remains "any port".
1287        let r = NetAllow::parse(":*,*").unwrap();
1288        assert!(r.all_ports);
1289        assert!(r.ports.is_empty());
1290    }
1291
1292    // --- Protocol scheme prefix tests ---
1293
1294    #[test]
1295    fn netallow_bare_form_defaults_to_tcp() {
1296        let r = NetAllow::parse("example.com:443").unwrap();
1297        assert_eq!(r.protocol, Protocol::Tcp);
1298    }
1299
1300    #[test]
1301    fn netallow_explicit_tcp_scheme() {
1302        let r = NetAllow::parse("tcp://example.com:443").unwrap();
1303        assert_eq!(r.protocol, Protocol::Tcp);
1304        assert_eq!(r.host.as_deref(), Some("example.com"));
1305        assert_eq!(r.ports, vec![443]);
1306    }
1307
1308    #[test]
1309    fn netallow_udp_scheme_with_host_port() {
1310        let r = NetAllow::parse("udp://1.1.1.1:53").unwrap();
1311        assert_eq!(r.protocol, Protocol::Udp);
1312        assert_eq!(r.host.as_deref(), Some("1.1.1.1"));
1313        assert_eq!(r.ports, vec![53]);
1314    }
1315
1316    #[test]
1317    fn netallow_udp_wildcard_any_anywhere() {
1318        // The "any UDP" gate, equivalent to the old `allow_udp = true`.
1319        let r = NetAllow::parse("udp://*:*").unwrap();
1320        assert_eq!(r.protocol, Protocol::Udp);
1321        assert_eq!(r.host, None);
1322        assert!(r.all_ports);
1323    }
1324
1325    #[test]
1326    fn netallow_icmp_scheme_with_host() {
1327        let r = NetAllow::parse("icmp://github.com").unwrap();
1328        assert_eq!(r.protocol, Protocol::Icmp);
1329        assert_eq!(r.host.as_deref(), Some("github.com"));
1330        assert!(r.ports.is_empty());
1331        assert!(!r.all_ports);
1332    }
1333
1334    #[test]
1335    fn netallow_icmp_wildcard() {
1336        // The "any ICMP echo" gate, equivalent to the old
1337        // `allow_icmp = true` for the SOCK_DGRAM path.
1338        let r = NetAllow::parse("icmp://*").unwrap();
1339        assert_eq!(r.protocol, Protocol::Icmp);
1340        assert_eq!(r.host, None);
1341    }
1342
1343    #[test]
1344    fn netallow_icmp_rejects_port() {
1345        // ICMP has no port — `:port` is meaningless and refused
1346        // explicitly so users can't write a rule that doesn't do what
1347        // they think.
1348        let err = NetAllow::parse("icmp://github.com:80").unwrap_err();
1349        assert!(format!("{}", err).contains("icmp rules take no port"));
1350    }
1351
1352    #[test]
1353    fn netallow_icmp_rejects_empty_body() {
1354        let err = NetAllow::parse("icmp://").unwrap_err();
1355        assert!(format!("{}", err).contains("needs a host or `*`"));
1356    }
1357
1358    #[test]
1359    fn netallow_unknown_scheme_rejected() {
1360        // Including `icmp-raw` — sandlock does not expose raw ICMP, so
1361        // the scheme is unknown rather than a special-case error.
1362        for spec in ["sctp://host:1234", "icmp-raw://*"] {
1363            let err = NetAllow::parse(spec).unwrap_err();
1364            assert!(format!("{}", err).contains("unknown scheme"), "spec: {}", spec);
1365        }
1366    }
1367
1368    #[tokio::test]
1369    async fn test_resolve_net_allow_empty() {
1370        let resolved = resolve_net_allow(&[]).await.unwrap();
1371        assert!(resolved.tcp.per_ip.is_empty());
1372        assert!(resolved.tcp.any_ip_ports.is_empty());
1373        assert!(resolved.udp.per_ip.is_empty());
1374        assert!(resolved.icmp.per_ip.is_empty());
1375        // No concrete-host rules → no resolved-entry lines.
1376        assert!(resolved.concrete_host_entries.is_empty());
1377    }
1378
1379    #[tokio::test]
1380    async fn test_resolve_net_allow_concrete_host() {
1381        let rules = vec![NetAllow {
1382            protocol: Protocol::Tcp,
1383            host: Some("localhost".to_string()),
1384            ports: vec![80, 443],
1385            all_ports: false,
1386        }];
1387        let resolved = resolve_net_allow(&rules).await.unwrap();
1388        // localhost should resolve to at least one loopback addr; only
1389        // the TCP set has entries.
1390        assert!(!resolved.tcp.per_ip.is_empty());
1391        for ports in resolved.tcp.per_ip.values() {
1392            assert!(ports.contains(&80));
1393            assert!(ports.contains(&443));
1394        }
1395        assert!(resolved.udp.per_ip.is_empty());
1396        assert!(resolved.icmp.per_ip.is_empty());
1397        // The resolved entry (`<ip> localhost`) surfaces in concrete_host_entries.
1398        assert!(resolved.concrete_host_entries.contains("127.0.0.1 localhost"));
1399    }
1400
1401    #[tokio::test]
1402    async fn test_resolve_net_allow_any_ip() {
1403        let rules = vec![NetAllow {
1404            protocol: Protocol::Tcp,
1405            host: None,
1406            ports: vec![8080],
1407            all_ports: false,
1408        }];
1409        let resolved = resolve_net_allow(&rules).await.unwrap();
1410        assert!(resolved.tcp.per_ip.is_empty());
1411        assert!(resolved.tcp.any_ip_ports.contains(&8080));
1412        assert!(!resolved.tcp.any_ip_all_ports);
1413        // Any-IP rule has no concrete host, so no resolved-entry line.
1414        assert!(resolved.concrete_host_entries.is_empty());
1415    }
1416
1417    #[tokio::test]
1418    async fn test_resolve_net_allow_any_ip_all_ports() {
1419        // `:*` — fully unrestricted egress, TCP-only.
1420        let rules = vec![NetAllow {
1421            protocol: Protocol::Tcp,
1422            host: None,
1423            ports: vec![],
1424            all_ports: true,
1425        }];
1426        let resolved = resolve_net_allow(&rules).await.unwrap();
1427        assert!(resolved.tcp.any_ip_all_ports);
1428        assert!(resolved.tcp.per_ip.is_empty());
1429        assert!(resolved.tcp.per_ip_all_ports.is_empty());
1430        assert!(resolved.tcp.any_ip_ports.is_empty());
1431        // UDP/ICMP unaffected by a TCP rule.
1432        assert!(!resolved.udp.any_ip_all_ports);
1433        assert!(!resolved.icmp.any_ip_all_ports);
1434    }
1435
1436    #[tokio::test]
1437    async fn test_resolve_net_allow_concrete_host_all_ports() {
1438        // `localhost:*` — every port to localhost only, TCP.
1439        let rules = vec![NetAllow {
1440            protocol: Protocol::Tcp,
1441            host: Some("localhost".to_string()),
1442            ports: vec![],
1443            all_ports: true,
1444        }];
1445        let resolved = resolve_net_allow(&rules).await.unwrap();
1446        assert!(!resolved.tcp.any_ip_all_ports);
1447        assert!(
1448            !resolved.tcp.per_ip_all_ports.is_empty(),
1449            "localhost should resolve to at least one IP marked as any-port"
1450        );
1451        for ip in resolved.tcp.per_ip_all_ports.iter() {
1452            assert!(resolved.tcp.per_ip.contains_key(ip));
1453        }
1454        assert!(resolved.concrete_host_entries.contains("localhost"));
1455    }
1456
1457    #[tokio::test]
1458    async fn test_resolve_net_allow_mixed_wildcard_and_concrete() {
1459        // Wildcard rule alongside concrete: wildcard sets the global
1460        // any-host any-port flag for TCP; concrete rule still resolves
1461        // into per_ip (the runtime layer chooses Unrestricted, ignoring
1462        // the concrete entries).
1463        let rules = vec![
1464            NetAllow {
1465                protocol: Protocol::Tcp,
1466                host: None,
1467                ports: vec![],
1468                all_ports: true,
1469            },
1470            NetAllow {
1471                protocol: Protocol::Tcp,
1472                host: Some("localhost".to_string()),
1473                ports: vec![22],
1474                all_ports: false,
1475            },
1476        ];
1477        let resolved = resolve_net_allow(&rules).await.unwrap();
1478        assert!(resolved.tcp.any_ip_all_ports);
1479        assert!(!resolved.tcp.per_ip.is_empty());
1480    }
1481
1482    // ============================================================
1483    // Per-protocol resolution — UDP / ICMP slices stay isolated
1484    // ============================================================
1485
1486    #[tokio::test]
1487    async fn test_resolve_per_protocol_isolation() {
1488        // A UDP rule should not appear in the TCP set, and vice versa.
1489        // This is the property Phase 2 relies on for protocol routing.
1490        let rules = vec![
1491            NetAllow {
1492                protocol: Protocol::Tcp,
1493                host: Some("localhost".to_string()),
1494                ports: vec![443],
1495                all_ports: false,
1496            },
1497            NetAllow {
1498                protocol: Protocol::Udp,
1499                host: None,
1500                ports: vec![53],
1501                all_ports: false,
1502            },
1503        ];
1504        let resolved = resolve_net_allow(&rules).await.unwrap();
1505        assert!(
1506            !resolved.tcp.per_ip.is_empty(),
1507            "TCP rule should populate tcp set"
1508        );
1509        assert!(
1510            resolved.udp.any_ip_ports.contains(&53),
1511            "UDP rule should populate udp set"
1512        );
1513        // Cross-contamination check: TCP per_ip ports must not contain 53;
1514        // UDP must not contain 443.
1515        for ports in resolved.tcp.per_ip.values() {
1516            assert!(!ports.contains(&53), "UDP port leaked into TCP set");
1517        }
1518        assert!(!resolved.udp.any_ip_ports.contains(&443), "TCP port leaked into UDP set");
1519    }
1520
1521    #[tokio::test]
1522    async fn test_resolve_icmp_no_ports() {
1523        // ICMP rules carry no ports; concrete hosts go into per_ip with
1524        // PortAllow::Any-style empty port set, plus per_ip_all_ports.
1525        let rules = vec![NetAllow {
1526            protocol: Protocol::Icmp,
1527            host: Some("localhost".to_string()),
1528            ports: vec![],
1529            all_ports: false,
1530        }];
1531        let resolved = resolve_net_allow(&rules).await.unwrap();
1532        assert!(
1533            !resolved.icmp.per_ip.is_empty(),
1534            "icmp host should populate per_ip"
1535        );
1536        assert!(
1537            !resolved.icmp.per_ip_all_ports.is_empty(),
1538            "icmp host should mark per_ip_all_ports (no port check)"
1539        );
1540        assert!(resolved.icmp.any_ip_ports.is_empty());
1541        // TCP/UDP unaffected.
1542        assert!(resolved.tcp.per_ip.is_empty());
1543        assert!(resolved.udp.per_ip.is_empty());
1544    }
1545
1546    #[tokio::test]
1547    async fn test_resolve_icmp_wildcard() {
1548        // `icmp://*` — any ICMP destination.
1549        let rules = vec![NetAllow {
1550            protocol: Protocol::Icmp,
1551            host: None,
1552            ports: vec![],
1553            all_ports: false,
1554        }];
1555        let resolved = resolve_net_allow(&rules).await.unwrap();
1556        assert!(resolved.icmp.any_ip_all_ports);
1557        assert!(!resolved.tcp.any_ip_all_ports);
1558    }
1559
1560    // ============================================================
1561    // compose_virtual_etc_hosts — synthetic /etc/hosts assembly
1562    // ============================================================
1563
1564    use std::io::Write;
1565
1566    fn temp_rootfs_with_hosts(name: &str, hosts_content: Option<&str>) -> std::path::PathBuf {
1567        let dir = std::env::temp_dir().join(format!(
1568            "sandlock-test-compose-hosts-{}-{}",
1569            name, std::process::id()
1570        ));
1571        let _ = std::fs::create_dir_all(dir.join("etc"));
1572        if let Some(content) = hosts_content {
1573            let mut f = std::fs::File::create(dir.join("etc").join("hosts")).unwrap();
1574            f.write_all(content.as_bytes()).unwrap();
1575        }
1576        dir
1577    }
1578
1579    #[test]
1580    fn compose_no_chroot_emits_loopback_base() {
1581        // Default path — no chroot, no concrete-host rules → the same
1582        // fixed loopback view we promise every sandbox.
1583        let out = compose_virtual_etc_hosts(None, "");
1584        assert_eq!(out, "127.0.0.1 localhost\n::1 localhost\n");
1585    }
1586
1587    #[test]
1588    fn compose_no_chroot_appends_concrete_entries() {
1589        let out = compose_virtual_etc_hosts(None, "10.0.0.1 api\n");
1590        assert_eq!(out, "127.0.0.1 localhost\n::1 localhost\n10.0.0.1 api\n");
1591    }
1592
1593    #[test]
1594    fn compose_chroot_seeds_from_image_and_injects_missing_loopback() {
1595        // Image ships an entry of its own but no localhost mapping; the
1596        // shim must keep the image's content and inject both loopback
1597        // entries on top so the always-on guarantee still holds.
1598        let rootfs = temp_rootfs_with_hosts(
1599            "no-localhost",
1600            Some("10.0.0.5 myimage.local\n"),
1601        );
1602        let out = compose_virtual_etc_hosts(Some(&rootfs), "");
1603        assert!(out.contains("10.0.0.5 myimage.local"), "image entry missing: {out}");
1604        assert!(out.contains("127.0.0.1 localhost"), "v4 loopback missing: {out}");
1605        assert!(out.contains("::1 localhost"), "v6 loopback missing: {out}");
1606        let _ = std::fs::remove_dir_all(&rootfs);
1607    }
1608
1609    #[test]
1610    fn compose_chroot_does_not_duplicate_existing_loopback() {
1611        // Image already has both loopback entries — don't append duplicates.
1612        let rootfs = temp_rootfs_with_hosts(
1613            "both-localhost",
1614            Some("127.0.0.1 localhost\n::1 localhost\n10.0.0.5 myimage.local\n"),
1615        );
1616        let out = compose_virtual_etc_hosts(Some(&rootfs), "");
1617        assert_eq!(out.matches("127.0.0.1 localhost").count(), 1, "v4 dup'd: {out}");
1618        assert_eq!(out.matches("::1 localhost").count(), 1, "v6 dup'd: {out}");
1619        assert!(out.contains("10.0.0.5 myimage.local"));
1620        let _ = std::fs::remove_dir_all(&rootfs);
1621    }
1622
1623    #[test]
1624    fn compose_chroot_injects_only_missing_family() {
1625        // Image has v4 but no v6 localhost — inject only v6, leave v4 alone.
1626        let rootfs = temp_rootfs_with_hosts(
1627            "only-v4-localhost",
1628            Some("127.0.0.1 localhost myimage\n"),
1629        );
1630        let out = compose_virtual_etc_hosts(Some(&rootfs), "");
1631        assert_eq!(out.matches("127.0.0.1 localhost").count(), 1);
1632        assert!(out.contains("::1 localhost"), "v6 loopback should be injected: {out}");
1633        let _ = std::fs::remove_dir_all(&rootfs);
1634    }
1635
1636    #[test]
1637    fn compose_chroot_missing_file_falls_back_to_loopback() {
1638        // Chroot exists but has no /etc/hosts — fall back to the bare
1639        // loopback base so the sandbox always sees a usable file.
1640        let rootfs = temp_rootfs_with_hosts("no-file", None);
1641        let out = compose_virtual_etc_hosts(Some(&rootfs), "10.0.0.1 api\n");
1642        assert_eq!(out, "127.0.0.1 localhost\n::1 localhost\n10.0.0.1 api\n");
1643        let _ = std::fs::remove_dir_all(&rootfs);
1644    }
1645
1646    #[test]
1647    fn compose_chroot_strips_inline_comments_when_detecting_loopback() {
1648        // hosts(5) treats `#` as a comment-start; the loopback-presence
1649        // check must respect it (otherwise an image line like
1650        // `127.0.0.1 # localhost` would be falsely treated as covering v4).
1651        let rootfs = temp_rootfs_with_hosts(
1652            "with-comments",
1653            Some("127.0.0.1 # localhost is a comment here\n"),
1654        );
1655        let out = compose_virtual_etc_hosts(Some(&rootfs), "");
1656        // Real `127.0.0.1 localhost` line must still be injected.
1657        assert!(
1658            out.lines().any(|l| l.trim() == "127.0.0.1 localhost"),
1659            "v4 loopback should still be injected: {out}"
1660        );
1661        let _ = std::fs::remove_dir_all(&rootfs);
1662    }
1663}