Skip to main content

sandlock_core/
network.rs

1// Network policy and control handlers — IP allowlist enforcement via seccomp notification.
2//
3// Intercepts connect/sendto/sendmsg syscalls, extracts the destination IP from
4// the child's memory, and checks it against an allowlist of resolved IPs.
5
6use std::collections::{HashMap, HashSet};
7use std::io;
8use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
9use std::os::unix::io::{AsRawFd, RawFd};
10use std::sync::Arc;
11
12use serde::{Deserialize, Serialize};
13
14use crate::error::SandboxError;
15use crate::seccomp::ctx::SupervisorCtx;
16use crate::seccomp::notif::{read_child_mem, write_child_mem, NotifAction};
17use crate::sys::structs::{SeccompNotif, AF_INET, AF_INET6, ECONNREFUSED};
18
19/// Maximum buffer size for sendto/sendmsg on-behalf operations (64 MiB).
20/// Prevents a sandboxed process from triggering OOM in the supervisor.
21const MAX_SEND_BUF: usize = 64 << 20;
22
23/// An IPv4 or IPv6 address with a prefix length, used by `--net-deny`
24/// to match destination IPs by exact address (`/32`, `/128`) or by range.
25#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
26pub struct IpCidr {
27    pub addr: IpAddr,
28    pub prefix_len: u8,
29}
30
31impl IpCidr {
32    /// Parse `addr` or `addr/prefix`. A bare address becomes a host route
33    /// (`/32` for IPv4, `/128` for IPv6). Hostnames are rejected: the
34    /// address part must parse as a literal IP.
35    pub fn parse(s: &str) -> Result<Self, SandboxError> {
36        let (addr_str, prefix) = match s.split_once('/') {
37            Some((a, p)) => {
38                let len: u8 = p.parse().map_err(|_| {
39                    SandboxError::Invalid(format!("invalid prefix length in `{}`", s))
40                })?;
41                (a, Some(len))
42            }
43            None => (s, None),
44        };
45        let addr: IpAddr = addr_str.parse().map_err(|_| {
46            SandboxError::Invalid(format!("`{}` is not a valid IP address", s))
47        })?;
48        let max = match addr {
49            IpAddr::V4(_) => 32u8,
50            IpAddr::V6(_) => 128u8,
51        };
52        let prefix_len = prefix.unwrap_or(max);
53        if prefix_len > max {
54            return Err(SandboxError::Invalid(format!(
55                "prefix /{} too large for {} in `{}`",
56                prefix_len,
57                if max == 32 { "IPv4" } else { "IPv6" },
58                s
59            )));
60        }
61        Ok(IpCidr { addr, prefix_len })
62    }
63
64    /// True iff this CIDR is a single host (`/32` IPv4 or `/128` IPv6),
65    /// i.e. it came from a bare IP literal rather than a range.
66    pub fn is_single_host(&self) -> bool {
67        match self.addr {
68            IpAddr::V4(_) => self.prefix_len == 32,
69            IpAddr::V6(_) => self.prefix_len == 128,
70        }
71    }
72
73    /// True iff `ip` falls within this network. Different address
74    /// families never match.
75    pub fn contains(&self, ip: IpAddr) -> bool {
76        match (self.addr, ip) {
77            (IpAddr::V4(net), IpAddr::V4(ip)) => {
78                if self.prefix_len == 0 {
79                    return true;
80                }
81                let mask = u32::MAX << (32 - self.prefix_len);
82                (u32::from(net) & mask) == (u32::from(ip) & mask)
83            }
84            (IpAddr::V6(net), IpAddr::V6(ip)) => {
85                if self.prefix_len == 0 {
86                    return true;
87                }
88                let mask = u128::MAX << (128 - self.prefix_len);
89                (u128::from(net) & mask) == (u128::from(ip) & mask)
90            }
91            _ => false,
92        }
93    }
94}
95
96impl std::fmt::Display for IpCidr {
97    /// A single host renders as the bare address (`1.2.3.4`, `::1`); a
98    /// range keeps its prefix (`10.0.0.0/8`). Inverse of [`IpCidr::parse`].
99    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
100        if self.is_single_host() {
101            write!(f, "{}", self.addr)
102        } else {
103            write!(f, "{}/{}", self.addr, self.prefix_len)
104        }
105    }
106}
107
108/// What a `--net-allow` / `--net-deny` rule targets at the IP layer.
109///
110/// `Cidr` covers both a bare IP literal (stored as a `/32` or `/128`) and
111/// an explicit CIDR range. `Host` is a hostname resolved via DNS at sandbox
112/// start; it is only produced for `--net-allow` (deny rejects hostnames).
113#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
114pub enum NetTarget {
115    /// Any destination IP (the `:port` / `*:port` / `*` form).
116    AnyIp,
117    /// A literal IP or CIDR range. Matched by containment, no DNS.
118    Cidr(IpCidr),
119    /// A hostname, resolved to IPs at sandbox start (allow-only).
120    Host(String),
121}
122
123/// A single `--net-allow` / `--net-deny` rule. Both flags share this
124/// representation and the same grammar; they differ only in whether
125/// hostnames are accepted (`--net-deny` rejects them) and in how the
126/// resolved rule is enforced (allowlist vs denylist).
127#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
128pub struct NetRule {
129    /// L4 protocol this rule applies to.
130    #[serde(default = "default_protocol_tcp")]
131    pub protocol: Protocol,
132    /// What the rule targets at the IP layer.
133    pub target: NetTarget,
134    /// Permitted/denied ports. Empty when `all_ports` is true and always
135    /// empty for `Protocol::Icmp`.
136    pub ports: Vec<u16>,
137    /// "Any port" (bare target with no `:port`, or the `*` port token).
138    #[serde(default)]
139    pub all_ports: bool,
140}
141
142/// `--net-allow` and `--net-deny` rules are the same shape; the aliases
143/// document intent at call sites and field declarations.
144pub type NetAllow = NetRule;
145pub type NetDeny = NetRule;
146
147fn default_protocol_tcp() -> Protocol {
148    Protocol::Tcp
149}
150
151impl NetRule {
152    /// Parse a `--net-allow` spec into a rule. Hostnames are accepted and
153    /// resolved to IPs at sandbox start. Grammar (shared with `--net-deny`):
154    ///
155    /// - `host` / `<ip>` / `<cidr>` / `*` -- all ports (port optional; `*`
156    ///   targets any IP). TCP is the default scheme.
157    /// - `host:<port[,port,...]>` / `<ip>:<port>` / `<cidr>:*` / `:port`.
158    /// - `[<ipv6|ipv6cidr>]:<port>` -- bracketed IPv6 with a port (a bare
159    ///   `addr:port` string is itself a valid IPv6 address, so the port
160    ///   form needs brackets).
161    /// - `tcp://...` / `udp://...` / `icmp://...` schemes (icmp: no port).
162    pub fn parse_allow(spec: &str) -> Result<NetRule, SandboxError> {
163        Self::parse_spec(spec, "--net-allow", true)
164    }
165
166    /// Parse a `--net-deny` spec into a rule. Identical grammar to
167    /// [`parse_allow`](Self::parse_allow), except hostnames are rejected
168    /// (the target must be a literal IP/CIDR or `*`); use `--http-deny`
169    /// for domain blocking.
170    pub fn parse_deny(spec: &str) -> Result<NetDeny, SandboxError> {
171        Self::parse_spec(spec, "--net-deny", false)
172    }
173
174    /// Shared grammar for both flags. `label` selects the error prefix and
175    /// `allow_hosts` whether non-IP targets are accepted (allow) or
176    /// rejected (deny).
177    fn parse_spec(spec: &str, label: &str, allow_hosts: bool) -> Result<NetRule, SandboxError> {
178        let (protocol, rest) = match spec.split_once("://") {
179            Some((scheme, body)) => {
180                let proto = Protocol::parse(scheme).ok_or_else(|| {
181                    SandboxError::Invalid(format!(
182                        "{}: unknown scheme `{}://` in `{}` (expected tcp, udp, icmp)",
183                        label, scheme, spec
184                    ))
185                })?;
186                (proto, body)
187            }
188            None => (Protocol::Tcp, spec),
189        };
190
191        // ICMP carries no port: the whole body is the target.
192        if protocol == Protocol::Icmp {
193            if rest.is_empty() {
194                return Err(SandboxError::Invalid(format!(
195                    "{}: icmp rule needs a host/IP or `*`, got `{}`",
196                    label, spec
197                )));
198            }
199            // Reject an explicit port. IPv6 literals/CIDRs also contain
200            // `:`, so only flag a `:` that isn't part of a valid IP/CIDR.
201            if rest != "*" && IpCidr::parse(rest).is_err() && rest.contains(':') {
202                return Err(SandboxError::Invalid(format!(
203                    "{}: icmp rule takes no port, got `{}`",
204                    label, spec
205                )));
206            }
207            return Ok(NetRule {
208                protocol,
209                target: parse_target(rest, label, allow_hosts)?,
210                ports: Vec::new(),
211                all_ports: true,
212            });
213        }
214
215        // 1. Bracketed IPv6 with a port: `[addr]:ports`.
216        if let Some(stripped) = rest.strip_prefix('[') {
217            let (inside, port_part) = stripped.rsplit_once("]:").ok_or_else(|| {
218                SandboxError::Invalid(format!("{}: malformed bracketed address in `{}`", label, spec))
219            })?;
220            let (ports, all_ports) = parse_ports(port_part, label, spec)?;
221            return Ok(NetRule {
222                protocol,
223                target: NetTarget::Cidr(IpCidr::parse(inside)?),
224                ports,
225                all_ports,
226            });
227        }
228
229        // An empty body must not silently mean "everything"; require an
230        // explicit `*` for the any-IP target.
231        if rest.is_empty() {
232            return Err(SandboxError::Invalid(format!(
233                "{}: empty rule in `{}` (use `*` for any host)",
234                label, spec
235            )));
236        }
237
238        // 2. Whole body is an IP/CIDR with no port -> all ports. Trying
239        //    `IpCidr::parse` first is what makes bare IPv6 (`::1`) and IPv6
240        //    CIDRs (`fc00::/7`) work despite containing colons.
241        if let Ok(cidr) = IpCidr::parse(rest) {
242            return Ok(NetRule {
243                protocol,
244                target: NetTarget::Cidr(cidr),
245                ports: Vec::new(),
246                all_ports: true,
247            });
248        }
249
250        // 3. `target[:ports]` where target is an IP/CIDR, hostname, `*`, or
251        //    empty. The port suffix is optional: a target with no `:port`
252        //    covers all ports, mirroring the bare-target form above.
253        let (host_part, port_part) = match rest.rsplit_once(':') {
254            Some((h, p)) => (h, Some(p)),
255            None => (rest, None),
256        };
257        let target = parse_target(host_part, label, allow_hosts)?;
258        let (ports, all_ports) = match port_part {
259            Some(p) => parse_ports(p, label, spec)?,
260            None => (Vec::new(), true),
261        };
262        Ok(NetRule {
263            protocol,
264            target,
265            ports,
266            all_ports,
267        })
268    }
269}
270
271/// Parse a rule target: `*` / empty -> any IP, an IP/CIDR literal ->
272/// `Cidr`, otherwise a hostname (`Host`) when `allow_hosts`, else an error.
273fn parse_target(s: &str, label: &str, allow_hosts: bool) -> Result<NetTarget, SandboxError> {
274    match s {
275        "" | "*" => Ok(NetTarget::AnyIp),
276        // A `/` signals CIDR intent: parse strictly so a bad prefix is a
277        // clear error rather than being misread as a hostname.
278        _ if s.contains('/') => Ok(NetTarget::Cidr(
279            IpCidr::parse(s).map_err(|e| SandboxError::Invalid(format!("{}: {}", label, e)))?,
280        )),
281        _ => {
282            if let Ok(cidr) = IpCidr::parse(s) {
283                Ok(NetTarget::Cidr(cidr))
284            } else if allow_hosts {
285                Ok(NetTarget::Host(s.to_string()))
286            } else {
287                Err(SandboxError::Invalid(format!(
288                    "{}: `{}` is not an IP or CIDR (hostnames are not allowed; \
289                     use --http-deny for domains)",
290                    label, s
291                )))
292            }
293        }
294    }
295}
296
297/// Parse a port suffix. `*` means all ports; mixing `*` with concrete
298/// ports, port 0, and an empty list are all rejected.
299fn parse_ports(s: &str, label: &str, full: &str) -> Result<(Vec<u16>, bool), SandboxError> {
300    let mut ports = Vec::new();
301    let mut saw_wildcard = false;
302    for p in s.split(',') {
303        let p = p.trim();
304        if p == "*" {
305            saw_wildcard = true;
306            continue;
307        }
308        let n: u16 = p.parse().map_err(|_| {
309            SandboxError::Invalid(format!("{}: invalid port `{}` in `{}`", label, p, full))
310        })?;
311        if n == 0 {
312            return Err(SandboxError::Invalid(format!(
313                "{}: port 0 is not valid in `{}`",
314                label, full
315            )));
316        }
317        ports.push(n);
318    }
319    if saw_wildcard && !ports.is_empty() {
320        return Err(SandboxError::Invalid(format!(
321            "{}: cannot mix `*` with concrete ports in `{}`",
322            label, full
323        )));
324    }
325    if !saw_wildcard && ports.is_empty() {
326        return Err(SandboxError::Invalid(format!(
327            "{}: at least one port required in `{}`",
328            label, full
329        )));
330    }
331    Ok((ports, saw_wildcard))
332}
333
334/// L4 protocol that a `NetAllow` rule applies to.
335///
336/// `Tcp` is the default if a rule has no scheme (the bare `host:port`
337/// form). `Udp` and `Icmp` require an explicit scheme.
338///
339/// `Icmp` is the kernel's unprivileged ping socket
340/// (`SOCK_DGRAM + IPPROTO_ICMP{,V6}`), gated by `ping_group_range` —
341/// destinations are filterable per host. Sandlock does not expose raw
342/// ICMP (`SOCK_RAW + IPPROTO_ICMP`): destination filtering at `sendto`
343/// would lie because raw sockets let the agent craft the IP header,
344/// and packet-crafting capabilities aren't part of the XOA threat
345/// model. Workloads that genuinely need raw ICMP should run outside
346/// sandlock or rely on the host's `ping_group_range` for the dgram
347/// path instead.
348#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
349#[serde(rename_all = "lowercase")]
350pub enum Protocol {
351    Tcp,
352    Udp,
353    Icmp,
354}
355
356impl Protocol {
357    fn parse(s: &str) -> Option<Self> {
358        match s {
359            "tcp" => Some(Protocol::Tcp),
360            "udp" => Some(Protocol::Udp),
361            "icmp" => Some(Protocol::Icmp),
362            _ => None,
363        }
364    }
365}
366
367// ============================================================
368// parse_ip_from_sockaddr — parse IP from a sockaddr byte buffer
369// ============================================================
370
371/// Parse IP address from a sockaddr byte buffer.
372/// Returns None for non-IP families (AF_UNIX etc.) — always allowed.
373fn parse_ip_from_sockaddr(bytes: &[u8]) -> Option<IpAddr> {
374    if bytes.len() < 2 {
375        return None;
376    }
377    let family = u16::from_ne_bytes([bytes[0], bytes[1]]) as u32;
378    match family {
379        f if f == AF_INET => {
380            if bytes.len() < 8 {
381                return None;
382            }
383            Some(IpAddr::V4(Ipv4Addr::new(
384                bytes[4], bytes[5], bytes[6], bytes[7],
385            )))
386        }
387        f if f == AF_INET6 => {
388            if bytes.len() < 24 {
389                return None;
390            }
391            let mut addr_bytes = [0u8; 16];
392            addr_bytes.copy_from_slice(&bytes[8..24]);
393            Some(IpAddr::V6(Ipv6Addr::from(addr_bytes)))
394        }
395        _ => None,
396    }
397}
398
399// ============================================================
400// parse_port_from_sockaddr — parse TCP port from sockaddr bytes
401// ============================================================
402
403/// Parse TCP port from a sockaddr byte buffer.
404/// Returns None for non-IP families (AF_UNIX etc.).
405fn parse_port_from_sockaddr(bytes: &[u8]) -> Option<u16> {
406    if bytes.len() < 4 {
407        return None;
408    }
409    let family = u16::from_ne_bytes([bytes[0], bytes[1]]) as u32;
410    match family {
411        f if f == AF_INET || f == AF_INET6 => {
412            Some(u16::from_be_bytes([bytes[2], bytes[3]]))
413        }
414        _ => None,
415    }
416}
417
418fn set_port_in_sockaddr(bytes: &mut [u8], port: u16) {
419    if bytes.len() >= 4 {
420        let port_bytes = port.to_be_bytes();
421        bytes[2] = port_bytes[0];
422        bytes[3] = port_bytes[1];
423    }
424}
425
426// ============================================================
427// query_socket_protocol — derive the rule Protocol from a fd via getsockopt
428// ============================================================
429
430/// Query `SO_PROTOCOL` on a dup'd socket fd to learn whether to route
431/// the on-behalf check through the TCP, UDP, or ICMP policy.
432///
433/// Returns `None` for protocols sandlock does not gate via `net_allow`
434/// (raw, SCTP, etc.) — the handler treats those as "no rule applies"
435/// which collapses to the default-deny path.
436pub(crate) fn query_socket_protocol(fd: RawFd) -> Option<Protocol> {
437    let mut proto: libc::c_int = 0;
438    let mut len: libc::socklen_t = std::mem::size_of::<libc::c_int>() as libc::socklen_t;
439    let rc = unsafe {
440        libc::getsockopt(
441            fd,
442            libc::SOL_SOCKET,
443            libc::SO_PROTOCOL,
444            &mut proto as *mut _ as *mut libc::c_void,
445            &mut len,
446        )
447    };
448    if rc != 0 {
449        return None;
450    }
451    match proto {
452        libc::IPPROTO_TCP => Some(Protocol::Tcp),
453        libc::IPPROTO_UDP => Some(Protocol::Udp),
454        // IPPROTO_ICMP and IPPROTO_ICMPV6 both route to the ICMP policy
455        // (the policy doesn't distinguish IP versions; the rule's
456        // resolved IP set already covers both via DNS).
457        libc::IPPROTO_ICMP | libc::IPPROTO_ICMPV6 => Some(Protocol::Icmp),
458        _ => None,
459    }
460}
461
462// ============================================================
463// connect_on_behalf — perform connect() on behalf of the child (TOCTOU-safe)
464// ============================================================
465
466/// Perform connect() on behalf of the child process (TOCTOU-safe).
467///
468/// 1. Copy sockaddr from child memory (our copy — immune to TOCTOU)
469/// 2. Check IP against allowlist on our copy
470/// 3. Duplicate child's socket fd via pidfd_getfd
471/// 4. connect() in supervisor with our validated sockaddr
472/// 5. Return result to child
473async fn connect_on_behalf(
474    notif: &SeccompNotif,
475    ctx: &Arc<SupervisorCtx>,
476    notif_fd: RawFd,
477) -> NotifAction {
478    let args = &notif.data.args;
479    let sockfd = args[0] as i32;
480    let addr_ptr = args[1];
481    let addr_len = args[2] as u32;
482
483    // 1. Copy sockaddr from child memory
484    let addr_bytes =
485        match read_child_mem(notif_fd, notif.id, notif.pid, addr_ptr, addr_len as usize) {
486            Ok(b) => b,
487            Err(_) => return NotifAction::Errno(libc::EIO),
488        };
489
490    // 2. Check destination against the per-protocol endpoint allowlist.
491    // The dup we'd need anyway for the on-behalf connect doubles as
492    // our SO_PROTOCOL probe — one pidfd_getfd, one getsockopt. The
493    // per-protocol policy is keyed on whether the socket is TCP / UDP
494    // / kernel ping (ICMP). Unknown protocol (raw, SCTP, etc.) fails
495    // closed: the BPF should have prevented socket creation, so
496    // reaching here with one is an unexpected case worth refusing.
497    if let Some(ip) = parse_ip_from_sockaddr(&addr_bytes) {
498        let dest_port = parse_port_from_sockaddr(&addr_bytes);
499        let dup_fd = match crate::seccomp::notif::dup_fd_from_pid(notif.pid, sockfd) {
500            Ok(fd) => fd,
501            Err(e) => return NotifAction::Errno(e.raw_os_error().unwrap_or(libc::EBADF)),
502        };
503        let protocol = match query_socket_protocol(dup_fd.as_raw_fd()) {
504            Some(p) => p,
505            None => return NotifAction::Errno(ECONNREFUSED),
506        };
507        let ns = ctx.network.lock().await;
508        let live_policy = {
509            let pfs = ctx.policy_fn.lock().await;
510            pfs.live_policy.clone()
511        };
512        let effective = ns.effective_network_policy(notif.pid, protocol, live_policy.as_ref());
513        match (effective, dest_port) {
514            (crate::seccomp::notif::NetworkPolicy::Unrestricted, _) => {
515                // No rules for this protocol's wildcard — Landlock (TCP
516                // only) or the protocol's wildcard rule covers it; no
517                // additional check here.
518            }
519            (policy, Some(p)) => {
520                // For ICMP rules every per-IP entry is `PortAllow::Any`,
521                // so the port arg from the sockaddr (typically 0 or the
522                // ICMP id) is functionally ignored — IP is what matters.
523                if !policy.allows(ip, p) {
524                    return NotifAction::Errno(ECONNREFUSED);
525                }
526            }
527            (_, None) => {
528                // Couldn't parse port from sockaddr — fail closed.
529                return NotifAction::Errno(ECONNREFUSED);
530            }
531        }
532        // Check for HTTP ACL redirect
533        let http_acl_addr = ns.http_acl_addr;
534        let http_acl_intercept = dest_port.map_or(false, |p| ns.http_acl_ports.contains(&p));
535        let http_acl_orig_dest = ns.http_acl_orig_dest.clone();
536        let remapped_loopback_port = if ctx.policy.port_remap && ip.is_loopback() {
537            dest_port.and_then(|p| ns.port_map.get_real(p))
538        } else {
539            None
540        };
541
542        drop(ns);
543
544        // Determine the actual connect target (redirect HTTP/HTTPS to proxy)
545        let mut redirected = false;
546        let is_ipv6 = parse_ip_from_sockaddr(&addr_bytes)
547            .map_or(false, |ip| ip.is_ipv6());
548        let (mut connect_addr, connect_len) = if let Some(proxy_addr) = http_acl_addr {
549            if http_acl_intercept {
550                redirected = true;
551                if is_ipv6 {
552                    // IPv6 socket: redirect via IPv4-mapped IPv6 address
553                    // (::ffff:127.0.0.1) so it connects to the IPv4 proxy.
554                    let mut sa6: libc::sockaddr_in6 = unsafe { std::mem::zeroed() };
555                    sa6.sin6_family = libc::AF_INET6 as u16;
556                    sa6.sin6_port = proxy_addr.port().to_be();
557                    // Build ::ffff:127.0.0.1
558                    let mapped = std::net::Ipv6Addr::from(
559                        match proxy_addr {
560                            std::net::SocketAddr::V4(v4) => v4.ip().to_ipv6_mapped(),
561                            std::net::SocketAddr::V6(v6) => *v6.ip(),
562                        }
563                    );
564                    sa6.sin6_addr.s6_addr = mapped.octets();
565                    let bytes = unsafe {
566                        std::slice::from_raw_parts(
567                            &sa6 as *const _ as *const u8,
568                            std::mem::size_of::<libc::sockaddr_in6>(),
569                        )
570                    }
571                    .to_vec();
572                    (bytes, std::mem::size_of::<libc::sockaddr_in6>() as u32)
573                } else {
574                    // IPv4 socket: redirect directly.
575                    let mut sa: libc::sockaddr_in = unsafe { std::mem::zeroed() };
576                    sa.sin_family = libc::AF_INET as u16;
577                    sa.sin_port = proxy_addr.port().to_be();
578                    match proxy_addr {
579                        std::net::SocketAddr::V4(v4) => {
580                            sa.sin_addr.s_addr = u32::from_ne_bytes(v4.ip().octets());
581                        }
582                        std::net::SocketAddr::V6(_) => {
583                            // Proxy always binds to 127.0.0.1
584                            return NotifAction::Errno(libc::EAFNOSUPPORT);
585                        }
586                    }
587                    let bytes = unsafe {
588                        std::slice::from_raw_parts(
589                            &sa as *const _ as *const u8,
590                            std::mem::size_of::<libc::sockaddr_in>(),
591                        )
592                    }
593                    .to_vec();
594                    (bytes, std::mem::size_of::<libc::sockaddr_in>() as u32)
595                }
596            } else {
597                (addr_bytes.clone(), addr_len)
598            }
599        } else {
600            (addr_bytes.clone(), addr_len)
601        };
602        if !redirected {
603            if let Some(real_port) = remapped_loopback_port {
604                // The child sees virtual ports via getsockname(); connect
605                // still has to target the real bound loopback port.
606                set_port_in_sockaddr(&mut connect_addr, real_port);
607            }
608        }
609
610        // (The supervisor-side dup is the same fd we already created
611        // for the SO_PROTOCOL probe above — reuse it rather than
612        // pidfd_getfd-ing a second time.)
613
614        // 4. Record original dest IP *before* connect to prevent TOCTOU race:
615        //    the proxy may receive the request before we write the mapping if
616        //    we do it after connect(). We already have the original IP from
617        //    addr_bytes (our immune copy).
618        if redirected {
619            if let Some(ref orig_dest_map) = http_acl_orig_dest {
620                if let Some(orig_ip) = parse_ip_from_sockaddr(&addr_bytes) {
621                    // Bind the socket so getsockname() returns the local addr
622                    // the proxy will see as client_addr.
623                    if is_ipv6 {
624                        let mut bind_sa6: libc::sockaddr_in6 = unsafe { std::mem::zeroed() };
625                        bind_sa6.sin6_family = libc::AF_INET6 as u16;
626                        // port 0 + IN6ADDR_ANY = kernel picks ephemeral port
627                        unsafe {
628                            libc::bind(
629                                dup_fd.as_raw_fd(),
630                                &bind_sa6 as *const _ as *const libc::sockaddr,
631                                std::mem::size_of::<libc::sockaddr_in6>() as libc::socklen_t,
632                            );
633                        }
634                        let mut local_sa6: libc::sockaddr_in6 = unsafe { std::mem::zeroed() };
635                        let mut local_len: libc::socklen_t =
636                            std::mem::size_of::<libc::sockaddr_in6>() as libc::socklen_t;
637                        let gs_ret = unsafe {
638                            libc::getsockname(
639                                dup_fd.as_raw_fd(),
640                                &mut local_sa6 as *mut _ as *mut libc::sockaddr,
641                                &mut local_len,
642                            )
643                        };
644                        if gs_ret == 0 {
645                            let local_port = u16::from_be(local_sa6.sin6_port);
646                            let local_ip = Ipv6Addr::from(local_sa6.sin6_addr.s6_addr);
647                            let local_addr = std::net::SocketAddr::V6(
648                                std::net::SocketAddrV6::new(local_ip, local_port, 0, 0),
649                            );
650                            if let Ok(mut map) = orig_dest_map.write() {
651                                map.insert(local_addr, orig_ip);
652                            }
653                        }
654                    } else {
655                        let mut bind_sa: libc::sockaddr_in = unsafe { std::mem::zeroed() };
656                        bind_sa.sin_family = libc::AF_INET as u16;
657                        // port 0 + INADDR_ANY = kernel picks ephemeral port
658                        unsafe {
659                            libc::bind(
660                                dup_fd.as_raw_fd(),
661                                &bind_sa as *const _ as *const libc::sockaddr,
662                                std::mem::size_of::<libc::sockaddr_in>() as libc::socklen_t,
663                            );
664                        }
665                        let mut local_sa: libc::sockaddr_in = unsafe { std::mem::zeroed() };
666                        let mut local_len: libc::socklen_t =
667                            std::mem::size_of::<libc::sockaddr_in>() as libc::socklen_t;
668                        let gs_ret = unsafe {
669                            libc::getsockname(
670                                dup_fd.as_raw_fd(),
671                                &mut local_sa as *mut _ as *mut libc::sockaddr,
672                                &mut local_len,
673                            )
674                        };
675                        if gs_ret == 0 {
676                            let local_port = u16::from_be(local_sa.sin_port);
677                            let local_ip = Ipv4Addr::from(u32::from_be(local_sa.sin_addr.s_addr));
678                            let local_addr = std::net::SocketAddr::V4(
679                                std::net::SocketAddrV4::new(local_ip, local_port),
680                            );
681                            if let Ok(mut map) = orig_dest_map.write() {
682                                map.insert(local_addr, orig_ip);
683                            }
684                        }
685                    }
686                }
687            }
688        }
689
690        // 5. Perform connect in supervisor with our validated sockaddr
691        let ret = unsafe {
692            libc::connect(
693                dup_fd.as_raw_fd(),
694                connect_addr.as_ptr() as *const libc::sockaddr,
695                connect_len as libc::socklen_t,
696            )
697        };
698
699        // 6. Return result.
700        // On failure, the stale orig_dest entry is harmless: the proxy never
701        // sees this connection, and the entry will be cleaned up on the next
702        // successful request from the same local address (or on shutdown).
703        if ret == 0 {
704            NotifAction::ReturnValue(0)
705        } else {
706            let errno = unsafe { *libc::__errno_location() };
707            NotifAction::Errno(errno)
708        }
709        // dup_fd dropped here, closing supervisor's copy
710    } else {
711        // Non-IP family (AF_UNIX etc.) — allow through
712        NotifAction::Continue
713    }
714}
715
716// ============================================================
717// sendto_on_behalf / sendmsg_on_behalf — on-behalf (TOCTOU-safe)
718// ============================================================
719
720/// Perform sendto() on behalf of the child process (TOCTOU-safe).
721///
722/// 1. Copy sockaddr from child memory (our copy — immune to TOCTOU)
723/// 2. Check IP against allowlist on our copy
724/// 3. Copy data buffer from child memory
725/// 4. Duplicate child's socket fd via pidfd_getfd
726/// 5. sendto() in supervisor with validated sockaddr + copied data
727/// 6. Return byte count or errno
728///
729/// Only triggers for unconnected sends (addr_ptr != NULL), which is
730/// primarily UDP. Connected sockets (addr_ptr == NULL) use CONTINUE.
731async fn sendto_on_behalf(
732    notif: &SeccompNotif,
733    ctx: &Arc<SupervisorCtx>,
734    notif_fd: RawFd,
735) -> NotifAction {
736    let args = &notif.data.args;
737    let sockfd = args[0] as i32;
738    let buf_ptr = args[1];
739    let buf_len = args[2] as usize;
740    if buf_len > MAX_SEND_BUF {
741        return NotifAction::Errno(libc::EMSGSIZE);
742    }
743    let flags = args[3] as i32;
744    let addr_ptr = args[4];
745    let addr_len = args[5] as u32;
746
747    if addr_ptr == 0 {
748        return NotifAction::Continue; // connected socket, no addr to check
749    }
750
751    // 1. Copy sockaddr from child memory (small: 16-28 bytes)
752    let addr_bytes =
753        match read_child_mem(notif_fd, notif.id, notif.pid, addr_ptr, addr_len as usize) {
754            Ok(b) => b,
755            Err(_) => return NotifAction::Errno(libc::EIO),
756        };
757
758    // 2. Check (ip, port) against the per-protocol endpoint allowlist.
759    // One pidfd_getfd serves both the SO_PROTOCOL probe and the
760    // on-behalf sendto.
761    if let Some(ip) = parse_ip_from_sockaddr(&addr_bytes) {
762        let dest_port = parse_port_from_sockaddr(&addr_bytes);
763        let dup_fd = match crate::seccomp::notif::dup_fd_from_pid(notif.pid, sockfd) {
764            Ok(fd) => fd,
765            Err(e) => return NotifAction::Errno(e.raw_os_error().unwrap_or(libc::EBADF)),
766        };
767        let protocol = match query_socket_protocol(dup_fd.as_raw_fd()) {
768            Some(p) => p,
769            None => return NotifAction::Errno(ECONNREFUSED),
770        };
771        let ns = ctx.network.lock().await;
772        let live_policy = {
773            let pfs = ctx.policy_fn.lock().await;
774            pfs.live_policy.clone()
775        };
776        let effective = ns.effective_network_policy(notif.pid, protocol, live_policy.as_ref());
777        if !matches!(effective, crate::seccomp::notif::NetworkPolicy::Unrestricted) {
778            match dest_port {
779                Some(p) if !effective.allows(ip, p) => {
780                    return NotifAction::Errno(ECONNREFUSED);
781                }
782                None => return NotifAction::Errno(ECONNREFUSED),
783                Some(_) => {}
784            }
785        }
786        drop(ns);
787
788        // 3. Copy data buffer from child memory
789        let data = match read_child_mem(notif_fd, notif.id, notif.pid, buf_ptr, buf_len) {
790            Ok(b) => b,
791            Err(_) => return NotifAction::Errno(libc::EIO),
792        };
793
794        // 4. (dup_fd from step 2 is reused for the supervisor sendto.)
795
796        // 5. Perform sendto in supervisor with validated sockaddr + copied data
797        let ret = unsafe {
798            libc::sendto(
799                dup_fd.as_raw_fd(),
800                data.as_ptr() as *const libc::c_void,
801                data.len(),
802                flags,
803                addr_bytes.as_ptr() as *const libc::sockaddr,
804                addr_len as libc::socklen_t,
805            )
806        };
807
808        // 6. Return result
809        if ret >= 0 {
810            NotifAction::ReturnValue(ret as i64)
811        } else {
812            let errno = unsafe { *libc::__errno_location() };
813            NotifAction::Errno(errno)
814        }
815    } else {
816        // Non-IP family (AF_UNIX etc.) — allow through
817        NotifAction::Continue
818    }
819}
820
821/// Perform sendmsg() on behalf of the child process (TOCTOU-safe).
822///
823/// 1. Copy full msghdr from child memory
824/// 2. Copy sockaddr from msg_name (our copy — immune to TOCTOU)
825/// 3. Check IP against allowlist on our copy
826/// 4. Copy iovec data buffers from child memory
827/// 5. Copy control message buffer from child memory
828/// 6. Duplicate child's socket fd via pidfd_getfd
829/// 7. sendmsg() in supervisor with validated sockaddr + copied data
830/// 8. Return byte count or errno
831async fn sendmsg_on_behalf(
832    notif: &SeccompNotif,
833    ctx: &Arc<SupervisorCtx>,
834    notif_fd: RawFd,
835) -> NotifAction {
836    let args = &notif.data.args;
837    let sockfd = args[0] as i32;
838    let msghdr_ptr = args[1];
839    let flags = args[2] as i32;
840
841    // Pre-scan for Continue cases (connected socket / non-IP family).
842    // Same TOCTOU-aware semantics as before: EFAULT on unreadable
843    // msghdr (vs. Continue, which would let the kernel re-read child
844    // memory and bypass our check).
845    match prescan_msghdr(notif, notif_fd, msghdr_ptr) {
846        PrescanResult::ContinueWholeCall => return NotifAction::Continue,
847        PrescanResult::Errno(e) => return NotifAction::Errno(e),
848        PrescanResult::OnBehalf => {}
849    }
850
851    let dup_fd = match crate::seccomp::notif::dup_fd_from_pid(notif.pid, sockfd) {
852        Ok(fd) => fd,
853        Err(e) => return NotifAction::Errno(e.raw_os_error().unwrap_or(libc::EBADF)),
854    };
855    let protocol = match query_socket_protocol(dup_fd.as_raw_fd()) {
856        Some(p) => p,
857        None => return NotifAction::Errno(ECONNREFUSED),
858    };
859
860    match send_msghdr_on_behalf(notif, ctx, notif_fd, &dup_fd, protocol, msghdr_ptr, flags).await {
861        Ok(n) => NotifAction::ReturnValue(n as i64),
862        Err(errno) => NotifAction::Errno(errno),
863    }
864}
865
866// ============================================================
867// prescan_msghdr / send_msghdr_on_behalf — shared per-message work
868// ============================================================
869
870#[derive(Clone, Copy)]
871enum PrescanResult {
872    /// All fields present, IP-family destination — caller can take the
873    /// on-behalf path with `send_msghdr_on_behalf`.
874    OnBehalf,
875    /// `msg_name == NULL` (connected socket) or non-IP family
876    /// (AF_UNIX etc.). Caller should return `NotifAction::Continue` so
877    /// the kernel handles the syscall in the child's namespace —
878    /// AF_UNIX path resolution is the canonical reason we don't take
879    /// these messages on behalf.
880    ContinueWholeCall,
881    /// Memory read failure. Caller maps to the appropriate errno
882    /// (EFAULT for unreadable msghdr, EIO for the sockaddr).
883    Errno(i32),
884}
885
886/// Probe one `struct msghdr` to decide whether the on-behalf path
887/// applies. Used by both `sendmsg_on_behalf` (one msghdr) and
888/// `sendmmsg_on_behalf` (one per `mmsghdr` entry, before doing any
889/// sends — Continue is a whole-syscall decision).
890fn prescan_msghdr(
891    notif: &SeccompNotif,
892    notif_fd: RawFd,
893    msghdr_ptr: u64,
894) -> PrescanResult {
895    let msghdr_bytes = match read_child_mem(notif_fd, notif.id, notif.pid, msghdr_ptr, 56) {
896        Ok(b) if b.len() >= 56 => b,
897        _ => return PrescanResult::Errno(libc::EFAULT),
898    };
899    let msg_name_ptr = u64::from_ne_bytes(msghdr_bytes[0..8].try_into().unwrap());
900    if msg_name_ptr == 0 {
901        return PrescanResult::ContinueWholeCall;
902    }
903    let msg_namelen = u32::from_ne_bytes(msghdr_bytes[8..12].try_into().unwrap());
904    let addr_bytes = match read_child_mem(notif_fd, notif.id, notif.pid, msg_name_ptr, msg_namelen as usize) {
905        Ok(b) => b,
906        Err(_) => return PrescanResult::Errno(libc::EIO),
907    };
908    if parse_ip_from_sockaddr(&addr_bytes).is_none() {
909        return PrescanResult::ContinueWholeCall;
910    }
911    PrescanResult::OnBehalf
912}
913
914/// Validate, materialize, and send one `struct msghdr` on behalf of
915/// the child. Caller is responsible for:
916///   - dup'ing the child fd (`dup_fd`),
917///   - resolving the socket protocol (`protocol`) via
918///     `query_socket_protocol` on that dup,
919///   - having confirmed via `prescan_msghdr` that `msghdr_ptr` points
920///     at an IP-family destination (non-NULL `msg_name`).
921///
922/// Returns the byte count returned by `sendmsg`, or an errno suitable
923/// for `NotifAction::Errno`. ECONNREFUSED is used both for "destination
924/// blocked by policy" and for "couldn't parse a port from the
925/// sockaddr"; EIO for sub-buffer read failures (iovec / control).
926async fn send_msghdr_on_behalf(
927    notif: &SeccompNotif,
928    ctx: &Arc<SupervisorCtx>,
929    notif_fd: RawFd,
930    dup_fd: &std::os::unix::io::OwnedFd,
931    protocol: Protocol,
932    msghdr_ptr: u64,
933    flags: i32,
934) -> Result<isize, i32> {
935    let msghdr_bytes = match read_child_mem(notif_fd, notif.id, notif.pid, msghdr_ptr, 56) {
936        Ok(b) if b.len() >= 56 => b,
937        _ => return Err(libc::EFAULT),
938    };
939    let msg_name_ptr = u64::from_ne_bytes(msghdr_bytes[0..8].try_into().unwrap());
940    let msg_namelen = u32::from_ne_bytes(msghdr_bytes[8..12].try_into().unwrap());
941    let msg_iov_ptr = u64::from_ne_bytes(msghdr_bytes[16..24].try_into().unwrap());
942    let msg_iovlen = u64::from_ne_bytes(msghdr_bytes[24..32].try_into().unwrap());
943    let msg_control_ptr = u64::from_ne_bytes(msghdr_bytes[32..40].try_into().unwrap());
944    let msg_controllen = u64::from_ne_bytes(msghdr_bytes[40..48].try_into().unwrap());
945
946    let addr_bytes = match read_child_mem(notif_fd, notif.id, notif.pid, msg_name_ptr, msg_namelen as usize) {
947        Ok(b) => b,
948        Err(_) => return Err(libc::EIO),
949    };
950    let ip = match parse_ip_from_sockaddr(&addr_bytes) {
951        Some(ip) => ip,
952        // Caller pre-checks via prescan_msghdr; reaching this branch
953        // means the sockaddr changed under us between the prescan and
954        // here. Fail closed.
955        None => return Err(libc::EAFNOSUPPORT),
956    };
957    let dest_port = parse_port_from_sockaddr(&addr_bytes);
958
959    let ns = ctx.network.lock().await;
960    let live_policy = {
961        let pfs = ctx.policy_fn.lock().await;
962        pfs.live_policy.clone()
963    };
964    let effective = ns.effective_network_policy(notif.pid, protocol, live_policy.as_ref());
965    if !matches!(effective, crate::seccomp::notif::NetworkPolicy::Unrestricted) {
966        match dest_port {
967            Some(p) if !effective.allows(ip, p) => return Err(ECONNREFUSED),
968            None => return Err(ECONNREFUSED),
969            Some(_) => {}
970        }
971    }
972    drop(ns);
973
974    let iovlen = (msg_iovlen as usize).min(1024);
975    let iov_size = iovlen * 16;
976    let iov_bytes = match read_child_mem(notif_fd, notif.id, notif.pid, msg_iov_ptr, iov_size) {
977        Ok(b) => b,
978        Err(_) => return Err(libc::EIO),
979    };
980    let mut data_bufs: Vec<Vec<u8>> = Vec::with_capacity(iovlen);
981    let mut local_iovs: Vec<libc::iovec> = Vec::with_capacity(iovlen);
982    for i in 0..iovlen {
983        let off = i * 16;
984        if off + 16 > iov_bytes.len() { break; }
985        let iov_base = u64::from_ne_bytes(iov_bytes[off..off + 8].try_into().unwrap());
986        let iov_len = u64::from_ne_bytes(iov_bytes[off + 8..off + 16].try_into().unwrap()) as usize;
987        if iov_len > MAX_SEND_BUF {
988            return Err(libc::EMSGSIZE);
989        }
990        if iov_base == 0 || iov_len == 0 {
991            data_bufs.push(Vec::new());
992            continue;
993        }
994        let buf = match read_child_mem(notif_fd, notif.id, notif.pid, iov_base, iov_len) {
995            Ok(b) => b,
996            Err(_) => return Err(libc::EIO),
997        };
998        data_bufs.push(buf);
999    }
1000    for buf in &data_bufs {
1001        local_iovs.push(libc::iovec {
1002            iov_base: buf.as_ptr() as *mut libc::c_void,
1003            iov_len: buf.len(),
1004        });
1005    }
1006
1007    let control_buf = if msg_control_ptr != 0 && msg_controllen > 0 {
1008        let len = (msg_controllen as usize).min(4096);
1009        read_child_mem(notif_fd, notif.id, notif.pid, msg_control_ptr, len).ok()
1010    } else {
1011        None
1012    };
1013
1014    let mut msg: libc::msghdr = unsafe { std::mem::zeroed() };
1015    msg.msg_name = addr_bytes.as_ptr() as *mut libc::c_void;
1016    msg.msg_namelen = addr_bytes.len() as u32;
1017    msg.msg_iov = local_iovs.as_mut_ptr();
1018    msg.msg_iovlen = local_iovs.len();
1019    if let Some(ref ctrl) = control_buf {
1020        msg.msg_control = ctrl.as_ptr() as *mut libc::c_void;
1021        msg.msg_controllen = ctrl.len();
1022    }
1023
1024    let ret = unsafe { libc::sendmsg(dup_fd.as_raw_fd(), &msg, flags) };
1025    if ret >= 0 {
1026        Ok(ret)
1027    } else {
1028        Err(unsafe { *libc::__errno_location() })
1029    }
1030}
1031
1032// ============================================================
1033// sendmmsg_on_behalf — multi-message variant
1034// ============================================================
1035
1036/// `struct mmsghdr` size on Linux x86_64 / aarch64: 56-byte msghdr +
1037/// 4-byte msg_len + 4-byte tail padding = 64 bytes. msg_len lives at
1038/// offset 56.
1039const MMSGHDR_SIZE: usize = 64;
1040const MSG_LEN_OFFSET: usize = 56;
1041/// Cap on the number of messages we'll process per sendmmsg call.
1042/// Linux's UIO_MAXIOV is 1024; lower here to bound supervisor work
1043/// per syscall (each entry costs at minimum a few read_child_mem
1044/// hops + one sendmsg).
1045const MAX_MMSGHDR_ENTRIES: usize = 256;
1046
1047/// Perform `sendmmsg()` on behalf of the child. Pre-scans every entry
1048/// for Continue cases (NULL `msg_name` or non-IP family) — if any
1049/// entry would Continue, we Continue the whole syscall to match
1050/// `sendmsg_on_behalf`'s coarse-grained behavior. Otherwise dup the
1051/// child fd once, query SO_PROTOCOL once, then loop:
1052/// validate → send → write `msg_len` back to the child's mmsghdr.
1053///
1054/// On partial failure (entry K denied or send fails), returns
1055/// `ReturnValue(K)` matching the kernel's "messages successfully
1056/// transmitted" semantics. Returns the errno only when the very first
1057/// entry fails — otherwise the child sees a positive count and reads
1058/// per-entry `msg_len` to learn the per-message status.
1059async fn sendmmsg_on_behalf(
1060    notif: &SeccompNotif,
1061    ctx: &Arc<SupervisorCtx>,
1062    notif_fd: RawFd,
1063) -> NotifAction {
1064    let args = &notif.data.args;
1065    let sockfd = args[0] as i32;
1066    let msgvec_ptr = args[1];
1067    let vlen = (args[2] as u32 as usize).min(MAX_MMSGHDR_ENTRIES);
1068    let flags = args[3] as i32;
1069
1070    if vlen == 0 {
1071        return NotifAction::ReturnValue(0);
1072    }
1073
1074    // Pre-scan every entry. If any has a Continue-eligible shape
1075    // (NULL msg_name or non-IP family), Continue the whole sendmmsg.
1076    // Mixed-shape sendmmsg calls (some entries on-behalf, others not)
1077    // aren't supported because Continue is binary at the syscall
1078    // level.
1079    for i in 0..vlen {
1080        let entry_ptr = msgvec_ptr + (i * MMSGHDR_SIZE) as u64;
1081        match prescan_msghdr(notif, notif_fd, entry_ptr) {
1082            PrescanResult::OnBehalf => continue,
1083            PrescanResult::ContinueWholeCall => return NotifAction::Continue,
1084            PrescanResult::Errno(e) => return NotifAction::Errno(e),
1085        }
1086    }
1087
1088    let dup_fd = match crate::seccomp::notif::dup_fd_from_pid(notif.pid, sockfd) {
1089        Ok(fd) => fd,
1090        Err(e) => return NotifAction::Errno(e.raw_os_error().unwrap_or(libc::EBADF)),
1091    };
1092    let protocol = match query_socket_protocol(dup_fd.as_raw_fd()) {
1093        Some(p) => p,
1094        None => return NotifAction::Errno(ECONNREFUSED),
1095    };
1096
1097    let mut sent: usize = 0;
1098    let mut first_errno: Option<i32> = None;
1099
1100    for i in 0..vlen {
1101        let entry_ptr = msgvec_ptr + (i * MMSGHDR_SIZE) as u64;
1102        match send_msghdr_on_behalf(notif, ctx, notif_fd, &dup_fd, protocol, entry_ptr, flags).await {
1103            Ok(n) => {
1104                let bytes = (n as u32).to_ne_bytes();
1105                let _ = write_child_mem(
1106                    notif_fd, notif.id, notif.pid,
1107                    entry_ptr + MSG_LEN_OFFSET as u64,
1108                    &bytes,
1109                );
1110                sent += 1;
1111            }
1112            Err(errno) => {
1113                first_errno = Some(errno);
1114                break;
1115            }
1116        }
1117    }
1118
1119    if sent > 0 {
1120        NotifAction::ReturnValue(sent as i64)
1121    } else {
1122        // Defensive: vlen > 0 + no successes means at least one attempt
1123        // failed, so first_errno is set. Fall back to ECONNREFUSED
1124        // rather than panicking on the unwrap if invariants ever drift.
1125        NotifAction::Errno(first_errno.unwrap_or(ECONNREFUSED))
1126    }
1127}
1128
1129// ============================================================
1130// handle_net — main handler for connect/sendto/sendmsg
1131// ============================================================
1132
1133/// Handle network-related notifications (connect, sendto, sendmsg).
1134///
1135/// All three are handled on-behalf (TOCTOU-safe): the supervisor copies data
1136/// from child memory, validates the destination, duplicates the socket via
1137/// pidfd_getfd, and performs the syscall itself. The child's memory is never
1138/// re-read by the kernel after validation.
1139///
1140/// Continue safety (issue #27): the on-behalf paths don't return Continue
1141/// at all (they return ReturnValue/Errno after performing the syscall in
1142/// the supervisor). The Continue cases in this module are:
1143///   1. Non-IP families (AF_UNIX etc.) — the IP allowlist doesn't apply;
1144///      Landlock IPC scoping is the enforcement boundary.
1145///   2. Connected sockets with addr_ptr == 0 — the address was already
1146///      validated at connect time, so the kernel re-read of (nothing) is
1147///      moot.
1148///   3. The fall-through case below — only reachable if the BPF filter
1149///      mis-routes a syscall; the kernel handles it normally.
1150/// In sendmsg_on_behalf, the msghdr read failure path returns
1151/// Errno(EFAULT) rather than Continue: a racing thread that briefly
1152/// unmaps the msghdr could otherwise force a fall-through that lets the
1153/// kernel execute sendmsg without the allowlist check. Sub-buffer read
1154/// failures (sockaddr/iovec/control) already return Errno(EIO) and so
1155/// don't bypass the check either.
1156pub(crate) async fn handle_net(
1157    notif: &SeccompNotif,
1158    ctx: &Arc<SupervisorCtx>,
1159    notif_fd: RawFd,
1160) -> NotifAction {
1161    let nr = notif.data.nr as i64;
1162
1163    if nr == libc::SYS_connect {
1164        connect_on_behalf(notif, ctx, notif_fd).await
1165    } else if nr == libc::SYS_sendto {
1166        sendto_on_behalf(notif, ctx, notif_fd).await
1167    } else if nr == libc::SYS_sendmsg {
1168        sendmsg_on_behalf(notif, ctx, notif_fd).await
1169    } else if nr == libc::SYS_sendmmsg {
1170        sendmmsg_on_behalf(notif, ctx, notif_fd).await
1171    } else {
1172        NotifAction::Continue
1173    }
1174}
1175
1176// ============================================================
1177// resolve_net_allow — resolve --net-allow rules to runtime allowlist
1178// ============================================================
1179
1180/// Resolved form of `Policy::net_allow`, ready for the on-behalf path.
1181pub struct ResolvedNetAllow {
1182    /// Per-IP port rules (each concrete-host entry resolves to one or
1183    /// more IPs). An IP appearing here with an empty port set means
1184    /// "all ports for this IP" (from a `host:*` rule).
1185    pub per_ip: HashMap<IpAddr, HashSet<u16>>,
1186    /// IPs permitted on every port (from `host:*` rules after host
1187    /// resolution). The on-behalf path treats these the same as
1188    /// `PortAllow::Any` — the entry in `per_ip` is kept as a
1189    /// placeholder for diagnostic / `/etc/hosts` purposes.
1190    pub per_ip_all_ports: HashSet<IpAddr>,
1191    /// IP/CIDR-literal targets, matched by containment with no DNS (an
1192    /// exact IP literal is a `/32` or `/128`). Each carries the ports
1193    /// permitted to that range (`PortAllow::Any` for all-ports rules).
1194    pub cidrs: Vec<(IpCidr, crate::seccomp::notif::PortAllow)>,
1195    /// Ports permitted to any IP (the `:port` form).
1196    pub any_ip_ports: HashSet<u16>,
1197    /// Any-host any-port wildcard (`:*` / `*:*`, or `icmp://*`). When
1198    /// true, the per-protocol policy becomes `Unrestricted` and the
1199    /// on-behalf check is bypassed for that protocol.
1200    pub any_ip_all_ports: bool,
1201}
1202
1203/// Per-protocol resolved allowlists. Each protocol gets its own
1204/// `ResolvedNetAllow`; the on-behalf path picks the right one based on
1205/// the dup'd fd's `SO_PROTOCOL`. `etc_hosts` is shared across all
1206/// protocols (the synthetic file maps every concrete host that appears
1207/// in any rule).
1208pub struct ResolvedNetAllowSet {
1209    pub tcp: ResolvedNetAllow,
1210    pub udp: ResolvedNetAllow,
1211    pub icmp: ResolvedNetAllow,
1212    /// `<ip> <hostname>\n` lines from every concrete-host rule across
1213    /// every protocol, in resolution order. Empty when no concrete-host
1214    /// rules are present. Combined with the loopback base (or, in chroot
1215    /// mode, the image's `/etc/hosts`) by [`compose_virtual_etc_hosts`]
1216    /// to build the synthetic file served to the sandbox.
1217    pub concrete_host_entries: String,
1218}
1219
1220/// Resolve `--net-allow` rules into per-protocol runtime allowlists.
1221///
1222/// Rules are grouped by `Protocol` and each group is resolved
1223/// independently. ICMP rules carry no ports, so the resulting ICMP
1224/// `ResolvedNetAllow` always has empty `any_ip_ports` / per-IP port
1225/// sets — the on-behalf check routes ICMP through the IP-only path
1226/// (PortAllow::Any). A `*` host on ICMP becomes `any_ip_all_ports`,
1227/// which the handler reads as "no destination check."
1228pub async fn resolve_net_allow(
1229    rules: &[NetAllow],
1230) -> io::Result<ResolvedNetAllowSet> {
1231    use crate::seccomp::notif::PortAllow;
1232    let per_proto = |target: Protocol| async move {
1233        let mut per_ip: HashMap<IpAddr, HashSet<u16>> = HashMap::new();
1234        let mut per_ip_all_ports: HashSet<IpAddr> = HashSet::new();
1235        let mut cidrs: Vec<(IpCidr, PortAllow)> = Vec::new();
1236        let mut any_ip_ports: HashSet<u16> = HashSet::new();
1237        let mut any_ip_all_ports = false;
1238        let mut local_etc_hosts = String::new();
1239
1240        for rule in rules.iter().filter(|r| r.protocol == target) {
1241            match &rule.target {
1242                NetTarget::AnyIp => {
1243                    if rule.all_ports || target == Protocol::Icmp {
1244                        // ICMP rules never carry ports, so a wildcard-host
1245                        // ICMP rule (`icmp://*`) means "any destination."
1246                        any_ip_all_ports = true;
1247                    } else {
1248                        for &p in &rule.ports {
1249                            any_ip_ports.insert(p);
1250                        }
1251                    }
1252                }
1253                NetTarget::Cidr(c) => {
1254                    // IP/CIDR literals are matched by containment with no
1255                    // DNS, exactly like `--net-deny` targets.
1256                    let pa = if rule.all_ports || target == Protocol::Icmp {
1257                        PortAllow::Any
1258                    } else {
1259                        PortAllow::Specific(rule.ports.iter().copied().collect())
1260                    };
1261                    cidrs.push((*c, pa));
1262                }
1263                NetTarget::Host(host) => {
1264                    let addr = format!("{}:0", host);
1265                    let resolved = tokio::net::lookup_host(addr.as_str()).await.map_err(|e| {
1266                        io::Error::new(
1267                            e.kind(),
1268                            format!("failed to resolve host '{}': {}", host, e),
1269                        )
1270                    })?;
1271                    for socket_addr in resolved {
1272                        let ip = socket_addr.ip();
1273                        if rule.all_ports || target == Protocol::Icmp {
1274                            per_ip_all_ports.insert(ip);
1275                            per_ip.entry(ip).or_default();
1276                        } else {
1277                            let entry = per_ip.entry(ip).or_default();
1278                            for &p in &rule.ports {
1279                                entry.insert(p);
1280                            }
1281                        }
1282                        local_etc_hosts.push_str(&format!("{} {}\n", ip, host));
1283                    }
1284                }
1285            }
1286        }
1287
1288        Ok::<_, io::Error>((
1289            ResolvedNetAllow {
1290                per_ip,
1291                per_ip_all_ports,
1292                cidrs,
1293                any_ip_ports,
1294                any_ip_all_ports,
1295            },
1296            local_etc_hosts,
1297        ))
1298    };
1299
1300    let (tcp, tcp_eh) = per_proto(Protocol::Tcp).await?;
1301    let (udp, udp_eh) = per_proto(Protocol::Udp).await?;
1302    let (icmp, icmp_eh) = per_proto(Protocol::Icmp).await?;
1303
1304    let mut concrete_host_entries = String::new();
1305    for chunk in [tcp_eh, udp_eh, icmp_eh] {
1306        concrete_host_entries.push_str(&chunk);
1307    }
1308
1309    Ok(ResolvedNetAllowSet {
1310        tcp,
1311        udp,
1312        icmp,
1313        concrete_host_entries,
1314    })
1315}
1316
1317/// Per-protocol resolved deny policies, ready for `NetworkState`.
1318pub struct ResolvedNetDenySet {
1319    pub tcp: crate::seccomp::notif::NetworkPolicy,
1320    pub udp: crate::seccomp::notif::NetworkPolicy,
1321    pub icmp: crate::seccomp::notif::NetworkPolicy,
1322}
1323
1324/// Resolve `--net-deny` rules into per-protocol `DenyList` policies.
1325/// A protocol with no deny rules stays `Unrestricted` (allow-all).
1326pub fn resolve_net_deny(rules: &[NetDeny]) -> ResolvedNetDenySet {
1327    use crate::seccomp::notif::{NetworkPolicy, PortAllow};
1328
1329    let per_proto = |target: Protocol| -> NetworkPolicy {
1330        let mut cidrs: Vec<(IpCidr, PortAllow)> = Vec::new();
1331        let mut any_ip_ports: HashSet<u16> = HashSet::new();
1332        let mut deny_all = false;
1333        let mut saw_rule = false;
1334
1335        for rule in rules.iter().filter(|r| r.protocol == target) {
1336            saw_rule = true;
1337            match &rule.target {
1338                NetTarget::AnyIp => {
1339                    if rule.all_ports || target == Protocol::Icmp {
1340                        deny_all = true;
1341                    } else {
1342                        for &p in &rule.ports {
1343                            any_ip_ports.insert(p);
1344                        }
1345                    }
1346                }
1347                NetTarget::Cidr(c) => {
1348                    let pa = if rule.all_ports || target == Protocol::Icmp {
1349                        PortAllow::Any
1350                    } else {
1351                        PortAllow::Specific(rule.ports.iter().copied().collect())
1352                    };
1353                    cidrs.push((*c, pa));
1354                }
1355                // `--net-deny` rejects hostnames at parse time, so a deny
1356                // rule never carries a `Host` target.
1357                NetTarget::Host(_) => unreachable!("net-deny rejects hostnames"),
1358            }
1359        }
1360
1361        if !saw_rule {
1362            NetworkPolicy::Unrestricted
1363        } else {
1364            NetworkPolicy::DenyList {
1365                cidrs,
1366                any_ip_ports,
1367                deny_all,
1368            }
1369        }
1370    };
1371
1372    ResolvedNetDenySet {
1373        tcp: per_proto(Protocol::Tcp),
1374        udp: per_proto(Protocol::Udp),
1375        icmp: per_proto(Protocol::Icmp),
1376    }
1377}
1378
1379/// Compose the synthetic `/etc/hosts` served to the sandbox.
1380///
1381/// - **No chroot**: emit the fixed loopback base
1382///   (`127.0.0.1 localhost\n::1 localhost\n`) followed by the
1383///   concrete-host entries from [`resolve_net_allow`]. The sandbox sees
1384///   the same baseline regardless of what the host's on-disk file says.
1385/// - **With chroot**: read `<chroot>/etc/hosts` and use it as the base
1386///   (an image that bakes in private-registry entries or similar keeps
1387///   them). Inject loopback entries only for any localhost family the
1388///   image doesn't already cover — never both, so we don't duplicate
1389///   what the image already has. Concrete-host entries are still
1390///   appended on top.
1391///
1392/// If a chroot is set but `<chroot>/etc/hosts` is unreadable (absent,
1393/// permission denied, etc.), fall back to the bare loopback base — the
1394/// sandbox always sees a usable hosts file.
1395pub fn compose_virtual_etc_hosts(
1396    chroot_root: Option<&std::path::Path>,
1397    concrete_host_entries: &str,
1398) -> String {
1399    let mut out = String::new();
1400    let mut has_v4_localhost = false;
1401    let mut has_v6_localhost = false;
1402
1403    if let Some(root) = chroot_root {
1404        if let Ok(image) = std::fs::read_to_string(root.join("etc").join("hosts")) {
1405            for line in image.lines() {
1406                // Strip an inline `#` comment before tokenizing — the
1407                // hosts(5) format treats everything after `#` as a comment.
1408                let stripped = line.split('#').next().unwrap_or("");
1409                let mut parts = stripped.split_whitespace();
1410                let Some(ip) = parts.next() else { continue };
1411                for name in parts {
1412                    if name == "localhost" {
1413                        if ip == "127.0.0.1" {
1414                            has_v4_localhost = true;
1415                        } else if ip == "::1" {
1416                            has_v6_localhost = true;
1417                        }
1418                    }
1419                }
1420            }
1421            out.push_str(&image);
1422            if !out.is_empty() && !out.ends_with('\n') {
1423                out.push('\n');
1424            }
1425        }
1426    }
1427
1428    if !has_v4_localhost {
1429        out.push_str("127.0.0.1 localhost\n");
1430    }
1431    if !has_v6_localhost {
1432        out.push_str("::1 localhost\n");
1433    }
1434    out.push_str(concrete_host_entries);
1435    out
1436}
1437
1438// ============================================================
1439// Tests
1440// ============================================================
1441
1442#[cfg(test)]
1443mod tests {
1444    use super::*;
1445
1446    // --- NetAllow::parse tests ---
1447
1448    #[test]
1449    fn netallow_parse_concrete_host_port() {
1450        let r = NetRule::parse_allow("example.com:443").unwrap();
1451        assert!(matches!(&r.target, NetTarget::Host(h) if h == "example.com"));
1452        assert_eq!(r.ports, vec![443]);
1453        assert!(!r.all_ports);
1454    }
1455
1456    #[test]
1457    fn netallow_parse_any_host_port() {
1458        let r = NetRule::parse_allow(":8080").unwrap();
1459        assert_eq!(r.target, NetTarget::AnyIp);
1460        assert_eq!(r.ports, vec![8080]);
1461        assert!(!r.all_ports);
1462
1463        let r = NetRule::parse_allow("*:8080").unwrap();
1464        assert_eq!(r.target, NetTarget::AnyIp);
1465        assert_eq!(r.ports, vec![8080]);
1466        assert!(!r.all_ports);
1467    }
1468
1469    #[test]
1470    fn netallow_parse_multiple_ports() {
1471        let r = NetRule::parse_allow("github.com:22,80,443").unwrap();
1472        assert!(matches!(&r.target, NetTarget::Host(h) if h == "github.com"));
1473        assert_eq!(r.ports, vec![22, 80, 443]);
1474        assert!(!r.all_ports);
1475    }
1476
1477    #[test]
1478    fn netallow_parse_wildcard_any_host_any_port_colon() {
1479        let r = NetRule::parse_allow(":*").unwrap();
1480        assert_eq!(r.target, NetTarget::AnyIp);
1481        assert!(r.ports.is_empty());
1482        assert!(r.all_ports);
1483    }
1484
1485    #[test]
1486    fn netallow_parse_wildcard_any_host_any_port_star() {
1487        let r = NetRule::parse_allow("*:*").unwrap();
1488        assert_eq!(r.target, NetTarget::AnyIp);
1489        assert!(r.ports.is_empty());
1490        assert!(r.all_ports);
1491    }
1492
1493    #[test]
1494    fn netallow_parse_wildcard_concrete_host_any_port() {
1495        let r = NetRule::parse_allow("example.com:*").unwrap();
1496        assert!(matches!(&r.target, NetTarget::Host(h) if h == "example.com"));
1497        assert!(r.ports.is_empty());
1498        assert!(r.all_ports);
1499    }
1500
1501    #[test]
1502    fn netallow_parse_rejects_mixed_wildcard_and_concrete() {
1503        // `host:80,*` and `host:*,80` are both ambiguous: the user
1504        // either meant "any port" (wildcard wins) or "ports 80 plus
1505        // some weird placeholder". Refuse and force a clean spec.
1506        let err = NetRule::parse_allow("example.com:80,*").unwrap_err();
1507        assert!(format!("{}", err).contains("cannot mix"));
1508        let err = NetRule::parse_allow("example.com:*,80").unwrap_err();
1509        assert!(format!("{}", err).contains("cannot mix"));
1510    }
1511
1512    #[test]
1513    fn netallow_parse_rejects_port_zero() {
1514        let err = NetRule::parse_allow("example.com:0").unwrap_err();
1515        assert!(format!("{}", err).contains("port 0"));
1516    }
1517
1518    #[test]
1519    fn netallow_parse_rejects_empty_port() {
1520        let err = NetRule::parse_allow("example.com:").unwrap_err();
1521        assert!(format!("{}", err).contains("invalid port"));
1522    }
1523
1524    #[test]
1525    fn netallow_bare_host_is_all_ports() {
1526        // No port suffix means "all ports" (port optional), symmetric
1527        // with the `host:*` form.
1528        let r = NetRule::parse_allow("example.com").unwrap();
1529        assert!(matches!(&r.target, NetTarget::Host(h) if h == "example.com"));
1530        assert!(r.all_ports);
1531        assert!(r.ports.is_empty());
1532    }
1533
1534    #[test]
1535    fn netallow_bare_star_is_any_host_all_ports() {
1536        let r = NetRule::parse_allow("*").unwrap();
1537        assert_eq!(r.target, NetTarget::AnyIp);
1538        assert!(r.all_ports);
1539        assert!(r.ports.is_empty());
1540    }
1541
1542    #[test]
1543    fn netallow_empty_spec_rejected() {
1544        assert!(NetRule::parse_allow("").is_err());
1545        assert!(NetRule::parse_allow("tcp://").is_err());
1546    }
1547
1548    #[test]
1549    fn netallow_cidr_target_with_port() {
1550        // CIDR ranges are now first-class in --net-allow (matched by
1551        // containment, no DNS), symmetric with --net-deny.
1552        let r = NetRule::parse_allow("10.0.0.0/8:80").unwrap();
1553        assert!(matches!(&r.target, NetTarget::Cidr(c) if !c.is_single_host()));
1554        assert_eq!(r.ports, vec![80]);
1555        assert!(!r.all_ports);
1556    }
1557
1558    #[test]
1559    fn netallow_ipv6_literal_and_bracket() {
1560        let lo: std::net::IpAddr = "::1".parse().unwrap();
1561        // Bare IPv6 literal (previously mis-split on its colons).
1562        let r = NetRule::parse_allow("::1").unwrap();
1563        assert!(matches!(&r.target, NetTarget::Cidr(c) if c.addr == lo && c.is_single_host()));
1564        assert!(r.all_ports);
1565        // Bracketed IPv6 with a port.
1566        let r = NetRule::parse_allow("[::1]:443").unwrap();
1567        assert!(matches!(&r.target, NetTarget::Cidr(c) if c.addr == lo && c.is_single_host()));
1568        assert_eq!(r.ports, vec![443]);
1569        // IPv6 CIDR.
1570        let r = NetRule::parse_allow("fc00::/7").unwrap();
1571        assert!(matches!(&r.target, NetTarget::Cidr(c) if !c.is_single_host()));
1572        assert!(r.all_ports);
1573    }
1574
1575    #[tokio::test]
1576    async fn test_resolve_net_allow_cidr_no_dns() {
1577        // A CIDR / IP-literal target resolves into `cidrs` directly, with
1578        // no DNS lookup and no `per_ip` / `/etc/hosts` entry.
1579        let rules = vec![
1580            NetAllow { protocol: Protocol::Tcp, target: NetTarget::Cidr(IpCidr::parse("10.0.0.0/8").unwrap()), ports: vec![80], all_ports: false },
1581            NetAllow { protocol: Protocol::Tcp, target: NetTarget::Cidr(IpCidr::parse("1.2.3.4").unwrap()), ports: vec![], all_ports: true },
1582        ];
1583        let resolved = resolve_net_allow(&rules).await.unwrap();
1584        assert_eq!(resolved.tcp.cidrs.len(), 2);
1585        assert!(resolved.tcp.per_ip.is_empty());
1586        assert!(resolved.concrete_host_entries.is_empty());
1587    }
1588
1589    #[test]
1590    fn netallow_parse_repeated_wildcard_is_idempotent() {
1591        // `*,*` collapses to a single wildcard — neither token contributes
1592        // a concrete port, so the rule remains "any port".
1593        let r = NetRule::parse_allow(":*,*").unwrap();
1594        assert!(r.all_ports);
1595        assert!(r.ports.is_empty());
1596    }
1597
1598    // --- Protocol scheme prefix tests ---
1599
1600    #[test]
1601    fn netallow_bare_form_defaults_to_tcp() {
1602        let r = NetRule::parse_allow("example.com:443").unwrap();
1603        assert_eq!(r.protocol, Protocol::Tcp);
1604    }
1605
1606    #[test]
1607    fn netallow_explicit_tcp_scheme() {
1608        let r = NetRule::parse_allow("tcp://example.com:443").unwrap();
1609        assert_eq!(r.protocol, Protocol::Tcp);
1610        assert!(matches!(&r.target, NetTarget::Host(h) if h == "example.com"));
1611        assert_eq!(r.ports, vec![443]);
1612    }
1613
1614    #[test]
1615    fn netallow_udp_scheme_with_host_port() {
1616        let r = NetRule::parse_allow("udp://1.1.1.1:53").unwrap();
1617        assert_eq!(r.protocol, Protocol::Udp);
1618        // An IP literal becomes a single-host CIDR target (no DNS).
1619        let one: std::net::IpAddr = "1.1.1.1".parse().unwrap();
1620        assert!(matches!(&r.target, NetTarget::Cidr(c) if c.addr == one && c.is_single_host()));
1621        assert_eq!(r.ports, vec![53]);
1622    }
1623
1624    #[test]
1625    fn netallow_udp_wildcard_any_anywhere() {
1626        // The "any UDP" gate, equivalent to the old `allow_udp = true`.
1627        let r = NetRule::parse_allow("udp://*:*").unwrap();
1628        assert_eq!(r.protocol, Protocol::Udp);
1629        assert_eq!(r.target, NetTarget::AnyIp);
1630        assert!(r.all_ports);
1631    }
1632
1633    #[test]
1634    fn netallow_icmp_scheme_with_host() {
1635        let r = NetRule::parse_allow("icmp://github.com").unwrap();
1636        assert_eq!(r.protocol, Protocol::Icmp);
1637        assert!(matches!(&r.target, NetTarget::Host(h) if h == "github.com"));
1638        assert!(r.ports.is_empty());
1639        // ICMP carries no ports, so the rule is "all ports" by convention.
1640        assert!(r.all_ports);
1641    }
1642
1643    #[test]
1644    fn netallow_icmp_wildcard() {
1645        // The "any ICMP echo" gate, equivalent to the old
1646        // `allow_icmp = true` for the SOCK_DGRAM path.
1647        let r = NetRule::parse_allow("icmp://*").unwrap();
1648        assert_eq!(r.protocol, Protocol::Icmp);
1649        assert_eq!(r.target, NetTarget::AnyIp);
1650    }
1651
1652    #[test]
1653    fn netallow_icmp_rejects_port() {
1654        // ICMP has no port — `:port` is meaningless and refused
1655        // explicitly so users can't write a rule that doesn't do what
1656        // they think.
1657        let err = NetRule::parse_allow("icmp://github.com:80").unwrap_err();
1658        assert!(format!("{}", err).contains("icmp rule takes no port"));
1659    }
1660
1661    #[test]
1662    fn netallow_icmp_rejects_empty_body() {
1663        let err = NetRule::parse_allow("icmp://").unwrap_err();
1664        assert!(format!("{}", err).contains("needs a host/IP or `*`"));
1665    }
1666
1667    #[test]
1668    fn netallow_unknown_scheme_rejected() {
1669        // Including `icmp-raw` — sandlock does not expose raw ICMP, so
1670        // the scheme is unknown rather than a special-case error.
1671        for spec in ["sctp://host:1234", "icmp-raw://*"] {
1672            let err = NetRule::parse_allow(spec).unwrap_err();
1673            assert!(format!("{}", err).contains("unknown scheme"), "spec: {}", spec);
1674        }
1675    }
1676
1677    #[tokio::test]
1678    async fn test_resolve_net_allow_empty() {
1679        let resolved = resolve_net_allow(&[]).await.unwrap();
1680        assert!(resolved.tcp.per_ip.is_empty());
1681        assert!(resolved.tcp.any_ip_ports.is_empty());
1682        assert!(resolved.udp.per_ip.is_empty());
1683        assert!(resolved.icmp.per_ip.is_empty());
1684        // No concrete-host rules → no resolved-entry lines.
1685        assert!(resolved.concrete_host_entries.is_empty());
1686    }
1687
1688    #[tokio::test]
1689    async fn test_resolve_net_allow_concrete_host() {
1690        let rules = vec![NetAllow {
1691            protocol: Protocol::Tcp,
1692            target: NetTarget::Host("localhost".to_string()),
1693            ports: vec![80, 443],
1694            all_ports: false,
1695        }];
1696        let resolved = resolve_net_allow(&rules).await.unwrap();
1697        // localhost should resolve to at least one loopback addr; only
1698        // the TCP set has entries.
1699        assert!(!resolved.tcp.per_ip.is_empty());
1700        for ports in resolved.tcp.per_ip.values() {
1701            assert!(ports.contains(&80));
1702            assert!(ports.contains(&443));
1703        }
1704        assert!(resolved.udp.per_ip.is_empty());
1705        assert!(resolved.icmp.per_ip.is_empty());
1706        // The resolved entry (`<ip> localhost`) surfaces in concrete_host_entries.
1707        assert!(resolved.concrete_host_entries.contains("127.0.0.1 localhost"));
1708    }
1709
1710    #[tokio::test]
1711    async fn test_resolve_net_allow_any_ip() {
1712        let rules = vec![NetAllow {
1713            protocol: Protocol::Tcp,
1714            target: NetTarget::AnyIp,
1715            ports: vec![8080],
1716            all_ports: false,
1717        }];
1718        let resolved = resolve_net_allow(&rules).await.unwrap();
1719        assert!(resolved.tcp.per_ip.is_empty());
1720        assert!(resolved.tcp.any_ip_ports.contains(&8080));
1721        assert!(!resolved.tcp.any_ip_all_ports);
1722        // Any-IP rule has no concrete host, so no resolved-entry line.
1723        assert!(resolved.concrete_host_entries.is_empty());
1724    }
1725
1726    #[tokio::test]
1727    async fn test_resolve_net_allow_any_ip_all_ports() {
1728        // `:*` — fully unrestricted egress, TCP-only.
1729        let rules = vec![NetAllow {
1730            protocol: Protocol::Tcp,
1731            target: NetTarget::AnyIp,
1732            ports: vec![],
1733            all_ports: true,
1734        }];
1735        let resolved = resolve_net_allow(&rules).await.unwrap();
1736        assert!(resolved.tcp.any_ip_all_ports);
1737        assert!(resolved.tcp.per_ip.is_empty());
1738        assert!(resolved.tcp.per_ip_all_ports.is_empty());
1739        assert!(resolved.tcp.any_ip_ports.is_empty());
1740        // UDP/ICMP unaffected by a TCP rule.
1741        assert!(!resolved.udp.any_ip_all_ports);
1742        assert!(!resolved.icmp.any_ip_all_ports);
1743    }
1744
1745    #[tokio::test]
1746    async fn test_resolve_net_allow_concrete_host_all_ports() {
1747        // `localhost:*` — every port to localhost only, TCP.
1748        let rules = vec![NetAllow {
1749            protocol: Protocol::Tcp,
1750            target: NetTarget::Host("localhost".to_string()),
1751            ports: vec![],
1752            all_ports: true,
1753        }];
1754        let resolved = resolve_net_allow(&rules).await.unwrap();
1755        assert!(!resolved.tcp.any_ip_all_ports);
1756        assert!(
1757            !resolved.tcp.per_ip_all_ports.is_empty(),
1758            "localhost should resolve to at least one IP marked as any-port"
1759        );
1760        for ip in resolved.tcp.per_ip_all_ports.iter() {
1761            assert!(resolved.tcp.per_ip.contains_key(ip));
1762        }
1763        assert!(resolved.concrete_host_entries.contains("localhost"));
1764    }
1765
1766    #[tokio::test]
1767    async fn test_resolve_net_allow_mixed_wildcard_and_concrete() {
1768        // Wildcard rule alongside concrete: wildcard sets the global
1769        // any-host any-port flag for TCP; concrete rule still resolves
1770        // into per_ip (the runtime layer chooses Unrestricted, ignoring
1771        // the concrete entries).
1772        let rules = vec![
1773            NetAllow {
1774                protocol: Protocol::Tcp,
1775                target: NetTarget::AnyIp,
1776                ports: vec![],
1777                all_ports: true,
1778            },
1779            NetAllow {
1780                protocol: Protocol::Tcp,
1781                target: NetTarget::Host("localhost".to_string()),
1782                ports: vec![22],
1783                all_ports: false,
1784            },
1785        ];
1786        let resolved = resolve_net_allow(&rules).await.unwrap();
1787        assert!(resolved.tcp.any_ip_all_ports);
1788        assert!(!resolved.tcp.per_ip.is_empty());
1789    }
1790
1791    // ============================================================
1792    // Per-protocol resolution — UDP / ICMP slices stay isolated
1793    // ============================================================
1794
1795    #[tokio::test]
1796    async fn test_resolve_per_protocol_isolation() {
1797        // A UDP rule should not appear in the TCP set, and vice versa.
1798        // This is the property Phase 2 relies on for protocol routing.
1799        let rules = vec![
1800            NetAllow {
1801                protocol: Protocol::Tcp,
1802                target: NetTarget::Host("localhost".to_string()),
1803                ports: vec![443],
1804                all_ports: false,
1805            },
1806            NetAllow {
1807                protocol: Protocol::Udp,
1808                target: NetTarget::AnyIp,
1809                ports: vec![53],
1810                all_ports: false,
1811            },
1812        ];
1813        let resolved = resolve_net_allow(&rules).await.unwrap();
1814        assert!(
1815            !resolved.tcp.per_ip.is_empty(),
1816            "TCP rule should populate tcp set"
1817        );
1818        assert!(
1819            resolved.udp.any_ip_ports.contains(&53),
1820            "UDP rule should populate udp set"
1821        );
1822        // Cross-contamination check: TCP per_ip ports must not contain 53;
1823        // UDP must not contain 443.
1824        for ports in resolved.tcp.per_ip.values() {
1825            assert!(!ports.contains(&53), "UDP port leaked into TCP set");
1826        }
1827        assert!(!resolved.udp.any_ip_ports.contains(&443), "TCP port leaked into UDP set");
1828    }
1829
1830    #[tokio::test]
1831    async fn test_resolve_icmp_no_ports() {
1832        // ICMP rules carry no ports; concrete hosts go into per_ip with
1833        // PortAllow::Any-style empty port set, plus per_ip_all_ports.
1834        let rules = vec![NetAllow {
1835            protocol: Protocol::Icmp,
1836            target: NetTarget::Host("localhost".to_string()),
1837            ports: vec![],
1838            all_ports: false,
1839        }];
1840        let resolved = resolve_net_allow(&rules).await.unwrap();
1841        assert!(
1842            !resolved.icmp.per_ip.is_empty(),
1843            "icmp host should populate per_ip"
1844        );
1845        assert!(
1846            !resolved.icmp.per_ip_all_ports.is_empty(),
1847            "icmp host should mark per_ip_all_ports (no port check)"
1848        );
1849        assert!(resolved.icmp.any_ip_ports.is_empty());
1850        // TCP/UDP unaffected.
1851        assert!(resolved.tcp.per_ip.is_empty());
1852        assert!(resolved.udp.per_ip.is_empty());
1853    }
1854
1855    #[tokio::test]
1856    async fn test_resolve_icmp_wildcard() {
1857        // `icmp://*` — any ICMP destination.
1858        let rules = vec![NetAllow {
1859            protocol: Protocol::Icmp,
1860            target: NetTarget::AnyIp,
1861            ports: vec![],
1862            all_ports: false,
1863        }];
1864        let resolved = resolve_net_allow(&rules).await.unwrap();
1865        assert!(resolved.icmp.any_ip_all_ports);
1866        assert!(!resolved.tcp.any_ip_all_ports);
1867    }
1868
1869    // ============================================================
1870    // compose_virtual_etc_hosts — synthetic /etc/hosts assembly
1871    // ============================================================
1872
1873    use std::io::Write;
1874
1875    fn temp_rootfs_with_hosts(name: &str, hosts_content: Option<&str>) -> std::path::PathBuf {
1876        let dir = std::env::temp_dir().join(format!(
1877            "sandlock-test-compose-hosts-{}-{}",
1878            name, std::process::id()
1879        ));
1880        let _ = std::fs::create_dir_all(dir.join("etc"));
1881        if let Some(content) = hosts_content {
1882            let mut f = std::fs::File::create(dir.join("etc").join("hosts")).unwrap();
1883            f.write_all(content.as_bytes()).unwrap();
1884        }
1885        dir
1886    }
1887
1888    #[test]
1889    fn compose_no_chroot_emits_loopback_base() {
1890        // Default path — no chroot, no concrete-host rules → the same
1891        // fixed loopback view we promise every sandbox.
1892        let out = compose_virtual_etc_hosts(None, "");
1893        assert_eq!(out, "127.0.0.1 localhost\n::1 localhost\n");
1894    }
1895
1896    #[test]
1897    fn compose_no_chroot_appends_concrete_entries() {
1898        let out = compose_virtual_etc_hosts(None, "10.0.0.1 api\n");
1899        assert_eq!(out, "127.0.0.1 localhost\n::1 localhost\n10.0.0.1 api\n");
1900    }
1901
1902    #[test]
1903    fn compose_chroot_seeds_from_image_and_injects_missing_loopback() {
1904        // Image ships an entry of its own but no localhost mapping; the
1905        // shim must keep the image's content and inject both loopback
1906        // entries on top so the always-on guarantee still holds.
1907        let rootfs = temp_rootfs_with_hosts(
1908            "no-localhost",
1909            Some("10.0.0.5 myimage.local\n"),
1910        );
1911        let out = compose_virtual_etc_hosts(Some(&rootfs), "");
1912        assert!(out.contains("10.0.0.5 myimage.local"), "image entry missing: {out}");
1913        assert!(out.contains("127.0.0.1 localhost"), "v4 loopback missing: {out}");
1914        assert!(out.contains("::1 localhost"), "v6 loopback missing: {out}");
1915        let _ = std::fs::remove_dir_all(&rootfs);
1916    }
1917
1918    #[test]
1919    fn compose_chroot_does_not_duplicate_existing_loopback() {
1920        // Image already has both loopback entries — don't append duplicates.
1921        let rootfs = temp_rootfs_with_hosts(
1922            "both-localhost",
1923            Some("127.0.0.1 localhost\n::1 localhost\n10.0.0.5 myimage.local\n"),
1924        );
1925        let out = compose_virtual_etc_hosts(Some(&rootfs), "");
1926        assert_eq!(out.matches("127.0.0.1 localhost").count(), 1, "v4 dup'd: {out}");
1927        assert_eq!(out.matches("::1 localhost").count(), 1, "v6 dup'd: {out}");
1928        assert!(out.contains("10.0.0.5 myimage.local"));
1929        let _ = std::fs::remove_dir_all(&rootfs);
1930    }
1931
1932    #[test]
1933    fn compose_chroot_injects_only_missing_family() {
1934        // Image has v4 but no v6 localhost — inject only v6, leave v4 alone.
1935        let rootfs = temp_rootfs_with_hosts(
1936            "only-v4-localhost",
1937            Some("127.0.0.1 localhost myimage\n"),
1938        );
1939        let out = compose_virtual_etc_hosts(Some(&rootfs), "");
1940        assert_eq!(out.matches("127.0.0.1 localhost").count(), 1);
1941        assert!(out.contains("::1 localhost"), "v6 loopback should be injected: {out}");
1942        let _ = std::fs::remove_dir_all(&rootfs);
1943    }
1944
1945    #[test]
1946    fn compose_chroot_missing_file_falls_back_to_loopback() {
1947        // Chroot exists but has no /etc/hosts — fall back to the bare
1948        // loopback base so the sandbox always sees a usable file.
1949        let rootfs = temp_rootfs_with_hosts("no-file", None);
1950        let out = compose_virtual_etc_hosts(Some(&rootfs), "10.0.0.1 api\n");
1951        assert_eq!(out, "127.0.0.1 localhost\n::1 localhost\n10.0.0.1 api\n");
1952        let _ = std::fs::remove_dir_all(&rootfs);
1953    }
1954
1955    #[test]
1956    fn compose_chroot_strips_inline_comments_when_detecting_loopback() {
1957        // hosts(5) treats `#` as a comment-start; the loopback-presence
1958        // check must respect it (otherwise an image line like
1959        // `127.0.0.1 # localhost` would be falsely treated as covering v4).
1960        let rootfs = temp_rootfs_with_hosts(
1961            "with-comments",
1962            Some("127.0.0.1 # localhost is a comment here\n"),
1963        );
1964        let out = compose_virtual_etc_hosts(Some(&rootfs), "");
1965        // Real `127.0.0.1 localhost` line must still be injected.
1966        assert!(
1967            out.lines().any(|l| l.trim() == "127.0.0.1 localhost"),
1968            "v4 loopback should still be injected: {out}"
1969        );
1970        let _ = std::fs::remove_dir_all(&rootfs);
1971    }
1972
1973    // --- IpCidr tests ---
1974
1975    #[test]
1976    fn ipcidr_parse_bare_ipv4_is_host_route() {
1977        let c = IpCidr::parse("1.2.3.4").unwrap();
1978        assert_eq!(c.prefix_len, 32);
1979        assert!(c.contains("1.2.3.4".parse().unwrap()));
1980        assert!(!c.contains("1.2.3.5".parse().unwrap()));
1981    }
1982
1983    #[test]
1984    fn ipcidr_parse_ipv4_range_contains() {
1985        let c = IpCidr::parse("10.0.0.0/8").unwrap();
1986        assert!(c.contains("10.3.7.9".parse().unwrap()));
1987        assert!(!c.contains("11.0.0.1".parse().unwrap()));
1988    }
1989
1990    #[test]
1991    fn ipcidr_parse_ipv6_range_contains() {
1992        let c = IpCidr::parse("fc00::/7").unwrap();
1993        assert!(c.contains("fd00::1".parse().unwrap()));
1994        assert!(!c.contains("2001:db8::1".parse().unwrap()));
1995    }
1996
1997    #[test]
1998    fn ipcidr_zero_prefix_matches_all_same_family() {
1999        let c = IpCidr::parse("0.0.0.0/0").unwrap();
2000        assert!(c.contains("8.8.8.8".parse().unwrap()));
2001        assert!(!c.contains("::1".parse().unwrap())); // family mismatch
2002    }
2003
2004    #[test]
2005    fn ipcidr_rejects_hostname() {
2006        assert!(IpCidr::parse("example.com").is_err());
2007    }
2008
2009    #[test]
2010    fn ipcidr_rejects_oversized_prefix() {
2011        assert!(IpCidr::parse("10.0.0.0/33").is_err());
2012        assert!(IpCidr::parse("fc00::/129").is_err());
2013    }
2014
2015    // --- NetDeny::parse tests ---
2016
2017    #[test]
2018    fn netdeny_bare_cidr_is_all_ports_tcp() {
2019        let rule = NetRule::parse_deny("10.0.0.0/8").unwrap();
2020        assert_eq!(rule.protocol, Protocol::Tcp);
2021        assert!(matches!(rule.target, NetTarget::Cidr(_)));
2022        assert!(rule.all_ports);
2023    }
2024
2025    #[test]
2026    fn netdeny_bare_ip_is_host_route_all_ports() {
2027        let rule = NetRule::parse_deny("169.254.169.254").unwrap();
2028        match &rule.target {
2029            NetTarget::Cidr(c) => assert_eq!(c.prefix_len, 32),
2030            _ => panic!("expected cidr"),
2031        }
2032        assert!(rule.all_ports);
2033    }
2034
2035    #[test]
2036    fn netdeny_cidr_with_port() {
2037        let rule = NetRule::parse_deny("10.0.0.0/8:443").unwrap();
2038        assert_eq!(rule.ports, vec![443]);
2039        assert!(!rule.all_ports);
2040    }
2041
2042    #[test]
2043    fn netdeny_any_ip_port() {
2044        let rule = NetRule::parse_deny(":25").unwrap();
2045        assert!(matches!(rule.target, NetTarget::AnyIp));
2046        assert_eq!(rule.ports, vec![25]);
2047    }
2048
2049    #[test]
2050    fn netdeny_udp_scheme() {
2051        let rule = NetRule::parse_deny("udp://192.168.0.0/16:53").unwrap();
2052        assert_eq!(rule.protocol, Protocol::Udp);
2053        assert_eq!(rule.ports, vec![53]);
2054    }
2055
2056    #[test]
2057    fn netdeny_ipv6_bracket_port() {
2058        let rule = NetRule::parse_deny("[::1]:443").unwrap();
2059        assert_eq!(rule.ports, vec![443]);
2060        match &rule.target {
2061            NetTarget::Cidr(c) => assert_eq!(c.prefix_len, 128),
2062            _ => panic!("expected cidr"),
2063        }
2064    }
2065
2066    #[test]
2067    fn netdeny_rejects_hostname() {
2068        assert!(NetRule::parse_deny("evil.com:443").is_err());
2069        assert!(NetRule::parse_deny("evil.com").is_err());
2070    }
2071
2072    #[test]
2073    fn netdeny_bare_ipv6_address_all_ports() {
2074        let rule = NetRule::parse_deny("::1").unwrap();
2075        assert!(rule.all_ports);
2076        match &rule.target {
2077            NetTarget::Cidr(c) => assert_eq!(c.prefix_len, 128),
2078            _ => panic!("expected cidr"),
2079        }
2080    }
2081
2082    #[test]
2083    fn netdeny_bare_ipv6_cidr_all_ports() {
2084        let rule = NetRule::parse_deny("fc00::/7").unwrap();
2085        assert!(rule.all_ports);
2086        let ula: std::net::IpAddr = "fd00::1".parse().unwrap();
2087        assert!(matches!(&rule.target, NetTarget::Cidr(c) if c.contains(ula)));
2088    }
2089
2090    #[test]
2091    fn netdeny_empty_icmp_body_is_rejected() {
2092        assert!(NetRule::parse_deny("icmp://").is_err());
2093    }
2094
2095    #[test]
2096    fn netdeny_bare_star_is_any_ip_all_ports() {
2097        // `*` with no port is the any-IP, all-ports form (port optional,
2098        // symmetric with a bare IP/CIDR).
2099        let rule = NetRule::parse_deny("*").unwrap();
2100        assert_eq!(rule.protocol, Protocol::Tcp);
2101        assert!(matches!(rule.target, NetTarget::AnyIp));
2102        assert!(rule.all_ports);
2103        assert!(rule.ports.is_empty());
2104    }
2105
2106    #[test]
2107    fn netdeny_udp_bare_star_all_ports() {
2108        let rule = NetRule::parse_deny("udp://*").unwrap();
2109        assert_eq!(rule.protocol, Protocol::Udp);
2110        assert!(matches!(rule.target, NetTarget::AnyIp));
2111        assert!(rule.all_ports);
2112    }
2113
2114    #[test]
2115    fn netdeny_empty_spec_rejected() {
2116        // An empty body must not silently mean "deny everything".
2117        assert!(NetRule::parse_deny("").is_err());
2118        assert!(NetRule::parse_deny("udp://").is_err());
2119    }
2120
2121    // --- resolve_net_deny tests ---
2122
2123    #[test]
2124    fn resolve_net_deny_groups_per_protocol() {
2125        let rule = NetRule::parse_deny("10.0.0.0/8").unwrap();
2126        let set = resolve_net_deny(std::slice::from_ref(&rule));
2127        // TCP policy denies 10.x, UDP/ICMP unaffected (still allow-all).
2128        assert!(!set.tcp.allows("10.0.0.1".parse().unwrap(), 443));
2129        assert!(set.udp.allows("10.0.0.1".parse().unwrap(), 443));
2130    }
2131
2132    #[test]
2133    fn resolve_net_deny_any_ip_port() {
2134        let rule = NetRule::parse_deny(":25").unwrap();
2135        let set = resolve_net_deny(std::slice::from_ref(&rule));
2136        assert!(!set.tcp.allows("8.8.8.8".parse().unwrap(), 25));
2137        assert!(set.tcp.allows("8.8.8.8".parse().unwrap(), 80));
2138    }
2139}