Skip to main content

sandlock_core/seccomp/
notif.rs

1// Seccomp user notification supervisor — async event loop that receives
2// notifications from the kernel, dispatches them to handler functions, and
3// sends responses.
4
5use std::collections::{HashMap, HashSet};
6use std::future::Future;
7use std::io;
8use std::net::IpAddr;
9use std::os::unix::io::{AsRawFd, OwnedFd, RawFd};
10use std::pin::Pin;
11use std::sync::Arc;
12
13use crate::error::NotifError;
14use crate::arch;
15use crate::sys::structs::{
16    SeccompNotif, SeccompNotifAddfd, SeccompNotifResp,
17    SECCOMP_ADDFD_FLAG_SEND, SECCOMP_IOCTL_NOTIF_ADDFD, SECCOMP_IOCTL_NOTIF_ID_VALID, SECCOMP_IOCTL_NOTIF_RECV,
18    SECCOMP_IOCTL_NOTIF_SEND, SECCOMP_IOCTL_NOTIF_SET_FLAGS,
19    SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP, SECCOMP_USER_NOTIF_FLAG_CONTINUE,
20    ENOMEM,
21};
22
23// ============================================================
24// NotifAction — how the supervisor should respond
25// ============================================================
26
27/// A one-shot callback invoked with the child-side fd number returned by
28/// `SECCOMP_IOCTL_NOTIF_ADDFD` after a successful `InjectFdSendTracked`.
29/// Wraps a boxed closure with a manual `Debug` impl so that `NotifAction`
30/// can keep deriving `Debug`.  The closure is both `Send` and `Sync` so
31/// that `&NotifAction` remains `Send` (required because `NotifAction` is
32/// borrowed across `.await` points in the notifier loop).
33pub struct OnInjectSuccess(pub Box<dyn FnOnce(i32) + Send + Sync>);
34
35impl std::fmt::Debug for OnInjectSuccess {
36    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
37        f.write_str("OnInjectSuccess(<callback>)")
38    }
39}
40
41impl OnInjectSuccess {
42    pub fn new<F: FnOnce(i32) + Send + Sync + 'static>(f: F) -> Self {
43        Self(Box::new(f))
44    }
45}
46
47/// A deferred decision: an owned, `'static` future that produces the real
48/// [`NotifAction`] off the supervisor's notification loop.
49///
50/// A handler returns [`NotifAction::Defer`] when computing the response is
51/// slow (a network round-trip, a blocking syscall) and must not stall the
52/// single supervisor task that gates every other trapped syscall.  The
53/// supervisor moves the future onto a worker, lets the loop proceed, and
54/// sends the response (via the still-valid `notif.id`) when the future
55/// resolves.  The future is `'static` because it outlives the borrowed
56/// `HandlerCtx` — capture what you need (`notif` is `Copy`, `notif_fd` is a
57/// `RawFd`) by value rather than borrowing `&self`.
58///
59/// The deferred future need only be `Send` (not `Sync`): the supervisor
60/// moves it onto a worker task and never shares it by reference.  Requiring
61/// `Sync` of user futures would be a leaky bound (it would reject a future
62/// capturing, say, a `Cell`), so it is not required.
63pub struct Deferred(Pin<Box<dyn Future<Output = NotifAction> + Send + 'static>>);
64
65// Safety: `NotifAction` must stay `Sync` so it can live in `Sync` contexts
66// (handler `&self` state, etc.; the `Handler` trait is `Send + Sync`), which
67// requires `Deferred: Sync`.  A `Send`-only future is not `Sync`, but the
68// boxed future is unreachable through a shared `&Deferred`: the field is
69// private, `Debug` touches only a static string, and `run(self)` consumes
70// the value (it is never callable through `&self`).  With no path to poll or
71// read the future via a shared reference, sharing `&Deferred` across threads
72// cannot race, so asserting `Sync` is sound while keeping user futures
73// `Send`-only.
74unsafe impl Sync for Deferred {}
75
76impl std::fmt::Debug for Deferred {
77    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
78        f.write_str("Deferred(<future>)")
79    }
80}
81
82impl Deferred {
83    pub fn new<F: Future<Output = NotifAction> + Send + 'static>(f: F) -> Self {
84        Self(Box::pin(f))
85    }
86
87    /// Drive the deferred future to its terminal action.  Consumes `self`
88    /// because the future is run exactly once, on a worker task.
89    pub async fn run(self) -> NotifAction {
90        self.0.await
91    }
92}
93
94/// How the supervisor should respond to a notification.
95#[derive(Debug)]
96pub enum NotifAction {
97    /// SECCOMP_USER_NOTIF_FLAG_CONTINUE — let the syscall proceed.
98    Continue,
99    /// Return -1 with the given errno.
100    Errno(i32),
101    /// Inject a file descriptor into the child, then continue.
102    InjectFd { srcfd: RawFd, targetfd: i32 },
103    /// Inject a file descriptor using SECCOMP_ADDFD_FLAG_SEND (atomically responds).
104    /// The child sees the injected fd as the return value of the syscall.
105    /// The `OwnedFd` is closed automatically after the ioctl completes.
106    /// `newfd_flags` controls flags on the injected fd (e.g. O_CLOEXEC).
107    InjectFdSend { srcfd: OwnedFd, newfd_flags: u32 },
108    /// Like `InjectFdSend`, but also invokes `on_success` with the
109    /// child-side fd number that `SECCOMP_IOCTL_NOTIF_ADDFD` returned.
110    /// Used when the caller needs to track the exact fd number allocated
111    /// in the child (e.g. to key per-fd state without TOCTOU).
112    InjectFdSendTracked {
113        srcfd: OwnedFd,
114        newfd_flags: u32,
115        on_success: OnInjectSuccess,
116    },
117    /// Synthetic return value (the child sees this as the syscall result).
118    ReturnValue(i64),
119    /// Don't respond — used for checkpoint/freeze.
120    Hold,
121    /// Kill the child process group (OOM-kill semantics).
122    /// Fields: signal, process group leader pid.
123    Kill { sig: i32, pgid: i32 },
124    /// Defer the response: run the carried future on a worker task and
125    /// send its terminal action later, keyed by `notif.id`.  Non-`Continue`,
126    /// so it short-circuits the handler chain — a deferring handler makes a
127    /// terminal decision.  See [`Deferred`].
128    Defer(Deferred),
129}
130
131impl NotifAction {
132    /// Construct a [`NotifAction::Defer`] from a `'static` future.  Ergonomic
133    /// shorthand for `NotifAction::Defer(Deferred::new(fut))`.
134    pub fn defer<F: Future<Output = NotifAction> + Send + 'static>(fut: F) -> Self {
135        NotifAction::Defer(Deferred::new(fut))
136    }
137
138    /// Inject `content` into the child as the syscall's returned fd, backed by
139    /// a sealed (read-only, fixed-size), `O_CLOEXEC` in-memory file.
140    ///
141    /// The fd is created, populated, sealed, and owned end to end by sandlock;
142    /// the caller never sees or closes it. On allocation failure this collapses
143    /// to `Errno(EIO)`, so a handler can return it directly:
144    ///
145    /// ```ignore
146    /// return NotifAction::inject_bytes(&secret);
147    /// ```
148    ///
149    /// For a *writable* injected fd, build one with
150    /// [`content_memfd(content, false)`](content_memfd) and pass it to
151    /// [`NotifAction::InjectFdSend`] yourself.
152    pub fn inject_bytes(content: &[u8]) -> NotifAction {
153        match content_memfd(content, true) {
154            Ok(fd) => NotifAction::InjectFdSend {
155                srcfd: fd,
156                newfd_flags: libc::O_CLOEXEC as u32,
157            },
158            Err(_) => NotifAction::Errno(libc::EIO),
159        }
160    }
161}
162
163/// Create an anonymous in-memory file ("memfd") populated with `content` and
164/// rewound to offset 0, ready to inject as a syscall's returned fd via
165/// [`NotifAction::InjectFdSend`].
166///
167/// When `seal` is true the fd is sealed read-only and fixed-size
168/// (`F_SEAL_SEAL | F_SEAL_WRITE | F_SEAL_GROW | F_SEAL_SHRINK`) so the guest
169/// cannot modify or resize the content it is handed. Sealing is best-effort:
170/// on a kernel without sealing support the fd is still returned, bounded by
171/// the rest of the policy. Pass `false` only when the guest genuinely needs a
172/// writable injected fd.
173///
174/// Most callers want [`NotifAction::inject_bytes`], which wraps this in the
175/// common sealed + `O_CLOEXEC` configuration.
176pub fn content_memfd(content: &[u8], seal: bool) -> io::Result<OwnedFd> {
177    use std::io::{Seek, SeekFrom, Write};
178    use std::os::unix::io::FromRawFd;
179
180    let flags = if seal {
181        (libc::MFD_CLOEXEC | libc::MFD_ALLOW_SEALING) as u32
182    } else {
183        libc::MFD_CLOEXEC as u32
184    };
185    let memfd = crate::sys::syscall::memfd_create("sandlock-content", flags)?;
186
187    // Write the content and rewind. Borrow the raw fd for File I/O without
188    // transferring ownership: `memfd` (the OwnedFd) keeps owning it.
189    {
190        let raw = memfd.as_raw_fd();
191        let mut file = unsafe { std::fs::File::from_raw_fd(raw) };
192        let res = file
193            .write_all(content)
194            .and_then(|()| file.seek(SeekFrom::Start(0)).map(|_| ()));
195        std::mem::forget(file); // don't close `raw`; `memfd` still owns it
196        res?;
197    }
198
199    if seal {
200        // Best-effort: ignore failure on kernels lacking sealing support.
201        let seals =
202            libc::F_SEAL_SEAL | libc::F_SEAL_WRITE | libc::F_SEAL_GROW | libc::F_SEAL_SHRINK;
203        unsafe { libc::fcntl(memfd.as_raw_fd(), libc::F_ADD_SEALS, seals) };
204    }
205
206    Ok(memfd)
207}
208
209/// Collapse a deferred future's resolved action into a sendable terminal
210/// action.  A deferred future that itself resolves to `Defer` is a bug
211/// (no nested deferral); collapse it to `EIO` so the trapped child gets a
212/// definite response instead of wedging forever waiting for one.
213fn finalize_deferred(action: NotifAction) -> NotifAction {
214    match action {
215        NotifAction::Defer(_) => NotifAction::Errno(libc::EIO),
216        other => other,
217    }
218}
219
220// ============================================================
221// NetworkPolicy — network access policy enum
222// ============================================================
223
224/// Per-IP port allowlist. `Any` is used by `policy_fn` IP-only
225/// overrides (legacy `restrict_network(ips)` API where the user
226/// restricts the destination IP set but not ports).
227#[derive(Debug, Clone)]
228pub enum PortAllow {
229    /// Any port permitted to this IP.
230    Any,
231    /// Only these ports permitted to this IP.
232    Specific(HashSet<u16>),
233}
234
235/// Global network policy for the sandbox.
236#[derive(Debug, Clone)]
237pub enum NetworkPolicy {
238    /// No IP-level restriction (no `--net-allow` configured and no
239    /// `policy_fn` override). The Landlock direct path enforces ports.
240    Unrestricted,
241    /// Endpoint-level allowlist: a connection is permitted iff the
242    /// destination IP and port match at least one entry below.
243    AllowList {
244        /// Per-IP port rules. From `--net-allow host:ports` after
245        /// hostname resolution, or from `policy_fn` overrides.
246        per_ip: HashMap<IpAddr, PortAllow>,
247        /// (network, allowed-ports) rules from `--net-allow` IP/CIDR
248        /// targets, matched by containment with no DNS. `PortAllow::Any`
249        /// permits every port to the range.
250        cidrs: Vec<(crate::network::IpCidr, PortAllow)>,
251        /// Ports permitted for any IP (from `--net-allow :port` /
252        /// `*:port`).
253        any_ip_ports: HashSet<u16>,
254    },
255    /// Default-allow denylist: a connection is permitted unless the
256    /// destination IP/port matches a deny rule. From `--net-deny`.
257    DenyList {
258        /// (network, denied-ports) rules. `PortAllow::Any` denies every
259        /// port to the network; `Specific` denies only those ports.
260        cidrs: Vec<(crate::network::IpCidr, PortAllow)>,
261        /// Ports denied for any IP (the `:port` form).
262        any_ip_ports: HashSet<u16>,
263        /// Deny everything (the `:*` / `*:*` form). Rare; here for
264        /// completeness so the form is not silently a no-op.
265        deny_all: bool,
266    },
267}
268
269impl NetworkPolicy {
270    /// True iff a connection to (ip, port) should be permitted.
271    pub fn allows(&self, ip: IpAddr, port: u16) -> bool {
272        match self {
273            NetworkPolicy::Unrestricted => true,
274            NetworkPolicy::AllowList { per_ip, cidrs, any_ip_ports } => {
275                if any_ip_ports.contains(&port) {
276                    return true;
277                }
278                match per_ip.get(&ip) {
279                    Some(PortAllow::Any) => return true,
280                    Some(PortAllow::Specific(s)) if s.contains(&port) => return true,
281                    _ => {}
282                }
283                for (net, allowed) in cidrs {
284                    if net.contains(ip) {
285                        match allowed {
286                            PortAllow::Any => return true,
287                            PortAllow::Specific(s) => {
288                                if s.contains(&port) {
289                                    return true;
290                                }
291                            }
292                        }
293                    }
294                }
295                false
296            }
297            NetworkPolicy::DenyList { cidrs, any_ip_ports, deny_all } => {
298                if *deny_all {
299                    return false;
300                }
301                if any_ip_ports.contains(&port) {
302                    return false;
303                }
304                for (net, denied) in cidrs {
305                    if net.contains(ip) {
306                        match denied {
307                            PortAllow::Any => return false,
308                            PortAllow::Specific(s) => {
309                                if s.contains(&port) {
310                                    return false;
311                                }
312                            }
313                        }
314                    }
315                }
316                true
317            }
318        }
319    }
320}
321
322/// Check if a path-bearing notification targets a denied path.
323///
324/// For two-path syscalls (renameat2, linkat), checks both source and
325/// destination paths — a denied file must not be linked, renamed, or
326/// overwritten.
327///
328/// Each resolved path is checked both as-is (lexical normalization) and
329/// after following symlinks via `canonicalize`.  This prevents bypass via
330/// pre-existing symlinks, relative symlinks, or symlink chains that
331/// ultimately resolve to a denied path.
332pub(crate) fn is_path_denied_for_notif(
333    policy_fn_state: &super::state::PolicyFnState,
334    notif: &SeccompNotif,
335    notif_fd: RawFd,
336) -> bool {
337    if let Some(path) = resolve_path_for_notif(notif, notif_fd) {
338        if is_denied_with_symlink_resolve(policy_fn_state, &path) {
339            return true;
340        }
341    }
342    // For two-path syscalls, also check the second (destination) path.
343    if let Some(path) = resolve_second_path_for_notif(notif, notif_fd) {
344        if is_denied_with_symlink_resolve(policy_fn_state, &path) {
345            return true;
346        }
347    }
348    false
349}
350
351/// Check a path against denied entries, also resolving symlinks.
352///
353/// First checks the lexical path, then `canonicalize`s to follow symlinks
354/// and checks the real path.  This catches pre-existing symlinks, relative
355/// symlinks, and symlink chains that resolve to a denied file.
356fn is_denied_with_symlink_resolve(
357    policy_fn_state: &super::state::PolicyFnState,
358    path: &str,
359) -> bool {
360    // Check the literal (lexically normalized) path first.
361    if policy_fn_state.is_path_denied(path) {
362        return true;
363    }
364    // Follow symlinks and re-check against denied entries.
365    if let Ok(real) = std::fs::canonicalize(path) {
366        if policy_fn_state.is_path_denied(&real.to_string_lossy()) {
367            return true;
368        }
369    }
370    false
371}
372
373/// Read the thread-group leader (Tgid) of a thread from `/proc/<tid>/status`.
374fn tgid_of(tid: u32) -> Option<u32> {
375    let status = std::fs::read_to_string(format!("/proc/{}/status", tid)).ok()?;
376    status
377        .lines()
378        .find_map(|l| l.strip_prefix("Tgid:").and_then(|r| r.trim().parse().ok()))
379}
380
381/// Duplicate a file descriptor from an arbitrary process (by PID/TID) into the supervisor.
382///
383/// `pidfd_getfd` (Linux 5.6+) needs a pidfd for the owning *process*. All threads
384/// of a process share one fd table, so the process's pidfd dups any thread's fd:
385/// `pidfd_open(pid, 0)` gives it directly when `pid` is a thread-group leader,
386/// otherwise we resolve the leader via `Tgid` in `/proc/<pid>/status` and open
387/// that. The triggering thread is frozen on the seccomp notification, so its
388/// Tgid cannot race with pid reuse. Works on any kernel with `pidfd_getfd`.
389pub(crate) fn dup_fd_from_pid(pid: u32, target_fd: i32) -> io::Result<OwnedFd> {
390    use crate::sys::syscall::{pidfd_getfd, pidfd_open};
391    let pidfd = pidfd_open(pid, 0).or_else(|e| match tgid_of(pid) {
392        Some(tgid) if tgid != pid => pidfd_open(tgid, 0),
393        _ => Err(e),
394    })?;
395    pidfd_getfd(&pidfd, target_fd, 0)
396}
397
398// ============================================================
399// NotifPolicy — policy for the notification supervisor
400// ============================================================
401
402/// Policy for the notification supervisor.
403pub struct NotifPolicy {
404    pub max_memory_bytes: u64,
405    pub max_processes: u32,
406    pub has_memory_limit: bool,
407    pub has_net_allowlist: bool,
408    /// `--net-deny-bind` is active: trap `bind()` and register the on-behalf
409    /// handler so denied TCP ports can be refused (independent of the
410    /// connect-side `has_net_allowlist`).
411    pub has_bind_denylist: bool,
412    pub has_random_seed: bool,
413    pub has_time_start: bool,
414    /// Argv-safety gate: the supervisor must freeze every task that
415    /// could mutate argv before any consumer reads it. True when
416    /// `policy_fn` is active or when a handler is bound to
417    /// execve/execveat (such handlers can call `read_child_mem`).
418    /// Also gates ptrace fork-event tracking so `ProcessIndex` is
419    /// complete when the freeze enumerates it.
420    pub argv_safety_required: bool,
421    pub time_offset: i64,
422    pub num_cpus: Option<u32>,
423    pub port_remap: bool,
424    pub cow_enabled: bool,
425    pub chroot_root: Option<std::path::PathBuf>,
426    /// Virtual paths allowed for reading under chroot (original user-specified paths).
427    pub chroot_readable: Vec<std::path::PathBuf>,
428    /// Virtual paths allowed for writing under chroot (original user-specified paths).
429    pub chroot_writable: Vec<std::path::PathBuf>,
430    /// Virtual paths explicitly denied under chroot.
431    pub chroot_denied: Vec<std::path::PathBuf>,
432    /// Mount mappings: (virtual_path, host_path) pairs.
433    pub chroot_mounts: Vec<(std::path::PathBuf, std::path::PathBuf)>,
434    pub deterministic_dirs: bool,
435    pub virtual_hostname: Option<String>,
436    pub has_http_acl: bool,
437    /// Synthetic `/etc/hosts` served to the sandbox. Always populated:
438    /// `openat("/etc/hosts")` returns a memfd with this content so the
439    /// host's on-disk `/etc/hosts` never leaks in. The content is the
440    /// loopback base plus any concrete hostnames resolved from `net_allow`.
441    pub virtual_etc_hosts: String,
442    /// User-declared trust-bundle paths to splice the MITM CA into.
443    pub ca_inject_paths: Vec<std::path::PathBuf>,
444    /// Active MITM CA public cert (PEM bytes) to inject. `Some` only when
445    /// HTTPS MITM is active (BYO or generated).
446    pub ca_inject_pem: Option<std::sync::Arc<Vec<u8>>>,
447}
448
449// ============================================================
450// Low-level ioctl helpers
451// ============================================================
452
453/// Receive a seccomp notification from the kernel.
454/// ioctl(fd, SECCOMP_IOCTL_NOTIF_RECV, &notif)
455fn recv_notif(fd: RawFd) -> io::Result<SeccompNotif> {
456    let mut notif: SeccompNotif = unsafe { std::mem::zeroed() };
457    let ret = unsafe {
458        libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_RECV as libc::c_ulong, &mut notif as *mut _)
459    };
460    if ret < 0 {
461        Err(io::Error::last_os_error())
462    } else {
463        Ok(notif)
464    }
465}
466
467/// Result of a non-blocking probe on the seccomp notif fd.
468enum NotifFdState {
469    /// At least one INIT-state notification is queued. `recv_notif`
470    /// will return without blocking.
471    Pending,
472    /// No notifications and no terminal flags. Wait for the next
473    /// epoll edge before probing again.
474    Empty,
475    /// `POLLHUP`/`POLLERR`/`POLLNVAL` set, or `poll(2)` itself failed:
476    /// filter has been released or the fd is invalid. The supervisor
477    /// should exit; subsequent waits would busy-spin because epoll
478    /// keeps reporting the fd ready.
479    Terminal,
480}
481
482/// Non-blocking probe of the seccomp notif fd.
483///
484/// `SECCOMP_IOCTL_NOTIF_RECV` ignores `O_NONBLOCK` and calls
485/// `wait_event_interruptible` unconditionally (kernel/seccomp.c
486/// `seccomp_notify_recv`). So `recv_notif` cannot be invoked
487/// speculatively to detect an empty queue. This helper uses
488/// `poll(timeout=0)` as a non-blocking predictor: if POLLIN is set
489/// the kernel will hand us a notification without blocking; if a
490/// terminal flag is set the fd will keep waking AsyncFd until the
491/// supervisor exits.
492fn probe_notif_fd(fd: RawFd) -> NotifFdState {
493    let mut pfd = libc::pollfd {
494        fd,
495        events: libc::POLLIN,
496        revents: 0,
497    };
498    let r = unsafe { libc::poll(&mut pfd, 1, 0) };
499    if r > 0 && (pfd.revents & libc::POLLIN) != 0 {
500        return NotifFdState::Pending;
501    }
502    if r < 0 || (pfd.revents & (libc::POLLHUP | libc::POLLERR | libc::POLLNVAL)) != 0 {
503        return NotifFdState::Terminal;
504    }
505    NotifFdState::Empty
506}
507
508/// Send a response with SECCOMP_USER_NOTIF_FLAG_CONTINUE.
509fn respond_continue(fd: RawFd, id: u64) -> io::Result<()> {
510    let resp = SeccompNotifResp {
511        id,
512        val: 0,
513        error: 0,
514        flags: SECCOMP_USER_NOTIF_FLAG_CONTINUE,
515    };
516    send_resp_raw(fd, &resp)
517}
518
519/// Send a response that returns -1 with the given errno.
520fn respond_errno(fd: RawFd, id: u64, errno: i32) -> io::Result<()> {
521    let resp = SeccompNotifResp {
522        id,
523        val: 0,
524        error: -errno,
525        flags: 0,
526    };
527    send_resp_raw(fd, &resp)
528}
529
530/// Send a response with a synthetic return value.
531fn respond_value(fd: RawFd, id: u64, val: i64) -> io::Result<()> {
532    let resp = SeccompNotifResp {
533        id,
534        val,
535        error: 0,
536        flags: 0,
537    };
538    send_resp_raw(fd, &resp)
539}
540
541/// Fail-closed response used when fd injection fails.
542///
543/// Denies the syscall with `EACCES` rather than letting it continue: a
544/// `SECCOMP_USER_NOTIF_FLAG_CONTINUE` here would let the child's original
545/// syscall run unmediated against the host path, silently bypassing
546/// chroot/file confinement. (Regression guard: this must never be a CONTINUE
547/// response.)
548fn inject_failure_resp(id: u64) -> SeccompNotifResp {
549    SeccompNotifResp {
550        id,
551        val: 0,
552        error: -libc::EACCES,
553        flags: 0,
554    }
555}
556
557/// Inject a file descriptor into the child process using SECCOMP_ADDFD_FLAG_SEND.
558///
559/// Uses the SEND flag to atomically inject the fd and respond to the syscall.
560/// The ioctl return value is the fd number assigned in the child process.
561/// After this call, no additional SECCOMP_IOCTL_NOTIF_SEND is needed.
562fn inject_fd_and_send(fd: RawFd, id: u64, srcfd: RawFd, newfd_flags: u32) -> io::Result<i32> {
563    let addfd = SeccompNotifAddfd {
564        id,
565        flags: SECCOMP_ADDFD_FLAG_SEND,
566        srcfd: srcfd as u32,
567        newfd: 0,   // ignored when SECCOMP_ADDFD_FLAG_SETFD is not set
568        newfd_flags,
569    };
570    let ret = unsafe {
571        libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_ADDFD as libc::c_ulong, &addfd as *const _)
572    };
573    if ret < 0 {
574        Err(io::Error::last_os_error())
575    } else {
576        Ok(ret as i32)
577    }
578}
579
580/// Inject a file descriptor into the child process (without responding).
581/// ioctl(fd, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd)
582fn inject_fd(fd: RawFd, id: u64, srcfd: RawFd, targetfd: i32) -> io::Result<()> {
583    let addfd = SeccompNotifAddfd {
584        id,
585        flags: 0,
586        srcfd: srcfd as u32,
587        newfd: targetfd as u32,
588        newfd_flags: 0,
589    };
590    let ret = unsafe {
591        libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_ADDFD as libc::c_ulong, &addfd as *const _)
592    };
593    if ret < 0 {
594        Err(io::Error::last_os_error())
595    } else {
596        Ok(())
597    }
598}
599
600/// Raw ioctl to send a notification response.
601fn send_resp_raw(fd: RawFd, resp: &SeccompNotifResp) -> io::Result<()> {
602    let ret = unsafe {
603        libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_SEND as libc::c_ulong, resp as *const _)
604    };
605    if ret < 0 {
606        Err(io::Error::last_os_error())
607    } else {
608        Ok(())
609    }
610}
611
612/// Check whether a notification ID is still valid (TOCTOU guard).
613/// ioctl(fd, SECCOMP_IOCTL_NOTIF_ID_VALID, &id)
614pub(crate) fn id_valid(fd: RawFd, id: u64) -> io::Result<()> {
615    let ret = unsafe {
616        libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_ID_VALID as libc::c_ulong, &id as *const _)
617    };
618    if ret < 0 {
619        Err(io::Error::last_os_error())
620    } else {
621        Ok(())
622    }
623}
624
625/// Try to enable sync wakeup (Linux 6.7+). Ignores errors.
626fn try_set_sync_wakeup(fd: RawFd) {
627    let flags: u64 = SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP as u64;
628    unsafe {
629        libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_SET_FLAGS as libc::c_ulong, &flags as *const _);
630    }
631}
632
633// ============================================================
634// Child memory access helpers
635// ============================================================
636
637/// Read bytes from a child process via process_vm_readv (single syscall).
638fn read_child_mem_vm(pid: u32, addr: u64, len: usize) -> Result<Vec<u8>, NotifError> {
639    let mut buf = vec![0u8; len];
640    let local_iov = libc::iovec {
641        iov_base: buf.as_mut_ptr() as *mut libc::c_void,
642        iov_len: len,
643    };
644    let remote_iov = libc::iovec {
645        iov_base: addr as *mut libc::c_void,
646        iov_len: len,
647    };
648    let ret = unsafe {
649        libc::process_vm_readv(pid as i32, &local_iov, 1, &remote_iov, 1, 0)
650    };
651    if ret < 0 {
652        Err(NotifError::ChildMemoryRead(io::Error::last_os_error()))
653    } else {
654        buf.truncate(ret as usize);
655        Ok(buf)
656    }
657}
658
659/// Write bytes to a child process via process_vm_writev (single syscall).
660fn write_child_mem_vm(pid: u32, addr: u64, data: &[u8]) -> Result<(), NotifError> {
661    let local_iov = libc::iovec {
662        iov_base: data.as_ptr() as *mut libc::c_void,
663        iov_len: data.len(),
664    };
665    let remote_iov = libc::iovec {
666        iov_base: addr as *mut libc::c_void,
667        iov_len: data.len(),
668    };
669    let ret = unsafe {
670        libc::process_vm_writev(pid as i32, &local_iov, 1, &remote_iov, 1, 0)
671    };
672    if ret < 0 {
673        Err(NotifError::ChildMemoryRead(io::Error::last_os_error()))
674    } else if (ret as usize) < data.len() {
675        Err(NotifError::ChildMemoryRead(io::Error::new(
676            io::ErrorKind::WriteZero,
677            format!("short write: {} of {} bytes", ret, data.len()),
678        )))
679    } else {
680        Ok(())
681    }
682}
683
684/// Read bytes from a child process via `process_vm_readv` with TOCTOU validation.
685///
686/// Calls `id_valid` before and after the read to ensure the notification is
687/// still live (kernel did not abort or release the trapped syscall while the
688/// supervisor was reading guest memory).
689///
690/// Public — used by downstream `Handler` implementations to read syscall
691/// arguments that the kernel passes by pointer (paths in `openat`, buffers
692/// in `write`/`writev`, etc.).
693pub fn read_child_mem(
694    notif_fd: RawFd,
695    id: u64,
696    pid: u32,
697    addr: u64,
698    len: usize,
699) -> Result<Vec<u8>, NotifError> {
700    id_valid(notif_fd, id).map_err(NotifError::Ioctl)?;
701    let result = read_child_mem_vm(pid, addr, len)?;
702    id_valid(notif_fd, id).map_err(NotifError::Ioctl)?;
703    Ok(result)
704}
705
706/// Read a NUL-terminated string from child memory without crossing unmapped
707/// page boundaries in a single `process_vm_readv` call.
708///
709/// TOCTOU-safe — internally calls [`read_child_mem`], inheriting the
710/// `id_valid` checks bracketing each `process_vm_readv` call.
711///
712/// Page-aware: reads up to a page boundary at a time and stops at the
713/// first NUL byte, never crossing into unmapped memory.  Returns
714/// `None` for `addr == 0`, `max_len == 0`, a read failure, or a string
715/// that exceeds `max_len` without a NUL.
716///
717/// Public — used by downstream `Handler` implementations that read
718/// path arguments from notifications (`openat`, `unlinkat`, `statx`,
719/// `newfstatat`, etc.).
720pub fn read_child_cstr(
721    notif_fd: RawFd,
722    id: u64,
723    pid: u32,
724    addr: u64,
725    max_len: usize,
726) -> Option<String> {
727    if addr == 0 || max_len == 0 {
728        return None;
729    }
730
731    const PAGE_SIZE: u64 = 4096;
732    let mut result = Vec::with_capacity(max_len.min(256));
733    let mut cur = addr;
734    while result.len() < max_len {
735        let page_remaining = PAGE_SIZE - (cur % PAGE_SIZE);
736        let remaining = max_len - result.len();
737        let to_read = page_remaining.min(remaining as u64) as usize;
738        let bytes = read_child_mem(notif_fd, id, pid, cur, to_read).ok()?;
739        if let Some(nul) = bytes.iter().position(|&b| b == 0) {
740            result.extend_from_slice(&bytes[..nul]);
741            return String::from_utf8(result).ok();
742        }
743        result.extend_from_slice(&bytes);
744        cur += to_read as u64;
745    }
746
747    String::from_utf8(result).ok()
748}
749
750/// Write bytes to a child process via `process_vm_writev` with TOCTOU validation.
751///
752/// Same TOCTOU contract as [`read_child_mem`].  Public for downstream
753/// `Handler` implementations that synthesise syscall results into
754/// guest memory (e.g. fake `getdents64` listings populated from a
755/// virtual directory index, or synthesised `stat` buffers).
756pub fn write_child_mem(
757    notif_fd: RawFd,
758    id: u64,
759    pid: u32,
760    addr: u64,
761    data: &[u8],
762) -> Result<(), NotifError> {
763    id_valid(notif_fd, id).map_err(NotifError::Ioctl)?;
764    write_child_mem_vm(pid, addr, data)?;
765    id_valid(notif_fd, id).map_err(NotifError::Ioctl)?;
766    Ok(())
767}
768
769// ============================================================
770// Response dispatch
771// ============================================================
772
773/// Dispatch a `NotifAction` to the appropriate low-level response function.
774fn send_response(fd: RawFd, id: u64, action: NotifAction) -> io::Result<()> {
775    match action {
776        NotifAction::Continue => respond_continue(fd, id),
777        NotifAction::Errno(errno) => respond_errno(fd, id, errno),
778        NotifAction::InjectFd { srcfd, targetfd } => {
779            inject_fd(fd, id, srcfd, targetfd)?;
780            respond_continue(fd, id)
781        }
782        NotifAction::InjectFdSend { srcfd, newfd_flags } => {
783            // SECCOMP_ADDFD_FLAG_SEND atomically injects the fd and responds.
784            // No separate NOTIF_SEND needed after this.
785            // On failure, deny (fail closed) rather than letting the original
786            // syscall continue unmediated against the host path.
787            // srcfd (OwnedFd) is dropped at end of this arm, closing the fd.
788            match inject_fd_and_send(fd, id, srcfd.as_raw_fd(), newfd_flags) {
789                Ok(_new_fd) => Ok(()),
790                Err(_) => send_resp_raw(fd, &inject_failure_resp(id)),
791            }
792        }
793        NotifAction::InjectFdSendTracked { srcfd, newfd_flags, on_success } => {
794            match inject_fd_and_send(fd, id, srcfd.as_raw_fd(), newfd_flags) {
795                Ok(new_fd) => {
796                    (on_success.0)(new_fd);
797                    Ok(())
798                }
799                Err(_) => send_resp_raw(fd, &inject_failure_resp(id)),
800            }
801        }
802        NotifAction::ReturnValue(val) => respond_value(fd, id, val),
803        NotifAction::Hold => Ok(()), // Don't send a response.
804        NotifAction::Defer(_) => {
805            // Defer is intercepted in `handle_notification` and never reaches
806            // here on the normal path. If it ever does, fail closed with EIO
807            // rather than dropping the future and wedging the child.
808            debug_assert!(false, "Defer reached send_response; should be intercepted earlier");
809            respond_errno(fd, id, libc::EIO)
810        }
811        NotifAction::Kill { sig, pgid } => {
812            // Kill the entire process group, then return ENOMEM so the
813            // seccomp notification is resolved (avoids a kernel warning).
814            unsafe { libc::killpg(pgid, sig) };
815            respond_errno(fd, id, ENOMEM)
816        }
817    }
818}
819
820// ============================================================
821// vDSO re-patching after exec
822// ============================================================
823
824/// Re-patch the vDSO if the base address changed (e.g. after exec replaces it).
825fn maybe_patch_vdso(pid: i32, procfs: &mut super::state::ProcfsState, policy: &NotifPolicy) {
826    let base = match crate::vdso::find_vdso_base(pid) {
827        Ok(addr) => addr,
828        Err(_) => return,
829    };
830    if base == procfs.vdso_patched_addr {
831        return; // already patched this vDSO
832    }
833    let time_offset = if policy.has_time_start { Some(policy.time_offset) } else { None };
834    if crate::vdso::patch(pid, time_offset, policy.has_random_seed).is_ok() {
835        procfs.vdso_patched_addr = base;
836    }
837}
838
839// ============================================================
840// Policy event emission
841// ============================================================
842
843/// Map a syscall number to a human-readable name for the policy callback.
844fn syscall_name(nr: i64) -> &'static str {
845    match nr {
846        n if n == libc::SYS_openat => "openat",
847        n if n == libc::SYS_connect => "connect",
848        n if n == libc::SYS_sendto => "sendto",
849        n if n == libc::SYS_sendmsg => "sendmsg",
850        n if n == libc::SYS_sendmmsg => "sendmmsg",
851        n if n == libc::SYS_bind => "bind",
852        n if n == libc::SYS_clone => "clone",
853        n if n == libc::SYS_clone3 => "clone3",
854        n if Some(n) == arch::sys_vfork() => "vfork",
855        n if Some(n) == arch::sys_fork() => "fork",
856        n if n == libc::SYS_execve => "execve",
857        n if n == libc::SYS_execveat => "execveat",
858        n if n == libc::SYS_mmap => "mmap",
859        n if n == libc::SYS_munmap => "munmap",
860        n if n == libc::SYS_brk => "brk",
861        n if n == libc::SYS_getrandom => "getrandom",
862        n if n == libc::SYS_unlinkat => "unlinkat",
863        n if n == libc::SYS_mkdirat => "mkdirat",
864        _ => "unknown",
865    }
866}
867
868/// Map a syscall number to a high-level category.
869fn syscall_category(nr: i64) -> crate::policy_fn::SyscallCategory {
870    use crate::policy_fn::SyscallCategory;
871    match nr {
872        n if n == libc::SYS_openat || n == libc::SYS_unlinkat
873            || n == libc::SYS_mkdirat || n == libc::SYS_renameat2
874            || n == libc::SYS_symlinkat || n == libc::SYS_linkat
875            || n == libc::SYS_fchmodat || n == libc::SYS_fchownat
876            || n == libc::SYS_truncate || n == libc::SYS_readlinkat
877            || n == libc::SYS_newfstatat || n == libc::SYS_statx
878            || n == libc::SYS_faccessat || n == libc::SYS_getdents64
879            || Some(n) == arch::sys_getdents() => SyscallCategory::File,
880        n if n == libc::SYS_connect || n == libc::SYS_sendto
881            || n == libc::SYS_sendmsg || n == libc::SYS_sendmmsg
882            || n == libc::SYS_bind
883            || n == libc::SYS_getsockname => SyscallCategory::Network,
884        n if n == libc::SYS_clone || n == libc::SYS_clone3
885            || Some(n) == arch::sys_vfork() || Some(n) == arch::sys_fork()
886            || n == libc::SYS_execve || n == libc::SYS_execveat => SyscallCategory::Process,
887        n if n == libc::SYS_mmap || n == libc::SYS_munmap
888            || n == libc::SYS_brk || n == libc::SYS_mremap
889            => SyscallCategory::Memory,
890        _ => SyscallCategory::File, // default
891    }
892}
893
894/// Read the parent PID from /proc/{pid}/stat.
895fn read_ppid(pid: u32) -> Option<u32> {
896    let stat = std::fs::read_to_string(format!("/proc/{}/stat", pid)).ok()?;
897    // Format: "pid (comm) state ppid ..."
898    // Find the closing ')' then split the rest
899    let close_paren = stat.rfind(')')?;
900    let rest = &stat[close_paren + 2..]; // skip ") "
901    let fields: Vec<&str> = rest.split_whitespace().collect();
902    // fields[0] = state, fields[1] = ppid
903    fields.get(1)?.parse().ok()
904}
905
906/// Read a NUL-terminated path from child memory (up to 256 bytes).
907fn read_path_for_event(notif: &SeccompNotif, addr: u64, notif_fd: RawFd) -> Option<String> {
908    if addr == 0 { return None; }
909    let bytes = read_child_mem(notif_fd, notif.id, notif.pid, addr, 256).ok()?;
910    let nul = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len());
911    String::from_utf8(bytes[..nul].to_vec()).ok()
912}
913
914fn normalize_path(path: &std::path::Path) -> String {
915    use std::path::{Component, PathBuf};
916
917    let mut normalized = PathBuf::new();
918    let absolute = path.is_absolute();
919    if absolute {
920        normalized.push("/");
921    }
922
923    for component in path.components() {
924        match component {
925            Component::RootDir | Component::CurDir => {}
926            Component::ParentDir => {
927                normalized.pop();
928            }
929            Component::Normal(part) => normalized.push(part),
930            Component::Prefix(_) => {}
931        }
932    }
933
934    if normalized.as_os_str().is_empty() {
935        if absolute { "/".into() } else { ".".into() }
936    } else {
937        normalized.to_string_lossy().into_owned()
938    }
939}
940
941fn resolve_at_path_for_event(notif: &SeccompNotif, dirfd: i64, path: &str) -> Option<String> {
942    use std::path::Path;
943
944    if Path::new(path).is_absolute() {
945        return Some(normalize_path(Path::new(path)));
946    }
947
948    let dirfd32 = dirfd as i32;
949    let base = if dirfd32 == libc::AT_FDCWD {
950        std::fs::read_link(format!("/proc/{}/cwd", notif.pid)).ok()?
951    } else {
952        std::fs::read_link(format!("/proc/{}/fd/{}", notif.pid, dirfd32)).ok()?
953    };
954
955    Some(normalize_path(&base.join(path)))
956}
957
958fn resolve_path_for_notif(notif: &SeccompNotif, notif_fd: RawFd) -> Option<String> {
959    let nr = notif.data.nr as i64;
960    match nr {
961        n if n == libc::SYS_openat => {
962            // openat(dirfd, pathname, flags, mode)
963            let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
964            resolve_at_path_for_event(notif, notif.data.args[0] as i64, &path)
965        }
966        n if Some(n) == arch::sys_open() || n == libc::SYS_execve => {
967            let path = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
968            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
969        }
970        n if n == libc::SYS_execveat => {
971            let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
972            resolve_at_path_for_event(notif, notif.data.args[0] as i64, &path)
973        }
974        // linkat(olddirfd, oldpath, newdirfd, newpath, flags)
975        // Check the source (old) path — deny if it's a denied file being linked away.
976        n if n == libc::SYS_linkat => {
977            let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
978            resolve_at_path_for_event(notif, notif.data.args[0] as i64, &path)
979        }
980        // renameat2(olddirfd, oldpath, newdirfd, newpath, flags)
981        // Check the source (old) path — deny if a denied file is being renamed away.
982        n if n == libc::SYS_renameat2 => {
983            let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
984            resolve_at_path_for_event(notif, notif.data.args[0] as i64, &path)
985        }
986        // symlinkat(target, newdirfd, linkpath)
987        // The target string is what the symlink points to; deny if it names a denied path.
988        n if n == libc::SYS_symlinkat => {
989            let target = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
990            // target may be absolute or relative to the process cwd
991            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &target)
992        }
993        // link(oldpath, newpath) — legacy, AT_FDCWD implied for both
994        n if Some(n) == arch::sys_link() => {
995            let path = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
996            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
997        }
998        // rename(oldpath, newpath) — legacy, AT_FDCWD implied for both
999        n if Some(n) == arch::sys_rename() => {
1000            let path = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
1001            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
1002        }
1003        // symlink(target, linkpath) — legacy
1004        n if Some(n) == arch::sys_symlink() => {
1005            let target = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
1006            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &target)
1007        }
1008        _ => None,
1009    }
1010}
1011
1012/// Resolve the second (destination) path for two-path syscalls.
1013///
1014/// Returns `None` for syscalls that only have a single path argument.
1015fn resolve_second_path_for_notif(notif: &SeccompNotif, notif_fd: RawFd) -> Option<String> {
1016    let nr = notif.data.nr as i64;
1017    match nr {
1018        // renameat2(olddirfd, oldpath, newdirfd, newpath, flags)
1019        n if n == libc::SYS_renameat2 => {
1020            let path = read_path_for_event(notif, notif.data.args[3], notif_fd)?;
1021            resolve_at_path_for_event(notif, notif.data.args[2] as i64, &path)
1022        }
1023        // linkat(olddirfd, oldpath, newdirfd, newpath, flags)
1024        // Destination of a hardlink to a denied file should also be denied
1025        // (prevents overwriting a denied file via linkat).
1026        n if n == libc::SYS_linkat => {
1027            let path = read_path_for_event(notif, notif.data.args[3], notif_fd)?;
1028            resolve_at_path_for_event(notif, notif.data.args[2] as i64, &path)
1029        }
1030        // rename(oldpath, newpath) — legacy
1031        n if Some(n) == arch::sys_rename() => {
1032            let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
1033            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
1034        }
1035        // link(oldpath, newpath) — legacy
1036        n if Some(n) == arch::sys_link() => {
1037            let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
1038            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
1039        }
1040        _ => None,
1041    }
1042}
1043
1044/// Extract IP and port from a sockaddr in child memory.
1045fn read_sockaddr_for_event(notif: &SeccompNotif, addr: u64, len: usize, notif_fd: RawFd)
1046    -> (Option<std::net::IpAddr>, Option<u16>)
1047{
1048    if addr == 0 || len < 4 { return (None, None); }
1049    let bytes = match read_child_mem(notif_fd, notif.id, notif.pid, addr, len.min(128)) {
1050        Ok(b) => b,
1051        Err(_) => return (None, None),
1052    };
1053    if bytes.len() < 4 { return (None, None); }
1054    let family = u16::from_ne_bytes([bytes[0], bytes[1]]);
1055    let port = u16::from_be_bytes([bytes[2], bytes[3]]);
1056    let ip = match family as u32 {
1057        f if f == crate::sys::structs::AF_INET && bytes.len() >= 8 => {
1058            Some(std::net::IpAddr::V4(std::net::Ipv4Addr::new(
1059                bytes[4], bytes[5], bytes[6], bytes[7],
1060            )))
1061        }
1062        f if f == crate::sys::structs::AF_INET6 && bytes.len() >= 24 => {
1063            let mut addr = [0u8; 16];
1064            addr.copy_from_slice(&bytes[8..24]);
1065            Some(std::net::IpAddr::V6(std::net::Ipv6Addr::from(addr)))
1066        }
1067        _ => None,
1068    };
1069    (ip, if port > 0 { Some(port) } else { None })
1070}
1071
1072/// Read argv (NULL-terminated array of char* in child memory) for execve.
1073/// Capped at 64 entries × 256 bytes/entry as a safety bound.
1074fn read_argv_for_event(notif: &SeccompNotif, argv_ptr: u64, notif_fd: RawFd) -> Option<Vec<String>> {
1075    if argv_ptr == 0 { return None; }
1076    let mut args = Vec::new();
1077    let ptr_size = std::mem::size_of::<u64>();
1078
1079    for i in 0..64u64 {
1080        let ptr_addr = argv_ptr + i * ptr_size as u64;
1081        let ptr_bytes = read_child_mem(notif_fd, notif.id, notif.pid, ptr_addr, ptr_size).ok()?;
1082        let str_ptr = u64::from_ne_bytes(ptr_bytes[..8].try_into().ok()?);
1083        if str_ptr == 0 { break; } // NULL terminator
1084
1085        if let Some(s) = read_path_for_event(notif, str_ptr, notif_fd) {
1086            args.push(s);
1087        } else {
1088            break;
1089        }
1090    }
1091
1092    if args.is_empty() { None } else { Some(args) }
1093}
1094
1095/// Resolve a held syscall's policy_fn gate outcome into a verdict.
1096///
1097/// `received` is the verdict the callback sent, or `None` if the gate timed
1098/// out or its channel closed before a decision arrived. A held syscall is one
1099/// whose verdict matters (execve, connect, openat, ...); when no decision
1100/// arrives we fail closed and deny rather than letting the syscall proceed.
1101fn resolve_held_gate(
1102    received: Option<crate::policy_fn::Verdict>,
1103) -> Option<crate::policy_fn::Verdict> {
1104    match received {
1105        Some(v) => Some(v),
1106        None => Some(crate::policy_fn::Verdict::Deny),
1107    }
1108}
1109
1110/// Emit a syscall event to the policy_fn callback thread (if active).
1111/// Returns the callback's verdict for held syscalls.
1112async fn emit_policy_event(
1113    notif: &SeccompNotif,
1114    action: &NotifAction,
1115    policy_fn_state: &Arc<tokio::sync::Mutex<super::state::PolicyFnState>>,
1116    notif_fd: RawFd,
1117) -> Option<crate::policy_fn::Verdict> {
1118    let pfs = policy_fn_state.lock().await;
1119    let tx = match pfs.event_tx.as_ref() {
1120        Some(tx) => tx.clone(),
1121        None => return None,
1122    };
1123    drop(pfs);
1124
1125    let nr = notif.data.nr as i64;
1126    let denied = matches!(action, NotifAction::Errno(_));
1127    let name = syscall_name(nr);
1128    let category = syscall_category(nr);
1129    let parent_pid = read_ppid(notif.pid);
1130
1131    // Extract metadata based on syscall type.
1132    //
1133    // Path strings are deliberately NOT extracted: the kernel re-reads
1134    // user-memory pointers after Continue, so any path-string-based
1135    // decision is racy (issue #27). Path-based access control belongs
1136    // in static Landlock rules.
1137    //
1138    // argv IS extracted for allowed execve/execveat notifications:
1139    // the supervisor freezes every task in the sandbox (siblings +
1140    // peers) before this callback reads argv and keeps that freeze
1141    // through Continue, so the post-Continue re-read sees the same
1142    // memory we read here.
1143    //
1144    // Network fields are TOCTOU-safe because connect/sendto/bind are
1145    // performed on-behalf via pidfd_getfd; the kernel never re-reads
1146    // child memory for those syscalls.
1147    let mut host = None;
1148    let mut port = None;
1149    let mut size = None;
1150    let mut argv = None;
1151
1152    if !denied && (nr == libc::SYS_execve || nr == libc::SYS_execveat) {
1153        // execve(pathname, argv, envp):       args[1] = argv ptr
1154        // execveat(dirfd, pathname, argv, ..): args[2] = argv ptr
1155        let argv_ptr = if nr == libc::SYS_execveat {
1156            notif.data.args[2]
1157        } else {
1158            notif.data.args[1]
1159        };
1160        argv = read_argv_for_event(notif, argv_ptr, notif_fd);
1161    }
1162
1163    if nr == libc::SYS_connect || nr == libc::SYS_sendto || nr == libc::SYS_bind {
1164        // connect(fd, addr, addrlen): args[1]=addr, args[2]=len
1165        let addr_ptr = notif.data.args[1];
1166        let addr_len = notif.data.args[2] as usize;
1167        let (h, p) = read_sockaddr_for_event(notif, addr_ptr, addr_len, notif_fd);
1168        host = h;
1169        port = p;
1170    }
1171
1172    if nr == libc::SYS_mmap {
1173        // mmap(addr, length, ...): args[1] = length
1174        size = Some(notif.data.args[1]);
1175    }
1176
1177    let event = crate::policy_fn::SyscallEvent {
1178        syscall: name.to_string(),
1179        category,
1180        pid: notif.pid,
1181        parent_pid,
1182        host,
1183        port,
1184        size,
1185        argv,
1186        denied,
1187    };
1188
1189    // Hold syscalls where the callback's verdict matters.
1190    // The child is blocked until the callback returns.
1191    let is_held = nr == libc::SYS_execve || nr == libc::SYS_execveat
1192        || nr == libc::SYS_connect || nr == libc::SYS_sendto
1193        || nr == libc::SYS_bind || nr == libc::SYS_openat;
1194
1195    if is_held {
1196        let (gate_tx, gate_rx) = tokio::sync::oneshot::channel();
1197        let _ = tx.send(crate::policy_fn::PolicyEvent {
1198            event,
1199            gate: Some(gate_tx),
1200        });
1201        let received = match tokio::time::timeout(std::time::Duration::from_secs(5), gate_rx).await {
1202            Ok(Ok(verdict)) => Some(verdict),
1203            _ => None, // timeout or channel closed
1204        };
1205        resolve_held_gate(received)
1206    } else {
1207        let _ = tx.send(crate::policy_fn::PolicyEvent {
1208            event,
1209            gate: None,
1210        });
1211        None
1212    }
1213}
1214
1215// ============================================================
1216// Per-notification handler (runs in a spawned task)
1217// ============================================================
1218
1219/// Process a single seccomp notification: vDSO re-patch, path denial check,
1220/// dispatch, policy event emission, and response.
1221/// Maximum number of deferred handler futures running concurrently. Caps
1222/// the worker fan-out (and any resources those workers hold, e.g. memfds or
1223/// sockets) so a burst of deferrals cannot exhaust the supervisor process.
1224const DEFER_MAX_INFLIGHT: usize = 64;
1225
1226/// Maximum time a deferred handler future may run before the supervisor gives
1227/// up and fails the trapped syscall closed. Bounds the worst case so a hung
1228/// future (e.g. a stalled network fetch in a token-injection handler) cannot
1229/// park the child forever or permanently leak its `DEFER_MAX_INFLIGHT` slot.
1230const DEFER_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(30);
1231
1232/// Drive a deferred future to its terminal action, bounded by `limit`.
1233///
1234/// On timeout, fail closed with `EIO` so the trapped child gets a definite
1235/// response instead of parking forever; `finalize_deferred` still guards a
1236/// future that resolves to a nested `Defer`.
1237async fn run_deferred_within(deferred: Deferred, limit: std::time::Duration) -> NotifAction {
1238    match tokio::time::timeout(limit, deferred.run()).await {
1239        Ok(action) => finalize_deferred(action),
1240        Err(_) => {
1241            eprintln!(
1242                "sandlock: deferred handler exceeded {:?}; failing syscall with EIO",
1243                limit
1244            );
1245            NotifAction::Errno(libc::EIO)
1246        }
1247    }
1248}
1249
1250/// Spawn a worker task that drives a deferred handler future to its terminal
1251/// action and sends the seccomp response, keyed by `id`. The `permit` is
1252/// held for the worker's lifetime, releasing its `DEFER_MAX_INFLIGHT` slot on
1253/// completion. A stale `id` (child exited mid-defer) makes `send_response`
1254/// a no-op, matching the inline path's "child may have exited" tolerance.
1255fn spawn_deferred(
1256    fd: RawFd,
1257    id: u64,
1258    deferred: Deferred,
1259    permit: tokio::sync::OwnedSemaphorePermit,
1260) {
1261    tokio::spawn(async move {
1262        let _permit = permit; // released when the worker finishes
1263        let action = run_deferred_within(deferred, DEFER_TIMEOUT).await;
1264        let _ = send_response(fd, id, action);
1265    });
1266}
1267
1268async fn handle_notification(
1269    notif: SeccompNotif,
1270    ctx: &Arc<super::ctx::SupervisorCtx>,
1271    dispatch_table: &super::dispatch::DispatchTable,
1272    fd: RawFd,
1273    defer_sem: &Arc<tokio::sync::Semaphore>,
1274) {
1275    let policy = &ctx.policy;
1276
1277    // Ensure every pid that produces a notification has per-process
1278    // supervisor state and an exit watcher. The fork handler runs on
1279    // the *parent* pid (the child doesn't exist yet at clone-time), so
1280    // the child gets registered the first time it issues a notified
1281    // syscall.
1282    crate::resource::register_child_if_new(ctx, notif.pid as i32).await;
1283
1284    // Re-patch vDSO if needed (exec replaces it with a fresh copy).
1285    if policy.has_time_start || policy.has_random_seed {
1286        let mut pfs = ctx.procfs.lock().await;
1287        maybe_patch_vdso(notif.pid as i32, &mut pfs, policy);
1288    }
1289
1290    // Check dynamic path denials before dispatch
1291    let mut action = {
1292        let nr = notif.data.nr as i64;
1293        let mut path_check_nrs = vec![
1294            libc::SYS_openat, libc::SYS_execve, libc::SYS_execveat,
1295            libc::SYS_linkat, libc::SYS_renameat2, libc::SYS_symlinkat,
1296        ];
1297        path_check_nrs.extend([
1298            arch::sys_open(), arch::sys_link(), arch::sys_rename(), arch::sys_symlink(),
1299        ].into_iter().flatten());
1300        let should_precheck_denied = policy.chroot_root.is_none()
1301            && path_check_nrs.contains(&nr);
1302        if should_precheck_denied {
1303            let pfs = ctx.policy_fn.lock().await;
1304            if is_path_denied_for_notif(&pfs, &notif, fd) {
1305                NotifAction::Errno(libc::EACCES)
1306            } else {
1307                drop(pfs);
1308                dispatch_table.dispatch(notif, fd).await
1309            }
1310        } else {
1311            dispatch_table.dispatch(notif, fd).await
1312        }
1313    };
1314
1315    let nr = notif.data.nr as i64;
1316    let fork_counted = matches!(action, NotifAction::Continue)
1317        && crate::resource::fork_counted_on_continue(&notif, fd);
1318
1319    // TOCTOU-close for execve (issue #27): freeze every sandbox task
1320    // that could mutate argv before policy_fn reads argv and before the
1321    // kernel re-reads it after Continue. This covers two writer classes:
1322    //   1. Sibling threads of the calling tid (same TGID, share mm).
1323    //   2. Peer processes in other TGIDs that alias argv pages via
1324    //      MAP_SHARED mappings or share mm via clone(CLONE_VM).
1325    //
1326    // The freeze enumerates ProcessIndex. With policy_fn active, that
1327    // index is complete: fork-like syscalls are traced at creation time
1328    // below, before new children can run user code.
1329    //
1330    // Strict on failure: if we cannot establish the freeze, we cannot
1331    // safely expose argv or allow execve, so we deny with EPERM.
1332    let mut exec_freeze = None;
1333    if matches!(action, NotifAction::Continue)
1334        && policy.argv_safety_required
1335        && crate::freeze::requires_freeze_on_continue(nr)
1336    {
1337        match crate::freeze::freeze_sandbox_for_execve(
1338            &ctx.processes,
1339            notif.pid as i32,
1340        ) {
1341            Ok(outcome) => {
1342                exec_freeze = Some(outcome);
1343            }
1344            Err(e) => {
1345                eprintln!(
1346                    "sandlock: argv-safety freeze failed for pid {}: {} \
1347                     — denying execve to preserve TOCTOU invariant",
1348                    notif.pid, e
1349                );
1350                action = NotifAction::Errno(libc::EPERM);
1351            }
1352        }
1353    }
1354
1355    // Emit event to policy_fn callback if active. For execve, argv is
1356    // only populated after `exec_freeze` has stopped every possible
1357    // writer, and those tasks stay stopped until after NOTIF_SEND.
1358    if let Some(verdict) = emit_policy_event(&notif, &action, &ctx.policy_fn, fd).await {
1359        use crate::policy_fn::Verdict;
1360        match verdict {
1361            Verdict::Deny => { action = NotifAction::Errno(libc::EPERM); }
1362            Verdict::DenyWith(errno) => { action = NotifAction::Errno(errno); }
1363            Verdict::Audit => { /* allow, but could log here */ }
1364            Verdict::Allow => {}
1365        }
1366    }
1367
1368    if fork_counted && !matches!(action, NotifAction::Continue) {
1369        crate::resource::rollback_fork_count(&ctx.resource).await;
1370    }
1371
1372    // With policy_fn active, fork-like syscalls are traced for exactly
1373    // one ptrace event so ProcessIndex becomes complete before the new
1374    // child can run user code. That closes the race where a peer
1375    // process could exist without ever having produced a notification.
1376    let mut creation_trace = None;
1377    if matches!(action, NotifAction::Continue)
1378        && crate::resource::requires_process_creation_tracking(&notif, fd, policy)
1379    {
1380        match crate::resource::prepare_process_creation_tracking(notif.pid as i32).await {
1381            Ok(trace) => {
1382                creation_trace = Some(trace);
1383            }
1384            Err(e) => {
1385                eprintln!(
1386                    "sandlock: process-creation tracking failed for pid {}: {} \
1387                     — denying fork-like syscall to preserve argv TOCTOU invariant",
1388                    notif.pid, e
1389                );
1390                if fork_counted {
1391                    crate::resource::rollback_fork_count(&ctx.resource).await;
1392                }
1393                action = NotifAction::Errno(libc::EPERM);
1394            }
1395        }
1396    }
1397
1398    // Deferred response: run the handler's future on a worker task so the
1399    // single supervisor loop is not blocked waiting for slow work (a network
1400    // round-trip, a blocking syscall). The trapped child stays parked in the
1401    // syscall; the worker sends the real response later, keyed by notif.id.
1402    //
1403    // Deferral is refused on syscalls whose Continue path requires the
1404    // execve argv-safety freeze or fork creation-tracking: sending the
1405    // response off-loop would skip that TOCTOU-closing work. (When `action`
1406    // is Defer it is not Continue, so `exec_freeze`/`creation_trace` above
1407    // are already None — there is nothing to unwind here.)
1408    if let NotifAction::Defer(deferred) = action {
1409        if crate::freeze::requires_freeze_on_continue(nr)
1410            || crate::resource::requires_process_creation_tracking(&notif, fd, policy)
1411        {
1412            let _ = send_response(fd, notif.id, NotifAction::Errno(libc::EPERM));
1413            return;
1414        }
1415        match Arc::clone(defer_sem).try_acquire_owned() {
1416            Ok(permit) => spawn_deferred(fd, notif.id, deferred, permit),
1417            // Too many deferrals in flight: fail fast with EAGAIN rather than
1418            // blocking the loop or letting unbounded workers accrete.
1419            Err(_) => {
1420                let _ = send_response(fd, notif.id, NotifAction::Errno(libc::EAGAIN));
1421            }
1422        }
1423        return;
1424    }
1425
1426    // Ignore error — child may have exited between recv and response.
1427    let exec_continued = exec_freeze.is_some() && matches!(action, NotifAction::Continue);
1428    let send_result = send_response(fd, notif.id, action);
1429
1430    if let Some(trace) = creation_trace {
1431        if send_result.is_ok() {
1432            match crate::resource::finish_process_creation_tracking(ctx, trace).await {
1433                Ok(true) => {}
1434                Ok(false) => {
1435                    crate::resource::rollback_fork_count(&ctx.resource).await;
1436                }
1437                Err(e) => {
1438                    crate::resource::rollback_fork_count(&ctx.resource).await;
1439                    eprintln!(
1440                        "sandlock: process-creation tracking completion failed for pid {}: {}",
1441                        notif.pid, e
1442                    );
1443                }
1444            }
1445        } else {
1446            crate::resource::rollback_fork_count(&ctx.resource).await;
1447            crate::resource::abort_process_creation_tracking(trace).await;
1448        }
1449    }
1450
1451    if let Some(freeze) = exec_freeze {
1452        if exec_continued && send_result.is_ok() {
1453            crate::freeze::detach_peers(&freeze.peer_tids);
1454        } else {
1455            crate::freeze::detach_all(&freeze);
1456        }
1457    }
1458}
1459
1460// ============================================================
1461// Main supervisor loop
1462// ============================================================
1463
1464/// Async event loop that processes seccomp notifications.
1465///
1466/// Runs until the notification fd is closed (child exits or filter is removed).
1467///
1468/// `pending_handlers` are user-supplied syscall handlers registered after all
1469/// builtin handlers.  For the default behaviour without any custom handlers
1470/// pass an empty `Vec`.
1471pub async fn supervisor(
1472    notif_fd: OwnedFd,
1473    ctx: Arc<super::ctx::SupervisorCtx>,
1474    pending_handlers: Vec<(i64, std::sync::Arc<dyn super::dispatch::Handler>)>,
1475    startup: tokio::sync::oneshot::Sender<io::Result<()>>,
1476) {
1477    // Register the notif fd with the Tokio IO driver so we can wait for
1478    // readiness via epoll instead of a dedicated blocking thread.
1479    let async_fd = match tokio::io::unix::AsyncFd::with_interest(
1480        notif_fd,
1481        tokio::io::Interest::READABLE,
1482    ) {
1483        Ok(fd) => fd,
1484        Err(err) => {
1485            let _ = startup.send(Err(err));
1486            return;
1487        }
1488    };
1489    let fd = async_fd.get_ref().as_raw_fd();
1490
1491    // Build the dispatch table once at startup.
1492    let dispatch_table = Arc::new(super::dispatch::build_dispatch_table(
1493        &ctx.policy,
1494        &ctx.resource,
1495        &ctx,
1496        pending_handlers,
1497    ));
1498
1499    // Try to enable sync wakeup (Linux 6.7+, ignore error on older kernels).
1500    try_set_sync_wakeup(fd);
1501
1502    // The IO driver has the fd registered; subsequent block_on cycles
1503    // can resume this task and pick up readiness events. Tell the
1504    // caller it is safe to release the child.
1505    let _ = startup.send(Ok(()));
1506
1507    // Periodic sweep as a defensive backstop in case pidfd-based
1508    // lifecycle cleanup misses an entry (e.g. pidfd_open failed for a
1509    // child on an old kernel, or its watcher panicked). At 5 minutes
1510    // this is cheap enough to leave on; the primary cleanup path is
1511    // still per-child pidfd readiness in `spawn_pid_watcher`.
1512    let gc = tokio::spawn(process_index_gc(Arc::clone(&ctx.processes)));
1513
1514    // Bounds the number of in-flight deferred handler futures (see
1515    // `DEFER_MAX_INFLIGHT`). Shared across all notifications this supervisor
1516    // processes.
1517    let defer_sem = Arc::new(tokio::sync::Semaphore::new(DEFER_MAX_INFLIGHT));
1518
1519    // Edge-triggered drain: each `readable().await` returns once per
1520    // epoll edge, then we drain the kernel queue via `probe_notif_fd`
1521    // until empty. The drain is necessary because tokio's AsyncFd is
1522    // edge-triggered and `recv_notif` does not signal "would block",
1523    // so a burst of arrivals between two `readable().await` calls
1524    // would coalesce into a single wake event.
1525    //
1526    // Notifications are processed sequentially (not spawned) to avoid
1527    // mutex contention between concurrent handlers.
1528    'outer: loop {
1529        let mut ready = match async_fd.readable().await {
1530            Ok(r) => r,
1531            Err(_) => break 'outer,
1532        };
1533        ready.clear_ready();
1534        drop(ready);
1535
1536        loop {
1537            match probe_notif_fd(fd) {
1538                NotifFdState::Pending => {
1539                    let notif = match recv_notif(fd) {
1540                        Ok(n) => n,
1541                        Err(e) if e.raw_os_error() == Some(libc::EINTR) => continue,
1542                        Err(_) => break 'outer,
1543                    };
1544                    handle_notification(notif, &ctx, &dispatch_table, fd, &defer_sem).await;
1545                }
1546                NotifFdState::Empty => break,
1547                NotifFdState::Terminal => break 'outer,
1548            }
1549        }
1550    }
1551
1552    gc.abort();
1553}
1554
1555/// Periodic sweep that drops `ProcessIndex` entries for exited PIDs.
1556/// Per-process state hangs off these entries via `Arc`, so dropping
1557/// them releases everything in one step.
1558async fn process_index_gc(processes: Arc<super::state::ProcessIndex>) {
1559    let interval = std::time::Duration::from_secs(300);
1560    loop {
1561        tokio::time::sleep(interval).await;
1562        if processes.len() == 0 {
1563            continue;
1564        }
1565        processes.prune_dead();
1566    }
1567}
1568
1569/// Spawn a per-child task that awaits the pidfd becoming readable
1570/// (process exit) and then runs unified cleanup across every
1571/// per-process supervisor map.
1572///
1573/// The watcher *owns* the pidfd via `AsyncFd<OwnedFd>` — the kernel
1574/// fd stays alive for as long as tokio's IO driver has it registered,
1575/// and is closed exactly once when the watcher task ends. This avoids
1576/// a TOCTOU where dropping the fd from a separate map could let a
1577/// recycled fd be deregistered from epoll.
1578pub(crate) fn spawn_pid_watcher(
1579    ctx: Arc<super::ctx::SupervisorCtx>,
1580    key: super::state::PidKey,
1581    pidfd: std::os::unix::io::OwnedFd,
1582) {
1583    tokio::spawn(async move {
1584        let async_fd = match tokio::io::unix::AsyncFd::with_interest(
1585            pidfd,
1586            tokio::io::Interest::READABLE,
1587        ) {
1588            Ok(f) => f,
1589            Err(_) => {
1590                // AsyncFd registration failed (extremely unusual);
1591                // fall back to immediate cleanup so we don't leak the
1592                // index entry. The OwnedFd we passed in is consumed
1593                // by `with_interest`'s Err return and will close on
1594                // drop here.
1595                cleanup_pid(&ctx, key).await;
1596                return;
1597            }
1598        };
1599        // pidfd becomes readable when the process exits; we don't
1600        // read any data, so `readable()` is just an await point.
1601        let _ = async_fd.readable().await;
1602        cleanup_pid(&ctx, key).await;
1603        // async_fd drops here, closing the pidfd.
1604    });
1605}
1606
1607/// Drop the supervisor's per-process state for `key`. With every
1608/// per-process map living inside `PerProcessState` (owned by
1609/// `ProcessIndex`), this is a single unregister — the entry's `Arc`
1610/// drops here, and remaining clones held by in-flight handlers will
1611/// drop with their tasks, freeing `PerProcessState` automatically.
1612pub(crate) async fn cleanup_pid(ctx: &super::ctx::SupervisorCtx, key: super::state::PidKey) {
1613    ctx.processes.unregister(key);
1614}
1615
1616// ============================================================
1617// Tests
1618// ============================================================
1619
1620#[cfg(test)]
1621mod tests {
1622    use super::*;
1623    use std::os::unix::io::FromRawFd;
1624
1625    fn gettid() -> u32 {
1626        (unsafe { libc::syscall(libc::SYS_gettid) }) as u32
1627    }
1628
1629    #[test]
1630    fn inject_failure_response_denies_not_continues() {
1631        // When fd injection fails, the supervisor must fail closed: deny the
1632        // syscall instead of letting it continue unmediated against the host
1633        // path (which would silently bypass chroot/file confinement).
1634        let resp = inject_failure_resp(123);
1635        assert_eq!(resp.id, 123);
1636        assert_eq!(
1637            resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE,
1638            0,
1639            "fd-injection failure must not respond with CONTINUE"
1640        );
1641        assert_ne!(resp.error, 0, "fd-injection failure must be a denial");
1642        assert_eq!(resp.error, -libc::EACCES);
1643    }
1644
1645    #[test]
1646    fn held_gate_no_decision_denies() {
1647        use crate::policy_fn::Verdict;
1648        // A held syscall whose policy_fn gate times out or whose channel closes
1649        // (received == None) must fail closed: deny, not allow the syscall.
1650        assert!(matches!(resolve_held_gate(None), Some(Verdict::Deny)));
1651    }
1652
1653    #[test]
1654    fn held_gate_passes_through_callback_verdict() {
1655        use crate::policy_fn::Verdict;
1656        // A real verdict from the callback is forwarded unchanged.
1657        assert!(matches!(
1658            resolve_held_gate(Some(Verdict::Allow)),
1659            Some(Verdict::Allow)
1660        ));
1661        assert!(matches!(
1662            resolve_held_gate(Some(Verdict::Deny)),
1663            Some(Verdict::Deny)
1664        ));
1665        assert!(matches!(
1666            resolve_held_gate(Some(Verdict::DenyWith(13))),
1667            Some(Verdict::DenyWith(13))
1668        ));
1669    }
1670
1671    #[test]
1672    fn tgid_of_main_thread_is_own_pid() {
1673        // The main thread's tid equals the process pid, and its Tgid is the pid.
1674        assert_eq!(tgid_of(gettid()), Some(std::process::id()));
1675    }
1676
1677    #[test]
1678    fn tgid_of_worker_thread_resolves_to_process() {
1679        // A non-leader thread's Tgid is the process pid, not its own tid.
1680        let (tid_tx, tid_rx) = std::sync::mpsc::channel();
1681        let (done_tx, done_rx) = std::sync::mpsc::channel::<()>();
1682        let h = std::thread::spawn(move || {
1683            tid_tx.send(gettid()).unwrap();
1684            done_rx.recv().ok(); // stay alive until the test has read /proc
1685        });
1686        let worker_tid = tid_rx.recv().unwrap();
1687        let pid = std::process::id();
1688        assert_ne!(worker_tid, pid, "worker tid must differ from pid");
1689        assert_eq!(tgid_of(worker_tid), Some(pid));
1690        done_tx.send(()).ok();
1691        h.join().unwrap();
1692    }
1693
1694    #[test]
1695    fn dup_fd_from_pid_handles_worker_thread_fd() {
1696        use std::os::unix::io::AsRawFd;
1697        // Open an fd in a non-leader worker thread, then duplicate it by that
1698        // thread's tid. Exercises the tid->process pidfd resolution end to end
1699        // (PIDFD_THREAD on >=6.9, the /proc Tgid fallback on older kernels).
1700        let (info_tx, info_rx) = std::sync::mpsc::channel();
1701        let (done_tx, done_rx) = std::sync::mpsc::channel::<()>();
1702        let h = std::thread::spawn(move || {
1703            let f = std::fs::File::open("/dev/null").unwrap();
1704            info_tx.send((gettid(), f.as_raw_fd())).unwrap();
1705            done_rx.recv().ok();
1706            drop(f);
1707        });
1708        let (worker_tid, fd) = info_rx.recv().unwrap();
1709        let dup = dup_fd_from_pid(worker_tid, fd);
1710        done_tx.send(()).ok();
1711        h.join().unwrap();
1712        assert!(dup.is_ok(), "dup_fd_from_pid for a worker-thread fd failed: {:?}", dup.err());
1713    }
1714
1715    #[test]
1716    fn read_child_cstr_returns_none_for_null_addr_or_zero_max_len() {
1717        // Smoke: addr == 0 short-circuits without touching the child.
1718        assert!(read_child_cstr(-1, 0, 0, 0, 4096).is_none());
1719        // max_len == 0 also short-circuits.
1720        assert!(read_child_cstr(-1, 0, 0, 0xdeadbeef, 0).is_none());
1721    }
1722
1723    #[test]
1724    fn test_notif_action_debug() {
1725        // Ensure all variants implement Debug.
1726        let _ = format!("{:?}", NotifAction::Continue);
1727        let _ = format!("{:?}", NotifAction::Errno(1));
1728        let _ = format!("{:?}", NotifAction::InjectFd { srcfd: 3, targetfd: 4 });
1729        // Use a real fd (dup'd from stderr) so OwnedFd can safely close it.
1730        let test_fd = unsafe { OwnedFd::from_raw_fd(libc::dup(2)) };
1731        let _ = format!("{:?}", NotifAction::InjectFdSend { srcfd: test_fd, newfd_flags: 0 });
1732        let _ = format!("{:?}", NotifAction::ReturnValue(42));
1733        let _ = format!("{:?}", NotifAction::Hold);
1734        let _ = format!("{:?}", NotifAction::Kill { sig: 9, pgid: 1 });
1735        let _ = format!("{:?}", NotifAction::defer(async { NotifAction::Continue }));
1736    }
1737
1738    #[tokio::test]
1739    async fn deferred_future_need_not_be_sync() {
1740        // A deferred future may capture Send-but-not-Sync state across an
1741        // await. `Cell` is Send but never Sync; holding it across `.await`
1742        // makes the future !Sync. Only `Send` is required (the supervisor
1743        // moves the future to a worker, never shares it by reference).
1744        use std::cell::Cell;
1745        let action = NotifAction::defer(async move {
1746            let counter = Cell::new(0);
1747            counter.set(counter.get() + 41);
1748            tokio::task::yield_now().await; // hold the !Sync Cell across await
1749            NotifAction::ReturnValue(counter.get() + 1)
1750        });
1751        let NotifAction::Defer(d) = action else { panic!("expected Defer") };
1752        assert!(matches!(d.run().await, NotifAction::ReturnValue(42)));
1753    }
1754
1755    #[tokio::test]
1756    async fn deferred_runs_to_its_terminal_action() {
1757        // A Defer carries a future; running it yields the deferred decision.
1758        let action = NotifAction::defer(async { NotifAction::ReturnValue(7) });
1759        let NotifAction::Defer(deferred) = action else {
1760            panic!("defer() must construct a NotifAction::Defer");
1761        };
1762        assert!(matches!(deferred.run().await, NotifAction::ReturnValue(7)));
1763    }
1764
1765    #[tokio::test(start_paused = true)]
1766    async fn deferred_times_out_to_eio() {
1767        // A deferred future that exceeds its limit must fail closed (EIO) so
1768        // the trapped child gets a definite response instead of parking
1769        // forever (and leaking its DEFER_MAX_INFLIGHT slot).
1770        let slow = Deferred::new(async {
1771            tokio::time::sleep(std::time::Duration::from_secs(60)).await;
1772            NotifAction::ReturnValue(7)
1773        });
1774        let action = run_deferred_within(slow, std::time::Duration::from_secs(1)).await;
1775        assert!(matches!(action, NotifAction::Errno(e) if e == libc::EIO));
1776    }
1777
1778    #[tokio::test(start_paused = true)]
1779    async fn deferred_within_limit_passes_through() {
1780        // A future that resolves within the limit returns its terminal action.
1781        let fast = Deferred::new(async { NotifAction::ReturnValue(7) });
1782        let action = run_deferred_within(fast, std::time::Duration::from_secs(1)).await;
1783        assert!(matches!(action, NotifAction::ReturnValue(7)));
1784    }
1785
1786    #[test]
1787    fn finalize_deferred_collapses_nested_defer_to_eio() {
1788        // A deferred future that itself resolves to Defer is a bug: collapse
1789        // to EIO so the trapped child is never wedged waiting for a response.
1790        let nested = NotifAction::defer(async { NotifAction::Continue });
1791        assert!(matches!(finalize_deferred(nested), NotifAction::Errno(e) if e == libc::EIO));
1792        // Non-nested terminal actions pass through unchanged.
1793        assert!(matches!(finalize_deferred(NotifAction::Continue), NotifAction::Continue));
1794        assert!(matches!(
1795            finalize_deferred(NotifAction::ReturnValue(3)),
1796            NotifAction::ReturnValue(3)
1797        ));
1798    }
1799
1800    #[test]
1801    fn content_memfd_roundtrips_content() {
1802        use std::io::Read;
1803        let fd = content_memfd(b"hello world", true).expect("content_memfd");
1804        // The fd is rewound to offset 0, so a plain read returns the content.
1805        let mut f = std::fs::File::from(fd);
1806        let mut buf = String::new();
1807        f.read_to_string(&mut buf).unwrap();
1808        assert_eq!(buf, "hello world");
1809    }
1810
1811    #[test]
1812    fn content_memfd_sealed_applies_write_seal() {
1813        let fd = content_memfd(b"data", true).expect("content_memfd");
1814        let seals = unsafe { libc::fcntl(fd.as_raw_fd(), libc::F_GET_SEALS) };
1815        assert!(seals >= 0, "F_GET_SEALS failed");
1816        assert!(
1817            seals & libc::F_SEAL_WRITE != 0,
1818            "expected F_SEAL_WRITE on a sealed memfd, got {seals:#x}"
1819        );
1820    }
1821
1822    #[test]
1823    fn content_memfd_unsealed_has_no_write_seal() {
1824        let fd = content_memfd(b"data", false).expect("content_memfd");
1825        let seals = unsafe { libc::fcntl(fd.as_raw_fd(), libc::F_GET_SEALS) };
1826        assert!(seals >= 0, "F_GET_SEALS failed");
1827        assert_eq!(
1828            seals & libc::F_SEAL_WRITE,
1829            0,
1830            "unsealed memfd must not carry a write seal, got {seals:#x}"
1831        );
1832    }
1833
1834    #[test]
1835    fn inject_bytes_produces_sealed_cloexec_injectfdsend() {
1836        use std::io::Read;
1837        match NotifAction::inject_bytes(b"payload") {
1838            NotifAction::InjectFdSend { srcfd, newfd_flags } => {
1839                assert_eq!(newfd_flags, libc::O_CLOEXEC as u32);
1840                let seals = unsafe { libc::fcntl(srcfd.as_raw_fd(), libc::F_GET_SEALS) };
1841                assert!(seals & libc::F_SEAL_WRITE != 0, "inject_bytes must seal");
1842                let mut f = std::fs::File::from(srcfd);
1843                let mut buf = String::new();
1844                f.read_to_string(&mut buf).unwrap();
1845                assert_eq!(buf, "payload");
1846            }
1847            other => panic!("expected InjectFdSend, got {other:?}"),
1848        }
1849    }
1850
1851    #[test]
1852    fn test_network_state_new() {
1853        let ns = super::super::state::NetworkState::new();
1854        assert!(matches!(ns.tcp_policy, NetworkPolicy::Unrestricted));
1855        assert!(matches!(ns.udp_policy, NetworkPolicy::Unrestricted));
1856        assert!(matches!(ns.icmp_policy, NetworkPolicy::Unrestricted));
1857        assert!(ns.port_map.bound_ports.is_empty());
1858    }
1859
1860    #[test]
1861    fn test_time_random_state_new() {
1862        let tr = super::super::state::TimeRandomState::new(None, None);
1863        assert!(tr.time_offset.is_none());
1864        assert!(tr.random_state.is_none());
1865    }
1866
1867    #[test]
1868    fn test_resource_state_new() {
1869        let rs = super::super::state::ResourceState::new(1024 * 1024, 10);
1870        assert_eq!(rs.mem_used, 0);
1871        assert_eq!(rs.max_memory_bytes, 1024 * 1024);
1872        assert_eq!(rs.max_processes, 10);
1873        assert!(!rs.hold_forks);
1874        assert!(rs.held_notif_ids.is_empty());
1875    }
1876
1877    #[test]
1878    fn test_process_vm_readv_self() {
1879        let data: u64 = 0xDEADBEEF_CAFEBABE;
1880        let addr = &data as *const u64 as u64;
1881        let pid = std::process::id();
1882        let result = read_child_mem_vm(pid, addr, 8);
1883        assert!(result.is_ok());
1884        let bytes = result.unwrap();
1885        let read_val = u64::from_ne_bytes(bytes[..8].try_into().unwrap());
1886        assert_eq!(read_val, 0xDEADBEEF_CAFEBABE);
1887    }
1888
1889    #[test]
1890    fn test_process_vm_writev_self() {
1891        let mut data: u64 = 0;
1892        let addr = &mut data as *mut u64 as u64;
1893        let pid = std::process::id();
1894        let payload = 0x1234567890ABCDEFu64.to_ne_bytes();
1895        let result = write_child_mem_vm(pid, addr, &payload);
1896        assert!(result.is_ok());
1897        assert_eq!(data, 0x1234567890ABCDEF);
1898    }
1899
1900    #[test]
1901    fn denylist_blocks_matching_cidr_allows_rest() {
1902        use crate::network::IpCidr;
1903        let policy = NetworkPolicy::DenyList {
1904            cidrs: vec![(IpCidr::parse("10.0.0.0/8").unwrap(), PortAllow::Any)],
1905            any_ip_ports: HashSet::new(),
1906            deny_all: false,
1907        };
1908        assert!(!policy.allows("10.1.2.3".parse().unwrap(), 443)); // denied
1909        assert!(policy.allows("8.8.8.8".parse().unwrap(), 443));   // allowed
1910    }
1911
1912    #[test]
1913    fn denylist_blocks_any_ip_port() {
1914        let mut ports = HashSet::new();
1915        ports.insert(25u16);
1916        let policy = NetworkPolicy::DenyList {
1917            cidrs: Vec::new(),
1918            any_ip_ports: ports,
1919            deny_all: false,
1920        };
1921        assert!(!policy.allows("8.8.8.8".parse().unwrap(), 25)); // denied
1922        assert!(policy.allows("8.8.8.8".parse().unwrap(), 80));  // allowed
1923    }
1924
1925    #[test]
1926    fn denylist_specific_ports_on_cidr() {
1927        use crate::network::IpCidr;
1928        let mut ports = HashSet::new();
1929        ports.insert(443u16);
1930        let policy = NetworkPolicy::DenyList {
1931            cidrs: vec![(IpCidr::parse("1.2.3.4/32").unwrap(), PortAllow::Specific(ports))],
1932            any_ip_ports: HashSet::new(),
1933            deny_all: false,
1934        };
1935        assert!(!policy.allows("1.2.3.4".parse().unwrap(), 443)); // denied
1936        assert!(policy.allows("1.2.3.4".parse().unwrap(), 80));   // allowed
1937    }
1938
1939    #[test]
1940    fn allowlist_permits_matching_cidr_only() {
1941        use crate::network::IpCidr;
1942        let mut ports = HashSet::new();
1943        ports.insert(80u16);
1944        let policy = NetworkPolicy::AllowList {
1945            per_ip: HashMap::new(),
1946            cidrs: vec![(IpCidr::parse("10.0.0.0/8").unwrap(), PortAllow::Specific(ports))],
1947            any_ip_ports: HashSet::new(),
1948        };
1949        assert!(policy.allows("10.1.2.3".parse().unwrap(), 80));   // in range, port ok
1950        assert!(!policy.allows("10.1.2.3".parse().unwrap(), 443)); // in range, wrong port
1951        assert!(!policy.allows("8.8.8.8".parse().unwrap(), 80));   // out of range
1952    }
1953
1954    #[test]
1955    fn allowlist_cidr_all_ports() {
1956        use crate::network::IpCidr;
1957        let policy = NetworkPolicy::AllowList {
1958            per_ip: HashMap::new(),
1959            cidrs: vec![(IpCidr::parse("192.168.0.0/16").unwrap(), PortAllow::Any)],
1960            any_ip_ports: HashSet::new(),
1961        };
1962        assert!(policy.allows("192.168.5.5".parse().unwrap(), 9999)); // any port in range
1963        assert!(!policy.allows("10.0.0.1".parse().unwrap(), 9999));   // out of range
1964    }
1965}