Skip to main content

sandlock_core/seccomp/
notif.rs

1// Seccomp user notification supervisor — async event loop that receives
2// notifications from the kernel, dispatches them to handler functions, and
3// sends responses.
4
5use std::collections::{HashMap, HashSet};
6use std::future::Future;
7use std::io;
8use std::net::IpAddr;
9use std::os::unix::io::{AsRawFd, OwnedFd, RawFd};
10use std::pin::Pin;
11use std::sync::Arc;
12
13use crate::error::NotifError;
14use crate::arch;
15use crate::sys::structs::{
16    SeccompNotif, SeccompNotifAddfd, SeccompNotifResp,
17    SECCOMP_ADDFD_FLAG_SEND, SECCOMP_IOCTL_NOTIF_ADDFD, SECCOMP_IOCTL_NOTIF_ID_VALID, SECCOMP_IOCTL_NOTIF_RECV,
18    SECCOMP_IOCTL_NOTIF_SEND, SECCOMP_IOCTL_NOTIF_SET_FLAGS,
19    SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP, SECCOMP_USER_NOTIF_FLAG_CONTINUE,
20    ENOMEM,
21};
22
23// ============================================================
24// NotifAction — how the supervisor should respond
25// ============================================================
26
27/// A one-shot callback invoked with the child-side fd number returned by
28/// `SECCOMP_IOCTL_NOTIF_ADDFD` after a successful `InjectFdSendTracked`.
29/// Wraps a boxed closure with a manual `Debug` impl so that `NotifAction`
30/// can keep deriving `Debug`.  The closure is both `Send` and `Sync` so
31/// that `&NotifAction` remains `Send` (required because `NotifAction` is
32/// borrowed across `.await` points in the notifier loop).
33pub struct OnInjectSuccess(pub Box<dyn FnOnce(i32) + Send + Sync>);
34
35impl std::fmt::Debug for OnInjectSuccess {
36    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
37        f.write_str("OnInjectSuccess(<callback>)")
38    }
39}
40
41impl OnInjectSuccess {
42    pub fn new<F: FnOnce(i32) + Send + Sync + 'static>(f: F) -> Self {
43        Self(Box::new(f))
44    }
45}
46
47/// A deferred decision: an owned, `'static` future that produces the real
48/// [`NotifAction`] off the supervisor's notification loop.
49///
50/// A handler returns [`NotifAction::Defer`] when computing the response is
51/// slow (a network round-trip, a blocking syscall) and must not stall the
52/// single supervisor task that gates every other trapped syscall.  The
53/// supervisor moves the future onto a worker, lets the loop proceed, and
54/// sends the response (via the still-valid `notif.id`) when the future
55/// resolves.  The future is `'static` because it outlives the borrowed
56/// `HandlerCtx` — capture what you need (`notif` is `Copy`, `notif_fd` is a
57/// `RawFd`) by value rather than borrowing `&self`.
58///
59/// The deferred future need only be `Send` (not `Sync`): the supervisor
60/// moves it onto a worker task and never shares it by reference.  Requiring
61/// `Sync` of user futures would be a leaky bound (it would reject a future
62/// capturing, say, a `Cell`), so it is not required.
63pub struct Deferred(Pin<Box<dyn Future<Output = NotifAction> + Send + 'static>>);
64
65// Safety: `NotifAction` must stay `Sync` so it can live in `Sync` contexts
66// (handler `&self` state, etc.; the `Handler` trait is `Send + Sync`), which
67// requires `Deferred: Sync`.  A `Send`-only future is not `Sync`, but the
68// boxed future is unreachable through a shared `&Deferred`: the field is
69// private, `Debug` touches only a static string, and `run(self)` consumes
70// the value (it is never callable through `&self`).  With no path to poll or
71// read the future via a shared reference, sharing `&Deferred` across threads
72// cannot race, so asserting `Sync` is sound while keeping user futures
73// `Send`-only.
74unsafe impl Sync for Deferred {}
75
76impl std::fmt::Debug for Deferred {
77    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
78        f.write_str("Deferred(<future>)")
79    }
80}
81
82impl Deferred {
83    pub fn new<F: Future<Output = NotifAction> + Send + 'static>(f: F) -> Self {
84        Self(Box::pin(f))
85    }
86
87    /// Drive the deferred future to its terminal action.  Consumes `self`
88    /// because the future is run exactly once, on a worker task.
89    pub async fn run(self) -> NotifAction {
90        self.0.await
91    }
92}
93
94/// How the supervisor should respond to a notification.
95#[derive(Debug)]
96pub enum NotifAction {
97    /// SECCOMP_USER_NOTIF_FLAG_CONTINUE — let the syscall proceed.
98    Continue,
99    /// Return -1 with the given errno.
100    Errno(i32),
101    /// Inject a file descriptor into the child, then continue.
102    InjectFd { srcfd: RawFd, targetfd: i32 },
103    /// Inject a file descriptor using SECCOMP_ADDFD_FLAG_SEND (atomically responds).
104    /// The child sees the injected fd as the return value of the syscall.
105    /// The `OwnedFd` is closed automatically after the ioctl completes.
106    /// `newfd_flags` controls flags on the injected fd (e.g. O_CLOEXEC).
107    InjectFdSend { srcfd: OwnedFd, newfd_flags: u32 },
108    /// Like `InjectFdSend`, but also invokes `on_success` with the
109    /// child-side fd number that `SECCOMP_IOCTL_NOTIF_ADDFD` returned.
110    /// Used when the caller needs to track the exact fd number allocated
111    /// in the child (e.g. to key per-fd state without TOCTOU).
112    InjectFdSendTracked {
113        srcfd: OwnedFd,
114        newfd_flags: u32,
115        on_success: OnInjectSuccess,
116    },
117    /// Synthetic return value (the child sees this as the syscall result).
118    ReturnValue(i64),
119    /// Don't respond — used for checkpoint/freeze.
120    Hold,
121    /// Kill the child process group (OOM-kill semantics).
122    /// Fields: signal, process group leader pid.
123    Kill { sig: i32, pgid: i32 },
124    /// Defer the response: run the carried future on a worker task and
125    /// send its terminal action later, keyed by `notif.id`.  Non-`Continue`,
126    /// so it short-circuits the handler chain — a deferring handler makes a
127    /// terminal decision.  See [`Deferred`].
128    Defer(Deferred),
129}
130
131impl NotifAction {
132    /// Construct a [`NotifAction::Defer`] from a `'static` future.  Ergonomic
133    /// shorthand for `NotifAction::Defer(Deferred::new(fut))`.
134    pub fn defer<F: Future<Output = NotifAction> + Send + 'static>(fut: F) -> Self {
135        NotifAction::Defer(Deferred::new(fut))
136    }
137
138    /// Inject `content` into the child as the syscall's returned fd, backed by
139    /// a sealed (read-only, fixed-size), `O_CLOEXEC` in-memory file.
140    ///
141    /// The fd is created, populated, sealed, and owned end to end by sandlock;
142    /// the caller never sees or closes it. On allocation failure this collapses
143    /// to `Errno(EIO)`, so a handler can return it directly:
144    ///
145    /// ```ignore
146    /// return NotifAction::inject_bytes(&secret);
147    /// ```
148    ///
149    /// For a *writable* injected fd, build one with
150    /// [`content_memfd(content, false)`](content_memfd) and pass it to
151    /// [`NotifAction::InjectFdSend`] yourself.
152    pub fn inject_bytes(content: &[u8]) -> NotifAction {
153        match content_memfd(content, true) {
154            Ok(fd) => NotifAction::InjectFdSend {
155                srcfd: fd,
156                newfd_flags: libc::O_CLOEXEC as u32,
157            },
158            Err(_) => NotifAction::Errno(libc::EIO),
159        }
160    }
161}
162
163/// Create an anonymous in-memory file ("memfd") populated with `content` and
164/// rewound to offset 0, ready to inject as a syscall's returned fd via
165/// [`NotifAction::InjectFdSend`].
166///
167/// When `seal` is true the fd is sealed read-only and fixed-size
168/// (`F_SEAL_SEAL | F_SEAL_WRITE | F_SEAL_GROW | F_SEAL_SHRINK`) so the guest
169/// cannot modify or resize the content it is handed. Sealing is best-effort:
170/// on a kernel without sealing support the fd is still returned, bounded by
171/// the rest of the policy. Pass `false` only when the guest genuinely needs a
172/// writable injected fd.
173///
174/// Most callers want [`NotifAction::inject_bytes`], which wraps this in the
175/// common sealed + `O_CLOEXEC` configuration.
176pub fn content_memfd(content: &[u8], seal: bool) -> io::Result<OwnedFd> {
177    use std::io::{Seek, SeekFrom, Write};
178    use std::os::unix::io::FromRawFd;
179
180    let flags = if seal {
181        (libc::MFD_CLOEXEC | libc::MFD_ALLOW_SEALING) as u32
182    } else {
183        libc::MFD_CLOEXEC as u32
184    };
185    let memfd = crate::sys::syscall::memfd_create("sandlock-content", flags)?;
186
187    // Write the content and rewind. Borrow the raw fd for File I/O without
188    // transferring ownership: `memfd` (the OwnedFd) keeps owning it.
189    {
190        let raw = memfd.as_raw_fd();
191        let mut file = unsafe { std::fs::File::from_raw_fd(raw) };
192        let res = file
193            .write_all(content)
194            .and_then(|()| file.seek(SeekFrom::Start(0)).map(|_| ()));
195        std::mem::forget(file); // don't close `raw`; `memfd` still owns it
196        res?;
197    }
198
199    if seal {
200        // Best-effort: ignore failure on kernels lacking sealing support.
201        let seals =
202            libc::F_SEAL_SEAL | libc::F_SEAL_WRITE | libc::F_SEAL_GROW | libc::F_SEAL_SHRINK;
203        unsafe { libc::fcntl(memfd.as_raw_fd(), libc::F_ADD_SEALS, seals) };
204    }
205
206    Ok(memfd)
207}
208
209/// Collapse a deferred future's resolved action into a sendable terminal
210/// action.  A deferred future that itself resolves to `Defer` is a bug
211/// (no nested deferral); collapse it to `EIO` so the trapped child gets a
212/// definite response instead of wedging forever waiting for one.
213fn finalize_deferred(action: NotifAction) -> NotifAction {
214    match action {
215        NotifAction::Defer(_) => NotifAction::Errno(libc::EIO),
216        other => other,
217    }
218}
219
220// ============================================================
221// NetworkPolicy — network access policy enum
222// ============================================================
223
224/// Per-IP port allowlist. `Any` is used by `policy_fn` IP-only
225/// overrides (legacy `restrict_network(ips)` API where the user
226/// restricts the destination IP set but not ports).
227#[derive(Debug, Clone)]
228pub enum PortAllow {
229    /// Any port permitted to this IP.
230    Any,
231    /// Only these ports permitted to this IP.
232    Specific(HashSet<u16>),
233}
234
235/// Global network policy for the sandbox.
236#[derive(Debug, Clone)]
237pub enum NetworkPolicy {
238    /// No IP-level restriction (no `--net-allow` configured and no
239    /// `policy_fn` override). The Landlock direct path enforces ports.
240    Unrestricted,
241    /// Endpoint-level allowlist: a connection is permitted iff the
242    /// destination IP and port match at least one entry below.
243    AllowList {
244        /// Per-IP port rules. From `--net-allow host:ports` after
245        /// hostname resolution, or from `policy_fn` overrides.
246        per_ip: HashMap<IpAddr, PortAllow>,
247        /// Ports permitted for any IP (from `--net-allow :port` /
248        /// `*:port`).
249        any_ip_ports: HashSet<u16>,
250    },
251}
252
253impl NetworkPolicy {
254    /// True iff a connection to (ip, port) should be permitted.
255    pub fn allows(&self, ip: IpAddr, port: u16) -> bool {
256        match self {
257            NetworkPolicy::Unrestricted => true,
258            NetworkPolicy::AllowList { per_ip, any_ip_ports } => {
259                if any_ip_ports.contains(&port) {
260                    return true;
261                }
262                match per_ip.get(&ip) {
263                    Some(PortAllow::Any) => true,
264                    Some(PortAllow::Specific(s)) => s.contains(&port),
265                    None => false,
266                }
267            }
268        }
269    }
270}
271
272/// Check if a path-bearing notification targets a denied path.
273///
274/// For two-path syscalls (renameat2, linkat), checks both source and
275/// destination paths — a denied file must not be linked, renamed, or
276/// overwritten.
277///
278/// Each resolved path is checked both as-is (lexical normalization) and
279/// after following symlinks via `canonicalize`.  This prevents bypass via
280/// pre-existing symlinks, relative symlinks, or symlink chains that
281/// ultimately resolve to a denied path.
282pub(crate) fn is_path_denied_for_notif(
283    policy_fn_state: &super::state::PolicyFnState,
284    notif: &SeccompNotif,
285    notif_fd: RawFd,
286) -> bool {
287    if let Some(path) = resolve_path_for_notif(notif, notif_fd) {
288        if is_denied_with_symlink_resolve(policy_fn_state, &path) {
289            return true;
290        }
291    }
292    // For two-path syscalls, also check the second (destination) path.
293    if let Some(path) = resolve_second_path_for_notif(notif, notif_fd) {
294        if is_denied_with_symlink_resolve(policy_fn_state, &path) {
295            return true;
296        }
297    }
298    false
299}
300
301/// Check a path against denied entries, also resolving symlinks.
302///
303/// First checks the lexical path, then `canonicalize`s to follow symlinks
304/// and checks the real path.  This catches pre-existing symlinks, relative
305/// symlinks, and symlink chains that resolve to a denied file.
306fn is_denied_with_symlink_resolve(
307    policy_fn_state: &super::state::PolicyFnState,
308    path: &str,
309) -> bool {
310    // Check the literal (lexically normalized) path first.
311    if policy_fn_state.is_path_denied(path) {
312        return true;
313    }
314    // Follow symlinks and re-check against denied entries.
315    if let Ok(real) = std::fs::canonicalize(path) {
316        if policy_fn_state.is_path_denied(&real.to_string_lossy()) {
317            return true;
318        }
319    }
320    false
321}
322
323/// Read the thread-group leader (Tgid) of a thread from `/proc/<tid>/status`.
324fn tgid_of(tid: u32) -> Option<u32> {
325    let status = std::fs::read_to_string(format!("/proc/{}/status", tid)).ok()?;
326    status
327        .lines()
328        .find_map(|l| l.strip_prefix("Tgid:").and_then(|r| r.trim().parse().ok()))
329}
330
331/// Duplicate a file descriptor from an arbitrary process (by PID/TID) into the supervisor.
332///
333/// `pidfd_getfd` (Linux 5.6+) needs a pidfd for the owning *process*. All threads
334/// of a process share one fd table, so the process's pidfd dups any thread's fd:
335/// `pidfd_open(pid, 0)` gives it directly when `pid` is a thread-group leader,
336/// otherwise we resolve the leader via `Tgid` in `/proc/<pid>/status` and open
337/// that. The triggering thread is frozen on the seccomp notification, so its
338/// Tgid cannot race with pid reuse. Works on any kernel with `pidfd_getfd`.
339pub(crate) fn dup_fd_from_pid(pid: u32, target_fd: i32) -> io::Result<OwnedFd> {
340    use crate::sys::syscall::{pidfd_getfd, pidfd_open};
341    let pidfd = pidfd_open(pid, 0).or_else(|e| match tgid_of(pid) {
342        Some(tgid) if tgid != pid => pidfd_open(tgid, 0),
343        _ => Err(e),
344    })?;
345    pidfd_getfd(&pidfd, target_fd, 0)
346}
347
348// ============================================================
349// NotifPolicy — policy for the notification supervisor
350// ============================================================
351
352/// Policy for the notification supervisor.
353pub struct NotifPolicy {
354    pub max_memory_bytes: u64,
355    pub max_processes: u32,
356    pub has_memory_limit: bool,
357    pub has_net_allowlist: bool,
358    pub has_random_seed: bool,
359    pub has_time_start: bool,
360    /// Argv-safety gate: the supervisor must freeze every task that
361    /// could mutate argv before any consumer reads it. True when
362    /// `policy_fn` is active or when a handler is bound to
363    /// execve/execveat (such handlers can call `read_child_mem`).
364    /// Also gates ptrace fork-event tracking so `ProcessIndex` is
365    /// complete when the freeze enumerates it.
366    pub argv_safety_required: bool,
367    pub time_offset: i64,
368    pub num_cpus: Option<u32>,
369    pub port_remap: bool,
370    pub cow_enabled: bool,
371    pub chroot_root: Option<std::path::PathBuf>,
372    /// Virtual paths allowed for reading under chroot (original user-specified paths).
373    pub chroot_readable: Vec<std::path::PathBuf>,
374    /// Virtual paths allowed for writing under chroot (original user-specified paths).
375    pub chroot_writable: Vec<std::path::PathBuf>,
376    /// Virtual paths explicitly denied under chroot.
377    pub chroot_denied: Vec<std::path::PathBuf>,
378    /// Mount mappings: (virtual_path, host_path) pairs.
379    pub chroot_mounts: Vec<(std::path::PathBuf, std::path::PathBuf)>,
380    pub deterministic_dirs: bool,
381    pub virtual_hostname: Option<String>,
382    pub has_http_acl: bool,
383    /// Synthetic `/etc/hosts` served to the sandbox. Always populated:
384    /// `openat("/etc/hosts")` returns a memfd with this content so the
385    /// host's on-disk `/etc/hosts` never leaks in. The content is the
386    /// loopback base plus any concrete hostnames resolved from `net_allow`.
387    pub virtual_etc_hosts: String,
388}
389
390// ============================================================
391// Low-level ioctl helpers
392// ============================================================
393
394/// Receive a seccomp notification from the kernel.
395/// ioctl(fd, SECCOMP_IOCTL_NOTIF_RECV, &notif)
396fn recv_notif(fd: RawFd) -> io::Result<SeccompNotif> {
397    let mut notif: SeccompNotif = unsafe { std::mem::zeroed() };
398    let ret = unsafe {
399        libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_RECV as libc::c_ulong, &mut notif as *mut _)
400    };
401    if ret < 0 {
402        Err(io::Error::last_os_error())
403    } else {
404        Ok(notif)
405    }
406}
407
408/// Result of a non-blocking probe on the seccomp notif fd.
409enum NotifFdState {
410    /// At least one INIT-state notification is queued. `recv_notif`
411    /// will return without blocking.
412    Pending,
413    /// No notifications and no terminal flags. Wait for the next
414    /// epoll edge before probing again.
415    Empty,
416    /// `POLLHUP`/`POLLERR`/`POLLNVAL` set, or `poll(2)` itself failed:
417    /// filter has been released or the fd is invalid. The supervisor
418    /// should exit; subsequent waits would busy-spin because epoll
419    /// keeps reporting the fd ready.
420    Terminal,
421}
422
423/// Non-blocking probe of the seccomp notif fd.
424///
425/// `SECCOMP_IOCTL_NOTIF_RECV` ignores `O_NONBLOCK` and calls
426/// `wait_event_interruptible` unconditionally (kernel/seccomp.c
427/// `seccomp_notify_recv`). So `recv_notif` cannot be invoked
428/// speculatively to detect an empty queue. This helper uses
429/// `poll(timeout=0)` as a non-blocking predictor: if POLLIN is set
430/// the kernel will hand us a notification without blocking; if a
431/// terminal flag is set the fd will keep waking AsyncFd until the
432/// supervisor exits.
433fn probe_notif_fd(fd: RawFd) -> NotifFdState {
434    let mut pfd = libc::pollfd {
435        fd,
436        events: libc::POLLIN,
437        revents: 0,
438    };
439    let r = unsafe { libc::poll(&mut pfd, 1, 0) };
440    if r > 0 && (pfd.revents & libc::POLLIN) != 0 {
441        return NotifFdState::Pending;
442    }
443    if r < 0 || (pfd.revents & (libc::POLLHUP | libc::POLLERR | libc::POLLNVAL)) != 0 {
444        return NotifFdState::Terminal;
445    }
446    NotifFdState::Empty
447}
448
449/// Send a response with SECCOMP_USER_NOTIF_FLAG_CONTINUE.
450fn respond_continue(fd: RawFd, id: u64) -> io::Result<()> {
451    let resp = SeccompNotifResp {
452        id,
453        val: 0,
454        error: 0,
455        flags: SECCOMP_USER_NOTIF_FLAG_CONTINUE,
456    };
457    send_resp_raw(fd, &resp)
458}
459
460/// Send a response that returns -1 with the given errno.
461fn respond_errno(fd: RawFd, id: u64, errno: i32) -> io::Result<()> {
462    let resp = SeccompNotifResp {
463        id,
464        val: 0,
465        error: -errno,
466        flags: 0,
467    };
468    send_resp_raw(fd, &resp)
469}
470
471/// Send a response with a synthetic return value.
472fn respond_value(fd: RawFd, id: u64, val: i64) -> io::Result<()> {
473    let resp = SeccompNotifResp {
474        id,
475        val,
476        error: 0,
477        flags: 0,
478    };
479    send_resp_raw(fd, &resp)
480}
481
482/// Fail-closed response used when fd injection fails.
483///
484/// Denies the syscall with `EACCES` rather than letting it continue: a
485/// `SECCOMP_USER_NOTIF_FLAG_CONTINUE` here would let the child's original
486/// syscall run unmediated against the host path, silently bypassing
487/// chroot/file confinement. (Regression guard: this must never be a CONTINUE
488/// response.)
489fn inject_failure_resp(id: u64) -> SeccompNotifResp {
490    SeccompNotifResp {
491        id,
492        val: 0,
493        error: -libc::EACCES,
494        flags: 0,
495    }
496}
497
498/// Inject a file descriptor into the child process using SECCOMP_ADDFD_FLAG_SEND.
499///
500/// Uses the SEND flag to atomically inject the fd and respond to the syscall.
501/// The ioctl return value is the fd number assigned in the child process.
502/// After this call, no additional SECCOMP_IOCTL_NOTIF_SEND is needed.
503fn inject_fd_and_send(fd: RawFd, id: u64, srcfd: RawFd, newfd_flags: u32) -> io::Result<i32> {
504    let addfd = SeccompNotifAddfd {
505        id,
506        flags: SECCOMP_ADDFD_FLAG_SEND,
507        srcfd: srcfd as u32,
508        newfd: 0,   // ignored when SECCOMP_ADDFD_FLAG_SETFD is not set
509        newfd_flags,
510    };
511    let ret = unsafe {
512        libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_ADDFD as libc::c_ulong, &addfd as *const _)
513    };
514    if ret < 0 {
515        Err(io::Error::last_os_error())
516    } else {
517        Ok(ret as i32)
518    }
519}
520
521/// Inject a file descriptor into the child process (without responding).
522/// ioctl(fd, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd)
523fn inject_fd(fd: RawFd, id: u64, srcfd: RawFd, targetfd: i32) -> io::Result<()> {
524    let addfd = SeccompNotifAddfd {
525        id,
526        flags: 0,
527        srcfd: srcfd as u32,
528        newfd: targetfd as u32,
529        newfd_flags: 0,
530    };
531    let ret = unsafe {
532        libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_ADDFD as libc::c_ulong, &addfd as *const _)
533    };
534    if ret < 0 {
535        Err(io::Error::last_os_error())
536    } else {
537        Ok(())
538    }
539}
540
541/// Raw ioctl to send a notification response.
542fn send_resp_raw(fd: RawFd, resp: &SeccompNotifResp) -> io::Result<()> {
543    let ret = unsafe {
544        libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_SEND as libc::c_ulong, resp as *const _)
545    };
546    if ret < 0 {
547        Err(io::Error::last_os_error())
548    } else {
549        Ok(())
550    }
551}
552
553/// Check whether a notification ID is still valid (TOCTOU guard).
554/// ioctl(fd, SECCOMP_IOCTL_NOTIF_ID_VALID, &id)
555pub(crate) fn id_valid(fd: RawFd, id: u64) -> io::Result<()> {
556    let ret = unsafe {
557        libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_ID_VALID as libc::c_ulong, &id as *const _)
558    };
559    if ret < 0 {
560        Err(io::Error::last_os_error())
561    } else {
562        Ok(())
563    }
564}
565
566/// Try to enable sync wakeup (Linux 6.7+). Ignores errors.
567fn try_set_sync_wakeup(fd: RawFd) {
568    let flags: u64 = SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP as u64;
569    unsafe {
570        libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_SET_FLAGS as libc::c_ulong, &flags as *const _);
571    }
572}
573
574// ============================================================
575// Child memory access helpers
576// ============================================================
577
578/// Read bytes from a child process via process_vm_readv (single syscall).
579fn read_child_mem_vm(pid: u32, addr: u64, len: usize) -> Result<Vec<u8>, NotifError> {
580    let mut buf = vec![0u8; len];
581    let local_iov = libc::iovec {
582        iov_base: buf.as_mut_ptr() as *mut libc::c_void,
583        iov_len: len,
584    };
585    let remote_iov = libc::iovec {
586        iov_base: addr as *mut libc::c_void,
587        iov_len: len,
588    };
589    let ret = unsafe {
590        libc::process_vm_readv(pid as i32, &local_iov, 1, &remote_iov, 1, 0)
591    };
592    if ret < 0 {
593        Err(NotifError::ChildMemoryRead(io::Error::last_os_error()))
594    } else {
595        buf.truncate(ret as usize);
596        Ok(buf)
597    }
598}
599
600/// Write bytes to a child process via process_vm_writev (single syscall).
601fn write_child_mem_vm(pid: u32, addr: u64, data: &[u8]) -> Result<(), NotifError> {
602    let local_iov = libc::iovec {
603        iov_base: data.as_ptr() as *mut libc::c_void,
604        iov_len: data.len(),
605    };
606    let remote_iov = libc::iovec {
607        iov_base: addr as *mut libc::c_void,
608        iov_len: data.len(),
609    };
610    let ret = unsafe {
611        libc::process_vm_writev(pid as i32, &local_iov, 1, &remote_iov, 1, 0)
612    };
613    if ret < 0 {
614        Err(NotifError::ChildMemoryRead(io::Error::last_os_error()))
615    } else if (ret as usize) < data.len() {
616        Err(NotifError::ChildMemoryRead(io::Error::new(
617            io::ErrorKind::WriteZero,
618            format!("short write: {} of {} bytes", ret, data.len()),
619        )))
620    } else {
621        Ok(())
622    }
623}
624
625/// Read bytes from a child process via `process_vm_readv` with TOCTOU validation.
626///
627/// Calls `id_valid` before and after the read to ensure the notification is
628/// still live (kernel did not abort or release the trapped syscall while the
629/// supervisor was reading guest memory).
630///
631/// Public — used by downstream `Handler` implementations to read syscall
632/// arguments that the kernel passes by pointer (paths in `openat`, buffers
633/// in `write`/`writev`, etc.).
634pub fn read_child_mem(
635    notif_fd: RawFd,
636    id: u64,
637    pid: u32,
638    addr: u64,
639    len: usize,
640) -> Result<Vec<u8>, NotifError> {
641    id_valid(notif_fd, id).map_err(NotifError::Ioctl)?;
642    let result = read_child_mem_vm(pid, addr, len)?;
643    id_valid(notif_fd, id).map_err(NotifError::Ioctl)?;
644    Ok(result)
645}
646
647/// Read a NUL-terminated string from child memory without crossing unmapped
648/// page boundaries in a single `process_vm_readv` call.
649///
650/// TOCTOU-safe — internally calls [`read_child_mem`], inheriting the
651/// `id_valid` checks bracketing each `process_vm_readv` call.
652///
653/// Page-aware: reads up to a page boundary at a time and stops at the
654/// first NUL byte, never crossing into unmapped memory.  Returns
655/// `None` for `addr == 0`, `max_len == 0`, a read failure, or a string
656/// that exceeds `max_len` without a NUL.
657///
658/// Public — used by downstream `Handler` implementations that read
659/// path arguments from notifications (`openat`, `unlinkat`, `statx`,
660/// `newfstatat`, etc.).
661pub fn read_child_cstr(
662    notif_fd: RawFd,
663    id: u64,
664    pid: u32,
665    addr: u64,
666    max_len: usize,
667) -> Option<String> {
668    if addr == 0 || max_len == 0 {
669        return None;
670    }
671
672    const PAGE_SIZE: u64 = 4096;
673    let mut result = Vec::with_capacity(max_len.min(256));
674    let mut cur = addr;
675    while result.len() < max_len {
676        let page_remaining = PAGE_SIZE - (cur % PAGE_SIZE);
677        let remaining = max_len - result.len();
678        let to_read = page_remaining.min(remaining as u64) as usize;
679        let bytes = read_child_mem(notif_fd, id, pid, cur, to_read).ok()?;
680        if let Some(nul) = bytes.iter().position(|&b| b == 0) {
681            result.extend_from_slice(&bytes[..nul]);
682            return String::from_utf8(result).ok();
683        }
684        result.extend_from_slice(&bytes);
685        cur += to_read as u64;
686    }
687
688    String::from_utf8(result).ok()
689}
690
691/// Write bytes to a child process via `process_vm_writev` with TOCTOU validation.
692///
693/// Same TOCTOU contract as [`read_child_mem`].  Public for downstream
694/// `Handler` implementations that synthesise syscall results into
695/// guest memory (e.g. fake `getdents64` listings populated from a
696/// virtual directory index, or synthesised `stat` buffers).
697pub fn write_child_mem(
698    notif_fd: RawFd,
699    id: u64,
700    pid: u32,
701    addr: u64,
702    data: &[u8],
703) -> Result<(), NotifError> {
704    id_valid(notif_fd, id).map_err(NotifError::Ioctl)?;
705    write_child_mem_vm(pid, addr, data)?;
706    id_valid(notif_fd, id).map_err(NotifError::Ioctl)?;
707    Ok(())
708}
709
710// ============================================================
711// Response dispatch
712// ============================================================
713
714/// Dispatch a `NotifAction` to the appropriate low-level response function.
715fn send_response(fd: RawFd, id: u64, action: NotifAction) -> io::Result<()> {
716    match action {
717        NotifAction::Continue => respond_continue(fd, id),
718        NotifAction::Errno(errno) => respond_errno(fd, id, errno),
719        NotifAction::InjectFd { srcfd, targetfd } => {
720            inject_fd(fd, id, srcfd, targetfd)?;
721            respond_continue(fd, id)
722        }
723        NotifAction::InjectFdSend { srcfd, newfd_flags } => {
724            // SECCOMP_ADDFD_FLAG_SEND atomically injects the fd and responds.
725            // No separate NOTIF_SEND needed after this.
726            // On failure, deny (fail closed) rather than letting the original
727            // syscall continue unmediated against the host path.
728            // srcfd (OwnedFd) is dropped at end of this arm, closing the fd.
729            match inject_fd_and_send(fd, id, srcfd.as_raw_fd(), newfd_flags) {
730                Ok(_new_fd) => Ok(()),
731                Err(_) => send_resp_raw(fd, &inject_failure_resp(id)),
732            }
733        }
734        NotifAction::InjectFdSendTracked { srcfd, newfd_flags, on_success } => {
735            match inject_fd_and_send(fd, id, srcfd.as_raw_fd(), newfd_flags) {
736                Ok(new_fd) => {
737                    (on_success.0)(new_fd);
738                    Ok(())
739                }
740                Err(_) => send_resp_raw(fd, &inject_failure_resp(id)),
741            }
742        }
743        NotifAction::ReturnValue(val) => respond_value(fd, id, val),
744        NotifAction::Hold => Ok(()), // Don't send a response.
745        NotifAction::Defer(_) => {
746            // Defer is intercepted in `handle_notification` and never reaches
747            // here on the normal path. If it ever does, fail closed with EIO
748            // rather than dropping the future and wedging the child.
749            debug_assert!(false, "Defer reached send_response; should be intercepted earlier");
750            respond_errno(fd, id, libc::EIO)
751        }
752        NotifAction::Kill { sig, pgid } => {
753            // Kill the entire process group, then return ENOMEM so the
754            // seccomp notification is resolved (avoids a kernel warning).
755            unsafe { libc::killpg(pgid, sig) };
756            respond_errno(fd, id, ENOMEM)
757        }
758    }
759}
760
761// ============================================================
762// vDSO re-patching after exec
763// ============================================================
764
765/// Re-patch the vDSO if the base address changed (e.g. after exec replaces it).
766fn maybe_patch_vdso(pid: i32, procfs: &mut super::state::ProcfsState, policy: &NotifPolicy) {
767    let base = match crate::vdso::find_vdso_base(pid) {
768        Ok(addr) => addr,
769        Err(_) => return,
770    };
771    if base == procfs.vdso_patched_addr {
772        return; // already patched this vDSO
773    }
774    let time_offset = if policy.has_time_start { Some(policy.time_offset) } else { None };
775    if crate::vdso::patch(pid, time_offset, policy.has_random_seed).is_ok() {
776        procfs.vdso_patched_addr = base;
777    }
778}
779
780// ============================================================
781// Policy event emission
782// ============================================================
783
784/// Map a syscall number to a human-readable name for the policy callback.
785fn syscall_name(nr: i64) -> &'static str {
786    match nr {
787        n if n == libc::SYS_openat => "openat",
788        n if n == libc::SYS_connect => "connect",
789        n if n == libc::SYS_sendto => "sendto",
790        n if n == libc::SYS_sendmsg => "sendmsg",
791        n if n == libc::SYS_sendmmsg => "sendmmsg",
792        n if n == libc::SYS_bind => "bind",
793        n if n == libc::SYS_clone => "clone",
794        n if n == libc::SYS_clone3 => "clone3",
795        n if Some(n) == arch::SYS_VFORK => "vfork",
796        n if Some(n) == arch::SYS_FORK => "fork",
797        n if n == libc::SYS_execve => "execve",
798        n if n == libc::SYS_execveat => "execveat",
799        n if n == libc::SYS_mmap => "mmap",
800        n if n == libc::SYS_munmap => "munmap",
801        n if n == libc::SYS_brk => "brk",
802        n if n == libc::SYS_getrandom => "getrandom",
803        n if n == libc::SYS_unlinkat => "unlinkat",
804        n if n == libc::SYS_mkdirat => "mkdirat",
805        _ => "unknown",
806    }
807}
808
809/// Map a syscall number to a high-level category.
810fn syscall_category(nr: i64) -> crate::policy_fn::SyscallCategory {
811    use crate::policy_fn::SyscallCategory;
812    match nr {
813        n if n == libc::SYS_openat || n == libc::SYS_unlinkat
814            || n == libc::SYS_mkdirat || n == libc::SYS_renameat2
815            || n == libc::SYS_symlinkat || n == libc::SYS_linkat
816            || n == libc::SYS_fchmodat || n == libc::SYS_fchownat
817            || n == libc::SYS_truncate || n == libc::SYS_readlinkat
818            || n == libc::SYS_newfstatat || n == libc::SYS_statx
819            || n == libc::SYS_faccessat || n == libc::SYS_getdents64
820            || Some(n) == arch::SYS_GETDENTS => SyscallCategory::File,
821        n if n == libc::SYS_connect || n == libc::SYS_sendto
822            || n == libc::SYS_sendmsg || n == libc::SYS_sendmmsg
823            || n == libc::SYS_bind
824            || n == libc::SYS_getsockname => SyscallCategory::Network,
825        n if n == libc::SYS_clone || n == libc::SYS_clone3
826            || Some(n) == arch::SYS_VFORK || Some(n) == arch::SYS_FORK
827            || n == libc::SYS_execve || n == libc::SYS_execveat => SyscallCategory::Process,
828        n if n == libc::SYS_mmap || n == libc::SYS_munmap
829            || n == libc::SYS_brk || n == libc::SYS_mremap
830            => SyscallCategory::Memory,
831        _ => SyscallCategory::File, // default
832    }
833}
834
835/// Read the parent PID from /proc/{pid}/stat.
836fn read_ppid(pid: u32) -> Option<u32> {
837    let stat = std::fs::read_to_string(format!("/proc/{}/stat", pid)).ok()?;
838    // Format: "pid (comm) state ppid ..."
839    // Find the closing ')' then split the rest
840    let close_paren = stat.rfind(')')?;
841    let rest = &stat[close_paren + 2..]; // skip ") "
842    let fields: Vec<&str> = rest.split_whitespace().collect();
843    // fields[0] = state, fields[1] = ppid
844    fields.get(1)?.parse().ok()
845}
846
847/// Read a NUL-terminated path from child memory (up to 256 bytes).
848fn read_path_for_event(notif: &SeccompNotif, addr: u64, notif_fd: RawFd) -> Option<String> {
849    if addr == 0 { return None; }
850    let bytes = read_child_mem(notif_fd, notif.id, notif.pid, addr, 256).ok()?;
851    let nul = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len());
852    String::from_utf8(bytes[..nul].to_vec()).ok()
853}
854
855fn normalize_path(path: &std::path::Path) -> String {
856    use std::path::{Component, PathBuf};
857
858    let mut normalized = PathBuf::new();
859    let absolute = path.is_absolute();
860    if absolute {
861        normalized.push("/");
862    }
863
864    for component in path.components() {
865        match component {
866            Component::RootDir | Component::CurDir => {}
867            Component::ParentDir => {
868                normalized.pop();
869            }
870            Component::Normal(part) => normalized.push(part),
871            Component::Prefix(_) => {}
872        }
873    }
874
875    if normalized.as_os_str().is_empty() {
876        if absolute { "/".into() } else { ".".into() }
877    } else {
878        normalized.to_string_lossy().into_owned()
879    }
880}
881
882fn resolve_at_path_for_event(notif: &SeccompNotif, dirfd: i64, path: &str) -> Option<String> {
883    use std::path::Path;
884
885    if Path::new(path).is_absolute() {
886        return Some(normalize_path(Path::new(path)));
887    }
888
889    let dirfd32 = dirfd as i32;
890    let base = if dirfd32 == libc::AT_FDCWD {
891        std::fs::read_link(format!("/proc/{}/cwd", notif.pid)).ok()?
892    } else {
893        std::fs::read_link(format!("/proc/{}/fd/{}", notif.pid, dirfd32)).ok()?
894    };
895
896    Some(normalize_path(&base.join(path)))
897}
898
899fn resolve_path_for_notif(notif: &SeccompNotif, notif_fd: RawFd) -> Option<String> {
900    let nr = notif.data.nr as i64;
901    match nr {
902        n if n == libc::SYS_openat => {
903            // openat(dirfd, pathname, flags, mode)
904            let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
905            resolve_at_path_for_event(notif, notif.data.args[0] as i64, &path)
906        }
907        n if Some(n) == arch::SYS_OPEN || n == libc::SYS_execve => {
908            let path = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
909            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
910        }
911        n if n == libc::SYS_execveat => {
912            let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
913            resolve_at_path_for_event(notif, notif.data.args[0] as i64, &path)
914        }
915        // linkat(olddirfd, oldpath, newdirfd, newpath, flags)
916        // Check the source (old) path — deny if it's a denied file being linked away.
917        n if n == libc::SYS_linkat => {
918            let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
919            resolve_at_path_for_event(notif, notif.data.args[0] as i64, &path)
920        }
921        // renameat2(olddirfd, oldpath, newdirfd, newpath, flags)
922        // Check the source (old) path — deny if a denied file is being renamed away.
923        n if n == libc::SYS_renameat2 => {
924            let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
925            resolve_at_path_for_event(notif, notif.data.args[0] as i64, &path)
926        }
927        // symlinkat(target, newdirfd, linkpath)
928        // The target string is what the symlink points to; deny if it names a denied path.
929        n if n == libc::SYS_symlinkat => {
930            let target = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
931            // target may be absolute or relative to the process cwd
932            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &target)
933        }
934        // link(oldpath, newpath) — legacy, AT_FDCWD implied for both
935        n if Some(n) == arch::SYS_LINK => {
936            let path = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
937            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
938        }
939        // rename(oldpath, newpath) — legacy, AT_FDCWD implied for both
940        n if Some(n) == arch::SYS_RENAME => {
941            let path = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
942            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
943        }
944        // symlink(target, linkpath) — legacy
945        n if Some(n) == arch::SYS_SYMLINK => {
946            let target = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
947            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &target)
948        }
949        _ => None,
950    }
951}
952
953/// Resolve the second (destination) path for two-path syscalls.
954///
955/// Returns `None` for syscalls that only have a single path argument.
956fn resolve_second_path_for_notif(notif: &SeccompNotif, notif_fd: RawFd) -> Option<String> {
957    let nr = notif.data.nr as i64;
958    match nr {
959        // renameat2(olddirfd, oldpath, newdirfd, newpath, flags)
960        n if n == libc::SYS_renameat2 => {
961            let path = read_path_for_event(notif, notif.data.args[3], notif_fd)?;
962            resolve_at_path_for_event(notif, notif.data.args[2] as i64, &path)
963        }
964        // linkat(olddirfd, oldpath, newdirfd, newpath, flags)
965        // Destination of a hardlink to a denied file should also be denied
966        // (prevents overwriting a denied file via linkat).
967        n if n == libc::SYS_linkat => {
968            let path = read_path_for_event(notif, notif.data.args[3], notif_fd)?;
969            resolve_at_path_for_event(notif, notif.data.args[2] as i64, &path)
970        }
971        // rename(oldpath, newpath) — legacy
972        n if Some(n) == arch::SYS_RENAME => {
973            let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
974            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
975        }
976        // link(oldpath, newpath) — legacy
977        n if Some(n) == arch::SYS_LINK => {
978            let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
979            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
980        }
981        _ => None,
982    }
983}
984
985/// Extract IP and port from a sockaddr in child memory.
986fn read_sockaddr_for_event(notif: &SeccompNotif, addr: u64, len: usize, notif_fd: RawFd)
987    -> (Option<std::net::IpAddr>, Option<u16>)
988{
989    if addr == 0 || len < 4 { return (None, None); }
990    let bytes = match read_child_mem(notif_fd, notif.id, notif.pid, addr, len.min(128)) {
991        Ok(b) => b,
992        Err(_) => return (None, None),
993    };
994    if bytes.len() < 4 { return (None, None); }
995    let family = u16::from_ne_bytes([bytes[0], bytes[1]]);
996    let port = u16::from_be_bytes([bytes[2], bytes[3]]);
997    let ip = match family as u32 {
998        f if f == crate::sys::structs::AF_INET && bytes.len() >= 8 => {
999            Some(std::net::IpAddr::V4(std::net::Ipv4Addr::new(
1000                bytes[4], bytes[5], bytes[6], bytes[7],
1001            )))
1002        }
1003        f if f == crate::sys::structs::AF_INET6 && bytes.len() >= 24 => {
1004            let mut addr = [0u8; 16];
1005            addr.copy_from_slice(&bytes[8..24]);
1006            Some(std::net::IpAddr::V6(std::net::Ipv6Addr::from(addr)))
1007        }
1008        _ => None,
1009    };
1010    (ip, if port > 0 { Some(port) } else { None })
1011}
1012
1013/// Read argv (NULL-terminated array of char* in child memory) for execve.
1014/// Capped at 64 entries × 256 bytes/entry as a safety bound.
1015fn read_argv_for_event(notif: &SeccompNotif, argv_ptr: u64, notif_fd: RawFd) -> Option<Vec<String>> {
1016    if argv_ptr == 0 { return None; }
1017    let mut args = Vec::new();
1018    let ptr_size = std::mem::size_of::<u64>();
1019
1020    for i in 0..64u64 {
1021        let ptr_addr = argv_ptr + i * ptr_size as u64;
1022        let ptr_bytes = read_child_mem(notif_fd, notif.id, notif.pid, ptr_addr, ptr_size).ok()?;
1023        let str_ptr = u64::from_ne_bytes(ptr_bytes[..8].try_into().ok()?);
1024        if str_ptr == 0 { break; } // NULL terminator
1025
1026        if let Some(s) = read_path_for_event(notif, str_ptr, notif_fd) {
1027            args.push(s);
1028        } else {
1029            break;
1030        }
1031    }
1032
1033    if args.is_empty() { None } else { Some(args) }
1034}
1035
1036/// Resolve a held syscall's policy_fn gate outcome into a verdict.
1037///
1038/// `received` is the verdict the callback sent, or `None` if the gate timed
1039/// out or its channel closed before a decision arrived. A held syscall is one
1040/// whose verdict matters (execve, connect, openat, ...); when no decision
1041/// arrives we fail closed and deny rather than letting the syscall proceed.
1042fn resolve_held_gate(
1043    received: Option<crate::policy_fn::Verdict>,
1044) -> Option<crate::policy_fn::Verdict> {
1045    match received {
1046        Some(v) => Some(v),
1047        None => Some(crate::policy_fn::Verdict::Deny),
1048    }
1049}
1050
1051/// Emit a syscall event to the policy_fn callback thread (if active).
1052/// Returns the callback's verdict for held syscalls.
1053async fn emit_policy_event(
1054    notif: &SeccompNotif,
1055    action: &NotifAction,
1056    policy_fn_state: &Arc<tokio::sync::Mutex<super::state::PolicyFnState>>,
1057    notif_fd: RawFd,
1058) -> Option<crate::policy_fn::Verdict> {
1059    let pfs = policy_fn_state.lock().await;
1060    let tx = match pfs.event_tx.as_ref() {
1061        Some(tx) => tx.clone(),
1062        None => return None,
1063    };
1064    drop(pfs);
1065
1066    let nr = notif.data.nr as i64;
1067    let denied = matches!(action, NotifAction::Errno(_));
1068    let name = syscall_name(nr);
1069    let category = syscall_category(nr);
1070    let parent_pid = read_ppid(notif.pid);
1071
1072    // Extract metadata based on syscall type.
1073    //
1074    // Path strings are deliberately NOT extracted: the kernel re-reads
1075    // user-memory pointers after Continue, so any path-string-based
1076    // decision is racy (issue #27). Path-based access control belongs
1077    // in static Landlock rules.
1078    //
1079    // argv IS extracted for allowed execve/execveat notifications:
1080    // the supervisor freezes every task in the sandbox (siblings +
1081    // peers) before this callback reads argv and keeps that freeze
1082    // through Continue, so the post-Continue re-read sees the same
1083    // memory we read here.
1084    //
1085    // Network fields are TOCTOU-safe because connect/sendto/bind are
1086    // performed on-behalf via pidfd_getfd; the kernel never re-reads
1087    // child memory for those syscalls.
1088    let mut host = None;
1089    let mut port = None;
1090    let mut size = None;
1091    let mut argv = None;
1092
1093    if !denied && (nr == libc::SYS_execve || nr == libc::SYS_execveat) {
1094        // execve(pathname, argv, envp):       args[1] = argv ptr
1095        // execveat(dirfd, pathname, argv, ..): args[2] = argv ptr
1096        let argv_ptr = if nr == libc::SYS_execveat {
1097            notif.data.args[2]
1098        } else {
1099            notif.data.args[1]
1100        };
1101        argv = read_argv_for_event(notif, argv_ptr, notif_fd);
1102    }
1103
1104    if nr == libc::SYS_connect || nr == libc::SYS_sendto || nr == libc::SYS_bind {
1105        // connect(fd, addr, addrlen): args[1]=addr, args[2]=len
1106        let addr_ptr = notif.data.args[1];
1107        let addr_len = notif.data.args[2] as usize;
1108        let (h, p) = read_sockaddr_for_event(notif, addr_ptr, addr_len, notif_fd);
1109        host = h;
1110        port = p;
1111    }
1112
1113    if nr == libc::SYS_mmap {
1114        // mmap(addr, length, ...): args[1] = length
1115        size = Some(notif.data.args[1]);
1116    }
1117
1118    let event = crate::policy_fn::SyscallEvent {
1119        syscall: name.to_string(),
1120        category,
1121        pid: notif.pid,
1122        parent_pid,
1123        host,
1124        port,
1125        size,
1126        argv,
1127        denied,
1128    };
1129
1130    // Hold syscalls where the callback's verdict matters.
1131    // The child is blocked until the callback returns.
1132    let is_held = nr == libc::SYS_execve || nr == libc::SYS_execveat
1133        || nr == libc::SYS_connect || nr == libc::SYS_sendto
1134        || nr == libc::SYS_bind || nr == libc::SYS_openat;
1135
1136    if is_held {
1137        let (gate_tx, gate_rx) = tokio::sync::oneshot::channel();
1138        let _ = tx.send(crate::policy_fn::PolicyEvent {
1139            event,
1140            gate: Some(gate_tx),
1141        });
1142        let received = match tokio::time::timeout(std::time::Duration::from_secs(5), gate_rx).await {
1143            Ok(Ok(verdict)) => Some(verdict),
1144            _ => None, // timeout or channel closed
1145        };
1146        resolve_held_gate(received)
1147    } else {
1148        let _ = tx.send(crate::policy_fn::PolicyEvent {
1149            event,
1150            gate: None,
1151        });
1152        None
1153    }
1154}
1155
1156// ============================================================
1157// Per-notification handler (runs in a spawned task)
1158// ============================================================
1159
1160/// Process a single seccomp notification: vDSO re-patch, path denial check,
1161/// dispatch, policy event emission, and response.
1162/// Maximum number of deferred handler futures running concurrently. Caps
1163/// the worker fan-out (and any resources those workers hold, e.g. memfds or
1164/// sockets) so a burst of deferrals cannot exhaust the supervisor process.
1165const DEFER_MAX_INFLIGHT: usize = 64;
1166
1167/// Maximum time a deferred handler future may run before the supervisor gives
1168/// up and fails the trapped syscall closed. Bounds the worst case so a hung
1169/// future (e.g. a stalled network fetch in a token-injection handler) cannot
1170/// park the child forever or permanently leak its `DEFER_MAX_INFLIGHT` slot.
1171const DEFER_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(30);
1172
1173/// Drive a deferred future to its terminal action, bounded by `limit`.
1174///
1175/// On timeout, fail closed with `EIO` so the trapped child gets a definite
1176/// response instead of parking forever; `finalize_deferred` still guards a
1177/// future that resolves to a nested `Defer`.
1178async fn run_deferred_within(deferred: Deferred, limit: std::time::Duration) -> NotifAction {
1179    match tokio::time::timeout(limit, deferred.run()).await {
1180        Ok(action) => finalize_deferred(action),
1181        Err(_) => {
1182            eprintln!(
1183                "sandlock: deferred handler exceeded {:?}; failing syscall with EIO",
1184                limit
1185            );
1186            NotifAction::Errno(libc::EIO)
1187        }
1188    }
1189}
1190
1191/// Spawn a worker task that drives a deferred handler future to its terminal
1192/// action and sends the seccomp response, keyed by `id`. The `permit` is
1193/// held for the worker's lifetime, releasing its `DEFER_MAX_INFLIGHT` slot on
1194/// completion. A stale `id` (child exited mid-defer) makes `send_response`
1195/// a no-op, matching the inline path's "child may have exited" tolerance.
1196fn spawn_deferred(
1197    fd: RawFd,
1198    id: u64,
1199    deferred: Deferred,
1200    permit: tokio::sync::OwnedSemaphorePermit,
1201) {
1202    tokio::spawn(async move {
1203        let _permit = permit; // released when the worker finishes
1204        let action = run_deferred_within(deferred, DEFER_TIMEOUT).await;
1205        let _ = send_response(fd, id, action);
1206    });
1207}
1208
1209async fn handle_notification(
1210    notif: SeccompNotif,
1211    ctx: &Arc<super::ctx::SupervisorCtx>,
1212    dispatch_table: &super::dispatch::DispatchTable,
1213    fd: RawFd,
1214    defer_sem: &Arc<tokio::sync::Semaphore>,
1215) {
1216    let policy = &ctx.policy;
1217
1218    // Ensure every pid that produces a notification has per-process
1219    // supervisor state and an exit watcher. The fork handler runs on
1220    // the *parent* pid (the child doesn't exist yet at clone-time), so
1221    // the child gets registered the first time it issues a notified
1222    // syscall.
1223    crate::resource::register_child_if_new(ctx, notif.pid as i32).await;
1224
1225    // Re-patch vDSO if needed (exec replaces it with a fresh copy).
1226    if policy.has_time_start || policy.has_random_seed {
1227        let mut pfs = ctx.procfs.lock().await;
1228        maybe_patch_vdso(notif.pid as i32, &mut pfs, policy);
1229    }
1230
1231    // Check dynamic path denials before dispatch
1232    let mut action = {
1233        let nr = notif.data.nr as i64;
1234        let mut path_check_nrs = vec![
1235            libc::SYS_openat, libc::SYS_execve, libc::SYS_execveat,
1236            libc::SYS_linkat, libc::SYS_renameat2, libc::SYS_symlinkat,
1237        ];
1238        path_check_nrs.extend([
1239            arch::SYS_OPEN, arch::SYS_LINK, arch::SYS_RENAME, arch::SYS_SYMLINK,
1240        ].into_iter().flatten());
1241        let should_precheck_denied = policy.chroot_root.is_none()
1242            && path_check_nrs.contains(&nr);
1243        if should_precheck_denied {
1244            let pfs = ctx.policy_fn.lock().await;
1245            if is_path_denied_for_notif(&pfs, &notif, fd) {
1246                NotifAction::Errno(libc::EACCES)
1247            } else {
1248                drop(pfs);
1249                dispatch_table.dispatch(notif, fd).await
1250            }
1251        } else {
1252            dispatch_table.dispatch(notif, fd).await
1253        }
1254    };
1255
1256    let nr = notif.data.nr as i64;
1257    let fork_counted = matches!(action, NotifAction::Continue)
1258        && crate::resource::fork_counted_on_continue(&notif, fd);
1259
1260    // TOCTOU-close for execve (issue #27): freeze every sandbox task
1261    // that could mutate argv before policy_fn reads argv and before the
1262    // kernel re-reads it after Continue. This covers two writer classes:
1263    //   1. Sibling threads of the calling tid (same TGID, share mm).
1264    //   2. Peer processes in other TGIDs that alias argv pages via
1265    //      MAP_SHARED mappings or share mm via clone(CLONE_VM).
1266    //
1267    // The freeze enumerates ProcessIndex. With policy_fn active, that
1268    // index is complete: fork-like syscalls are traced at creation time
1269    // below, before new children can run user code.
1270    //
1271    // Strict on failure: if we cannot establish the freeze, we cannot
1272    // safely expose argv or allow execve, so we deny with EPERM.
1273    let mut exec_freeze = None;
1274    if matches!(action, NotifAction::Continue)
1275        && policy.argv_safety_required
1276        && crate::freeze::requires_freeze_on_continue(nr)
1277    {
1278        match crate::freeze::freeze_sandbox_for_execve(
1279            &ctx.processes,
1280            notif.pid as i32,
1281        ) {
1282            Ok(outcome) => {
1283                exec_freeze = Some(outcome);
1284            }
1285            Err(e) => {
1286                eprintln!(
1287                    "sandlock: argv-safety freeze failed for pid {}: {} \
1288                     — denying execve to preserve TOCTOU invariant",
1289                    notif.pid, e
1290                );
1291                action = NotifAction::Errno(libc::EPERM);
1292            }
1293        }
1294    }
1295
1296    // Emit event to policy_fn callback if active. For execve, argv is
1297    // only populated after `exec_freeze` has stopped every possible
1298    // writer, and those tasks stay stopped until after NOTIF_SEND.
1299    if let Some(verdict) = emit_policy_event(&notif, &action, &ctx.policy_fn, fd).await {
1300        use crate::policy_fn::Verdict;
1301        match verdict {
1302            Verdict::Deny => { action = NotifAction::Errno(libc::EPERM); }
1303            Verdict::DenyWith(errno) => { action = NotifAction::Errno(errno); }
1304            Verdict::Audit => { /* allow, but could log here */ }
1305            Verdict::Allow => {}
1306        }
1307    }
1308
1309    if fork_counted && !matches!(action, NotifAction::Continue) {
1310        crate::resource::rollback_fork_count(&ctx.resource).await;
1311    }
1312
1313    // With policy_fn active, fork-like syscalls are traced for exactly
1314    // one ptrace event so ProcessIndex becomes complete before the new
1315    // child can run user code. That closes the race where a peer
1316    // process could exist without ever having produced a notification.
1317    let mut creation_trace = None;
1318    if matches!(action, NotifAction::Continue)
1319        && crate::resource::requires_process_creation_tracking(&notif, fd, policy)
1320    {
1321        match crate::resource::prepare_process_creation_tracking(notif.pid as i32).await {
1322            Ok(trace) => {
1323                creation_trace = Some(trace);
1324            }
1325            Err(e) => {
1326                eprintln!(
1327                    "sandlock: process-creation tracking failed for pid {}: {} \
1328                     — denying fork-like syscall to preserve argv TOCTOU invariant",
1329                    notif.pid, e
1330                );
1331                if fork_counted {
1332                    crate::resource::rollback_fork_count(&ctx.resource).await;
1333                }
1334                action = NotifAction::Errno(libc::EPERM);
1335            }
1336        }
1337    }
1338
1339    // Deferred response: run the handler's future on a worker task so the
1340    // single supervisor loop is not blocked waiting for slow work (a network
1341    // round-trip, a blocking syscall). The trapped child stays parked in the
1342    // syscall; the worker sends the real response later, keyed by notif.id.
1343    //
1344    // Deferral is refused on syscalls whose Continue path requires the
1345    // execve argv-safety freeze or fork creation-tracking: sending the
1346    // response off-loop would skip that TOCTOU-closing work. (When `action`
1347    // is Defer it is not Continue, so `exec_freeze`/`creation_trace` above
1348    // are already None — there is nothing to unwind here.)
1349    if let NotifAction::Defer(deferred) = action {
1350        if crate::freeze::requires_freeze_on_continue(nr)
1351            || crate::resource::requires_process_creation_tracking(&notif, fd, policy)
1352        {
1353            let _ = send_response(fd, notif.id, NotifAction::Errno(libc::EPERM));
1354            return;
1355        }
1356        match Arc::clone(defer_sem).try_acquire_owned() {
1357            Ok(permit) => spawn_deferred(fd, notif.id, deferred, permit),
1358            // Too many deferrals in flight: fail fast with EAGAIN rather than
1359            // blocking the loop or letting unbounded workers accrete.
1360            Err(_) => {
1361                let _ = send_response(fd, notif.id, NotifAction::Errno(libc::EAGAIN));
1362            }
1363        }
1364        return;
1365    }
1366
1367    // Ignore error — child may have exited between recv and response.
1368    let exec_continued = exec_freeze.is_some() && matches!(action, NotifAction::Continue);
1369    let send_result = send_response(fd, notif.id, action);
1370
1371    if let Some(trace) = creation_trace {
1372        if send_result.is_ok() {
1373            match crate::resource::finish_process_creation_tracking(ctx, trace).await {
1374                Ok(true) => {}
1375                Ok(false) => {
1376                    crate::resource::rollback_fork_count(&ctx.resource).await;
1377                }
1378                Err(e) => {
1379                    crate::resource::rollback_fork_count(&ctx.resource).await;
1380                    eprintln!(
1381                        "sandlock: process-creation tracking completion failed for pid {}: {}",
1382                        notif.pid, e
1383                    );
1384                }
1385            }
1386        } else {
1387            crate::resource::rollback_fork_count(&ctx.resource).await;
1388            crate::resource::abort_process_creation_tracking(trace).await;
1389        }
1390    }
1391
1392    if let Some(freeze) = exec_freeze {
1393        if exec_continued && send_result.is_ok() {
1394            crate::freeze::detach_peers(&freeze.peer_tids);
1395        } else {
1396            crate::freeze::detach_all(&freeze);
1397        }
1398    }
1399}
1400
1401// ============================================================
1402// Main supervisor loop
1403// ============================================================
1404
1405/// Async event loop that processes seccomp notifications.
1406///
1407/// Runs until the notification fd is closed (child exits or filter is removed).
1408///
1409/// `pending_handlers` are user-supplied syscall handlers registered after all
1410/// builtin handlers.  For the default behaviour without any custom handlers
1411/// pass an empty `Vec`.
1412pub async fn supervisor(
1413    notif_fd: OwnedFd,
1414    ctx: Arc<super::ctx::SupervisorCtx>,
1415    pending_handlers: Vec<(i64, std::sync::Arc<dyn super::dispatch::Handler>)>,
1416    startup: tokio::sync::oneshot::Sender<io::Result<()>>,
1417) {
1418    // Register the notif fd with the Tokio IO driver so we can wait for
1419    // readiness via epoll instead of a dedicated blocking thread.
1420    let async_fd = match tokio::io::unix::AsyncFd::with_interest(
1421        notif_fd,
1422        tokio::io::Interest::READABLE,
1423    ) {
1424        Ok(fd) => fd,
1425        Err(err) => {
1426            let _ = startup.send(Err(err));
1427            return;
1428        }
1429    };
1430    let fd = async_fd.get_ref().as_raw_fd();
1431
1432    // Build the dispatch table once at startup.
1433    let dispatch_table = Arc::new(super::dispatch::build_dispatch_table(
1434        &ctx.policy,
1435        &ctx.resource,
1436        &ctx,
1437        pending_handlers,
1438    ));
1439
1440    // Try to enable sync wakeup (Linux 6.7+, ignore error on older kernels).
1441    try_set_sync_wakeup(fd);
1442
1443    // The IO driver has the fd registered; subsequent block_on cycles
1444    // can resume this task and pick up readiness events. Tell the
1445    // caller it is safe to release the child.
1446    let _ = startup.send(Ok(()));
1447
1448    // Periodic sweep as a defensive backstop in case pidfd-based
1449    // lifecycle cleanup misses an entry (e.g. pidfd_open failed for a
1450    // child on an old kernel, or its watcher panicked). At 5 minutes
1451    // this is cheap enough to leave on; the primary cleanup path is
1452    // still per-child pidfd readiness in `spawn_pid_watcher`.
1453    let gc = tokio::spawn(process_index_gc(Arc::clone(&ctx.processes)));
1454
1455    // Bounds the number of in-flight deferred handler futures (see
1456    // `DEFER_MAX_INFLIGHT`). Shared across all notifications this supervisor
1457    // processes.
1458    let defer_sem = Arc::new(tokio::sync::Semaphore::new(DEFER_MAX_INFLIGHT));
1459
1460    // Edge-triggered drain: each `readable().await` returns once per
1461    // epoll edge, then we drain the kernel queue via `probe_notif_fd`
1462    // until empty. The drain is necessary because tokio's AsyncFd is
1463    // edge-triggered and `recv_notif` does not signal "would block",
1464    // so a burst of arrivals between two `readable().await` calls
1465    // would coalesce into a single wake event.
1466    //
1467    // Notifications are processed sequentially (not spawned) to avoid
1468    // mutex contention between concurrent handlers.
1469    'outer: loop {
1470        let mut ready = match async_fd.readable().await {
1471            Ok(r) => r,
1472            Err(_) => break 'outer,
1473        };
1474        ready.clear_ready();
1475        drop(ready);
1476
1477        loop {
1478            match probe_notif_fd(fd) {
1479                NotifFdState::Pending => {
1480                    let notif = match recv_notif(fd) {
1481                        Ok(n) => n,
1482                        Err(e) if e.raw_os_error() == Some(libc::EINTR) => continue,
1483                        Err(_) => break 'outer,
1484                    };
1485                    handle_notification(notif, &ctx, &dispatch_table, fd, &defer_sem).await;
1486                }
1487                NotifFdState::Empty => break,
1488                NotifFdState::Terminal => break 'outer,
1489            }
1490        }
1491    }
1492
1493    gc.abort();
1494}
1495
1496/// Periodic sweep that drops `ProcessIndex` entries for exited PIDs.
1497/// Per-process state hangs off these entries via `Arc`, so dropping
1498/// them releases everything in one step.
1499async fn process_index_gc(processes: Arc<super::state::ProcessIndex>) {
1500    let interval = std::time::Duration::from_secs(300);
1501    loop {
1502        tokio::time::sleep(interval).await;
1503        if processes.len() == 0 {
1504            continue;
1505        }
1506        processes.prune_dead();
1507    }
1508}
1509
1510/// Spawn a per-child task that awaits the pidfd becoming readable
1511/// (process exit) and then runs unified cleanup across every
1512/// per-process supervisor map.
1513///
1514/// The watcher *owns* the pidfd via `AsyncFd<OwnedFd>` — the kernel
1515/// fd stays alive for as long as tokio's IO driver has it registered,
1516/// and is closed exactly once when the watcher task ends. This avoids
1517/// a TOCTOU where dropping the fd from a separate map could let a
1518/// recycled fd be deregistered from epoll.
1519pub(crate) fn spawn_pid_watcher(
1520    ctx: Arc<super::ctx::SupervisorCtx>,
1521    key: super::state::PidKey,
1522    pidfd: std::os::unix::io::OwnedFd,
1523) {
1524    tokio::spawn(async move {
1525        let async_fd = match tokio::io::unix::AsyncFd::with_interest(
1526            pidfd,
1527            tokio::io::Interest::READABLE,
1528        ) {
1529            Ok(f) => f,
1530            Err(_) => {
1531                // AsyncFd registration failed (extremely unusual);
1532                // fall back to immediate cleanup so we don't leak the
1533                // index entry. The OwnedFd we passed in is consumed
1534                // by `with_interest`'s Err return and will close on
1535                // drop here.
1536                cleanup_pid(&ctx, key).await;
1537                return;
1538            }
1539        };
1540        // pidfd becomes readable when the process exits; we don't
1541        // read any data, so `readable()` is just an await point.
1542        let _ = async_fd.readable().await;
1543        cleanup_pid(&ctx, key).await;
1544        // async_fd drops here, closing the pidfd.
1545    });
1546}
1547
1548/// Drop the supervisor's per-process state for `key`. With every
1549/// per-process map living inside `PerProcessState` (owned by
1550/// `ProcessIndex`), this is a single unregister — the entry's `Arc`
1551/// drops here, and remaining clones held by in-flight handlers will
1552/// drop with their tasks, freeing `PerProcessState` automatically.
1553pub(crate) async fn cleanup_pid(ctx: &super::ctx::SupervisorCtx, key: super::state::PidKey) {
1554    ctx.processes.unregister(key);
1555}
1556
1557// ============================================================
1558// Tests
1559// ============================================================
1560
1561#[cfg(test)]
1562mod tests {
1563    use super::*;
1564    use std::os::unix::io::FromRawFd;
1565
1566    fn gettid() -> u32 {
1567        (unsafe { libc::syscall(libc::SYS_gettid) }) as u32
1568    }
1569
1570    #[test]
1571    fn inject_failure_response_denies_not_continues() {
1572        // When fd injection fails, the supervisor must fail closed: deny the
1573        // syscall instead of letting it continue unmediated against the host
1574        // path (which would silently bypass chroot/file confinement).
1575        let resp = inject_failure_resp(123);
1576        assert_eq!(resp.id, 123);
1577        assert_eq!(
1578            resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE,
1579            0,
1580            "fd-injection failure must not respond with CONTINUE"
1581        );
1582        assert_ne!(resp.error, 0, "fd-injection failure must be a denial");
1583        assert_eq!(resp.error, -libc::EACCES);
1584    }
1585
1586    #[test]
1587    fn held_gate_no_decision_denies() {
1588        use crate::policy_fn::Verdict;
1589        // A held syscall whose policy_fn gate times out or whose channel closes
1590        // (received == None) must fail closed: deny, not allow the syscall.
1591        assert!(matches!(resolve_held_gate(None), Some(Verdict::Deny)));
1592    }
1593
1594    #[test]
1595    fn held_gate_passes_through_callback_verdict() {
1596        use crate::policy_fn::Verdict;
1597        // A real verdict from the callback is forwarded unchanged.
1598        assert!(matches!(
1599            resolve_held_gate(Some(Verdict::Allow)),
1600            Some(Verdict::Allow)
1601        ));
1602        assert!(matches!(
1603            resolve_held_gate(Some(Verdict::Deny)),
1604            Some(Verdict::Deny)
1605        ));
1606        assert!(matches!(
1607            resolve_held_gate(Some(Verdict::DenyWith(13))),
1608            Some(Verdict::DenyWith(13))
1609        ));
1610    }
1611
1612    #[test]
1613    fn tgid_of_main_thread_is_own_pid() {
1614        // The main thread's tid equals the process pid, and its Tgid is the pid.
1615        assert_eq!(tgid_of(gettid()), Some(std::process::id()));
1616    }
1617
1618    #[test]
1619    fn tgid_of_worker_thread_resolves_to_process() {
1620        // A non-leader thread's Tgid is the process pid, not its own tid.
1621        let (tid_tx, tid_rx) = std::sync::mpsc::channel();
1622        let (done_tx, done_rx) = std::sync::mpsc::channel::<()>();
1623        let h = std::thread::spawn(move || {
1624            tid_tx.send(gettid()).unwrap();
1625            done_rx.recv().ok(); // stay alive until the test has read /proc
1626        });
1627        let worker_tid = tid_rx.recv().unwrap();
1628        let pid = std::process::id();
1629        assert_ne!(worker_tid, pid, "worker tid must differ from pid");
1630        assert_eq!(tgid_of(worker_tid), Some(pid));
1631        done_tx.send(()).ok();
1632        h.join().unwrap();
1633    }
1634
1635    #[test]
1636    fn dup_fd_from_pid_handles_worker_thread_fd() {
1637        use std::os::unix::io::AsRawFd;
1638        // Open an fd in a non-leader worker thread, then duplicate it by that
1639        // thread's tid. Exercises the tid->process pidfd resolution end to end
1640        // (PIDFD_THREAD on >=6.9, the /proc Tgid fallback on older kernels).
1641        let (info_tx, info_rx) = std::sync::mpsc::channel();
1642        let (done_tx, done_rx) = std::sync::mpsc::channel::<()>();
1643        let h = std::thread::spawn(move || {
1644            let f = std::fs::File::open("/dev/null").unwrap();
1645            info_tx.send((gettid(), f.as_raw_fd())).unwrap();
1646            done_rx.recv().ok();
1647            drop(f);
1648        });
1649        let (worker_tid, fd) = info_rx.recv().unwrap();
1650        let dup = dup_fd_from_pid(worker_tid, fd);
1651        done_tx.send(()).ok();
1652        h.join().unwrap();
1653        assert!(dup.is_ok(), "dup_fd_from_pid for a worker-thread fd failed: {:?}", dup.err());
1654    }
1655
1656    #[test]
1657    fn read_child_cstr_returns_none_for_null_addr_or_zero_max_len() {
1658        // Smoke: addr == 0 short-circuits without touching the child.
1659        assert!(read_child_cstr(-1, 0, 0, 0, 4096).is_none());
1660        // max_len == 0 also short-circuits.
1661        assert!(read_child_cstr(-1, 0, 0, 0xdeadbeef, 0).is_none());
1662    }
1663
1664    #[test]
1665    fn test_notif_action_debug() {
1666        // Ensure all variants implement Debug.
1667        let _ = format!("{:?}", NotifAction::Continue);
1668        let _ = format!("{:?}", NotifAction::Errno(1));
1669        let _ = format!("{:?}", NotifAction::InjectFd { srcfd: 3, targetfd: 4 });
1670        // Use a real fd (dup'd from stderr) so OwnedFd can safely close it.
1671        let test_fd = unsafe { OwnedFd::from_raw_fd(libc::dup(2)) };
1672        let _ = format!("{:?}", NotifAction::InjectFdSend { srcfd: test_fd, newfd_flags: 0 });
1673        let _ = format!("{:?}", NotifAction::ReturnValue(42));
1674        let _ = format!("{:?}", NotifAction::Hold);
1675        let _ = format!("{:?}", NotifAction::Kill { sig: 9, pgid: 1 });
1676        let _ = format!("{:?}", NotifAction::defer(async { NotifAction::Continue }));
1677    }
1678
1679    #[tokio::test]
1680    async fn deferred_future_need_not_be_sync() {
1681        // A deferred future may capture Send-but-not-Sync state across an
1682        // await. `Cell` is Send but never Sync; holding it across `.await`
1683        // makes the future !Sync. Only `Send` is required (the supervisor
1684        // moves the future to a worker, never shares it by reference).
1685        use std::cell::Cell;
1686        let action = NotifAction::defer(async move {
1687            let counter = Cell::new(0);
1688            counter.set(counter.get() + 41);
1689            tokio::task::yield_now().await; // hold the !Sync Cell across await
1690            NotifAction::ReturnValue(counter.get() + 1)
1691        });
1692        let NotifAction::Defer(d) = action else { panic!("expected Defer") };
1693        assert!(matches!(d.run().await, NotifAction::ReturnValue(42)));
1694    }
1695
1696    #[tokio::test]
1697    async fn deferred_runs_to_its_terminal_action() {
1698        // A Defer carries a future; running it yields the deferred decision.
1699        let action = NotifAction::defer(async { NotifAction::ReturnValue(7) });
1700        let NotifAction::Defer(deferred) = action else {
1701            panic!("defer() must construct a NotifAction::Defer");
1702        };
1703        assert!(matches!(deferred.run().await, NotifAction::ReturnValue(7)));
1704    }
1705
1706    #[tokio::test(start_paused = true)]
1707    async fn deferred_times_out_to_eio() {
1708        // A deferred future that exceeds its limit must fail closed (EIO) so
1709        // the trapped child gets a definite response instead of parking
1710        // forever (and leaking its DEFER_MAX_INFLIGHT slot).
1711        let slow = Deferred::new(async {
1712            tokio::time::sleep(std::time::Duration::from_secs(60)).await;
1713            NotifAction::ReturnValue(7)
1714        });
1715        let action = run_deferred_within(slow, std::time::Duration::from_secs(1)).await;
1716        assert!(matches!(action, NotifAction::Errno(e) if e == libc::EIO));
1717    }
1718
1719    #[tokio::test(start_paused = true)]
1720    async fn deferred_within_limit_passes_through() {
1721        // A future that resolves within the limit returns its terminal action.
1722        let fast = Deferred::new(async { NotifAction::ReturnValue(7) });
1723        let action = run_deferred_within(fast, std::time::Duration::from_secs(1)).await;
1724        assert!(matches!(action, NotifAction::ReturnValue(7)));
1725    }
1726
1727    #[test]
1728    fn finalize_deferred_collapses_nested_defer_to_eio() {
1729        // A deferred future that itself resolves to Defer is a bug: collapse
1730        // to EIO so the trapped child is never wedged waiting for a response.
1731        let nested = NotifAction::defer(async { NotifAction::Continue });
1732        assert!(matches!(finalize_deferred(nested), NotifAction::Errno(e) if e == libc::EIO));
1733        // Non-nested terminal actions pass through unchanged.
1734        assert!(matches!(finalize_deferred(NotifAction::Continue), NotifAction::Continue));
1735        assert!(matches!(
1736            finalize_deferred(NotifAction::ReturnValue(3)),
1737            NotifAction::ReturnValue(3)
1738        ));
1739    }
1740
1741    #[test]
1742    fn content_memfd_roundtrips_content() {
1743        use std::io::Read;
1744        let fd = content_memfd(b"hello world", true).expect("content_memfd");
1745        // The fd is rewound to offset 0, so a plain read returns the content.
1746        let mut f = std::fs::File::from(fd);
1747        let mut buf = String::new();
1748        f.read_to_string(&mut buf).unwrap();
1749        assert_eq!(buf, "hello world");
1750    }
1751
1752    #[test]
1753    fn content_memfd_sealed_applies_write_seal() {
1754        let fd = content_memfd(b"data", true).expect("content_memfd");
1755        let seals = unsafe { libc::fcntl(fd.as_raw_fd(), libc::F_GET_SEALS) };
1756        assert!(seals >= 0, "F_GET_SEALS failed");
1757        assert!(
1758            seals & libc::F_SEAL_WRITE != 0,
1759            "expected F_SEAL_WRITE on a sealed memfd, got {seals:#x}"
1760        );
1761    }
1762
1763    #[test]
1764    fn content_memfd_unsealed_has_no_write_seal() {
1765        let fd = content_memfd(b"data", false).expect("content_memfd");
1766        let seals = unsafe { libc::fcntl(fd.as_raw_fd(), libc::F_GET_SEALS) };
1767        assert!(seals >= 0, "F_GET_SEALS failed");
1768        assert_eq!(
1769            seals & libc::F_SEAL_WRITE,
1770            0,
1771            "unsealed memfd must not carry a write seal, got {seals:#x}"
1772        );
1773    }
1774
1775    #[test]
1776    fn inject_bytes_produces_sealed_cloexec_injectfdsend() {
1777        use std::io::Read;
1778        match NotifAction::inject_bytes(b"payload") {
1779            NotifAction::InjectFdSend { srcfd, newfd_flags } => {
1780                assert_eq!(newfd_flags, libc::O_CLOEXEC as u32);
1781                let seals = unsafe { libc::fcntl(srcfd.as_raw_fd(), libc::F_GET_SEALS) };
1782                assert!(seals & libc::F_SEAL_WRITE != 0, "inject_bytes must seal");
1783                let mut f = std::fs::File::from(srcfd);
1784                let mut buf = String::new();
1785                f.read_to_string(&mut buf).unwrap();
1786                assert_eq!(buf, "payload");
1787            }
1788            other => panic!("expected InjectFdSend, got {other:?}"),
1789        }
1790    }
1791
1792    #[test]
1793    fn test_network_state_new() {
1794        let ns = super::super::state::NetworkState::new();
1795        assert!(matches!(ns.tcp_policy, NetworkPolicy::Unrestricted));
1796        assert!(matches!(ns.udp_policy, NetworkPolicy::Unrestricted));
1797        assert!(matches!(ns.icmp_policy, NetworkPolicy::Unrestricted));
1798        assert!(ns.port_map.bound_ports.is_empty());
1799    }
1800
1801    #[test]
1802    fn test_time_random_state_new() {
1803        let tr = super::super::state::TimeRandomState::new(None, None);
1804        assert!(tr.time_offset.is_none());
1805        assert!(tr.random_state.is_none());
1806    }
1807
1808    #[test]
1809    fn test_resource_state_new() {
1810        let rs = super::super::state::ResourceState::new(1024 * 1024, 10);
1811        assert_eq!(rs.mem_used, 0);
1812        assert_eq!(rs.max_memory_bytes, 1024 * 1024);
1813        assert_eq!(rs.max_processes, 10);
1814        assert!(!rs.hold_forks);
1815        assert!(rs.held_notif_ids.is_empty());
1816    }
1817
1818    #[test]
1819    fn test_process_vm_readv_self() {
1820        let data: u64 = 0xDEADBEEF_CAFEBABE;
1821        let addr = &data as *const u64 as u64;
1822        let pid = std::process::id();
1823        let result = read_child_mem_vm(pid, addr, 8);
1824        assert!(result.is_ok());
1825        let bytes = result.unwrap();
1826        let read_val = u64::from_ne_bytes(bytes[..8].try_into().unwrap());
1827        assert_eq!(read_val, 0xDEADBEEF_CAFEBABE);
1828    }
1829
1830    #[test]
1831    fn test_process_vm_writev_self() {
1832        let mut data: u64 = 0;
1833        let addr = &mut data as *mut u64 as u64;
1834        let pid = std::process::id();
1835        let payload = 0x1234567890ABCDEFu64.to_ne_bytes();
1836        let result = write_child_mem_vm(pid, addr, &payload);
1837        assert!(result.is_ok());
1838        assert_eq!(data, 0x1234567890ABCDEF);
1839    }
1840}