Skip to main content

sandlock_core/seccomp/
notif.rs

1// Seccomp user notification supervisor — async event loop that receives
2// notifications from the kernel, dispatches them to handler functions, and
3// sends responses.
4
5use std::collections::HashSet;
6use std::io;
7use std::net::IpAddr;
8use std::os::unix::io::{AsRawFd, FromRawFd, OwnedFd, RawFd};
9use std::sync::Arc;
10
11use crate::error::NotifError;
12use crate::arch;
13use crate::sys::structs::{
14    SeccompNotif, SeccompNotifAddfd, SeccompNotifResp,
15    SECCOMP_ADDFD_FLAG_SEND, SECCOMP_IOCTL_NOTIF_ADDFD, SECCOMP_IOCTL_NOTIF_ID_VALID, SECCOMP_IOCTL_NOTIF_RECV,
16    SECCOMP_IOCTL_NOTIF_SEND, SECCOMP_IOCTL_NOTIF_SET_FLAGS,
17    SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP, SECCOMP_USER_NOTIF_FLAG_CONTINUE,
18    ENOMEM,
19};
20
21// ============================================================
22// NotifAction — how the supervisor should respond
23// ============================================================
24
25/// A one-shot callback invoked with the child-side fd number returned by
26/// `SECCOMP_IOCTL_NOTIF_ADDFD` after a successful `InjectFdSendTracked`.
27/// Wraps a boxed closure with a manual `Debug` impl so that `NotifAction`
28/// can keep deriving `Debug`.  The closure is both `Send` and `Sync` so
29/// that `&NotifAction` remains `Send` (required because `NotifAction` is
30/// borrowed across `.await` points in the notifier loop).
31pub struct OnInjectSuccess(pub Box<dyn FnOnce(i32) + Send + Sync>);
32
33impl std::fmt::Debug for OnInjectSuccess {
34    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
35        f.write_str("OnInjectSuccess(<callback>)")
36    }
37}
38
39impl OnInjectSuccess {
40    pub fn new<F: FnOnce(i32) + Send + Sync + 'static>(f: F) -> Self {
41        Self(Box::new(f))
42    }
43}
44
45/// How the supervisor should respond to a notification.
46#[derive(Debug)]
47pub enum NotifAction {
48    /// SECCOMP_USER_NOTIF_FLAG_CONTINUE — let the syscall proceed.
49    Continue,
50    /// Return -1 with the given errno.
51    Errno(i32),
52    /// Inject a file descriptor into the child, then continue.
53    InjectFd { srcfd: RawFd, targetfd: i32 },
54    /// Inject a file descriptor using SECCOMP_ADDFD_FLAG_SEND (atomically responds).
55    /// The child sees the injected fd as the return value of the syscall.
56    /// The `OwnedFd` is closed automatically after the ioctl completes.
57    /// `newfd_flags` controls flags on the injected fd (e.g. O_CLOEXEC).
58    InjectFdSend { srcfd: OwnedFd, newfd_flags: u32 },
59    /// Like `InjectFdSend`, but also invokes `on_success` with the
60    /// child-side fd number that `SECCOMP_IOCTL_NOTIF_ADDFD` returned.
61    /// Used when the caller needs to track the exact fd number allocated
62    /// in the child (e.g. to key per-fd state without TOCTOU).
63    InjectFdSendTracked {
64        srcfd: OwnedFd,
65        newfd_flags: u32,
66        on_success: OnInjectSuccess,
67    },
68    /// Synthetic return value (the child sees this as the syscall result).
69    ReturnValue(i64),
70    /// Don't respond — used for checkpoint/freeze.
71    Hold,
72    /// Kill the child process group (OOM-kill semantics).
73    /// Fields: signal, process group leader pid.
74    Kill { sig: i32, pgid: i32 },
75}
76
77// ============================================================
78// NetworkPolicy — network access policy enum
79// ============================================================
80
81/// Global network policy for the sandbox.
82#[derive(Debug, Clone)]
83pub enum NetworkPolicy {
84    /// All IPs allowed (no net_allow_hosts configured).
85    Unrestricted,
86    /// Only these IPs are allowed (from resolved net_allow_hosts).
87    AllowList(HashSet<IpAddr>),
88}
89
90/// Check if a path-bearing notification targets a denied path.
91///
92/// For two-path syscalls (renameat2, linkat), checks both source and
93/// destination paths — a denied file must not be linked, renamed, or
94/// overwritten.
95///
96/// Each resolved path is checked both as-is (lexical normalization) and
97/// after following symlinks via `canonicalize`.  This prevents bypass via
98/// pre-existing symlinks, relative symlinks, or symlink chains that
99/// ultimately resolve to a denied path.
100pub(crate) fn is_path_denied_for_notif(
101    policy_fn_state: &super::state::PolicyFnState,
102    notif: &SeccompNotif,
103    notif_fd: RawFd,
104) -> bool {
105    if let Some(path) = resolve_path_for_notif(notif, notif_fd) {
106        if is_denied_with_symlink_resolve(policy_fn_state, &path) {
107            return true;
108        }
109    }
110    // For two-path syscalls, also check the second (destination) path.
111    if let Some(path) = resolve_second_path_for_notif(notif, notif_fd) {
112        if is_denied_with_symlink_resolve(policy_fn_state, &path) {
113            return true;
114        }
115    }
116    false
117}
118
119/// Check a path against denied entries, also resolving symlinks.
120///
121/// First checks the lexical path, then `canonicalize`s to follow symlinks
122/// and checks the real path.  This catches pre-existing symlinks, relative
123/// symlinks, and symlink chains that resolve to a denied file.
124fn is_denied_with_symlink_resolve(
125    policy_fn_state: &super::state::PolicyFnState,
126    path: &str,
127) -> bool {
128    // Check the literal (lexically normalized) path first.
129    if policy_fn_state.is_path_denied(path) {
130        return true;
131    }
132    // Follow symlinks and re-check against denied entries.
133    if let Ok(real) = std::fs::canonicalize(path) {
134        if policy_fn_state.is_path_denied(&real.to_string_lossy()) {
135            return true;
136        }
137    }
138    false
139}
140
141/// Duplicate a file descriptor from an arbitrary process (by PID/TID) into the supervisor.
142/// Uses PIDFD_THREAD so pidfd_open works for any thread, not just the group leader.
143pub(crate) fn dup_fd_from_pid(pid: u32, target_fd: i32) -> Result<OwnedFd, io::Error> {
144    const SYS_PIDFD_OPEN: i64 = 434;
145    const SYS_PIDFD_GETFD: i64 = 438;
146    const PIDFD_THREAD: i64 = libc::O_EXCL as i64; // Linux 6.9+
147    let pidfd = unsafe { libc::syscall(SYS_PIDFD_OPEN, pid as i64, PIDFD_THREAD) };
148    if pidfd < 0 {
149        return Err(io::Error::last_os_error());
150    }
151    let pidfd_owned = unsafe { OwnedFd::from_raw_fd(pidfd as i32) };
152    let ret = unsafe {
153        libc::syscall(SYS_PIDFD_GETFD, pidfd_owned.as_raw_fd() as i64, target_fd as i64, 0i64)
154    };
155    if ret < 0 {
156        Err(io::Error::last_os_error())
157    } else {
158        Ok(unsafe { OwnedFd::from_raw_fd(ret as i32) })
159    }
160}
161
162// ============================================================
163// NotifPolicy — policy for the notification supervisor
164// ============================================================
165
166/// Policy for the notification supervisor.
167pub struct NotifPolicy {
168    pub max_memory_bytes: u64,
169    pub max_processes: u32,
170    pub has_memory_limit: bool,
171    pub has_net_allowlist: bool,
172    pub has_random_seed: bool,
173    pub has_time_start: bool,
174    pub time_offset: i64,
175    pub num_cpus: Option<u32>,
176    pub port_remap: bool,
177    pub cow_enabled: bool,
178    pub chroot_root: Option<std::path::PathBuf>,
179    /// Virtual paths allowed for reading under chroot (original user-specified paths).
180    pub chroot_readable: Vec<std::path::PathBuf>,
181    /// Virtual paths allowed for writing under chroot (original user-specified paths).
182    pub chroot_writable: Vec<std::path::PathBuf>,
183    /// Virtual paths explicitly denied under chroot.
184    pub chroot_denied: Vec<std::path::PathBuf>,
185    /// Mount mappings: (virtual_path, host_path) pairs.
186    pub chroot_mounts: Vec<(std::path::PathBuf, std::path::PathBuf)>,
187    pub deterministic_dirs: bool,
188    pub hostname: Option<String>,
189    pub has_http_acl: bool,
190    /// Synthetic `/etc/hosts` content for `net_allow_hosts` virtualization.
191    /// When set, `openat("/etc/hosts")` returns a memfd with this content
192    /// so sandboxed processes can resolve allowed hostnames without DNS.
193    pub virtual_etc_hosts: Option<String>,
194}
195
196// ============================================================
197// Low-level ioctl helpers
198// ============================================================
199
200/// Receive a seccomp notification from the kernel.
201/// ioctl(fd, SECCOMP_IOCTL_NOTIF_RECV, &notif)
202fn recv_notif(fd: RawFd) -> io::Result<SeccompNotif> {
203    let mut notif: SeccompNotif = unsafe { std::mem::zeroed() };
204    let ret = unsafe {
205        libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_RECV as libc::c_ulong, &mut notif as *mut _)
206    };
207    if ret < 0 {
208        Err(io::Error::last_os_error())
209    } else {
210        Ok(notif)
211    }
212}
213
214/// Send a response with SECCOMP_USER_NOTIF_FLAG_CONTINUE.
215fn respond_continue(fd: RawFd, id: u64) -> io::Result<()> {
216    let resp = SeccompNotifResp {
217        id,
218        val: 0,
219        error: 0,
220        flags: SECCOMP_USER_NOTIF_FLAG_CONTINUE,
221    };
222    send_resp_raw(fd, &resp)
223}
224
225/// Send a response that returns -1 with the given errno.
226fn respond_errno(fd: RawFd, id: u64, errno: i32) -> io::Result<()> {
227    let resp = SeccompNotifResp {
228        id,
229        val: 0,
230        error: -errno,
231        flags: 0,
232    };
233    send_resp_raw(fd, &resp)
234}
235
236/// Send a response with a synthetic return value.
237fn respond_value(fd: RawFd, id: u64, val: i64) -> io::Result<()> {
238    let resp = SeccompNotifResp {
239        id,
240        val,
241        error: 0,
242        flags: 0,
243    };
244    send_resp_raw(fd, &resp)
245}
246
247/// Inject a file descriptor into the child process using SECCOMP_ADDFD_FLAG_SEND.
248///
249/// Uses the SEND flag to atomically inject the fd and respond to the syscall.
250/// The ioctl return value is the fd number assigned in the child process.
251/// After this call, no additional SECCOMP_IOCTL_NOTIF_SEND is needed.
252fn inject_fd_and_send(fd: RawFd, id: u64, srcfd: RawFd, newfd_flags: u32) -> io::Result<i32> {
253    let addfd = SeccompNotifAddfd {
254        id,
255        flags: SECCOMP_ADDFD_FLAG_SEND,
256        srcfd: srcfd as u32,
257        newfd: 0,   // ignored when SECCOMP_ADDFD_FLAG_SETFD is not set
258        newfd_flags,
259    };
260    let ret = unsafe {
261        libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_ADDFD as libc::c_ulong, &addfd as *const _)
262    };
263    if ret < 0 {
264        Err(io::Error::last_os_error())
265    } else {
266        Ok(ret as i32)
267    }
268}
269
270/// Inject a file descriptor into the child process (without responding).
271/// ioctl(fd, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd)
272fn inject_fd(fd: RawFd, id: u64, srcfd: RawFd, targetfd: i32) -> io::Result<()> {
273    let addfd = SeccompNotifAddfd {
274        id,
275        flags: 0,
276        srcfd: srcfd as u32,
277        newfd: targetfd as u32,
278        newfd_flags: 0,
279    };
280    let ret = unsafe {
281        libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_ADDFD as libc::c_ulong, &addfd as *const _)
282    };
283    if ret < 0 {
284        Err(io::Error::last_os_error())
285    } else {
286        Ok(())
287    }
288}
289
290/// Raw ioctl to send a notification response.
291fn send_resp_raw(fd: RawFd, resp: &SeccompNotifResp) -> io::Result<()> {
292    let ret = unsafe {
293        libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_SEND as libc::c_ulong, resp as *const _)
294    };
295    if ret < 0 {
296        Err(io::Error::last_os_error())
297    } else {
298        Ok(())
299    }
300}
301
302/// Check whether a notification ID is still valid (TOCTOU guard).
303/// ioctl(fd, SECCOMP_IOCTL_NOTIF_ID_VALID, &id)
304pub(crate) fn id_valid(fd: RawFd, id: u64) -> io::Result<()> {
305    let ret = unsafe {
306        libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_ID_VALID as libc::c_ulong, &id as *const _)
307    };
308    if ret < 0 {
309        Err(io::Error::last_os_error())
310    } else {
311        Ok(())
312    }
313}
314
315/// Try to enable sync wakeup (Linux 6.7+). Ignores errors.
316fn try_set_sync_wakeup(fd: RawFd) {
317    let flags: u64 = SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP as u64;
318    unsafe {
319        libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_SET_FLAGS as libc::c_ulong, &flags as *const _);
320    }
321}
322
323// ============================================================
324// Child memory access helpers
325// ============================================================
326
327/// Read bytes from a child process via process_vm_readv (single syscall).
328fn read_child_mem_vm(pid: u32, addr: u64, len: usize) -> Result<Vec<u8>, NotifError> {
329    let mut buf = vec![0u8; len];
330    let local_iov = libc::iovec {
331        iov_base: buf.as_mut_ptr() as *mut libc::c_void,
332        iov_len: len,
333    };
334    let remote_iov = libc::iovec {
335        iov_base: addr as *mut libc::c_void,
336        iov_len: len,
337    };
338    let ret = unsafe {
339        libc::process_vm_readv(pid as i32, &local_iov, 1, &remote_iov, 1, 0)
340    };
341    if ret < 0 {
342        Err(NotifError::ChildMemoryRead(io::Error::last_os_error()))
343    } else {
344        buf.truncate(ret as usize);
345        Ok(buf)
346    }
347}
348
349/// Write bytes to a child process via process_vm_writev (single syscall).
350fn write_child_mem_vm(pid: u32, addr: u64, data: &[u8]) -> Result<(), NotifError> {
351    let local_iov = libc::iovec {
352        iov_base: data.as_ptr() as *mut libc::c_void,
353        iov_len: data.len(),
354    };
355    let remote_iov = libc::iovec {
356        iov_base: addr as *mut libc::c_void,
357        iov_len: data.len(),
358    };
359    let ret = unsafe {
360        libc::process_vm_writev(pid as i32, &local_iov, 1, &remote_iov, 1, 0)
361    };
362    if ret < 0 {
363        Err(NotifError::ChildMemoryRead(io::Error::last_os_error()))
364    } else if (ret as usize) < data.len() {
365        Err(NotifError::ChildMemoryRead(io::Error::new(
366            io::ErrorKind::WriteZero,
367            format!("short write: {} of {} bytes", ret, data.len()),
368        )))
369    } else {
370        Ok(())
371    }
372}
373
374/// Read bytes from a child process via process_vm_readv.
375///
376/// Performs TOCTOU validation by calling `id_valid` before and after
377/// the read to ensure the notification is still live.
378pub(crate) fn read_child_mem(
379    notif_fd: RawFd,
380    id: u64,
381    pid: u32,
382    addr: u64,
383    len: usize,
384) -> Result<Vec<u8>, NotifError> {
385    id_valid(notif_fd, id).map_err(NotifError::Ioctl)?;
386    let result = read_child_mem_vm(pid, addr, len)?;
387    id_valid(notif_fd, id).map_err(NotifError::Ioctl)?;
388    Ok(result)
389}
390
391/// Read a NUL-terminated string from child memory without crossing unmapped
392/// page boundaries in a single `process_vm_readv` call.
393pub(crate) fn read_child_cstr(
394    notif_fd: RawFd,
395    id: u64,
396    pid: u32,
397    addr: u64,
398    max_len: usize,
399) -> Option<String> {
400    if addr == 0 || max_len == 0 {
401        return None;
402    }
403
404    const PAGE_SIZE: u64 = 4096;
405    let mut result = Vec::with_capacity(max_len.min(256));
406    let mut cur = addr;
407    while result.len() < max_len {
408        let page_remaining = PAGE_SIZE - (cur % PAGE_SIZE);
409        let remaining = max_len - result.len();
410        let to_read = page_remaining.min(remaining as u64) as usize;
411        let bytes = read_child_mem(notif_fd, id, pid, cur, to_read).ok()?;
412        if let Some(nul) = bytes.iter().position(|&b| b == 0) {
413            result.extend_from_slice(&bytes[..nul]);
414            return String::from_utf8(result).ok();
415        }
416        result.extend_from_slice(&bytes);
417        cur += to_read as u64;
418    }
419
420    String::from_utf8(result).ok()
421}
422
423/// Write bytes to a child process via process_vm_writev.
424///
425/// Performs TOCTOU validation by calling `id_valid` before and after
426/// the write to ensure the notification is still live.
427pub(crate) fn write_child_mem(
428    notif_fd: RawFd,
429    id: u64,
430    pid: u32,
431    addr: u64,
432    data: &[u8],
433) -> Result<(), NotifError> {
434    id_valid(notif_fd, id).map_err(NotifError::Ioctl)?;
435    write_child_mem_vm(pid, addr, data)?;
436    id_valid(notif_fd, id).map_err(NotifError::Ioctl)?;
437    Ok(())
438}
439
440// ============================================================
441// Response dispatch
442// ============================================================
443
444/// Dispatch a `NotifAction` to the appropriate low-level response function.
445fn send_response(fd: RawFd, id: u64, action: NotifAction) -> io::Result<()> {
446    match action {
447        NotifAction::Continue => respond_continue(fd, id),
448        NotifAction::Errno(errno) => respond_errno(fd, id, errno),
449        NotifAction::InjectFd { srcfd, targetfd } => {
450            inject_fd(fd, id, srcfd, targetfd)?;
451            respond_continue(fd, id)
452        }
453        NotifAction::InjectFdSend { srcfd, newfd_flags } => {
454            // SECCOMP_ADDFD_FLAG_SEND atomically injects the fd and responds.
455            // No separate NOTIF_SEND needed after this.
456            // Fall back to Continue if ADDFD_SEND fails (e.g., old kernel).
457            // srcfd (OwnedFd) is dropped at end of this arm, closing the fd.
458            match inject_fd_and_send(fd, id, srcfd.as_raw_fd(), newfd_flags) {
459                Ok(_new_fd) => Ok(()),
460                Err(_) => respond_continue(fd, id),
461            }
462        }
463        NotifAction::InjectFdSendTracked { srcfd, newfd_flags, on_success } => {
464            match inject_fd_and_send(fd, id, srcfd.as_raw_fd(), newfd_flags) {
465                Ok(new_fd) => {
466                    (on_success.0)(new_fd);
467                    Ok(())
468                }
469                Err(_) => respond_continue(fd, id),
470            }
471        }
472        NotifAction::ReturnValue(val) => respond_value(fd, id, val),
473        NotifAction::Hold => Ok(()), // Don't send a response.
474        NotifAction::Kill { sig, pgid } => {
475            // Kill the entire process group, then return ENOMEM so the
476            // seccomp notification is resolved (avoids a kernel warning).
477            unsafe { libc::killpg(pgid, sig) };
478            respond_errno(fd, id, ENOMEM)
479        }
480    }
481}
482
483// ============================================================
484// vDSO re-patching after exec
485// ============================================================
486
487/// Re-patch the vDSO if the base address changed (e.g. after exec replaces it).
488fn maybe_patch_vdso(pid: i32, procfs: &mut super::state::ProcfsState, policy: &NotifPolicy) {
489    let base = match crate::vdso::find_vdso_base(pid) {
490        Ok(addr) => addr,
491        Err(_) => return,
492    };
493    if base == procfs.vdso_patched_addr {
494        return; // already patched this vDSO
495    }
496    let time_offset = if policy.has_time_start { Some(policy.time_offset) } else { None };
497    if crate::vdso::patch(pid, time_offset, policy.has_random_seed).is_ok() {
498        procfs.vdso_patched_addr = base;
499    }
500}
501
502// ============================================================
503// Policy event emission
504// ============================================================
505
506/// Map a syscall number to a human-readable name for the policy callback.
507fn syscall_name(nr: i64) -> &'static str {
508    match nr {
509        n if n == libc::SYS_openat => "openat",
510        n if n == libc::SYS_connect => "connect",
511        n if n == libc::SYS_sendto => "sendto",
512        n if n == libc::SYS_sendmsg => "sendmsg",
513        n if n == libc::SYS_bind => "bind",
514        n if n == libc::SYS_clone => "clone",
515        n if n == libc::SYS_clone3 => "clone3",
516        n if Some(n) == arch::SYS_VFORK => "vfork",
517        n if n == libc::SYS_execve => "execve",
518        n if n == libc::SYS_execveat => "execveat",
519        n if n == libc::SYS_mmap => "mmap",
520        n if n == libc::SYS_munmap => "munmap",
521        n if n == libc::SYS_brk => "brk",
522        n if n == libc::SYS_getrandom => "getrandom",
523        n if n == libc::SYS_unlinkat => "unlinkat",
524        n if n == libc::SYS_mkdirat => "mkdirat",
525        _ => "unknown",
526    }
527}
528
529/// Map a syscall number to a high-level category.
530fn syscall_category(nr: i64) -> crate::policy_fn::SyscallCategory {
531    use crate::policy_fn::SyscallCategory;
532    match nr {
533        n if n == libc::SYS_openat || n == libc::SYS_unlinkat
534            || n == libc::SYS_mkdirat || n == libc::SYS_renameat2
535            || n == libc::SYS_symlinkat || n == libc::SYS_linkat
536            || n == libc::SYS_fchmodat || n == libc::SYS_fchownat
537            || n == libc::SYS_truncate || n == libc::SYS_readlinkat
538            || n == libc::SYS_newfstatat || n == libc::SYS_statx
539            || n == libc::SYS_faccessat || n == libc::SYS_getdents64
540            || Some(n) == arch::SYS_GETDENTS => SyscallCategory::File,
541        n if n == libc::SYS_connect || n == libc::SYS_sendto
542            || n == libc::SYS_sendmsg || n == libc::SYS_bind
543            || n == libc::SYS_getsockname => SyscallCategory::Network,
544        n if n == libc::SYS_clone || n == libc::SYS_clone3
545            || Some(n) == arch::SYS_VFORK || n == libc::SYS_execve
546            || n == libc::SYS_execveat => SyscallCategory::Process,
547        n if n == libc::SYS_mmap || n == libc::SYS_munmap
548            || n == libc::SYS_brk || n == libc::SYS_mremap
549            => SyscallCategory::Memory,
550        _ => SyscallCategory::File, // default
551    }
552}
553
554/// Read the parent PID from /proc/{pid}/stat.
555fn read_ppid(pid: u32) -> Option<u32> {
556    let stat = std::fs::read_to_string(format!("/proc/{}/stat", pid)).ok()?;
557    // Format: "pid (comm) state ppid ..."
558    // Find the closing ')' then split the rest
559    let close_paren = stat.rfind(')')?;
560    let rest = &stat[close_paren + 2..]; // skip ") "
561    let fields: Vec<&str> = rest.split_whitespace().collect();
562    // fields[0] = state, fields[1] = ppid
563    fields.get(1)?.parse().ok()
564}
565
566/// Read a NUL-terminated path from child memory (up to 256 bytes).
567fn read_path_for_event(notif: &SeccompNotif, addr: u64, notif_fd: RawFd) -> Option<String> {
568    if addr == 0 { return None; }
569    let bytes = read_child_mem(notif_fd, notif.id, notif.pid, addr, 256).ok()?;
570    let nul = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len());
571    String::from_utf8(bytes[..nul].to_vec()).ok()
572}
573
574fn normalize_path(path: &std::path::Path) -> String {
575    use std::path::{Component, PathBuf};
576
577    let mut normalized = PathBuf::new();
578    let absolute = path.is_absolute();
579    if absolute {
580        normalized.push("/");
581    }
582
583    for component in path.components() {
584        match component {
585            Component::RootDir | Component::CurDir => {}
586            Component::ParentDir => {
587                normalized.pop();
588            }
589            Component::Normal(part) => normalized.push(part),
590            Component::Prefix(_) => {}
591        }
592    }
593
594    if normalized.as_os_str().is_empty() {
595        if absolute { "/".into() } else { ".".into() }
596    } else {
597        normalized.to_string_lossy().into_owned()
598    }
599}
600
601fn resolve_at_path_for_event(notif: &SeccompNotif, dirfd: i64, path: &str) -> Option<String> {
602    use std::path::Path;
603
604    if Path::new(path).is_absolute() {
605        return Some(normalize_path(Path::new(path)));
606    }
607
608    let dirfd32 = dirfd as i32;
609    let base = if dirfd32 == libc::AT_FDCWD {
610        std::fs::read_link(format!("/proc/{}/cwd", notif.pid)).ok()?
611    } else {
612        std::fs::read_link(format!("/proc/{}/fd/{}", notif.pid, dirfd32)).ok()?
613    };
614
615    Some(normalize_path(&base.join(path)))
616}
617
618fn resolve_path_for_notif(notif: &SeccompNotif, notif_fd: RawFd) -> Option<String> {
619    let nr = notif.data.nr as i64;
620    match nr {
621        n if n == libc::SYS_openat => {
622            // openat(dirfd, pathname, flags, mode)
623            let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
624            resolve_at_path_for_event(notif, notif.data.args[0] as i64, &path)
625        }
626        n if Some(n) == arch::SYS_OPEN || n == libc::SYS_execve => {
627            let path = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
628            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
629        }
630        n if n == libc::SYS_execveat => {
631            let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
632            resolve_at_path_for_event(notif, notif.data.args[0] as i64, &path)
633        }
634        // linkat(olddirfd, oldpath, newdirfd, newpath, flags)
635        // Check the source (old) path — deny if it's a denied file being linked away.
636        n if n == libc::SYS_linkat => {
637            let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
638            resolve_at_path_for_event(notif, notif.data.args[0] as i64, &path)
639        }
640        // renameat2(olddirfd, oldpath, newdirfd, newpath, flags)
641        // Check the source (old) path — deny if a denied file is being renamed away.
642        n if n == libc::SYS_renameat2 => {
643            let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
644            resolve_at_path_for_event(notif, notif.data.args[0] as i64, &path)
645        }
646        // symlinkat(target, newdirfd, linkpath)
647        // The target string is what the symlink points to; deny if it names a denied path.
648        n if n == libc::SYS_symlinkat => {
649            let target = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
650            // target may be absolute or relative to the process cwd
651            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &target)
652        }
653        // link(oldpath, newpath) — legacy, AT_FDCWD implied for both
654        n if Some(n) == arch::SYS_LINK => {
655            let path = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
656            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
657        }
658        // rename(oldpath, newpath) — legacy, AT_FDCWD implied for both
659        n if Some(n) == arch::SYS_RENAME => {
660            let path = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
661            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
662        }
663        // symlink(target, linkpath) — legacy
664        n if Some(n) == arch::SYS_SYMLINK => {
665            let target = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
666            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &target)
667        }
668        _ => None,
669    }
670}
671
672/// Resolve the second (destination) path for two-path syscalls.
673///
674/// Returns `None` for syscalls that only have a single path argument.
675fn resolve_second_path_for_notif(notif: &SeccompNotif, notif_fd: RawFd) -> Option<String> {
676    let nr = notif.data.nr as i64;
677    match nr {
678        // renameat2(olddirfd, oldpath, newdirfd, newpath, flags)
679        n if n == libc::SYS_renameat2 => {
680            let path = read_path_for_event(notif, notif.data.args[3], notif_fd)?;
681            resolve_at_path_for_event(notif, notif.data.args[2] as i64, &path)
682        }
683        // linkat(olddirfd, oldpath, newdirfd, newpath, flags)
684        // Destination of a hardlink to a denied file should also be denied
685        // (prevents overwriting a denied file via linkat).
686        n if n == libc::SYS_linkat => {
687            let path = read_path_for_event(notif, notif.data.args[3], notif_fd)?;
688            resolve_at_path_for_event(notif, notif.data.args[2] as i64, &path)
689        }
690        // rename(oldpath, newpath) — legacy
691        n if Some(n) == arch::SYS_RENAME => {
692            let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
693            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
694        }
695        // link(oldpath, newpath) — legacy
696        n if Some(n) == arch::SYS_LINK => {
697            let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
698            resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
699        }
700        _ => None,
701    }
702}
703
704/// Extract IP and port from a sockaddr in child memory.
705fn read_sockaddr_for_event(notif: &SeccompNotif, addr: u64, len: usize, notif_fd: RawFd)
706    -> (Option<std::net::IpAddr>, Option<u16>)
707{
708    if addr == 0 || len < 4 { return (None, None); }
709    let bytes = match read_child_mem(notif_fd, notif.id, notif.pid, addr, len.min(128)) {
710        Ok(b) => b,
711        Err(_) => return (None, None),
712    };
713    if bytes.len() < 4 { return (None, None); }
714    let family = u16::from_ne_bytes([bytes[0], bytes[1]]);
715    let port = u16::from_be_bytes([bytes[2], bytes[3]]);
716    let ip = match family as u32 {
717        f if f == crate::sys::structs::AF_INET && bytes.len() >= 8 => {
718            Some(std::net::IpAddr::V4(std::net::Ipv4Addr::new(
719                bytes[4], bytes[5], bytes[6], bytes[7],
720            )))
721        }
722        f if f == crate::sys::structs::AF_INET6 && bytes.len() >= 24 => {
723            let mut addr = [0u8; 16];
724            addr.copy_from_slice(&bytes[8..24]);
725            Some(std::net::IpAddr::V6(std::net::Ipv6Addr::from(addr)))
726        }
727        _ => None,
728    };
729    (ip, if port > 0 { Some(port) } else { None })
730}
731
732/// Read argv (NULL-terminated array of char* in child memory) for execve.
733/// Capped at 64 entries × 256 bytes/entry as a safety bound.
734fn read_argv_for_event(notif: &SeccompNotif, argv_ptr: u64, notif_fd: RawFd) -> Option<Vec<String>> {
735    if argv_ptr == 0 { return None; }
736    let mut args = Vec::new();
737    let ptr_size = std::mem::size_of::<u64>();
738
739    for i in 0..64u64 {
740        let ptr_addr = argv_ptr + i * ptr_size as u64;
741        let ptr_bytes = read_child_mem(notif_fd, notif.id, notif.pid, ptr_addr, ptr_size).ok()?;
742        let str_ptr = u64::from_ne_bytes(ptr_bytes[..8].try_into().ok()?);
743        if str_ptr == 0 { break; } // NULL terminator
744
745        if let Some(s) = read_path_for_event(notif, str_ptr, notif_fd) {
746            args.push(s);
747        } else {
748            break;
749        }
750    }
751
752    if args.is_empty() { None } else { Some(args) }
753}
754
755/// Emit a syscall event to the policy_fn callback thread (if active).
756/// Returns the callback's verdict for held syscalls.
757async fn emit_policy_event(
758    notif: &SeccompNotif,
759    action: &NotifAction,
760    policy_fn_state: &Arc<tokio::sync::Mutex<super::state::PolicyFnState>>,
761    notif_fd: RawFd,
762) -> Option<crate::policy_fn::Verdict> {
763    let pfs = policy_fn_state.lock().await;
764    let tx = match pfs.event_tx.as_ref() {
765        Some(tx) => tx.clone(),
766        None => return None,
767    };
768    drop(pfs);
769
770    let nr = notif.data.nr as i64;
771    let denied = matches!(action, NotifAction::Errno(_));
772    let name = syscall_name(nr);
773    let category = syscall_category(nr);
774    let parent_pid = read_ppid(notif.pid);
775
776    // Extract metadata based on syscall type.
777    //
778    // Path strings are deliberately NOT extracted: the kernel re-reads
779    // user-memory pointers after Continue, so any path-string-based
780    // decision is racy (issue #27). Path-based access control belongs
781    // in static Landlock rules.
782    //
783    // argv IS extracted for execve/execveat: the supervisor freezes
784    // sibling threads before returning Continue (sibling_freeze module),
785    // so the post-Continue re-read sees the same memory we read here.
786    //
787    // Network fields are TOCTOU-safe because connect/sendto/bind are
788    // performed on-behalf via pidfd_getfd; the kernel never re-reads
789    // child memory for those syscalls.
790    let mut host = None;
791    let mut port = None;
792    let mut size = None;
793    let mut argv = None;
794
795    if nr == libc::SYS_execve || nr == libc::SYS_execveat {
796        // execve(pathname, argv, envp):       args[1] = argv ptr
797        // execveat(dirfd, pathname, argv, ..): args[2] = argv ptr
798        let argv_ptr = if nr == libc::SYS_execveat {
799            notif.data.args[2]
800        } else {
801            notif.data.args[1]
802        };
803        argv = read_argv_for_event(notif, argv_ptr, notif_fd);
804    }
805
806    if nr == libc::SYS_connect || nr == libc::SYS_sendto || nr == libc::SYS_bind {
807        // connect(fd, addr, addrlen): args[1]=addr, args[2]=len
808        let addr_ptr = notif.data.args[1];
809        let addr_len = notif.data.args[2] as usize;
810        let (h, p) = read_sockaddr_for_event(notif, addr_ptr, addr_len, notif_fd);
811        host = h;
812        port = p;
813    }
814
815    if nr == libc::SYS_mmap {
816        // mmap(addr, length, ...): args[1] = length
817        size = Some(notif.data.args[1]);
818    }
819
820    let event = crate::policy_fn::SyscallEvent {
821        syscall: name.to_string(),
822        category,
823        pid: notif.pid,
824        parent_pid,
825        host,
826        port,
827        size,
828        argv,
829        denied,
830    };
831
832    // Hold syscalls where the callback's verdict matters.
833    // The child is blocked until the callback returns.
834    let is_held = nr == libc::SYS_execve || nr == libc::SYS_execveat
835        || nr == libc::SYS_connect || nr == libc::SYS_sendto
836        || nr == libc::SYS_bind || nr == libc::SYS_openat;
837
838    if is_held {
839        let (gate_tx, gate_rx) = tokio::sync::oneshot::channel();
840        let _ = tx.send(crate::policy_fn::PolicyEvent {
841            event,
842            gate: Some(gate_tx),
843        });
844        match tokio::time::timeout(std::time::Duration::from_secs(5), gate_rx).await {
845            Ok(Ok(verdict)) => Some(verdict),
846            _ => None, // timeout or channel closed — allow
847        }
848    } else {
849        let _ = tx.send(crate::policy_fn::PolicyEvent {
850            event,
851            gate: None,
852        });
853        None
854    }
855}
856
857// ============================================================
858// Per-notification handler (runs in a spawned task)
859// ============================================================
860
861/// Process a single seccomp notification: vDSO re-patch, path denial check,
862/// dispatch, policy event emission, and response.
863async fn handle_notification(
864    notif: SeccompNotif,
865    ctx: &Arc<super::ctx::SupervisorCtx>,
866    dispatch_table: &super::dispatch::DispatchTable,
867    fd: RawFd,
868) {
869    let policy = &ctx.policy;
870
871    // Ensure every pid that produces a notification is tracked in the
872    // ProcessIndex with an exit watcher. The fork handler runs on the
873    // *parent* pid (the child doesn't exist yet at clone-time), so the
874    // child gets registered the first time it issues its own syscall.
875    crate::resource::register_child_if_new(ctx, notif.pid as i32).await;
876
877    // Re-patch vDSO if needed (exec replaces it with a fresh copy).
878    if policy.has_time_start || policy.has_random_seed {
879        let mut pfs = ctx.procfs.lock().await;
880        maybe_patch_vdso(notif.pid as i32, &mut pfs, policy);
881    }
882
883    // Check dynamic path denials before dispatch
884    let mut action = {
885        let nr = notif.data.nr as i64;
886        let mut path_check_nrs = vec![
887            libc::SYS_openat, libc::SYS_execve, libc::SYS_execveat,
888            libc::SYS_linkat, libc::SYS_renameat2, libc::SYS_symlinkat,
889        ];
890        path_check_nrs.extend([
891            arch::SYS_OPEN, arch::SYS_LINK, arch::SYS_RENAME, arch::SYS_SYMLINK,
892        ].into_iter().flatten());
893        let should_precheck_denied = policy.chroot_root.is_none()
894            && path_check_nrs.contains(&nr);
895        if should_precheck_denied {
896            let pfs = ctx.policy_fn.lock().await;
897            if is_path_denied_for_notif(&pfs, &notif, fd) {
898                NotifAction::Errno(libc::EACCES)
899            } else {
900                drop(pfs);
901                dispatch_table.dispatch(notif, ctx, fd).await
902            }
903        } else {
904            dispatch_table.dispatch(notif, ctx, fd).await
905        }
906    };
907
908    // Emit event to policy_fn callback if active
909    if let Some(verdict) = emit_policy_event(&notif, &action, &ctx.policy_fn, fd).await {
910        use crate::policy_fn::Verdict;
911        match verdict {
912            Verdict::Deny => { action = NotifAction::Errno(libc::EPERM); }
913            Verdict::DenyWith(errno) => { action = NotifAction::Errno(errno); }
914            Verdict::Audit => { /* allow, but could log here */ }
915            Verdict::Allow => {}
916        }
917    }
918
919    // TOCTOU-close for execve (issue #27): freeze sibling threads of
920    // the calling tid before the kernel re-reads pathname/argv from
921    // child memory.  Cheap because the kernel's de_thread step in
922    // execve kills the siblings anyway — we're just stopping them
923    // moments earlier, closing the race window for the supervisor's
924    // argv inspection in policy_fn.
925    //
926    // Only relevant when we're sending Continue: a denial response
927    // (Errno) means the kernel never re-reads, so no freeze needed.
928    //
929    // Strict on failure: if we cannot freeze the siblings, we cannot
930    // uphold the argv-safety invariant, so we deny the execve with
931    // EPERM rather than letting it through unprotected.
932    let nr = notif.data.nr as i64;
933    if matches!(action, NotifAction::Continue)
934        && crate::sibling_freeze::requires_freeze_on_continue(nr)
935    {
936        if let Err(e) = crate::sibling_freeze::freeze_siblings_for_execve(notif.pid as i32) {
937            eprintln!(
938                "sandlock: argv-safety freeze failed for pid {}: {} \
939                 — denying execve to preserve TOCTOU invariant",
940                notif.pid, e
941            );
942            action = NotifAction::Errno(libc::EPERM);
943        }
944    }
945
946    // Ignore error — child may have exited between recv and response.
947    let _ = send_response(fd, notif.id, action);
948}
949
950// ============================================================
951// Main supervisor loop
952// ============================================================
953
954/// Async event loop that processes seccomp notifications.
955///
956/// Runs until the notification fd is closed (child exits or filter is removed).
957pub async fn supervisor(
958    notif_fd: OwnedFd,
959    ctx: Arc<super::ctx::SupervisorCtx>,
960) {
961    let fd = notif_fd.as_raw_fd();
962
963    // Build the dispatch table once at startup.
964    let dispatch_table = Arc::new(super::dispatch::build_dispatch_table(&ctx.policy, &ctx.resource));
965
966    // Try to enable sync wakeup (Linux 6.7+, ignore error on older kernels).
967    try_set_sync_wakeup(fd);
968
969    // SECCOMP_IOCTL_NOTIF_RECV blocks regardless of O_NONBLOCK, so we
970    // receive notifications in a blocking thread and send them to the
971    // async handler via a channel.  This guarantees we never miss a
972    // notification — the thread is always blocked in recv_notif ready
973    // for the next one.
974    //
975    // Notifications are processed sequentially (not spawned) to avoid
976    // mutex contention between concurrent handlers.
977    let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel::<SeccompNotif>();
978
979    std::thread::spawn(move || {
980        loop {
981            match recv_notif(fd) {
982                Ok(notif) => {
983                    if tx.send(notif).is_err() {
984                        break; // receiver dropped — supervisor shutting down
985                    }
986                }
987                Err(_) => break, // fd closed — child exited
988            }
989        }
990    });
991
992    // Periodic sweep as a defensive backstop in case pidfd-based
993    // lifecycle cleanup misses an entry (e.g. pidfd_open failed for a
994    // child on an old kernel, or its watcher panicked). At 5 minutes
995    // this is cheap enough to leave on; the primary cleanup path is
996    // still per-child pidfd readiness in `spawn_pid_watcher`.
997    let gc = tokio::spawn(process_index_gc(Arc::clone(&ctx.processes)));
998
999    while let Some(notif) = rx.recv().await {
1000        handle_notification(notif, &ctx, &dispatch_table, fd).await;
1001    }
1002
1003    gc.abort();
1004}
1005
1006/// Periodic sweep that drops `ProcessIndex` entries for exited PIDs.
1007/// Per-process state hangs off these entries via `Arc`, so dropping
1008/// them releases everything in one step.
1009async fn process_index_gc(processes: Arc<super::state::ProcessIndex>) {
1010    let interval = std::time::Duration::from_secs(300);
1011    loop {
1012        tokio::time::sleep(interval).await;
1013        if processes.len() == 0 {
1014            continue;
1015        }
1016        processes.prune_dead();
1017    }
1018}
1019
1020/// Spawn a per-child task that awaits the pidfd becoming readable
1021/// (process exit) and then runs unified cleanup across every
1022/// per-process supervisor map.
1023///
1024/// The watcher *owns* the pidfd via `AsyncFd<OwnedFd>` — the kernel
1025/// fd stays alive for as long as tokio's IO driver has it registered,
1026/// and is closed exactly once when the watcher task ends. This avoids
1027/// a TOCTOU where dropping the fd from a separate map could let a
1028/// recycled fd be deregistered from epoll.
1029pub(crate) fn spawn_pid_watcher(
1030    ctx: Arc<super::ctx::SupervisorCtx>,
1031    key: super::state::PidKey,
1032    pidfd: std::os::unix::io::OwnedFd,
1033) {
1034    tokio::spawn(async move {
1035        let async_fd = match tokio::io::unix::AsyncFd::with_interest(
1036            pidfd,
1037            tokio::io::Interest::READABLE,
1038        ) {
1039            Ok(f) => f,
1040            Err(_) => {
1041                // AsyncFd registration failed (extremely unusual);
1042                // fall back to immediate cleanup so we don't leak the
1043                // index entry. The OwnedFd we passed in is consumed
1044                // by `with_interest`'s Err return and will close on
1045                // drop here.
1046                cleanup_pid(&ctx, key).await;
1047                return;
1048            }
1049        };
1050        // pidfd becomes readable when the process exits; we don't
1051        // read any data, so `readable()` is just an await point.
1052        let _ = async_fd.readable().await;
1053        cleanup_pid(&ctx, key).await;
1054        // async_fd drops here, closing the pidfd.
1055    });
1056}
1057
1058/// Drop the supervisor's per-process state for `key`. With every
1059/// per-process map living inside `PerProcessState` (owned by
1060/// `ProcessIndex`), this is a single unregister — the entry's `Arc`
1061/// drops here, and remaining clones held by in-flight handlers will
1062/// drop with their tasks, freeing `PerProcessState` automatically.
1063pub(crate) async fn cleanup_pid(ctx: &super::ctx::SupervisorCtx, key: super::state::PidKey) {
1064    ctx.processes.unregister(key);
1065}
1066
1067// ============================================================
1068// Tests
1069// ============================================================
1070
1071#[cfg(test)]
1072mod tests {
1073    use super::*;
1074
1075    #[test]
1076    fn test_notif_action_debug() {
1077        // Ensure all variants implement Debug.
1078        let _ = format!("{:?}", NotifAction::Continue);
1079        let _ = format!("{:?}", NotifAction::Errno(1));
1080        let _ = format!("{:?}", NotifAction::InjectFd { srcfd: 3, targetfd: 4 });
1081        // Use a real fd (dup'd from stderr) so OwnedFd can safely close it.
1082        let test_fd = unsafe { OwnedFd::from_raw_fd(libc::dup(2)) };
1083        let _ = format!("{:?}", NotifAction::InjectFdSend { srcfd: test_fd, newfd_flags: 0 });
1084        let _ = format!("{:?}", NotifAction::ReturnValue(42));
1085        let _ = format!("{:?}", NotifAction::Hold);
1086        let _ = format!("{:?}", NotifAction::Kill { sig: 9, pgid: 1 });
1087    }
1088
1089    #[test]
1090    fn test_network_state_new() {
1091        let ns = super::super::state::NetworkState::new();
1092        assert!(matches!(ns.network_policy, NetworkPolicy::Unrestricted));
1093        assert!(ns.port_map.bound_ports.is_empty());
1094    }
1095
1096    #[test]
1097    fn test_time_random_state_new() {
1098        let tr = super::super::state::TimeRandomState::new(None, None);
1099        assert!(tr.time_offset.is_none());
1100        assert!(tr.random_state.is_none());
1101    }
1102
1103    #[test]
1104    fn test_resource_state_new() {
1105        let rs = super::super::state::ResourceState::new(1024 * 1024, 10);
1106        assert_eq!(rs.mem_used, 0);
1107        assert_eq!(rs.max_memory_bytes, 1024 * 1024);
1108        assert_eq!(rs.max_processes, 10);
1109        assert!(!rs.hold_forks);
1110        assert!(rs.held_notif_ids.is_empty());
1111    }
1112
1113    #[test]
1114    fn test_process_vm_readv_self() {
1115        let data: u64 = 0xDEADBEEF_CAFEBABE;
1116        let addr = &data as *const u64 as u64;
1117        let pid = std::process::id();
1118        let result = read_child_mem_vm(pid, addr, 8);
1119        assert!(result.is_ok());
1120        let bytes = result.unwrap();
1121        let read_val = u64::from_ne_bytes(bytes[..8].try_into().unwrap());
1122        assert_eq!(read_val, 0xDEADBEEF_CAFEBABE);
1123    }
1124
1125    #[test]
1126    fn test_process_vm_writev_self() {
1127        let mut data: u64 = 0;
1128        let addr = &mut data as *mut u64 as u64;
1129        let pid = std::process::id();
1130        let payload = 0x1234567890ABCDEFu64.to_ne_bytes();
1131        let result = write_child_mem_vm(pid, addr, &payload);
1132        assert!(result.is_ok());
1133        assert_eq!(data, 0x1234567890ABCDEF);
1134    }
1135}