Skip to main content

sandlock_core/
context.rs

1// Fork + confinement sequence: child-side Landlock + seccomp application
2// and parent-child pipe synchronization.
3
4use std::ffi::CString;
5use std::io;
6use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
7
8use crate::arch;
9use crate::sandbox::{FsIsolation, Sandbox};
10use crate::seccomp::bpf::{self, stmt, jump};
11use crate::sys::structs::{
12    AF_INET, AF_INET6,
13    BPF_ABS, BPF_ALU, BPF_AND, BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K, BPF_LD, BPF_RET, BPF_W,
14    CLONE_NS_FLAGS, DEFAULT_BLOCKLIST_SYSCALLS, EPERM, SYSV_IPC_BLOCKLIST_SYSCALLS,
15    SECCOMP_RET_ALLOW, SECCOMP_RET_ERRNO,
16    SIOCETHTOOL, SIOCGIFADDR, SIOCGIFBRDADDR, SIOCGIFCONF, SIOCGIFDSTADDR,
17    SIOCGIFFLAGS, SIOCGIFHWADDR, SIOCGIFINDEX, SIOCGIFNAME, SIOCGIFNETMASK,
18    SOCK_DGRAM, SOCK_RAW, SOCK_TYPE_MASK, TIOCLINUX, TIOCSTI,
19    PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER,
20    OFFSET_ARGS0_LO, OFFSET_ARGS1_LO, OFFSET_ARGS2_LO, OFFSET_ARGS3_LO, OFFSET_NR,
21    SockFilter,
22};
23
24// ============================================================
25// Pipe pair for parent-child synchronization
26// ============================================================
27
28/// Pipes for parent-child communication after fork().
29pub struct PipePair {
30    /// Parent reads the notif fd number written by the child.
31    pub notif_r: OwnedFd,
32    /// Child writes the notif fd number to the parent.
33    pub notif_w: OwnedFd,
34    /// Child reads the "supervisor ready" signal from the parent.
35    pub ready_r: OwnedFd,
36    /// Parent writes the "supervisor ready" signal to the child.
37    pub ready_w: OwnedFd,
38}
39
40impl PipePair {
41    /// Create two pipe pairs using `pipe2(O_CLOEXEC)`.
42    pub fn new() -> io::Result<Self> {
43        let mut notif_fds = [0i32; 2];
44        let mut ready_fds = [0i32; 2];
45
46        // SAFETY: pipe2 with valid pointers and O_CLOEXEC
47        let ret = unsafe { libc::pipe2(notif_fds.as_mut_ptr(), libc::O_CLOEXEC) };
48        if ret < 0 {
49            return Err(io::Error::last_os_error());
50        }
51
52        let ret = unsafe { libc::pipe2(ready_fds.as_mut_ptr(), libc::O_CLOEXEC) };
53        if ret < 0 {
54            // Close the first pair on failure
55            unsafe {
56                libc::close(notif_fds[0]);
57                libc::close(notif_fds[1]);
58            }
59            return Err(io::Error::last_os_error());
60        }
61
62        // SAFETY: pipe2 returned valid fds
63        Ok(PipePair {
64            notif_r: unsafe { OwnedFd::from_raw_fd(notif_fds[0]) },
65            notif_w: unsafe { OwnedFd::from_raw_fd(notif_fds[1]) },
66            ready_r: unsafe { OwnedFd::from_raw_fd(ready_fds[0]) },
67            ready_w: unsafe { OwnedFd::from_raw_fd(ready_fds[1]) },
68        })
69    }
70}
71
72// ============================================================
73// Pipe I/O helpers
74// ============================================================
75
76/// Write a `u32` as 4 little-endian bytes to a raw fd.
77pub(crate) fn write_u32_fd(fd: RawFd, val: u32) -> io::Result<()> {
78    let buf = val.to_le_bytes();
79    let mut written = 0usize;
80    while written < 4 {
81        let ret = unsafe {
82            libc::write(
83                fd,
84                buf[written..].as_ptr() as *const libc::c_void,
85                4 - written,
86            )
87        };
88        if ret < 0 {
89            return Err(io::Error::last_os_error());
90        }
91        written += ret as usize;
92    }
93    Ok(())
94}
95
96/// Read a `u32` (4 little-endian bytes, blocking) from a raw fd.
97pub(crate) fn read_u32_fd(fd: RawFd) -> io::Result<u32> {
98    let mut buf = [0u8; 4];
99    let mut total = 0usize;
100    while total < 4 {
101        let ret = unsafe {
102            libc::read(
103                fd,
104                buf[total..].as_mut_ptr() as *mut libc::c_void,
105                4 - total,
106            )
107        };
108        if ret < 0 {
109            return Err(io::Error::last_os_error());
110        }
111        if ret == 0 {
112            return Err(io::Error::new(
113                io::ErrorKind::UnexpectedEof,
114                "pipe closed before 4 bytes read",
115            ));
116        }
117        total += ret as usize;
118    }
119    Ok(u32::from_le_bytes(buf))
120}
121
122// ============================================================
123// Syscall name → number mapping
124// ============================================================
125
126/// Map a syscall name to its `libc::SYS_*` number.
127///
128/// Covers all names in `DEFAULT_BLOCKLIST_SYSCALLS` plus extras needed for
129/// notif and arg-filter lists.
130pub fn syscall_name_to_nr(name: &str) -> Option<u32> {
131    let nr: i64 = match name {
132        "mount" => libc::SYS_mount,
133        "umount2" => libc::SYS_umount2,
134        "pivot_root" => libc::SYS_pivot_root,
135        "swapon" => libc::SYS_swapon,
136        "swapoff" => libc::SYS_swapoff,
137        "reboot" => libc::SYS_reboot,
138        "sethostname" => libc::SYS_sethostname,
139        "setdomainname" => libc::SYS_setdomainname,
140        "kexec_load" => libc::SYS_kexec_load,
141        "init_module" => libc::SYS_init_module,
142        "finit_module" => libc::SYS_finit_module,
143        "delete_module" => libc::SYS_delete_module,
144        "unshare" => libc::SYS_unshare,
145        "setns" => libc::SYS_setns,
146        "perf_event_open" => libc::SYS_perf_event_open,
147        "bpf" => libc::SYS_bpf,
148        "userfaultfd" => libc::SYS_userfaultfd,
149        "keyctl" => libc::SYS_keyctl,
150        "add_key" => libc::SYS_add_key,
151        "request_key" => libc::SYS_request_key,
152        "ptrace" => libc::SYS_ptrace,
153        "process_vm_readv" => libc::SYS_process_vm_readv,
154        "process_vm_writev" => libc::SYS_process_vm_writev,
155        "open_by_handle_at" => libc::SYS_open_by_handle_at,
156        "name_to_handle_at" => libc::SYS_name_to_handle_at,
157        "ioperm" => arch::SYS_IOPERM?,
158        "iopl" => arch::SYS_IOPL?,
159        "quotactl" => libc::SYS_quotactl,
160        "acct" => libc::SYS_acct,
161        "lookup_dcookie" => libc::SYS_lookup_dcookie,
162        // nfsservctl was removed in Linux 3.1; no libc constant — skip
163        "personality" => libc::SYS_personality,
164        "io_uring_setup" => libc::SYS_io_uring_setup,
165        "io_uring_enter" => libc::SYS_io_uring_enter,
166        "io_uring_register" => libc::SYS_io_uring_register,
167        // Additional syscalls for notif/arg filters
168        "clone" => libc::SYS_clone,
169        "clone3" => libc::SYS_clone3,
170        "vfork" => arch::SYS_VFORK?,
171        "mmap" => libc::SYS_mmap,
172        "munmap" => libc::SYS_munmap,
173        "brk" => libc::SYS_brk,
174        "mremap" => libc::SYS_mremap,
175        "connect" => libc::SYS_connect,
176        "sendto" => libc::SYS_sendto,
177        "sendmsg" => libc::SYS_sendmsg,
178        "sendmmsg" => libc::SYS_sendmmsg,
179        "ioctl" => libc::SYS_ioctl,
180        "socket" => libc::SYS_socket,
181        "prctl" => libc::SYS_prctl,
182        "getrandom" => libc::SYS_getrandom,
183        "openat" => libc::SYS_openat,
184        "open" => arch::SYS_OPEN?,
185        "getdents64" => libc::SYS_getdents64,
186        "getdents" => arch::SYS_GETDENTS?,
187        "bind" => libc::SYS_bind,
188        "getsockname" => libc::SYS_getsockname,
189        "clock_gettime" => libc::SYS_clock_gettime,
190        "gettimeofday" => libc::SYS_gettimeofday,
191        "time" => arch::SYS_TIME?,
192        "clock_nanosleep" => libc::SYS_clock_nanosleep,
193        "timerfd_settime" => libc::SYS_timerfd_settime,
194        "timer_settime" => libc::SYS_timer_settime,
195        "execve" => libc::SYS_execve,
196        "execveat" => libc::SYS_execveat,
197        // COW filesystem syscalls
198        "unlinkat" => libc::SYS_unlinkat,
199        "mkdirat" => libc::SYS_mkdirat,
200        "renameat2" => libc::SYS_renameat2,
201        "newfstatat" => libc::SYS_newfstatat,
202        "statx" => libc::SYS_statx,
203        "faccessat" => libc::SYS_faccessat,
204        "symlinkat" => libc::SYS_symlinkat,
205        "linkat" => libc::SYS_linkat,
206        "fchmodat" => libc::SYS_fchmodat,
207        "fchownat" => libc::SYS_fchownat,
208        "readlinkat" => libc::SYS_readlinkat,
209        "truncate" => libc::SYS_truncate,
210        "utimensat" => libc::SYS_utimensat,
211        "unlink" => arch::SYS_UNLINK?,
212        "rmdir" => arch::SYS_RMDIR?,
213        "mkdir" => arch::SYS_MKDIR?,
214        "rename" => arch::SYS_RENAME?,
215        "stat" => arch::SYS_STAT?,
216        "lstat" => arch::SYS_LSTAT?,
217        "access" => arch::SYS_ACCESS?,
218        "symlink" => arch::SYS_SYMLINK?,
219        "link" => arch::SYS_LINK?,
220        "chmod" => arch::SYS_CHMOD?,
221        "chown" => arch::SYS_CHOWN?,
222        "lchown" => arch::SYS_LCHOWN?,
223        "readlink" => arch::SYS_READLINK?,
224        "futimesat" => arch::SYS_FUTIMESAT?,
225        "fork" => arch::SYS_FORK?,
226        // SysV IPC (gated by extra_allow_syscalls=["sysv_ipc"]; denied by default)
227        "shmget" => libc::SYS_shmget,
228        "shmat" => libc::SYS_shmat,
229        "shmdt" => libc::SYS_shmdt,
230        "shmctl" => libc::SYS_shmctl,
231        "msgget" => libc::SYS_msgget,
232        "msgsnd" => libc::SYS_msgsnd,
233        "msgrcv" => libc::SYS_msgrcv,
234        "msgctl" => libc::SYS_msgctl,
235        "semget" => libc::SYS_semget,
236        "semop" => libc::SYS_semop,
237        "semctl" => libc::SYS_semctl,
238        "semtimedop" => libc::SYS_semtimedop,
239        _ => return None,
240    };
241    Some(nr as u32)
242}
243
244// ============================================================
245// Sandbox → syscall lists
246// ============================================================
247
248/// Determine which syscalls need `SECCOMP_RET_USER_NOTIF`.
249pub fn notif_syscalls(policy: &Sandbox, sandbox_name: Option<&str>) -> Vec<u32> {
250    let mut nrs = vec![
251        libc::SYS_clone as u32,
252        libc::SYS_clone3 as u32,
253        libc::SYS_wait4 as u32,
254        libc::SYS_waitid as u32,
255    ];
256    arch::push_optional_syscall(&mut nrs, arch::SYS_VFORK);
257    // Bare fork(2) carries none of the namespace/process-limit risk of
258    // clone/clone3 and was historically left out of the BPF filter so
259    // hot fork-loops (COW map-reduce) bypass the supervisor entirely.
260    // It only needs interception when policy_fn is active, so the
261    // supervisor can register the new child via ptrace fork events
262    // before it can run user code (argv-safety invariant).
263    if policy.policy_fn.is_some() {
264        arch::push_optional_syscall(&mut nrs, arch::SYS_FORK);
265    }
266
267    if policy.max_memory.is_some() {
268        nrs.push(libc::SYS_mmap as u32);
269        nrs.push(libc::SYS_munmap as u32);
270        nrs.push(libc::SYS_brk as u32);
271        nrs.push(libc::SYS_mremap as u32);
272        // shmget is in notif only when SysV IPC is allowed. The BPF
273        // layout puts notif JEQs before deny JEQs, so a syscall on
274        // both lists would notify (RET_USER_NOTIF) and silently
275        // bypass the kernel-level deny. When extra_allow_syscalls does not contain "sysv_ipc",
276        // shmget belongs only on the blocklist.
277        if policy.allows_sysv_ipc() {
278            nrs.push(libc::SYS_shmget as u32);
279        }
280    }
281
282    if !policy.net_allow.is_empty()
283        || policy.policy_fn.is_some()
284        || !policy.http_allow.is_empty()
285        || !policy.http_deny.is_empty()
286    {
287        nrs.push(libc::SYS_connect as u32);
288        nrs.push(libc::SYS_sendto as u32);
289        nrs.push(libc::SYS_sendmsg as u32);
290        nrs.push(libc::SYS_sendmmsg as u32);
291        nrs.push(libc::SYS_bind as u32);
292    }
293
294    if policy.random_seed.is_some() {
295        nrs.push(libc::SYS_getrandom as u32);
296        // Also intercept openat so the supervisor can re-patch vDSO after exec.
297        nrs.push(libc::SYS_openat as u32);
298    }
299
300    if policy.time_start.is_some() {
301        nrs.extend_from_slice(&[
302            libc::SYS_clock_nanosleep as u32,
303            libc::SYS_timerfd_settime as u32,
304            libc::SYS_timer_settime as u32,
305        ]);
306        // Also intercept openat so the supervisor gets a notification after exec
307        // and can re-patch the vDSO (exec replaces vDSO with a fresh copy).
308        nrs.push(libc::SYS_openat as u32);
309    }
310
311    // /proc virtualization (always on: PID filtering, sensitive path blocking)
312    nrs.push(libc::SYS_openat as u32);
313    nrs.push(libc::SYS_getdents64 as u32);
314    arch::push_optional_syscall(&mut nrs, arch::SYS_GETDENTS);
315
316    // Netlink virtualization (always on):
317    //   socket, bind, getsockname — swap in a unix socketpair for AF_NETLINK
318    //   recvfrom, recvmsg         — zero msg_name so glibc accepts the reply
319    //                                (kernel only writes sun_family on unix
320    //                                 recvmsg, leaving nl_pid uninitialized)
321    //   close                     — unregister (pid, fd) so reuse doesn't
322    //                                collide with the cookie set
323    // Send traffic flows through the real socketpair untouched.
324    nrs.push(libc::SYS_socket as u32);
325    nrs.push(libc::SYS_bind as u32);
326    nrs.push(libc::SYS_getsockname as u32);
327    nrs.push(libc::SYS_recvfrom as u32);
328    nrs.push(libc::SYS_recvmsg as u32);
329    nrs.push(libc::SYS_close as u32);
330    // Virtualize sched_getaffinity so nproc/sysconf agree with /proc/cpuinfo
331    if policy.num_cpus.is_some() {
332        nrs.push(libc::SYS_sched_getaffinity as u32);
333    }
334    if sandbox_name.is_some() {
335        nrs.push(libc::SYS_uname as u32);
336        nrs.push(libc::SYS_openat as u32);
337    }
338
339    // COW filesystem interception (seccomp-based, unprivileged)
340    if policy.workdir.is_some() && policy.fs_isolation == FsIsolation::None {
341        nrs.extend_from_slice(&[
342            libc::SYS_openat as u32,
343            libc::SYS_execve as u32,
344            libc::SYS_execveat as u32,
345            libc::SYS_unlinkat as u32,
346            libc::SYS_mkdirat as u32,
347            libc::SYS_renameat2 as u32,
348            libc::SYS_symlinkat as u32,
349            libc::SYS_linkat as u32,
350            libc::SYS_fchmodat as u32,
351            libc::SYS_fchownat as u32,
352            libc::SYS_truncate as u32,
353            libc::SYS_utimensat as u32,
354            libc::SYS_newfstatat as u32,
355            libc::SYS_statx as u32,
356            libc::SYS_faccessat as u32,
357            439u32,                       // SYS_faccessat2 — glibc 2.33+ uses this instead of faccessat
358            libc::SYS_readlinkat as u32,
359            libc::SYS_getdents64 as u32,
360            libc::SYS_chdir as u32,
361            libc::SYS_getcwd as u32,
362        ]);
363        for nr in [
364            arch::SYS_OPEN, arch::SYS_UNLINK, arch::SYS_RMDIR, arch::SYS_MKDIR,
365            arch::SYS_RENAME, arch::SYS_SYMLINK, arch::SYS_LINK, arch::SYS_CHMOD,
366            arch::SYS_CHOWN, arch::SYS_LCHOWN, arch::SYS_STAT, arch::SYS_LSTAT,
367            arch::SYS_ACCESS, arch::SYS_READLINK, arch::SYS_GETDENTS,
368        ] {
369            arch::push_optional_syscall(&mut nrs, nr);
370        }
371    }
372
373    // Chroot path interception
374    if policy.chroot.is_some() {
375        nrs.extend_from_slice(&[
376            libc::SYS_openat as u32,
377            libc::SYS_execve as u32,
378            libc::SYS_execveat as u32,
379            libc::SYS_unlinkat as u32,
380            libc::SYS_mkdirat as u32,
381            libc::SYS_renameat2 as u32,
382            libc::SYS_symlinkat as u32,
383            libc::SYS_linkat as u32,
384            libc::SYS_fchmodat as u32,
385            libc::SYS_fchownat as u32,
386            libc::SYS_truncate as u32,
387            libc::SYS_newfstatat as u32,
388            libc::SYS_statx as u32,
389            libc::SYS_faccessat as u32,
390            439u32,                       // SYS_faccessat2 — glibc 2.33+ uses this instead of faccessat
391            libc::SYS_readlinkat as u32,
392            libc::SYS_getdents64 as u32,
393            libc::SYS_chdir as u32,
394            libc::SYS_getcwd as u32,
395            libc::SYS_statfs as u32,
396            libc::SYS_utimensat as u32,
397        ]);
398        for nr in [
399            arch::SYS_OPEN, arch::SYS_STAT, arch::SYS_LSTAT, arch::SYS_ACCESS,
400            arch::SYS_READLINK, arch::SYS_GETDENTS, arch::SYS_UNLINK,
401            arch::SYS_RMDIR, arch::SYS_MKDIR, arch::SYS_RENAME,
402            arch::SYS_SYMLINK, arch::SYS_LINK, arch::SYS_CHMOD,
403            arch::SYS_CHOWN, arch::SYS_LCHOWN,
404        ] {
405            arch::push_optional_syscall(&mut nrs, nr);
406        }
407    }
408
409    // Explicit deny-paths need path-bearing syscalls intercepted.
410    if !policy.fs_denied.is_empty() {
411        nrs.extend_from_slice(&[
412            libc::SYS_openat as u32,
413            libc::SYS_execve as u32,
414            libc::SYS_execveat as u32,
415            libc::SYS_linkat as u32,
416            libc::SYS_renameat2 as u32,
417            libc::SYS_symlinkat as u32,
418        ]);
419        for nr in [arch::SYS_OPEN, arch::SYS_LINK, arch::SYS_RENAME, arch::SYS_SYMLINK] {
420            arch::push_optional_syscall(&mut nrs, nr);
421        }
422    }
423
424    // Dynamic policy callback — intercept key syscalls for event emission.
425    if policy.policy_fn.is_some() {
426        nrs.extend_from_slice(&[
427            libc::SYS_openat as u32,
428            libc::SYS_connect as u32,
429            libc::SYS_sendto as u32,
430            libc::SYS_bind as u32,
431            libc::SYS_execve as u32,
432            libc::SYS_execveat as u32,
433        ]);
434    }
435
436    // Port remapping
437    if policy.port_remap {
438        nrs.extend_from_slice(&[
439            libc::SYS_bind as u32,
440            libc::SYS_getsockname as u32,
441        ]);
442    }
443
444    nrs.sort_unstable();
445    nrs.dedup();
446    nrs
447}
448
449/// Resolve `NO_SUPERVISOR_BLOCKLIST_SYSCALLS` names to numbers, plus
450/// SysV IPC syscalls when `policy.allows_sysv_ipc()` is false.
451pub fn no_supervisor_blocklist_syscall_numbers(policy: &Sandbox) -> Vec<u32> {
452    use crate::sys::structs::NO_SUPERVISOR_BLOCKLIST_SYSCALLS;
453    let mut nrs: Vec<u32> = NO_SUPERVISOR_BLOCKLIST_SYSCALLS
454        .iter()
455        .copied()
456        .chain(policy.extra_deny_syscalls.iter().map(String::as_str))
457        .filter_map(|n| syscall_name_to_nr(n))
458        .collect();
459    if !policy.allows_sysv_ipc() {
460        for name in SYSV_IPC_BLOCKLIST_SYSCALLS {
461            if let Some(nr) = syscall_name_to_nr(name) {
462                if !nrs.contains(&nr) {
463                    nrs.push(nr);
464                }
465            }
466        }
467    }
468    nrs.sort_unstable();
469    nrs.dedup();
470    nrs
471}
472
473/// Resolve the default syscall blocklist plus policy extras to numbers.
474///
475/// SysV IPC syscalls are appended to the resolved blocklist when
476/// `policy.allows_sysv_ipc()` is false.
477pub fn blocklist_syscall_numbers(policy: &Sandbox) -> Vec<u32> {
478    let mut nrs: Vec<u32> = DEFAULT_BLOCKLIST_SYSCALLS
479        .iter()
480        .copied()
481        .chain(policy.extra_deny_syscalls.iter().map(String::as_str))
482        .filter_map(|n| syscall_name_to_nr(n))
483        .collect();
484    if !policy.allows_sysv_ipc() {
485        for name in SYSV_IPC_BLOCKLIST_SYSCALLS {
486            if let Some(nr) = syscall_name_to_nr(name) {
487                if !nrs.contains(&nr) {
488                    nrs.push(nr);
489                }
490            }
491        }
492    }
493    nrs.sort_unstable();
494    nrs.dedup();
495    nrs
496}
497
498/// Build argument-level seccomp filter instructions matching the Python
499/// `_build_arg_filters()` exactly.
500///
501/// Returns a `Vec<SockFilter>` containing self-contained BPF blocks for:
502///   - clone: block namespace creation flags
503///   - ioctl: block TIOCSTI, TIOCLINUX, SIOCGIF*, SIOCETHTOOL
504///   - prctl: block PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER
505///   - socket: block SOCK_RAW/SOCK_DGRAM on AF_INET/AF_INET6 (with type mask)
506pub fn arg_filters(policy: &Sandbox) -> Vec<SockFilter> {
507    let ret_errno = SECCOMP_RET_ERRNO | EPERM as u32;
508    let nr_clone = libc::SYS_clone as u32;
509    let nr_ioctl = libc::SYS_ioctl as u32;
510    let nr_prctl = libc::SYS_prctl as u32;
511    let nr_socket = libc::SYS_socket as u32;
512
513    let mut insns: Vec<SockFilter> = Vec::new();
514
515    // --- clone: block namespace creation flags ---
516    // 5 instructions:
517    //   LD NR
518    //   JEQ clone → +0, skip 3
519    //   LD arg0
520    //   JSET NS_FLAGS → +0, skip 1
521    //   RET ERRNO
522    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
523    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_clone, 0, 3));
524    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
525    insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, CLONE_NS_FLAGS as u32, 0, 1));
526    insns.push(stmt(BPF_RET | BPF_K, ret_errno));
527
528    // --- ioctl: block dangerous commands ---
529    // Block terminal injection (TIOCSTI, TIOCLINUX) and network interface
530    // enumeration ioctls (SIOCGIF*, SIOCETHTOOL) to complement NETLINK_ROUTE
531    // virtualization.
532    // Layout: LD NR, JEQ ioctl (skip 1 + N*2), LD arg1, [JEQ cmd, RET ERRNO] * N
533    let dangerous_ioctls: &[u32] = &[
534        TIOCSTI as u32,
535        TIOCLINUX as u32,
536        SIOCGIFNAME as u32,
537        SIOCGIFCONF as u32,
538        SIOCGIFFLAGS as u32,
539        SIOCGIFADDR as u32,
540        SIOCGIFDSTADDR as u32,
541        SIOCGIFBRDADDR as u32,
542        SIOCGIFNETMASK as u32,
543        SIOCGIFHWADDR as u32,
544        SIOCGIFINDEX as u32,
545        SIOCETHTOOL as u32,
546    ];
547    let n_ioctls = dangerous_ioctls.len();
548    let skip_count = (1 + n_ioctls * 2) as u8;
549    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
550    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_ioctl, 0, skip_count));
551    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
552    for &cmd in dangerous_ioctls {
553        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, cmd, 0, 1));
554        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
555    }
556
557    // --- prctl: block dangerous options ---
558    // Layout: LD NR, JEQ prctl (skip 1 + N*2), LD arg0, [JEQ op, RET ERRNO] * N
559    let dangerous_prctl_ops: &[u32] = &[PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER];
560    let n_ops = dangerous_prctl_ops.len();
561    let skip_count = (1 + n_ops * 2) as u8;
562    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
563    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_prctl, 0, skip_count));
564    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
565    for &op in dangerous_prctl_ops {
566        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, op, 0, 1));
567        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
568    }
569
570    // --- socket: block SOCK_RAW and/or SOCK_DGRAM on AF_INET/AF_INET6 ---
571    //
572    // SOCK_RAW is unconditionally denied. Sandlock does not expose
573    // raw ICMP — packet-crafting capabilities aren't part of the XOA
574    // threat model, and destination filtering at `sendto` can't be
575    // honestly enforced for raw sockets (the agent controls the IP
576    // header). Workloads that need ping should use the kernel ping
577    // socket (SOCK_DGRAM + IPPROTO_ICMP) via an `icmp://...` rule.
578    //
579    // SOCK_DGRAM is denied unless a UDP or ICMP rule exists in
580    // net_allow. The kernel ping socket uses SOCK_DGRAM with
581    // IPPROTO_ICMP, so the same type bit gates both — destination
582    // filtering at sendto (Phase 2) is what separates them per-rule.
583    use crate::sandbox::Protocol;
584    let any_udp_rule = policy.net_allow.iter().any(|r| r.protocol == Protocol::Udp);
585    let any_icmp_rule = policy.net_allow.iter().any(|r| r.protocol == Protocol::Icmp);
586    let mut blocked_types: Vec<u32> = Vec::new();
587    blocked_types.push(SOCK_RAW);
588    if !any_udp_rule && !any_icmp_rule {
589        blocked_types.push(SOCK_DGRAM);
590    }
591
592    if !blocked_types.is_empty() {
593        let n = blocked_types.len();
594        // Instructions after domain checks: 2 (load+AND) + N (JEQs) + 1 (RET)
595        let after_domain = 2 + n + 1;
596        // Total after NR check: 3 (load domain + 2 JEQs) + after_domain
597        let skip_all = (3 + after_domain) as u8;
598
599        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
600        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, skip_all));
601        // Load domain (arg0)
602        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
603        // AF_INET → skip to type check (jump over AF_INET6 check)
604        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET, 1, 0));
605        // AF_INET6 → type check; else skip everything remaining
606        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET6, 0, after_domain as u8));
607        // Load type (arg1) and mask off SOCK_NONBLOCK|SOCK_CLOEXEC
608        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
609        insns.push(stmt(BPF_ALU | BPF_AND | BPF_K, SOCK_TYPE_MASK));
610        // Check each blocked type
611        for (i, &sock_type) in blocked_types.iter().enumerate() {
612            let remaining = n - i - 1;
613            // Match → jump to RET ERRNO (skip 'remaining' JEQs ahead)
614            // No match on last type → skip past RET ERRNO (jf=1)
615            // No match on non-last → check next type (jf=0)
616            let jf: u8 = if remaining == 0 { 1 } else { 0 };
617            insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, sock_type, remaining as u8, jf));
618        }
619        // Deny return (reached by any matching JEQ)
620        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
621    }
622
623    // (raw ICMP carve-out removed — SOCK_RAW is unconditionally denied
624    // by the blocked_types block above. Sandlock does not expose raw
625    // sockets; ping uses the SOCK_DGRAM kernel ping socket via an
626    // `icmp://...` rule, gated by host `ping_group_range`.)
627
628    // --- wait4: skip notification for WNOHANG/WNOWAIT (non-blocking) ---
629    // wait4(pid, status, options, rusage) — options is arg2
630    // 5 instructions:
631    //   LD NR
632    //   JEQ wait4 → +0, skip 3
633    //   LD arg2
634    //   JSET (WNOHANG|WNOWAIT) → +0, skip 1
635    //   RET ALLOW
636    {
637        let nr_wait4 = libc::SYS_wait4 as u32;
638        let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000/* WNOWAIT */) as u32;
639        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
640        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_wait4, 0, 3));
641        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS2_LO));
642        insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
643        insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
644    }
645
646    // --- waitid: skip notification for WNOHANG/WNOWAIT (non-blocking) ---
647    // waitid(idtype, id, infop, options, rusage) — options is arg3
648    {
649        let nr_waitid = libc::SYS_waitid as u32;
650        let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000/* WNOWAIT */) as u32;
651        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
652        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_waitid, 0, 3));
653        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS3_LO));
654        insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
655        insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
656    }
657
658    insns
659}
660
661// ============================================================
662// Close fds above threshold
663// ============================================================
664
665/// Close all file descriptors above `min_fd`, except those in `keep`.
666fn close_fds_above(min_fd: RawFd, keep: &[RawFd]) {
667    // Read /proc/self/fd to enumerate open fds.
668    // Collect all fd numbers first, then close them after dropping the directory
669    // iterator. This avoids closing the directory fd during iteration.
670    let fds_to_close: Vec<RawFd> = {
671        let dir = match std::fs::read_dir("/proc/self/fd") {
672            Ok(d) => d,
673            Err(_) => return,
674        };
675        dir.flatten()
676            .filter_map(|entry| {
677                entry.file_name().into_string().ok()
678                    .and_then(|name| name.parse::<RawFd>().ok())
679            })
680            .filter(|&fd| fd > min_fd && !keep.contains(&fd))
681            .collect()
682    };
683    // The directory is now closed; safe to close the collected fds.
684    for fd in fds_to_close {
685        unsafe { libc::close(fd) };
686    }
687}
688
689// ============================================================
690// COW filesystem config passed from parent to child
691// ============================================================
692
693// Re-export ChildMountConfig so callers can use the old import path.
694pub(crate) use crate::cow::ChildMountConfig;
695
696/// Write uid/gid maps for an unprivileged user namespace.
697/// `real_uid`/`real_gid` must be captured *before* unshare(CLONE_NEWUSER),
698/// since getuid()/getgid() return the overflow id (65534) after unshare.
699/// `target_uid`/`target_gid` are the UIDs visible inside the namespace.
700fn write_id_maps(real_uid: u32, real_gid: u32, target_uid: u32, target_gid: u32) {
701    let _ = std::fs::write("/proc/self/uid_map", format!("{} {} 1\n", target_uid, real_uid));
702    let _ = std::fs::write("/proc/self/setgroups", "deny\n");
703    let _ = std::fs::write("/proc/self/gid_map", format!("{} {} 1\n", target_gid, real_gid));
704}
705
706/// Write uid/gid maps using the post-unshare overflow uid (65534).
707/// Used by the OverlayFS COW path which maps to root (UID 0) inside.
708fn write_id_maps_overflow() {
709    let uid = unsafe { libc::getuid() };
710    let gid = unsafe { libc::getgid() };
711    write_id_maps(uid, gid, 0, 0);
712}
713
714// ============================================================
715// Child-side confinement (never returns)
716// ============================================================
717
718/// Arguments threaded from the parent's `do_spawn` into the child-side
719/// `confine_child`.  Packed into a struct because `confine_child` historically
720/// grew to seven positional parameters and a struct keeps the call site
721/// readable when new flags get added (e.g. `extra_syscalls` for user
722/// handlers).  Lifetimes tie everything to the parent's stack frame — the
723/// child never outlives the fork point because `confine_child` either execs
724/// or exits.
725pub(crate) struct ChildSpawnArgs<'a> {
726    pub sandbox: &'a Sandbox,
727    pub cmd: &'a [CString],
728    pub pipes: &'a PipePair,
729    pub cow_config: Option<&'a ChildMountConfig>,
730    pub nested: bool,
731    pub keep_fds: &'a [RawFd],
732    /// Sandbox instance name. When set, it is also exposed as the
733    /// sandbox's virtual hostname.
734    pub sandbox_name: Option<&'a str>,
735    /// Syscall numbers for which the parent registered user `Handler`s.
736    /// Merged into the child's BPF notif list so the kernel actually
737    /// raises USER_NOTIF for them.
738    pub extra_syscalls: &'a [u32],
739    /// PID of the parent process captured before fork. Used to detect
740    /// parent death in the child without assuming PID 1 is always init
741    /// (incorrect in containers where the entrypoint runs as PID 1).
742    pub parent_pid: libc::pid_t,
743}
744
745/// Apply irreversible confinement (Landlock + seccomp) then exec the command.
746///
747/// This function **never returns**: it calls `execvp` on success or
748/// `_exit(127)` on any error.
749pub(crate) fn confine_child(args: ChildSpawnArgs<'_>) -> ! {
750    let ChildSpawnArgs {
751        sandbox,
752        cmd,
753        pipes,
754        cow_config,
755        nested,
756        keep_fds,
757        sandbox_name,
758        extra_syscalls,
759        parent_pid,
760    } = args;
761    // Helper: abort child on error. Includes the OS error automatically.
762    macro_rules! fail {
763        ($msg:expr) => {{
764            let err = std::io::Error::last_os_error();
765            let _ = write!(std::io::stderr(), "sandlock child: {}: {}\n", $msg, err);
766            unsafe { libc::_exit(127) };
767        }};
768    }
769
770    use std::io::Write;
771
772    // 1. New process group
773    if unsafe { libc::setpgid(0, 0) } != 0 {
774        fail!("setpgid");
775    }
776
777    // 1b. If stdin is a terminal, become the foreground process group
778    //     so interactive shells can read from the TTY.
779    //     Must ignore SIGTTOU first — a background pgrp calling tcsetpgrp
780    //     gets stopped by SIGTTOU otherwise.
781    if unsafe { libc::isatty(0) } == 1 {
782        unsafe {
783            libc::signal(libc::SIGTTOU, libc::SIG_IGN);
784            libc::tcsetpgrp(0, libc::getpgrp());
785            libc::signal(libc::SIGTTOU, libc::SIG_DFL);
786        }
787    }
788
789    // 2. Die if parent exits
790    if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } != 0 {
791        fail!("prctl(PR_SET_PDEATHSIG)");
792    }
793
794    // 3. Check parent didn't die between fork and prctl.
795    // Compare against the actual parent PID captured before fork rather than
796    // hardcoding 1, since containers often run the entrypoint as PID 1 and a
797    // child forked from it legitimately has getppid() == 1.
798    if unsafe { libc::getppid() } != parent_pid {
799        fail!("parent died before confinement");
800    }
801
802    // 4. Optional: disable ASLR
803    if sandbox.no_randomize_memory {
804        const ADDR_NO_RANDOMIZE: libc::c_ulong = 0x0040000;
805        // Read current personality first (0xffffffff = query), then OR in the flag.
806        let current = unsafe { libc::personality(0xffffffff) };
807        if current == -1 {
808            fail!("personality(query)");
809        }
810        if unsafe { libc::personality(current as libc::c_ulong | ADDR_NO_RANDOMIZE) } == -1 {
811            fail!("personality(ADDR_NO_RANDOMIZE)");
812        }
813    }
814
815    // 4b. Optional: CPU core binding
816    if let Some(ref cores) = sandbox.cpu_cores {
817        if !cores.is_empty() {
818            let mut set = unsafe { std::mem::zeroed::<libc::cpu_set_t>() };
819            unsafe { libc::CPU_ZERO(&mut set) };
820            for &core in cores {
821                unsafe { libc::CPU_SET(core as usize, &mut set) };
822            }
823            if unsafe {
824                libc::sched_setaffinity(
825                    0,
826                    std::mem::size_of::<libc::cpu_set_t>(),
827                    &set,
828                )
829            } != 0
830            {
831                fail!("sched_setaffinity");
832            }
833        }
834    }
835
836    // 5. Optional: disable THP
837    if sandbox.no_huge_pages {
838        if unsafe { libc::prctl(libc::PR_SET_THP_DISABLE, 1, 0, 0, 0) } != 0 {
839            fail!("prctl(PR_SET_THP_DISABLE)");
840        }
841    }
842
843    // 5c. Optional: disable core dumps
844    if sandbox.no_coredump {
845        // Set RLIMIT_CORE to 0 — the kernel will not write a core file.
846        // We intentionally do NOT call prctl(PR_SET_DUMPABLE, 0) because
847        // that would break pidfd_getfd which the supervisor needs.
848        // The seccomp filter already blocks the child from calling
849        // prctl(PR_SET_DUMPABLE, ...) so it can't re-enable it.
850        let rlim = libc::rlimit { rlim_cur: 0, rlim_max: 0 };
851        if unsafe { libc::setrlimit(libc::RLIMIT_CORE, &rlim) } != 0 {
852            fail!("setrlimit(RLIMIT_CORE, 0)");
853        }
854    }
855
856    // Capture real uid/gid before any unshare (after unshare they become 65534)
857    let real_uid = unsafe { libc::getuid() };
858    let real_gid = unsafe { libc::getgid() };
859
860    // 5b. User namespace for --uid mapping (when not using OverlayFS COW,
861    //     which sets up its own user namespace)
862    if let Some(target_uid) = sandbox.uid {
863        if cow_config.is_none() {
864            if unsafe { libc::unshare(libc::CLONE_NEWUSER) } != 0 {
865                fail!("unshare(CLONE_NEWUSER)");
866            }
867            write_id_maps(real_uid, real_gid, target_uid, target_uid);
868        }
869    }
870
871    // 5c. User + mount namespace for OverlayFS COW (includes CLONE_NEWUSER)
872    if let Some(ref cow) = cow_config {
873        // unshare user + mount namespaces (unprivileged)
874        if unsafe { libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) } != 0 {
875            fail!("unshare(CLONE_NEWUSER | CLONE_NEWNS)");
876        }
877
878        // Write uid/gid maps using overflow uid (preserves existing COW behavior)
879        write_id_maps_overflow();
880
881        // Mount the overlay filesystem ON TOP of the workdir so the child
882        // sees the merged view at the original path.  The kernel resolves
883        // lowerdir before the covering mount takes effect, so using the
884        // same path as both lowerdir and mount-point is safe inside our
885        // private mount namespace.
886        let lowerdir = cow.lowers.iter()
887            .map(|p| p.display().to_string())
888            .collect::<Vec<_>>()
889            .join(":");
890        let opts = format!(
891            "lowerdir={},upperdir={},workdir={}",
892            lowerdir,
893            cow.upper.display(),
894            cow.work.display(),
895        );
896
897        let mount_cstr = match CString::new(cow.mount_point.to_str().unwrap_or("")) {
898            Ok(c) => c,
899            Err(_) => fail!("invalid overlay mount point path"),
900        };
901        let overlay_cstr = CString::new("overlay").unwrap();
902        let opts_cstr = match CString::new(opts) {
903            Ok(c) => c,
904            Err(_) => fail!("invalid overlay opts"),
905        };
906
907        let ret = unsafe {
908            libc::mount(
909                overlay_cstr.as_ptr(),
910                mount_cstr.as_ptr(),
911                overlay_cstr.as_ptr(),
912                0,
913                opts_cstr.as_ptr() as *const libc::c_void,
914            )
915        };
916        if ret != 0 {
917            fail!("mount overlay");
918        }
919    }
920
921    // 6. Optional: change working directory
922    // cwd controls where the child starts; workdir is only for COW
923    let effective_cwd = if let Some(ref cwd) = sandbox.cwd {
924        if let Some(ref chroot_root) = sandbox.chroot {
925            Some(chroot_root.join(cwd.strip_prefix("/").unwrap_or(cwd)))
926        } else {
927            Some(cwd.clone())
928        }
929    } else if let Some(ref chroot_root) = sandbox.chroot {
930        // Default to chroot root
931        Some(chroot_root.to_path_buf())
932    } else if let Some(ref workdir) = sandbox.workdir {
933        // Default to workdir when set (COW working directory)
934        Some(workdir.clone())
935    } else {
936        None
937    };
938
939    if let Some(ref cwd) = effective_cwd {
940        let c_path = match CString::new(cwd.as_os_str().as_encoded_bytes()) {
941            Ok(c) => c,
942            Err(_) => fail!("invalid cwd path"),
943        };
944        if unsafe { libc::chdir(c_path.as_ptr()) } != 0 {
945            fail!("chdir");
946        }
947    }
948
949    // 7. Set NO_NEW_PRIVS (required for both Landlock and seccomp without CAP_SYS_ADMIN)
950    if unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) } != 0 {
951        fail!("prctl(PR_SET_NO_NEW_PRIVS)");
952    }
953
954    // 8. Apply Landlock confinement (IRREVERSIBLE)
955    if let Err(e) = crate::landlock::confine(sandbox) {
956        fail!(format!("landlock: {}", e));
957    }
958
959    // 9. Assemble and install seccomp filter (IRREVERSIBLE)
960    let deny = blocklist_syscall_numbers(sandbox);
961    let args = arg_filters(sandbox);
962    let mut keep_fd: i32 = -1;
963
964    if nested {
965        // Nested sandbox: deny-only filter (no supervisor — parent handles it).
966        // BPF filters are ANDed by the kernel, so each level can only tighten.
967        let filter = match bpf::assemble_filter(&[], &deny, &args) {
968            Ok(f) => f,
969            Err(e) => fail!(format!("seccomp assemble: {}", e)),
970        };
971        if let Err(e) = bpf::install_deny_filter(&filter) {
972            fail!(format!("seccomp deny filter: {}", e));
973        }
974        // Signal nested mode to parent (fd=0 means no supervisor needed)
975        if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), 0) {
976            fail!(format!("write nested signal: {}", e));
977        }
978    } else {
979        // First-level sandbox: notif + deny filter with NEW_LISTENER.
980        //
981        // Caller-supplied handlers must have their syscalls registered in
982        // the BPF filter, otherwise the kernel never raises a notification for
983        // them and the handler silently never fires.  We merge `extra_syscalls`
984        // into the notif list and dedup so each syscall produces exactly one
985        // JEQ in the assembled program.
986        let mut notif = notif_syscalls(sandbox, sandbox_name);
987        if !extra_syscalls.is_empty() {
988            notif.extend_from_slice(extra_syscalls);
989        }
990        // Argv-safety gate (companion to the policy_fn case in
991        // notif_syscalls): a handler bound to execve/execveat
992        // can call `read_child_mem` to inspect argv, so the supervisor
993        // must register newly forked children before they can run user
994        // code — same invariant policy_fn relies on. Bare fork(2)
995        // therefore needs to be intercepted here too.
996        let exec_extra = extra_syscalls.iter().any(|&n| {
997            n == libc::SYS_execve as u32 || n == libc::SYS_execveat as u32
998        });
999        if exec_extra {
1000            arch::push_optional_syscall(&mut notif, arch::SYS_FORK);
1001        }
1002        notif.sort_unstable();
1003        notif.dedup();
1004        let filter = match bpf::assemble_filter(&notif, &deny, &args) {
1005            Ok(f) => f,
1006            Err(e) => fail!(format!("seccomp assemble: {}", e)),
1007        };
1008        let notif_fd = match bpf::install_filter(&filter) {
1009            Ok(fd) => fd,
1010            Err(e) => fail!(format!("seccomp install: {}", e)),
1011        };
1012        keep_fd = notif_fd.as_raw_fd();
1013        if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), keep_fd as u32) {
1014            fail!(format!("write notif fd: {}", e));
1015        }
1016        std::mem::forget(notif_fd);
1017    }
1018
1019    // Mark this process as confined for in-process nesting detection
1020    crate::process::CONFINED.store(true, std::sync::atomic::Ordering::Relaxed);
1021
1022    // 10. Wait for parent to signal ready
1023    match read_u32_fd(pipes.ready_r.as_raw_fd()) {
1024        Ok(_) => {}
1025        Err(e) => fail!(format!("read ready signal: {}", e)),
1026    }
1027
1028    // 12. Close all fds above stderr (always on for isolation)
1029    let mut fds_to_keep: Vec<RawFd> = keep_fds.to_vec();
1030    if keep_fd >= 0 {
1031        fds_to_keep.push(keep_fd);
1032    }
1033    close_fds_above(2, &fds_to_keep);
1034
1035    // 13. Apply environment
1036    if sandbox.clean_env {
1037        // Clear all env vars first
1038        for (key, _) in std::env::vars_os() {
1039            std::env::remove_var(&key);
1040        }
1041    }
1042    for (key, value) in &sandbox.env {
1043        std::env::set_var(key, value);
1044    }
1045
1046    // 13b. GPU device visibility
1047    if let Some(ref devices) = sandbox.gpu_devices {
1048        if !devices.is_empty() {
1049            let vis = devices.iter().map(|d| d.to_string()).collect::<Vec<_>>().join(",");
1050            std::env::set_var("CUDA_VISIBLE_DEVICES", &vis);
1051            std::env::set_var("ROCR_VISIBLE_DEVICES", &vis);
1052        }
1053        // Empty list = all GPUs visible, don't set env vars
1054    }
1055
1056    // 14. exec
1057    debug_assert!(!cmd.is_empty(), "cmd must not be empty");
1058    let argv_ptrs: Vec<*const libc::c_char> = cmd
1059        .iter()
1060        .map(|s| s.as_ptr())
1061        .chain(std::iter::once(std::ptr::null()))
1062        .collect();
1063
1064    if sandbox.chroot.is_some() {
1065        // With chroot the seccomp handler rewrites the filename to a host path
1066        // (or /proc/self/fd/N).  Pass a separate PATH_MAX buffer as the `file`
1067        // argument so the rewrite does not corrupt argv[0] — which must stay as
1068        // the original command name (e.g. busybox uses argv[0] for applet
1069        // detection).  execvp still handles PATH lookup for bare command names.
1070        let mut exec_path = vec![0u8; libc::PATH_MAX as usize];
1071        let orig = cmd[0].as_bytes_with_nul();
1072        exec_path[..orig.len()].copy_from_slice(orig);
1073
1074        unsafe {
1075            libc::execvp(
1076                exec_path.as_ptr() as *const libc::c_char,
1077                argv_ptrs.as_ptr(),
1078            )
1079        };
1080    } else {
1081        unsafe { libc::execvp(argv_ptrs[0], argv_ptrs.as_ptr()) };
1082    }
1083
1084    // If we get here, exec failed
1085    fail!(format!("execvp '{}'", cmd[0].to_string_lossy()));
1086}
1087
1088// ============================================================
1089// Tests
1090// ============================================================
1091
1092#[cfg(test)]
1093mod tests {
1094    use super::*;
1095
1096    #[test]
1097    fn test_pipe_pair_creation() {
1098        let pipes = PipePair::new().expect("pipe creation failed");
1099        // Verify fds are valid (non-negative)
1100        assert!(pipes.notif_r.as_raw_fd() >= 0);
1101        assert!(pipes.notif_w.as_raw_fd() >= 0);
1102        assert!(pipes.ready_r.as_raw_fd() >= 0);
1103        assert!(pipes.ready_w.as_raw_fd() >= 0);
1104        // All four fds should be distinct
1105        let fds = [
1106            pipes.notif_r.as_raw_fd(),
1107            pipes.notif_w.as_raw_fd(),
1108            pipes.ready_r.as_raw_fd(),
1109            pipes.ready_w.as_raw_fd(),
1110        ];
1111        for i in 0..4 {
1112            for j in (i + 1)..4 {
1113                assert_ne!(fds[i], fds[j]);
1114            }
1115        }
1116    }
1117
1118    #[test]
1119    fn test_write_read_u32() {
1120        let pipes = PipePair::new().expect("pipe creation failed");
1121        let val = 42u32;
1122        write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
1123        let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
1124        assert_eq!(got, val);
1125    }
1126
1127    #[test]
1128    fn test_write_read_u32_large() {
1129        let pipes = PipePair::new().expect("pipe creation failed");
1130        let val = 0xDEAD_BEEFu32;
1131        write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
1132        let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
1133        assert_eq!(got, val);
1134    }
1135
1136    #[test]
1137    fn test_notif_syscalls_always_has_clone() {
1138        let policy = Sandbox::builder().build().unwrap();
1139        let nrs = notif_syscalls(&policy, None);
1140        assert!(nrs.contains(&(libc::SYS_clone as u32)));
1141        assert!(nrs.contains(&(libc::SYS_clone3 as u32)));
1142        if let Some(vfork) = arch::SYS_VFORK {
1143            assert!(nrs.contains(&(vfork as u32)));
1144        }
1145        // Bare fork(2) is intercepted only when policy_fn is active —
1146        // see notif_syscalls. The default policy has no policy_fn, so
1147        // fork stays out of the BPF filter and hot fork-loops keep
1148        // bypassing the supervisor.
1149        if let Some(fork) = arch::SYS_FORK {
1150            assert!(!nrs.contains(&(fork as u32)));
1151        }
1152    }
1153
1154    #[test]
1155    fn test_notif_syscalls_fork_gated_on_policy_fn() {
1156        let Some(fork) = arch::SYS_FORK else { return };
1157        let policy = Sandbox::builder()
1158            .policy_fn(|_event, _ctx| crate::policy_fn::Verdict::Allow)
1159            .build()
1160            .unwrap();
1161        let nrs = notif_syscalls(&policy, None);
1162        assert!(nrs.contains(&(fork as u32)));
1163    }
1164
1165    #[test]
1166    fn test_notif_syscalls_memory() {
1167        // shmget only appears in notif when SysV IPC is allowed —
1168        // otherwise it is on the kernel blocklist and notifying would
1169        // bypass the deny (notif JEQs precede deny JEQs in the BPF
1170        // layout).
1171        let policy = Sandbox::builder()
1172            .max_memory(crate::sandbox::ByteSize::mib(256))
1173            .extra_allow_syscalls(vec!["sysv_ipc".into()])
1174            .build()
1175            .unwrap();
1176        let nrs = notif_syscalls(&policy, None);
1177        assert!(nrs.contains(&(libc::SYS_mmap as u32)));
1178        assert!(nrs.contains(&(libc::SYS_munmap as u32)));
1179        assert!(nrs.contains(&(libc::SYS_brk as u32)));
1180        assert!(nrs.contains(&(libc::SYS_mremap as u32)));
1181        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1182    }
1183
1184    #[test]
1185    fn test_notif_syscalls_memory_excludes_shmget_when_sysv_ipc_denied() {
1186        // With max_memory but allows_sysv_ipc()=false (the default),
1187        // shmget must NOT be in notif: if it were, the BPF filter
1188        // would route it to RET_USER_NOTIF before reaching the deny
1189        // JEQ, silently bypassing the kernel-level deny.
1190        let policy = Sandbox::builder()
1191            .max_memory(crate::sandbox::ByteSize::mib(256))
1192            .build()
1193            .unwrap();
1194        let nrs = notif_syscalls(&policy, None);
1195        assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1196        // Other memory syscalls remain notified — they are not denied.
1197        assert!(nrs.contains(&(libc::SYS_mmap as u32)));
1198        assert!(nrs.contains(&(libc::SYS_brk as u32)));
1199    }
1200
1201    #[test]
1202    fn test_notif_syscalls_net() {
1203        let policy = Sandbox::builder()
1204            .net_allow("example.com:443")
1205            .build()
1206            .unwrap();
1207        let nrs = notif_syscalls(&policy, None);
1208        assert!(nrs.contains(&(libc::SYS_connect as u32)));
1209        assert!(nrs.contains(&(libc::SYS_sendto as u32)));
1210        assert!(nrs.contains(&(libc::SYS_sendmsg as u32)));
1211        assert!(nrs.contains(&(libc::SYS_sendmmsg as u32)));
1212    }
1213
1214    #[test]
1215    fn test_notif_syscalls_sandbox_name_enables_hostname_virtualization() {
1216        let policy = Sandbox::builder().build().unwrap();
1217        let nrs = notif_syscalls(&policy, Some("api.local"));
1218        assert!(nrs.contains(&(libc::SYS_uname as u32)));
1219        assert!(nrs.contains(&(libc::SYS_openat as u32)));
1220    }
1221
1222    /// SYS_faccessat2 (439) must be in the notification filter for both
1223    /// chroot and COW modes — glibc 2.33+ uses it instead of faccessat.
1224    #[test]
1225    fn test_notif_syscalls_faccessat2() {
1226        const SYS_FACCESSAT2: u32 = 439;
1227
1228        // Chroot mode
1229        let policy = Sandbox::builder()
1230            .chroot("/tmp")
1231            .build()
1232            .unwrap();
1233        let nrs = notif_syscalls(&policy, None);
1234        assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
1235        assert!(nrs.contains(&SYS_FACCESSAT2),
1236                "chroot notif filter must include SYS_faccessat2 (439)");
1237
1238        // COW mode
1239        let policy = Sandbox::builder()
1240            .workdir("/tmp")
1241            .build()
1242            .unwrap();
1243        let nrs = notif_syscalls(&policy, None);
1244        assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
1245        assert!(nrs.contains(&SYS_FACCESSAT2),
1246                "COW notif filter must include SYS_faccessat2 (439)");
1247    }
1248
1249    #[test]
1250    fn test_blocklist_syscall_numbers_default() {
1251        let policy = Sandbox::builder().build().unwrap();
1252        let nrs = blocklist_syscall_numbers(&policy);
1253        // Should contain mount, ptrace, etc.
1254        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1255        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1256        assert!(nrs.contains(&(libc::SYS_bpf as u32)));
1257        // SysV IPC denied by default (no IPC namespace in sandlock)
1258        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1259        assert!(nrs.contains(&(libc::SYS_shmat as u32)));
1260        assert!(nrs.contains(&(libc::SYS_msgget as u32)));
1261        assert!(nrs.contains(&(libc::SYS_semget as u32)));
1262        // nfsservctl has no libc constant, so it is skipped
1263        assert!(!nrs.is_empty());
1264    }
1265
1266    #[test]
1267    fn test_blocklist_syscall_numbers_custom() {
1268        let policy = Sandbox::builder()
1269            .extra_deny_syscalls(vec!["mount".into(), "ptrace".into()])
1270            .build()
1271            .unwrap();
1272        let nrs = blocklist_syscall_numbers(&policy);
1273        // User-supplied blocklist still gets SysV IPC appended
1274        // (allows_sysv_ipc() defaults to false).
1275        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1276        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1277        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1278    }
1279
1280    #[test]
1281    fn test_blocklist_syscall_numbers_custom_with_sysv_ipc_allowed() {
1282        let policy = Sandbox::builder()
1283            .extra_deny_syscalls(vec!["mount".into(), "ptrace".into()])
1284            .extra_allow_syscalls(vec!["sysv_ipc".into()])
1285            .build()
1286            .unwrap();
1287        let nrs = blocklist_syscall_numbers(&policy);
1288        // Default blocklist plus user extras — no SysV IPC append.
1289        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1290        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1291        assert!(nrs.contains(&(libc::SYS_bpf as u32)));
1292        assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1293    }
1294
1295    #[test]
1296    fn test_blocklist_syscall_numbers_default_with_sysv_ipc_allowed() {
1297        let policy = Sandbox::builder()
1298            .extra_allow_syscalls(vec!["sysv_ipc".into()])
1299            .build()
1300            .unwrap();
1301        let nrs = blocklist_syscall_numbers(&policy);
1302        // Default blocklist still present, but SysV IPC is permitted.
1303        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1304        assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1305        assert!(!nrs.contains(&(libc::SYS_msgget as u32)));
1306        assert!(!nrs.contains(&(libc::SYS_semget as u32)));
1307    }
1308
1309    #[test]
1310    fn test_no_supervisor_blocklist_includes_sysv_ipc_by_default() {
1311        let policy = Sandbox::builder().build().unwrap();
1312        let nrs = no_supervisor_blocklist_syscall_numbers(&policy);
1313        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1314        assert!(nrs.contains(&(libc::SYS_msgget as u32)));
1315        assert!(nrs.contains(&(libc::SYS_semget as u32)));
1316    }
1317
1318    #[test]
1319    fn test_no_supervisor_blocklist_excludes_sysv_ipc_when_allowed() {
1320        let policy = Sandbox::builder()
1321            .extra_allow_syscalls(vec!["sysv_ipc".into()])
1322            .build()
1323            .unwrap();
1324        let nrs = no_supervisor_blocklist_syscall_numbers(&policy);
1325        assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1326        assert!(!nrs.contains(&(libc::SYS_msgget as u32)));
1327        assert!(!nrs.contains(&(libc::SYS_semget as u32)));
1328    }
1329
1330    #[test]
1331    fn test_arg_filters_has_clone_ioctl_prctl_socket() {
1332        use crate::sys::structs::{
1333            BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K,
1334        };
1335        let policy = Sandbox::builder().build().unwrap();
1336        let filters = arg_filters(&policy);
1337        // Should contain JEQ for clone syscall nr
1338        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1339            && f.k == libc::SYS_clone as u32));
1340        // Should contain JSET for CLONE_NS_FLAGS
1341        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JSET | BPF_K)
1342            && f.k == CLONE_NS_FLAGS as u32));
1343        // Should contain JEQ for ioctl syscall nr
1344        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1345            && f.k == libc::SYS_ioctl as u32));
1346        // Should contain JEQ for TIOCSTI, TIOCLINUX, and SIOCGIF*/SIOCETHTOOL
1347        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1348            && f.k == TIOCSTI as u32));
1349        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1350            && f.k == TIOCLINUX as u32));
1351        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1352            && f.k == SIOCGIFCONF as u32));
1353        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1354            && f.k == SIOCETHTOOL as u32));
1355        // Should contain JEQ for prctl syscall nr
1356        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1357            && f.k == libc::SYS_prctl as u32));
1358        // Should contain JEQ for PR_SET_DUMPABLE
1359        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1360            && f.k == PR_SET_DUMPABLE));
1361    }
1362
1363    #[test]
1364    fn test_arg_filters_raw_sockets() {
1365        use crate::sys::structs::{BPF_ALU, BPF_AND, BPF_JEQ, BPF_JMP, BPF_K};
1366        // Raw sockets are blocked by default — no `icmp-raw://*` rule.
1367        let policy = Sandbox::builder().build().unwrap();
1368        let filters = arg_filters(&policy);
1369        // Should have AF_INET check
1370        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1371            && f.k == AF_INET));
1372        // Should have AF_INET6 check
1373        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1374            && f.k == AF_INET6));
1375        // Should have ALU AND SOCK_TYPE_MASK
1376        assert!(filters.iter().any(|f| f.code == (BPF_ALU | BPF_AND | BPF_K)
1377            && f.k == SOCK_TYPE_MASK));
1378        // Should have JEQ SOCK_RAW
1379        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1380            && f.k == SOCK_RAW));
1381    }
1382
1383    #[test]
1384    fn test_arg_filters_udp_denied_by_default() {
1385        use crate::sys::structs::{BPF_JEQ, BPF_JMP, BPF_K};
1386        // UDP is denied by default — no `udp://...` rule in net_allow.
1387        let policy = Sandbox::builder().build().unwrap();
1388        let filters = arg_filters(&policy);
1389        // Should have JEQ SOCK_DGRAM
1390        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1391            && f.k == SOCK_DGRAM));
1392    }
1393
1394    #[test]
1395    fn test_syscall_name_to_nr_covers_defaults() {
1396        // Every name in DEFAULT_BLOCKLIST_SYSCALLS should resolve unless the
1397        // running architecture does not expose that syscall.
1398        let expected_unresolved: &[&str] = &[
1399            "nfsservctl",
1400            #[cfg(target_arch = "aarch64")]
1401            "ioperm",
1402            #[cfg(target_arch = "aarch64")]
1403            "iopl",
1404        ];
1405        let mut skipped = 0;
1406        for name in DEFAULT_BLOCKLIST_SYSCALLS {
1407            match syscall_name_to_nr(name) {
1408                Some(_) => {}
1409                None => {
1410                    assert!(
1411                        expected_unresolved.contains(name),
1412                        "unexpected unresolved syscall: {}",
1413                        name
1414                    );
1415                    skipped += 1;
1416                }
1417            }
1418        }
1419        assert_eq!(skipped, expected_unresolved.len());
1420    }
1421}