Skip to main content

sandlock_core/
context.rs

1// Fork + confinement sequence: child-side Landlock + seccomp application
2// and parent-child pipe synchronization.
3
4use std::ffi::CString;
5use std::io;
6use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
7
8use crate::arch;
9use crate::sandbox::Sandbox;
10use crate::seccomp::bpf::{self, stmt, jump};
11use crate::sys::structs::{
12    AF_INET, AF_INET6,
13    BPF_ABS, BPF_ALU, BPF_AND, BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K, BPF_LD, BPF_RET, BPF_W,
14    CLONE_NS_FLAGS, DEFAULT_BLOCKLIST_SYSCALLS, EPERM, SYSV_IPC_BLOCKLIST_SYSCALLS,
15    SECCOMP_RET_ALLOW, SECCOMP_RET_ERRNO,
16    SIOCETHTOOL, SIOCGIFADDR, SIOCGIFBRDADDR, SIOCGIFCONF, SIOCGIFDSTADDR,
17    SIOCGIFFLAGS, SIOCGIFHWADDR, SIOCGIFINDEX, SIOCGIFNAME, SIOCGIFNETMASK,
18    SOCK_DGRAM, SOCK_RAW, SOCK_TYPE_MASK, TIOCLINUX, TIOCSTI,
19    PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER,
20    OFFSET_ARGS0_LO, OFFSET_ARGS1_LO, OFFSET_ARGS2_LO, OFFSET_ARGS3_LO, OFFSET_NR,
21    SockFilter,
22};
23
24// ============================================================
25// Pipe pair for parent-child synchronization
26// ============================================================
27
28/// Pipes for parent-child communication after fork().
29pub struct PipePair {
30    /// Parent reads the notif fd number written by the child.
31    pub notif_r: OwnedFd,
32    /// Child writes the notif fd number to the parent.
33    pub notif_w: OwnedFd,
34    /// Child reads the "supervisor ready" signal from the parent.
35    pub ready_r: OwnedFd,
36    /// Parent writes the "supervisor ready" signal to the child.
37    pub ready_w: OwnedFd,
38}
39
40impl PipePair {
41    /// Create two pipe pairs using `pipe2(O_CLOEXEC)`.
42    pub fn new() -> io::Result<Self> {
43        let mut notif_fds = [0i32; 2];
44        let mut ready_fds = [0i32; 2];
45
46        // SAFETY: pipe2 with valid pointers and O_CLOEXEC
47        let ret = unsafe { libc::pipe2(notif_fds.as_mut_ptr(), libc::O_CLOEXEC) };
48        if ret < 0 {
49            return Err(io::Error::last_os_error());
50        }
51
52        let ret = unsafe { libc::pipe2(ready_fds.as_mut_ptr(), libc::O_CLOEXEC) };
53        if ret < 0 {
54            // Close the first pair on failure
55            unsafe {
56                libc::close(notif_fds[0]);
57                libc::close(notif_fds[1]);
58            }
59            return Err(io::Error::last_os_error());
60        }
61
62        // SAFETY: pipe2 returned valid fds
63        Ok(PipePair {
64            notif_r: unsafe { OwnedFd::from_raw_fd(notif_fds[0]) },
65            notif_w: unsafe { OwnedFd::from_raw_fd(notif_fds[1]) },
66            ready_r: unsafe { OwnedFd::from_raw_fd(ready_fds[0]) },
67            ready_w: unsafe { OwnedFd::from_raw_fd(ready_fds[1]) },
68        })
69    }
70}
71
72// ============================================================
73// Pipe I/O helpers
74// ============================================================
75
76/// Write a `u32` as 4 little-endian bytes to a raw fd.
77pub(crate) fn write_u32_fd(fd: RawFd, val: u32) -> io::Result<()> {
78    let buf = val.to_le_bytes();
79    let mut written = 0usize;
80    while written < 4 {
81        let ret = unsafe {
82            libc::write(
83                fd,
84                buf[written..].as_ptr() as *const libc::c_void,
85                4 - written,
86            )
87        };
88        if ret < 0 {
89            return Err(io::Error::last_os_error());
90        }
91        written += ret as usize;
92    }
93    Ok(())
94}
95
96/// Read a `u32` (4 little-endian bytes, blocking) from a raw fd.
97pub(crate) fn read_u32_fd(fd: RawFd) -> io::Result<u32> {
98    let mut buf = [0u8; 4];
99    let mut total = 0usize;
100    while total < 4 {
101        let ret = unsafe {
102            libc::read(
103                fd,
104                buf[total..].as_mut_ptr() as *mut libc::c_void,
105                4 - total,
106            )
107        };
108        if ret < 0 {
109            return Err(io::Error::last_os_error());
110        }
111        if ret == 0 {
112            return Err(io::Error::new(
113                io::ErrorKind::UnexpectedEof,
114                "pipe closed before 4 bytes read",
115            ));
116        }
117        total += ret as usize;
118    }
119    Ok(u32::from_le_bytes(buf))
120}
121
122// ============================================================
123// Syscall name → number mapping
124// ============================================================
125
126/// Map a syscall name to its `libc::SYS_*` number.
127///
128/// Covers all names in `DEFAULT_BLOCKLIST_SYSCALLS` plus extras needed for
129/// notif and arg-filter lists.
130pub fn syscall_name_to_nr(name: &str) -> Option<u32> {
131    let nr: i64 = match name {
132        "mount" => libc::SYS_mount,
133        "umount2" => libc::SYS_umount2,
134        "pivot_root" => libc::SYS_pivot_root,
135        "swapon" => libc::SYS_swapon,
136        "swapoff" => libc::SYS_swapoff,
137        "reboot" => libc::SYS_reboot,
138        "sethostname" => libc::SYS_sethostname,
139        "setdomainname" => libc::SYS_setdomainname,
140        "kexec_load" => libc::SYS_kexec_load,
141        "init_module" => libc::SYS_init_module,
142        "finit_module" => libc::SYS_finit_module,
143        "delete_module" => libc::SYS_delete_module,
144        "unshare" => libc::SYS_unshare,
145        "setns" => libc::SYS_setns,
146        "perf_event_open" => libc::SYS_perf_event_open,
147        "bpf" => libc::SYS_bpf,
148        "userfaultfd" => libc::SYS_userfaultfd,
149        "keyctl" => libc::SYS_keyctl,
150        "add_key" => libc::SYS_add_key,
151        "request_key" => libc::SYS_request_key,
152        "ptrace" => libc::SYS_ptrace,
153        "process_vm_readv" => libc::SYS_process_vm_readv,
154        "process_vm_writev" => libc::SYS_process_vm_writev,
155        "open_by_handle_at" => libc::SYS_open_by_handle_at,
156        "name_to_handle_at" => libc::SYS_name_to_handle_at,
157        "ioperm" => arch::SYS_IOPERM?,
158        "iopl" => arch::SYS_IOPL?,
159        "quotactl" => libc::SYS_quotactl,
160        "acct" => libc::SYS_acct,
161        "lookup_dcookie" => libc::SYS_lookup_dcookie,
162        // nfsservctl was removed in Linux 3.1; no libc constant — skip
163        "personality" => libc::SYS_personality,
164        "io_uring_setup" => libc::SYS_io_uring_setup,
165        "io_uring_enter" => libc::SYS_io_uring_enter,
166        "io_uring_register" => libc::SYS_io_uring_register,
167        // Additional syscalls for notif/arg filters
168        "clone" => libc::SYS_clone,
169        "clone3" => libc::SYS_clone3,
170        "vfork" => arch::SYS_VFORK?,
171        "mmap" => libc::SYS_mmap,
172        "munmap" => libc::SYS_munmap,
173        "brk" => libc::SYS_brk,
174        "mremap" => libc::SYS_mremap,
175        "connect" => libc::SYS_connect,
176        "sendto" => libc::SYS_sendto,
177        "sendmsg" => libc::SYS_sendmsg,
178        "sendmmsg" => libc::SYS_sendmmsg,
179        "ioctl" => libc::SYS_ioctl,
180        "socket" => libc::SYS_socket,
181        "prctl" => libc::SYS_prctl,
182        "getrandom" => libc::SYS_getrandom,
183        "openat" => libc::SYS_openat,
184        "open" => arch::SYS_OPEN?,
185        "getdents64" => libc::SYS_getdents64,
186        "getdents" => arch::SYS_GETDENTS?,
187        "bind" => libc::SYS_bind,
188        "getsockname" => libc::SYS_getsockname,
189        "clock_gettime" => libc::SYS_clock_gettime,
190        "gettimeofday" => libc::SYS_gettimeofday,
191        "time" => arch::SYS_TIME?,
192        "clock_nanosleep" => libc::SYS_clock_nanosleep,
193        "timerfd_settime" => libc::SYS_timerfd_settime,
194        "timer_settime" => libc::SYS_timer_settime,
195        "execve" => libc::SYS_execve,
196        "execveat" => libc::SYS_execveat,
197        // COW filesystem syscalls
198        "unlinkat" => libc::SYS_unlinkat,
199        "mkdirat" => libc::SYS_mkdirat,
200        "renameat2" => libc::SYS_renameat2,
201        "newfstatat" => libc::SYS_newfstatat,
202        "statx" => libc::SYS_statx,
203        "faccessat" => libc::SYS_faccessat,
204        "symlinkat" => libc::SYS_symlinkat,
205        "linkat" => libc::SYS_linkat,
206        "fchmodat" => libc::SYS_fchmodat,
207        "fchownat" => libc::SYS_fchownat,
208        "readlinkat" => libc::SYS_readlinkat,
209        "truncate" => libc::SYS_truncate,
210        "utimensat" => libc::SYS_utimensat,
211        "unlink" => arch::SYS_UNLINK?,
212        "rmdir" => arch::SYS_RMDIR?,
213        "mkdir" => arch::SYS_MKDIR?,
214        "rename" => arch::SYS_RENAME?,
215        "stat" => arch::SYS_STAT?,
216        "lstat" => arch::SYS_LSTAT?,
217        "access" => arch::SYS_ACCESS?,
218        "symlink" => arch::SYS_SYMLINK?,
219        "link" => arch::SYS_LINK?,
220        "chmod" => arch::SYS_CHMOD?,
221        "chown" => arch::SYS_CHOWN?,
222        "lchown" => arch::SYS_LCHOWN?,
223        "readlink" => arch::SYS_READLINK?,
224        "futimesat" => arch::SYS_FUTIMESAT?,
225        "fork" => arch::SYS_FORK?,
226        // SysV IPC (gated by extra_allow_syscalls=["sysv_ipc"]; denied by default)
227        "shmget" => libc::SYS_shmget,
228        "shmat" => libc::SYS_shmat,
229        "shmdt" => libc::SYS_shmdt,
230        "shmctl" => libc::SYS_shmctl,
231        "msgget" => libc::SYS_msgget,
232        "msgsnd" => libc::SYS_msgsnd,
233        "msgrcv" => libc::SYS_msgrcv,
234        "msgctl" => libc::SYS_msgctl,
235        "semget" => libc::SYS_semget,
236        "semop" => libc::SYS_semop,
237        "semctl" => libc::SYS_semctl,
238        "semtimedop" => libc::SYS_semtimedop,
239        _ => return None,
240    };
241    Some(nr as u32)
242}
243
244// ============================================================
245// Sandbox → syscall lists
246// ============================================================
247
248/// Determine which syscalls need `SECCOMP_RET_USER_NOTIF`.
249pub fn notif_syscalls(policy: &Sandbox, sandbox_name: Option<&str>) -> Vec<u32> {
250    let mut nrs = vec![
251        libc::SYS_clone as u32,
252        libc::SYS_clone3 as u32,
253        libc::SYS_wait4 as u32,
254        libc::SYS_waitid as u32,
255    ];
256    arch::push_optional_syscall(&mut nrs, arch::SYS_VFORK);
257    // Bare fork(2) carries none of the namespace/process-limit risk of
258    // clone/clone3 and was historically left out of the BPF filter so
259    // hot fork-loops (COW map-reduce) bypass the supervisor entirely.
260    // It only needs interception when policy_fn is active, so the
261    // supervisor can register the new child via ptrace fork events
262    // before it can run user code (argv-safety invariant).
263    if policy.policy_fn.is_some() {
264        arch::push_optional_syscall(&mut nrs, arch::SYS_FORK);
265    }
266
267    if policy.max_memory.is_some() {
268        nrs.push(libc::SYS_mmap as u32);
269        nrs.push(libc::SYS_munmap as u32);
270        nrs.push(libc::SYS_brk as u32);
271        nrs.push(libc::SYS_mremap as u32);
272        // shmget is in notif only when SysV IPC is allowed. The BPF
273        // layout puts notif JEQs before deny JEQs, so a syscall on
274        // both lists would notify (RET_USER_NOTIF) and silently
275        // bypass the kernel-level deny. When extra_allow_syscalls does not contain "sysv_ipc",
276        // shmget belongs only on the blocklist.
277        if policy.allows_sysv_ipc() {
278            nrs.push(libc::SYS_shmget as u32);
279        }
280    }
281
282    if !policy.net_allow.is_empty()
283        || policy.policy_fn.is_some()
284        || !policy.http_allow.is_empty()
285        || !policy.http_deny.is_empty()
286    {
287        nrs.push(libc::SYS_connect as u32);
288        nrs.push(libc::SYS_sendto as u32);
289        nrs.push(libc::SYS_sendmsg as u32);
290        nrs.push(libc::SYS_sendmmsg as u32);
291        nrs.push(libc::SYS_bind as u32);
292    }
293
294    if policy.random_seed.is_some() {
295        nrs.push(libc::SYS_getrandom as u32);
296        // Also intercept openat so the supervisor can re-patch vDSO after exec.
297        nrs.push(libc::SYS_openat as u32);
298    }
299
300    if policy.time_start.is_some() {
301        nrs.extend_from_slice(&[
302            libc::SYS_clock_nanosleep as u32,
303            libc::SYS_timerfd_settime as u32,
304            libc::SYS_timer_settime as u32,
305        ]);
306        // Also intercept openat so the supervisor gets a notification after exec
307        // and can re-patch the vDSO (exec replaces vDSO with a fresh copy).
308        nrs.push(libc::SYS_openat as u32);
309    }
310
311    // /proc virtualization + /etc/hosts virtualization (always on).
312    //
313    // `openat` carries the simple `(AT_FDCWD, "/proc/...")` and
314    // `(AT_FDCWD, "/etc/hosts")` spellings; `openat2` is the same shape
315    // on newer libcs; legacy `open(path, ...)` is the same path without a
316    // dirfd. The handlers normalize all three into a single absolute path
317    // check, so we have to put every variant on the notif list — otherwise
318    // a caller that picks `open` or `openat2` slips past virtualization
319    // and reads the real on-disk file.
320    nrs.push(libc::SYS_openat as u32);
321    nrs.push(arch::SYS_OPENAT2 as u32);
322    arch::push_optional_syscall(&mut nrs, arch::SYS_OPEN);
323    nrs.push(libc::SYS_getdents64 as u32);
324    arch::push_optional_syscall(&mut nrs, arch::SYS_GETDENTS);
325
326    // Netlink virtualization (always on):
327    //   socket, bind, getsockname — swap in a unix socketpair for AF_NETLINK
328    //   recvfrom, recvmsg         — zero msg_name so glibc accepts the reply
329    //                                (kernel only writes sun_family on unix
330    //                                 recvmsg, leaving nl_pid uninitialized)
331    //   close                     — unregister (pid, fd) so reuse doesn't
332    //                                collide with the cookie set
333    // Send traffic flows through the real socketpair untouched.
334    nrs.push(libc::SYS_socket as u32);
335    nrs.push(libc::SYS_bind as u32);
336    nrs.push(libc::SYS_getsockname as u32);
337    nrs.push(libc::SYS_recvfrom as u32);
338    nrs.push(libc::SYS_recvmsg as u32);
339    nrs.push(libc::SYS_close as u32);
340    // Virtualize sched_getaffinity so nproc/sysconf agree with /proc/cpuinfo
341    if policy.num_cpus.is_some() {
342        nrs.push(libc::SYS_sched_getaffinity as u32);
343    }
344    if sandbox_name.is_some() {
345        nrs.push(libc::SYS_uname as u32);
346        nrs.push(libc::SYS_openat as u32);
347    }
348
349    // COW filesystem interception (seccomp-based, unprivileged)
350    if policy.workdir.is_some() {
351        nrs.extend_from_slice(&[
352            libc::SYS_openat as u32,
353            libc::SYS_execve as u32,
354            libc::SYS_execveat as u32,
355            libc::SYS_unlinkat as u32,
356            libc::SYS_mkdirat as u32,
357            libc::SYS_renameat2 as u32,
358            libc::SYS_symlinkat as u32,
359            libc::SYS_linkat as u32,
360            libc::SYS_fchmodat as u32,
361            libc::SYS_fchownat as u32,
362            libc::SYS_truncate as u32,
363            libc::SYS_utimensat as u32,
364            libc::SYS_newfstatat as u32,
365            libc::SYS_statx as u32,
366            libc::SYS_faccessat as u32,
367            439u32,                       // SYS_faccessat2 — glibc 2.33+ uses this instead of faccessat
368            libc::SYS_readlinkat as u32,
369            libc::SYS_getdents64 as u32,
370            libc::SYS_chdir as u32,
371            libc::SYS_getcwd as u32,
372        ]);
373        for nr in [
374            arch::SYS_OPEN, arch::SYS_UNLINK, arch::SYS_RMDIR, arch::SYS_MKDIR,
375            arch::SYS_RENAME, arch::SYS_SYMLINK, arch::SYS_LINK, arch::SYS_CHMOD,
376            arch::SYS_CHOWN, arch::SYS_LCHOWN, arch::SYS_STAT, arch::SYS_LSTAT,
377            arch::SYS_ACCESS, arch::SYS_READLINK, arch::SYS_GETDENTS,
378        ] {
379            arch::push_optional_syscall(&mut nrs, nr);
380        }
381    }
382
383    // Chroot path interception
384    if policy.chroot.is_some() {
385        nrs.extend_from_slice(&[
386            libc::SYS_openat as u32,
387            libc::SYS_execve as u32,
388            libc::SYS_execveat as u32,
389            libc::SYS_unlinkat as u32,
390            libc::SYS_mkdirat as u32,
391            libc::SYS_renameat2 as u32,
392            libc::SYS_symlinkat as u32,
393            libc::SYS_linkat as u32,
394            libc::SYS_fchmodat as u32,
395            libc::SYS_fchownat as u32,
396            libc::SYS_truncate as u32,
397            libc::SYS_newfstatat as u32,
398            libc::SYS_statx as u32,
399            libc::SYS_faccessat as u32,
400            439u32,                       // SYS_faccessat2 — glibc 2.33+ uses this instead of faccessat
401            libc::SYS_readlinkat as u32,
402            libc::SYS_getdents64 as u32,
403            libc::SYS_chdir as u32,
404            libc::SYS_getcwd as u32,
405            libc::SYS_statfs as u32,
406            libc::SYS_utimensat as u32,
407        ]);
408        for nr in [
409            arch::SYS_OPEN, arch::SYS_STAT, arch::SYS_LSTAT, arch::SYS_ACCESS,
410            arch::SYS_READLINK, arch::SYS_GETDENTS, arch::SYS_UNLINK,
411            arch::SYS_RMDIR, arch::SYS_MKDIR, arch::SYS_RENAME,
412            arch::SYS_SYMLINK, arch::SYS_LINK, arch::SYS_CHMOD,
413            arch::SYS_CHOWN, arch::SYS_LCHOWN,
414        ] {
415            arch::push_optional_syscall(&mut nrs, nr);
416        }
417    }
418
419    // Explicit deny-paths need path-bearing syscalls intercepted.
420    if !policy.fs_denied.is_empty() {
421        nrs.extend_from_slice(&[
422            libc::SYS_openat as u32,
423            libc::SYS_execve as u32,
424            libc::SYS_execveat as u32,
425            libc::SYS_linkat as u32,
426            libc::SYS_renameat2 as u32,
427            libc::SYS_symlinkat as u32,
428        ]);
429        for nr in [arch::SYS_OPEN, arch::SYS_LINK, arch::SYS_RENAME, arch::SYS_SYMLINK] {
430            arch::push_optional_syscall(&mut nrs, nr);
431        }
432    }
433
434    // Dynamic policy callback — intercept key syscalls for event emission.
435    if policy.policy_fn.is_some() {
436        nrs.extend_from_slice(&[
437            libc::SYS_openat as u32,
438            libc::SYS_connect as u32,
439            libc::SYS_sendto as u32,
440            libc::SYS_bind as u32,
441            libc::SYS_execve as u32,
442            libc::SYS_execveat as u32,
443        ]);
444    }
445
446    // Port remapping
447    if policy.port_remap {
448        nrs.extend_from_slice(&[
449            libc::SYS_bind as u32,
450            libc::SYS_getsockname as u32,
451        ]);
452    }
453
454    nrs.sort_unstable();
455    nrs.dedup();
456    nrs
457}
458
459/// Resolve `NO_SUPERVISOR_BLOCKLIST_SYSCALLS` names to numbers, plus
460/// SysV IPC syscalls when `policy.allows_sysv_ipc()` is false.
461pub fn no_supervisor_blocklist_syscall_numbers(policy: &Sandbox) -> Vec<u32> {
462    use crate::sys::structs::NO_SUPERVISOR_BLOCKLIST_SYSCALLS;
463    let mut nrs: Vec<u32> = NO_SUPERVISOR_BLOCKLIST_SYSCALLS
464        .iter()
465        .copied()
466        .chain(policy.extra_deny_syscalls.iter().map(String::as_str))
467        .filter_map(|n| syscall_name_to_nr(n))
468        .collect();
469    if !policy.allows_sysv_ipc() {
470        for name in SYSV_IPC_BLOCKLIST_SYSCALLS {
471            if let Some(nr) = syscall_name_to_nr(name) {
472                if !nrs.contains(&nr) {
473                    nrs.push(nr);
474                }
475            }
476        }
477    }
478    nrs.sort_unstable();
479    nrs.dedup();
480    nrs
481}
482
483/// Resolve the default syscall blocklist plus policy extras to numbers.
484///
485/// SysV IPC syscalls are appended to the resolved blocklist when
486/// `policy.allows_sysv_ipc()` is false.
487pub fn blocklist_syscall_numbers(policy: &Sandbox) -> Vec<u32> {
488    let mut nrs: Vec<u32> = DEFAULT_BLOCKLIST_SYSCALLS
489        .iter()
490        .copied()
491        .chain(policy.extra_deny_syscalls.iter().map(String::as_str))
492        .filter_map(|n| syscall_name_to_nr(n))
493        .collect();
494    if !policy.allows_sysv_ipc() {
495        for name in SYSV_IPC_BLOCKLIST_SYSCALLS {
496            if let Some(nr) = syscall_name_to_nr(name) {
497                if !nrs.contains(&nr) {
498                    nrs.push(nr);
499                }
500            }
501        }
502    }
503    nrs.sort_unstable();
504    nrs.dedup();
505    nrs
506}
507
508/// Build argument-level seccomp filter instructions matching the Python
509/// `_build_arg_filters()` exactly.
510///
511/// Returns a `Vec<SockFilter>` containing self-contained BPF blocks for:
512///   - clone: block namespace creation flags
513///   - ioctl: block TIOCSTI, TIOCLINUX, SIOCGIF*, SIOCETHTOOL
514///   - prctl: block PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER
515///   - socket: block SOCK_RAW/SOCK_DGRAM on AF_INET/AF_INET6 (with type mask)
516pub fn arg_filters(policy: &Sandbox) -> Vec<SockFilter> {
517    let ret_errno = SECCOMP_RET_ERRNO | EPERM as u32;
518    let nr_clone = libc::SYS_clone as u32;
519    let nr_ioctl = libc::SYS_ioctl as u32;
520    let nr_prctl = libc::SYS_prctl as u32;
521    let nr_socket = libc::SYS_socket as u32;
522
523    let mut insns: Vec<SockFilter> = Vec::new();
524
525    // --- clone: block namespace creation flags ---
526    // 5 instructions:
527    //   LD NR
528    //   JEQ clone → +0, skip 3
529    //   LD arg0
530    //   JSET NS_FLAGS → +0, skip 1
531    //   RET ERRNO
532    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
533    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_clone, 0, 3));
534    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
535    insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, CLONE_NS_FLAGS as u32, 0, 1));
536    insns.push(stmt(BPF_RET | BPF_K, ret_errno));
537
538    // --- ioctl: block dangerous commands ---
539    // Block terminal injection (TIOCSTI, TIOCLINUX) and network interface
540    // enumeration ioctls (SIOCGIF*, SIOCETHTOOL) to complement NETLINK_ROUTE
541    // virtualization.
542    // Layout: LD NR, JEQ ioctl (skip 1 + N*2), LD arg1, [JEQ cmd, RET ERRNO] * N
543    let dangerous_ioctls: &[u32] = &[
544        TIOCSTI as u32,
545        TIOCLINUX as u32,
546        SIOCGIFNAME as u32,
547        SIOCGIFCONF as u32,
548        SIOCGIFFLAGS as u32,
549        SIOCGIFADDR as u32,
550        SIOCGIFDSTADDR as u32,
551        SIOCGIFBRDADDR as u32,
552        SIOCGIFNETMASK as u32,
553        SIOCGIFHWADDR as u32,
554        SIOCGIFINDEX as u32,
555        SIOCETHTOOL as u32,
556    ];
557    let n_ioctls = dangerous_ioctls.len();
558    let skip_count = (1 + n_ioctls * 2) as u8;
559    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
560    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_ioctl, 0, skip_count));
561    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
562    for &cmd in dangerous_ioctls {
563        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, cmd, 0, 1));
564        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
565    }
566
567    // --- prctl: block dangerous options ---
568    // Layout: LD NR, JEQ prctl (skip 1 + N*2), LD arg0, [JEQ op, RET ERRNO] * N
569    let dangerous_prctl_ops: &[u32] = &[PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER];
570    let n_ops = dangerous_prctl_ops.len();
571    let skip_count = (1 + n_ops * 2) as u8;
572    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
573    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_prctl, 0, skip_count));
574    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
575    for &op in dangerous_prctl_ops {
576        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, op, 0, 1));
577        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
578    }
579
580    // --- socket: block SOCK_RAW and/or SOCK_DGRAM on AF_INET/AF_INET6 ---
581    //
582    // SOCK_RAW is unconditionally denied. Sandlock does not expose
583    // raw ICMP — packet-crafting capabilities aren't part of the XOA
584    // threat model, and destination filtering at `sendto` can't be
585    // honestly enforced for raw sockets (the agent controls the IP
586    // header). Workloads that need ping should use the kernel ping
587    // socket (SOCK_DGRAM + IPPROTO_ICMP) via an `icmp://...` rule.
588    //
589    // SOCK_DGRAM is denied unless a UDP or ICMP rule exists in
590    // net_allow. The kernel ping socket uses SOCK_DGRAM with
591    // IPPROTO_ICMP, so the same type bit gates both — destination
592    // filtering at sendto (Phase 2) is what separates them per-rule.
593    use crate::sandbox::Protocol;
594    let any_udp_rule = policy.net_allow.iter().any(|r| r.protocol == Protocol::Udp);
595    let any_icmp_rule = policy.net_allow.iter().any(|r| r.protocol == Protocol::Icmp);
596    let mut blocked_types: Vec<u32> = Vec::new();
597    blocked_types.push(SOCK_RAW);
598    if !any_udp_rule && !any_icmp_rule {
599        blocked_types.push(SOCK_DGRAM);
600    }
601
602    if !blocked_types.is_empty() {
603        let n = blocked_types.len();
604        // Instructions after domain checks: 2 (load+AND) + N (JEQs) + 1 (RET)
605        let after_domain = 2 + n + 1;
606        // Total after NR check: 3 (load domain + 2 JEQs) + after_domain
607        let skip_all = (3 + after_domain) as u8;
608
609        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
610        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, skip_all));
611        // Load domain (arg0)
612        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
613        // AF_INET → skip to type check (jump over AF_INET6 check)
614        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET, 1, 0));
615        // AF_INET6 → type check; else skip everything remaining
616        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET6, 0, after_domain as u8));
617        // Load type (arg1) and mask off SOCK_NONBLOCK|SOCK_CLOEXEC
618        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
619        insns.push(stmt(BPF_ALU | BPF_AND | BPF_K, SOCK_TYPE_MASK));
620        // Check each blocked type
621        for (i, &sock_type) in blocked_types.iter().enumerate() {
622            let remaining = n - i - 1;
623            // Match → jump to RET ERRNO (skip 'remaining' JEQs ahead)
624            // No match on last type → skip past RET ERRNO (jf=1)
625            // No match on non-last → check next type (jf=0)
626            let jf: u8 = if remaining == 0 { 1 } else { 0 };
627            insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, sock_type, remaining as u8, jf));
628        }
629        // Deny return (reached by any matching JEQ)
630        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
631    }
632
633    // (raw ICMP carve-out removed — SOCK_RAW is unconditionally denied
634    // by the blocked_types block above. Sandlock does not expose raw
635    // sockets; ping uses the SOCK_DGRAM kernel ping socket via an
636    // `icmp://...` rule, gated by host `ping_group_range`.)
637
638    // --- wait4: skip notification for WNOHANG/WNOWAIT (non-blocking) ---
639    // wait4(pid, status, options, rusage) — options is arg2
640    // 5 instructions:
641    //   LD NR
642    //   JEQ wait4 → +0, skip 3
643    //   LD arg2
644    //   JSET (WNOHANG|WNOWAIT) → +0, skip 1
645    //   RET ALLOW
646    {
647        let nr_wait4 = libc::SYS_wait4 as u32;
648        let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000/* WNOWAIT */) as u32;
649        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
650        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_wait4, 0, 3));
651        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS2_LO));
652        insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
653        insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
654    }
655
656    // --- waitid: skip notification for WNOHANG/WNOWAIT (non-blocking) ---
657    // waitid(idtype, id, infop, options, rusage) — options is arg3
658    {
659        let nr_waitid = libc::SYS_waitid as u32;
660        let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000/* WNOWAIT */) as u32;
661        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
662        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_waitid, 0, 3));
663        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS3_LO));
664        insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
665        insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
666    }
667
668    insns
669}
670
671// ============================================================
672// Close fds above threshold
673// ============================================================
674
675/// Close all file descriptors above `min_fd`, except those in `keep`.
676fn close_fds_above(min_fd: RawFd, keep: &[RawFd]) {
677    // Read /proc/self/fd to enumerate open fds.
678    // Collect all fd numbers first, then close them after dropping the directory
679    // iterator. This avoids closing the directory fd during iteration.
680    let fds_to_close: Vec<RawFd> = {
681        let dir = match std::fs::read_dir("/proc/self/fd") {
682            Ok(d) => d,
683            Err(_) => return,
684        };
685        dir.flatten()
686            .filter_map(|entry| {
687                entry.file_name().into_string().ok()
688                    .and_then(|name| name.parse::<RawFd>().ok())
689            })
690            .filter(|&fd| fd > min_fd && !keep.contains(&fd))
691            .collect()
692    };
693    // The directory is now closed; safe to close the collected fds.
694    for fd in fds_to_close {
695        unsafe { libc::close(fd) };
696    }
697}
698
699// ============================================================
700// User-namespace uid/gid mapping helpers
701// ============================================================
702
703/// Write uid/gid maps for an unprivileged user namespace.
704/// `real_uid`/`real_gid` must be captured *before* unshare(CLONE_NEWUSER),
705/// since getuid()/getgid() return the overflow id (65534) after unshare.
706/// `target_uid`/`target_gid` are the UIDs visible inside the namespace.
707fn write_id_maps(real_uid: u32, real_gid: u32, target_uid: u32, target_gid: u32) {
708    let _ = std::fs::write("/proc/self/uid_map", format!("{} {} 1\n", target_uid, real_uid));
709    let _ = std::fs::write("/proc/self/setgroups", "deny\n");
710    let _ = std::fs::write("/proc/self/gid_map", format!("{} {} 1\n", target_gid, real_gid));
711}
712
713// ============================================================
714// Child-side confinement (never returns)
715// ============================================================
716
717/// Arguments threaded from the parent's `do_spawn` into the child-side
718/// `confine_child`.  Packed into a struct because `confine_child` historically
719/// grew to seven positional parameters and a struct keeps the call site
720/// readable when new flags get added (e.g. `extra_syscalls` for user
721/// handlers).  Lifetimes tie everything to the parent's stack frame — the
722/// child never outlives the fork point because `confine_child` either execs
723/// or exits.
724pub(crate) struct ChildSpawnArgs<'a> {
725    pub sandbox: &'a Sandbox,
726    pub cmd: &'a [CString],
727    pub pipes: &'a PipePair,
728    /// Skip the user-notification supervisor: child installs a kernel-only
729    /// deny filter, parent reads `notif_fd_num = 0` and never starts a
730    /// supervisor. Mirrors `Sandbox::no_supervisor`.
731    pub no_supervisor: bool,
732    pub keep_fds: &'a [RawFd],
733    /// Sandbox instance name. When set, it is also exposed as the
734    /// sandbox's virtual hostname.
735    pub sandbox_name: Option<&'a str>,
736    /// Syscall numbers for which the parent registered user `Handler`s.
737    /// Merged into the child's BPF notif list so the kernel actually
738    /// raises USER_NOTIF for them.
739    pub extra_syscalls: &'a [u32],
740    /// PID of the parent process captured before fork. Used to detect
741    /// parent death in the child without assuming PID 1 is always init
742    /// (incorrect in containers where the entrypoint runs as PID 1).
743    pub parent_pid: libc::pid_t,
744}
745
746/// Apply irreversible confinement (Landlock + seccomp) then exec the command.
747///
748/// This function **never returns**: it calls `execvp` on success or
749/// `_exit(127)` on any error.
750pub(crate) fn confine_child(args: ChildSpawnArgs<'_>) -> ! {
751    let ChildSpawnArgs {
752        sandbox,
753        cmd,
754        pipes,
755        no_supervisor,
756        keep_fds,
757        sandbox_name,
758        extra_syscalls,
759        parent_pid,
760    } = args;
761    // Helper: abort child on error. Includes the OS error automatically.
762    macro_rules! fail {
763        ($msg:expr) => {{
764            let err = std::io::Error::last_os_error();
765            let _ = write!(std::io::stderr(), "sandlock child: {}: {}\n", $msg, err);
766            unsafe { libc::_exit(127) };
767        }};
768    }
769
770    use std::io::Write;
771
772    // 1. New process group
773    if unsafe { libc::setpgid(0, 0) } != 0 {
774        fail!("setpgid");
775    }
776
777    // 1b. If stdin is a terminal, become the foreground process group
778    //     so interactive shells can read from the TTY.
779    //     Must ignore SIGTTOU first — a background pgrp calling tcsetpgrp
780    //     gets stopped by SIGTTOU otherwise.
781    if unsafe { libc::isatty(0) } == 1 {
782        unsafe {
783            libc::signal(libc::SIGTTOU, libc::SIG_IGN);
784            libc::tcsetpgrp(0, libc::getpgrp());
785            libc::signal(libc::SIGTTOU, libc::SIG_DFL);
786        }
787    }
788
789    // 2. Die if parent exits
790    if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } != 0 {
791        fail!("prctl(PR_SET_PDEATHSIG)");
792    }
793
794    // 3. Check parent didn't die between fork and prctl.
795    // Compare against the actual parent PID captured before fork rather than
796    // hardcoding 1, since containers often run the entrypoint as PID 1 and a
797    // child forked from it legitimately has getppid() == 1.
798    if unsafe { libc::getppid() } != parent_pid {
799        fail!("parent died before confinement");
800    }
801
802    // 4. Optional: disable ASLR
803    if sandbox.no_randomize_memory {
804        const ADDR_NO_RANDOMIZE: libc::c_ulong = 0x0040000;
805        // Read current personality first (0xffffffff = query), then OR in the flag.
806        let current = unsafe { libc::personality(0xffffffff) };
807        if current == -1 {
808            fail!("personality(query)");
809        }
810        if unsafe { libc::personality(current as libc::c_ulong | ADDR_NO_RANDOMIZE) } == -1 {
811            fail!("personality(ADDR_NO_RANDOMIZE)");
812        }
813    }
814
815    // 4b. Optional: CPU core binding
816    if let Some(ref cores) = sandbox.cpu_cores {
817        if !cores.is_empty() {
818            let mut set = unsafe { std::mem::zeroed::<libc::cpu_set_t>() };
819            unsafe { libc::CPU_ZERO(&mut set) };
820            for &core in cores {
821                unsafe { libc::CPU_SET(core as usize, &mut set) };
822            }
823            if unsafe {
824                libc::sched_setaffinity(
825                    0,
826                    std::mem::size_of::<libc::cpu_set_t>(),
827                    &set,
828                )
829            } != 0
830            {
831                fail!("sched_setaffinity");
832            }
833        }
834    }
835
836    // 5. Optional: disable THP
837    if sandbox.no_huge_pages {
838        if unsafe { libc::prctl(libc::PR_SET_THP_DISABLE, 1, 0, 0, 0) } != 0 {
839            fail!("prctl(PR_SET_THP_DISABLE)");
840        }
841    }
842
843    // 5c. Optional: disable core dumps
844    if sandbox.no_coredump {
845        // Set RLIMIT_CORE to 0 — the kernel will not write a core file.
846        // We intentionally do NOT call prctl(PR_SET_DUMPABLE, 0) because
847        // that would break pidfd_getfd which the supervisor needs.
848        // The seccomp filter already blocks the child from calling
849        // prctl(PR_SET_DUMPABLE, ...) so it can't re-enable it.
850        let rlim = libc::rlimit { rlim_cur: 0, rlim_max: 0 };
851        if unsafe { libc::setrlimit(libc::RLIMIT_CORE, &rlim) } != 0 {
852            fail!("setrlimit(RLIMIT_CORE, 0)");
853        }
854    }
855
856    // Capture real uid/gid before any unshare (after unshare they become 65534)
857    let real_uid = unsafe { libc::getuid() };
858    let real_gid = unsafe { libc::getgid() };
859
860    // 5b. User namespace for --uid mapping.
861    if let Some(target_uid) = sandbox.uid {
862        if unsafe { libc::unshare(libc::CLONE_NEWUSER) } != 0 {
863            fail!("unshare(CLONE_NEWUSER)");
864        }
865        write_id_maps(real_uid, real_gid, target_uid, target_uid);
866    }
867
868    // 6. Optional: change working directory
869    // cwd controls where the child starts; workdir is only for COW
870    let effective_cwd = if let Some(ref cwd) = sandbox.cwd {
871        if let Some(ref chroot_root) = sandbox.chroot {
872            Some(chroot_root.join(cwd.strip_prefix("/").unwrap_or(cwd)))
873        } else {
874            Some(cwd.clone())
875        }
876    } else if let Some(ref chroot_root) = sandbox.chroot {
877        // Default to chroot root
878        Some(chroot_root.to_path_buf())
879    } else if let Some(ref workdir) = sandbox.workdir {
880        // Default to workdir when set (COW working directory)
881        Some(workdir.clone())
882    } else {
883        None
884    };
885
886    if let Some(ref cwd) = effective_cwd {
887        let c_path = match CString::new(cwd.as_os_str().as_encoded_bytes()) {
888            Ok(c) => c,
889            Err(_) => fail!("invalid cwd path"),
890        };
891        if unsafe { libc::chdir(c_path.as_ptr()) } != 0 {
892            fail!("chdir");
893        }
894    }
895
896    // 7. Set NO_NEW_PRIVS (required for both Landlock and seccomp without CAP_SYS_ADMIN)
897    if unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) } != 0 {
898        fail!("prctl(PR_SET_NO_NEW_PRIVS)");
899    }
900
901    // 8. Apply Landlock confinement (IRREVERSIBLE)
902    if let Err(e) = crate::landlock::confine(sandbox) {
903        fail!(format!("landlock: {}", e));
904    }
905
906    // 9. Assemble and install seccomp filter (IRREVERSIBLE)
907    let args = arg_filters(sandbox);
908    let mut keep_fd: i32 = -1;
909
910    if no_supervisor {
911        // No-supervisor mode: deny-only kernel filter, no NEW_LISTENER.
912        // BPF filters are ANDed by the kernel, so an outer filter (from a
913        // wrapping sandbox) keeps tightening this layer too.
914        //
915        // Uses the relaxed `no_supervisor_blocklist_syscall_numbers` deny
916        // list (which leaves `ptrace`, `unshare`, `process_vm_*`, etc.
917        // alone) so an inner full-supervisor sandlock nested under this
918        // one still has the syscalls its supervisor needs.
919        let deny = no_supervisor_blocklist_syscall_numbers(sandbox);
920        let filter = match bpf::assemble_filter(&[], &deny, &args) {
921            Ok(f) => f,
922            Err(e) => fail!(format!("seccomp assemble: {}", e)),
923        };
924        if let Err(e) = bpf::install_deny_filter(&filter) {
925            fail!(format!("seccomp deny filter: {}", e));
926        }
927        // fd=0 tells the parent there's no supervisor to attach to.
928        if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), 0) {
929            fail!(format!("write no-supervisor signal: {}", e));
930        }
931    } else {
932        let deny = blocklist_syscall_numbers(sandbox);
933        // First-level sandbox: notif + deny filter with NEW_LISTENER.
934        //
935        // Caller-supplied handlers must have their syscalls registered in
936        // the BPF filter, otherwise the kernel never raises a notification for
937        // them and the handler silently never fires.  We merge `extra_syscalls`
938        // into the notif list and dedup so each syscall produces exactly one
939        // JEQ in the assembled program.
940        let mut notif = notif_syscalls(sandbox, sandbox_name);
941        if !extra_syscalls.is_empty() {
942            notif.extend_from_slice(extra_syscalls);
943        }
944        // Argv-safety gate (companion to the policy_fn case in
945        // notif_syscalls): a handler bound to execve/execveat
946        // can call `read_child_mem` to inspect argv, so the supervisor
947        // must register newly forked children before they can run user
948        // code — same invariant policy_fn relies on. Bare fork(2)
949        // therefore needs to be intercepted here too.
950        let exec_extra = extra_syscalls.iter().any(|&n| {
951            n == libc::SYS_execve as u32 || n == libc::SYS_execveat as u32
952        });
953        if exec_extra {
954            arch::push_optional_syscall(&mut notif, arch::SYS_FORK);
955        }
956        notif.sort_unstable();
957        notif.dedup();
958        let filter = match bpf::assemble_filter(&notif, &deny, &args) {
959            Ok(f) => f,
960            Err(e) => fail!(format!("seccomp assemble: {}", e)),
961        };
962        let notif_fd = match bpf::install_filter(&filter) {
963            Ok(fd) => fd,
964            Err(e) => {
965                // EBUSY here means another seccomp filter on this task already
966                // owns the SECCOMP_FILTER_FLAG_NEW_LISTENER slot. The kernel
967                // permits at most one listener per task — to nest, opt this
968                // sandbox out of the supervisor via `Sandbox::no_supervisor`
969                // (or the CLI's `--no-supervisor` flag).
970                if e.raw_os_error() == Some(libc::EBUSY) {
971                    let _ = write!(
972                        std::io::stderr(),
973                        "sandlock child: seccomp install: {} (an outer sandbox already owns the \
974                         seccomp listener; pass --no-supervisor or Sandbox::no_supervisor(true) \
975                         on this sandbox to nest)\n",
976                        e,
977                    );
978                    unsafe { libc::_exit(127) };
979                }
980                fail!(format!("seccomp install: {}", e));
981            }
982        };
983        keep_fd = notif_fd.as_raw_fd();
984        if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), keep_fd as u32) {
985            fail!(format!("write notif fd: {}", e));
986        }
987        std::mem::forget(notif_fd);
988    }
989
990    // 10. Wait for parent to signal ready
991    match read_u32_fd(pipes.ready_r.as_raw_fd()) {
992        Ok(_) => {}
993        Err(e) => fail!(format!("read ready signal: {}", e)),
994    }
995
996    // 12. Close all fds above stderr (always on for isolation)
997    let mut fds_to_keep: Vec<RawFd> = keep_fds.to_vec();
998    if keep_fd >= 0 {
999        fds_to_keep.push(keep_fd);
1000    }
1001    close_fds_above(2, &fds_to_keep);
1002
1003    // 13. Apply environment
1004    if sandbox.clean_env {
1005        // Clear all env vars first
1006        for (key, _) in std::env::vars_os() {
1007            std::env::remove_var(&key);
1008        }
1009    }
1010    for (key, value) in &sandbox.env {
1011        std::env::set_var(key, value);
1012    }
1013
1014    // 13b. GPU device visibility
1015    if let Some(ref devices) = sandbox.gpu_devices {
1016        if !devices.is_empty() {
1017            let vis = devices.iter().map(|d| d.to_string()).collect::<Vec<_>>().join(",");
1018            std::env::set_var("CUDA_VISIBLE_DEVICES", &vis);
1019            std::env::set_var("ROCR_VISIBLE_DEVICES", &vis);
1020        }
1021        // Empty list = all GPUs visible, don't set env vars
1022    }
1023
1024    // 14. exec
1025    debug_assert!(!cmd.is_empty(), "cmd must not be empty");
1026    let argv_ptrs: Vec<*const libc::c_char> = cmd
1027        .iter()
1028        .map(|s| s.as_ptr())
1029        .chain(std::iter::once(std::ptr::null()))
1030        .collect();
1031
1032    if sandbox.chroot.is_some() {
1033        // With chroot the seccomp handler rewrites the filename to a host path
1034        // (or /proc/self/fd/N).  Pass a separate PATH_MAX buffer as the `file`
1035        // argument so the rewrite does not corrupt argv[0] — which must stay as
1036        // the original command name (e.g. busybox uses argv[0] for applet
1037        // detection).  execvp still handles PATH lookup for bare command names.
1038        let mut exec_path = vec![0u8; libc::PATH_MAX as usize];
1039        let orig = cmd[0].as_bytes_with_nul();
1040        exec_path[..orig.len()].copy_from_slice(orig);
1041
1042        unsafe {
1043            libc::execvp(
1044                exec_path.as_ptr() as *const libc::c_char,
1045                argv_ptrs.as_ptr(),
1046            )
1047        };
1048    } else {
1049        unsafe { libc::execvp(argv_ptrs[0], argv_ptrs.as_ptr()) };
1050    }
1051
1052    // If we get here, exec failed
1053    fail!(format!("execvp '{}'", cmd[0].to_string_lossy()));
1054}
1055
1056// ============================================================
1057// Tests
1058// ============================================================
1059
1060#[cfg(test)]
1061mod tests {
1062    use super::*;
1063
1064    #[test]
1065    fn test_pipe_pair_creation() {
1066        let pipes = PipePair::new().expect("pipe creation failed");
1067        // Verify fds are valid (non-negative)
1068        assert!(pipes.notif_r.as_raw_fd() >= 0);
1069        assert!(pipes.notif_w.as_raw_fd() >= 0);
1070        assert!(pipes.ready_r.as_raw_fd() >= 0);
1071        assert!(pipes.ready_w.as_raw_fd() >= 0);
1072        // All four fds should be distinct
1073        let fds = [
1074            pipes.notif_r.as_raw_fd(),
1075            pipes.notif_w.as_raw_fd(),
1076            pipes.ready_r.as_raw_fd(),
1077            pipes.ready_w.as_raw_fd(),
1078        ];
1079        for i in 0..4 {
1080            for j in (i + 1)..4 {
1081                assert_ne!(fds[i], fds[j]);
1082            }
1083        }
1084    }
1085
1086    #[test]
1087    fn test_write_read_u32() {
1088        let pipes = PipePair::new().expect("pipe creation failed");
1089        let val = 42u32;
1090        write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
1091        let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
1092        assert_eq!(got, val);
1093    }
1094
1095    #[test]
1096    fn test_write_read_u32_large() {
1097        let pipes = PipePair::new().expect("pipe creation failed");
1098        let val = 0xDEAD_BEEFu32;
1099        write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
1100        let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
1101        assert_eq!(got, val);
1102    }
1103
1104    #[test]
1105    fn test_notif_syscalls_always_has_clone() {
1106        let policy = Sandbox::builder().build().unwrap();
1107        let nrs = notif_syscalls(&policy, None);
1108        assert!(nrs.contains(&(libc::SYS_clone as u32)));
1109        assert!(nrs.contains(&(libc::SYS_clone3 as u32)));
1110        if let Some(vfork) = arch::SYS_VFORK {
1111            assert!(nrs.contains(&(vfork as u32)));
1112        }
1113        // Bare fork(2) is intercepted only when policy_fn is active —
1114        // see notif_syscalls. The default policy has no policy_fn, so
1115        // fork stays out of the BPF filter and hot fork-loops keep
1116        // bypassing the supervisor.
1117        if let Some(fork) = arch::SYS_FORK {
1118            assert!(!nrs.contains(&(fork as u32)));
1119        }
1120    }
1121
1122    #[test]
1123    fn test_notif_syscalls_fork_gated_on_policy_fn() {
1124        let Some(fork) = arch::SYS_FORK else { return };
1125        let policy = Sandbox::builder()
1126            .policy_fn(|_event, _ctx| crate::policy_fn::Verdict::Allow)
1127            .build()
1128            .unwrap();
1129        let nrs = notif_syscalls(&policy, None);
1130        assert!(nrs.contains(&(fork as u32)));
1131    }
1132
1133    #[test]
1134    fn test_notif_syscalls_memory() {
1135        // shmget only appears in notif when SysV IPC is allowed —
1136        // otherwise it is on the kernel blocklist and notifying would
1137        // bypass the deny (notif JEQs precede deny JEQs in the BPF
1138        // layout).
1139        let policy = Sandbox::builder()
1140            .max_memory(crate::sandbox::ByteSize::mib(256))
1141            .extra_allow_syscalls(vec!["sysv_ipc".into()])
1142            .build()
1143            .unwrap();
1144        let nrs = notif_syscalls(&policy, None);
1145        assert!(nrs.contains(&(libc::SYS_mmap as u32)));
1146        assert!(nrs.contains(&(libc::SYS_munmap as u32)));
1147        assert!(nrs.contains(&(libc::SYS_brk as u32)));
1148        assert!(nrs.contains(&(libc::SYS_mremap as u32)));
1149        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1150    }
1151
1152    #[test]
1153    fn test_notif_syscalls_memory_excludes_shmget_when_sysv_ipc_denied() {
1154        // With max_memory but allows_sysv_ipc()=false (the default),
1155        // shmget must NOT be in notif: if it were, the BPF filter
1156        // would route it to RET_USER_NOTIF before reaching the deny
1157        // JEQ, silently bypassing the kernel-level deny.
1158        let policy = Sandbox::builder()
1159            .max_memory(crate::sandbox::ByteSize::mib(256))
1160            .build()
1161            .unwrap();
1162        let nrs = notif_syscalls(&policy, None);
1163        assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1164        // Other memory syscalls remain notified — they are not denied.
1165        assert!(nrs.contains(&(libc::SYS_mmap as u32)));
1166        assert!(nrs.contains(&(libc::SYS_brk as u32)));
1167    }
1168
1169    #[test]
1170    fn test_notif_syscalls_net() {
1171        let policy = Sandbox::builder()
1172            .net_allow("example.com:443")
1173            .build()
1174            .unwrap();
1175        let nrs = notif_syscalls(&policy, None);
1176        assert!(nrs.contains(&(libc::SYS_connect as u32)));
1177        assert!(nrs.contains(&(libc::SYS_sendto as u32)));
1178        assert!(nrs.contains(&(libc::SYS_sendmsg as u32)));
1179        assert!(nrs.contains(&(libc::SYS_sendmmsg as u32)));
1180    }
1181
1182    #[test]
1183    fn test_notif_syscalls_sandbox_name_enables_hostname_virtualization() {
1184        let policy = Sandbox::builder().build().unwrap();
1185        let nrs = notif_syscalls(&policy, Some("api.local"));
1186        assert!(nrs.contains(&(libc::SYS_uname as u32)));
1187        assert!(nrs.contains(&(libc::SYS_openat as u32)));
1188    }
1189
1190    /// SYS_faccessat2 (439) must be in the notification filter for both
1191    /// chroot and COW modes — glibc 2.33+ uses it instead of faccessat.
1192    #[test]
1193    fn test_notif_syscalls_faccessat2() {
1194        const SYS_FACCESSAT2: u32 = 439;
1195
1196        // Chroot mode
1197        let policy = Sandbox::builder()
1198            .chroot("/tmp")
1199            .build()
1200            .unwrap();
1201        let nrs = notif_syscalls(&policy, None);
1202        assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
1203        assert!(nrs.contains(&SYS_FACCESSAT2),
1204                "chroot notif filter must include SYS_faccessat2 (439)");
1205
1206        // COW mode
1207        let policy = Sandbox::builder()
1208            .workdir("/tmp")
1209            .build()
1210            .unwrap();
1211        let nrs = notif_syscalls(&policy, None);
1212        assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
1213        assert!(nrs.contains(&SYS_FACCESSAT2),
1214                "COW notif filter must include SYS_faccessat2 (439)");
1215    }
1216
1217    #[test]
1218    fn test_blocklist_syscall_numbers_default() {
1219        let policy = Sandbox::builder().build().unwrap();
1220        let nrs = blocklist_syscall_numbers(&policy);
1221        // Should contain mount, ptrace, etc.
1222        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1223        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1224        assert!(nrs.contains(&(libc::SYS_bpf as u32)));
1225        // SysV IPC denied by default (no IPC namespace in sandlock)
1226        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1227        assert!(nrs.contains(&(libc::SYS_shmat as u32)));
1228        assert!(nrs.contains(&(libc::SYS_msgget as u32)));
1229        assert!(nrs.contains(&(libc::SYS_semget as u32)));
1230        // nfsservctl has no libc constant, so it is skipped
1231        assert!(!nrs.is_empty());
1232    }
1233
1234    #[test]
1235    fn test_blocklist_syscall_numbers_custom() {
1236        let policy = Sandbox::builder()
1237            .extra_deny_syscalls(vec!["mount".into(), "ptrace".into()])
1238            .build()
1239            .unwrap();
1240        let nrs = blocklist_syscall_numbers(&policy);
1241        // User-supplied blocklist still gets SysV IPC appended
1242        // (allows_sysv_ipc() defaults to false).
1243        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1244        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1245        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1246    }
1247
1248    #[test]
1249    fn test_blocklist_syscall_numbers_custom_with_sysv_ipc_allowed() {
1250        let policy = Sandbox::builder()
1251            .extra_deny_syscalls(vec!["mount".into(), "ptrace".into()])
1252            .extra_allow_syscalls(vec!["sysv_ipc".into()])
1253            .build()
1254            .unwrap();
1255        let nrs = blocklist_syscall_numbers(&policy);
1256        // Default blocklist plus user extras — no SysV IPC append.
1257        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1258        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1259        assert!(nrs.contains(&(libc::SYS_bpf as u32)));
1260        assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1261    }
1262
1263    #[test]
1264    fn test_blocklist_syscall_numbers_default_with_sysv_ipc_allowed() {
1265        let policy = Sandbox::builder()
1266            .extra_allow_syscalls(vec!["sysv_ipc".into()])
1267            .build()
1268            .unwrap();
1269        let nrs = blocklist_syscall_numbers(&policy);
1270        // Default blocklist still present, but SysV IPC is permitted.
1271        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1272        assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1273        assert!(!nrs.contains(&(libc::SYS_msgget as u32)));
1274        assert!(!nrs.contains(&(libc::SYS_semget as u32)));
1275    }
1276
1277    #[test]
1278    fn test_no_supervisor_blocklist_includes_sysv_ipc_by_default() {
1279        let policy = Sandbox::builder().build().unwrap();
1280        let nrs = no_supervisor_blocklist_syscall_numbers(&policy);
1281        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1282        assert!(nrs.contains(&(libc::SYS_msgget as u32)));
1283        assert!(nrs.contains(&(libc::SYS_semget as u32)));
1284    }
1285
1286    #[test]
1287    fn test_no_supervisor_blocklist_excludes_sysv_ipc_when_allowed() {
1288        let policy = Sandbox::builder()
1289            .extra_allow_syscalls(vec!["sysv_ipc".into()])
1290            .build()
1291            .unwrap();
1292        let nrs = no_supervisor_blocklist_syscall_numbers(&policy);
1293        assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1294        assert!(!nrs.contains(&(libc::SYS_msgget as u32)));
1295        assert!(!nrs.contains(&(libc::SYS_semget as u32)));
1296    }
1297
1298    #[test]
1299    fn test_arg_filters_has_clone_ioctl_prctl_socket() {
1300        use crate::sys::structs::{
1301            BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K,
1302        };
1303        let policy = Sandbox::builder().build().unwrap();
1304        let filters = arg_filters(&policy);
1305        // Should contain JEQ for clone syscall nr
1306        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1307            && f.k == libc::SYS_clone as u32));
1308        // Should contain JSET for CLONE_NS_FLAGS
1309        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JSET | BPF_K)
1310            && f.k == CLONE_NS_FLAGS as u32));
1311        // Should contain JEQ for ioctl syscall nr
1312        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1313            && f.k == libc::SYS_ioctl as u32));
1314        // Should contain JEQ for TIOCSTI, TIOCLINUX, and SIOCGIF*/SIOCETHTOOL
1315        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1316            && f.k == TIOCSTI as u32));
1317        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1318            && f.k == TIOCLINUX as u32));
1319        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1320            && f.k == SIOCGIFCONF as u32));
1321        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1322            && f.k == SIOCETHTOOL as u32));
1323        // Should contain JEQ for prctl syscall nr
1324        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1325            && f.k == libc::SYS_prctl as u32));
1326        // Should contain JEQ for PR_SET_DUMPABLE
1327        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1328            && f.k == PR_SET_DUMPABLE));
1329    }
1330
1331    #[test]
1332    fn test_arg_filters_raw_sockets() {
1333        use crate::sys::structs::{BPF_ALU, BPF_AND, BPF_JEQ, BPF_JMP, BPF_K};
1334        // Raw sockets are blocked by default — no `icmp-raw://*` rule.
1335        let policy = Sandbox::builder().build().unwrap();
1336        let filters = arg_filters(&policy);
1337        // Should have AF_INET check
1338        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1339            && f.k == AF_INET));
1340        // Should have AF_INET6 check
1341        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1342            && f.k == AF_INET6));
1343        // Should have ALU AND SOCK_TYPE_MASK
1344        assert!(filters.iter().any(|f| f.code == (BPF_ALU | BPF_AND | BPF_K)
1345            && f.k == SOCK_TYPE_MASK));
1346        // Should have JEQ SOCK_RAW
1347        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1348            && f.k == SOCK_RAW));
1349    }
1350
1351    #[test]
1352    fn test_arg_filters_udp_denied_by_default() {
1353        use crate::sys::structs::{BPF_JEQ, BPF_JMP, BPF_K};
1354        // UDP is denied by default — no `udp://...` rule in net_allow.
1355        let policy = Sandbox::builder().build().unwrap();
1356        let filters = arg_filters(&policy);
1357        // Should have JEQ SOCK_DGRAM
1358        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1359            && f.k == SOCK_DGRAM));
1360    }
1361
1362    #[test]
1363    fn test_syscall_name_to_nr_covers_defaults() {
1364        // Every name in DEFAULT_BLOCKLIST_SYSCALLS should resolve unless the
1365        // running architecture does not expose that syscall.
1366        let expected_unresolved: &[&str] = &[
1367            "nfsservctl",
1368            #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
1369            "ioperm",
1370            #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
1371            "iopl",
1372        ];
1373        let mut skipped = 0;
1374        for name in DEFAULT_BLOCKLIST_SYSCALLS {
1375            match syscall_name_to_nr(name) {
1376                Some(_) => {}
1377                None => {
1378                    assert!(
1379                        expected_unresolved.contains(name),
1380                        "unexpected unresolved syscall: {}",
1381                        name
1382                    );
1383                    skipped += 1;
1384                }
1385            }
1386        }
1387        assert_eq!(skipped, expected_unresolved.len());
1388    }
1389}