Skip to main content

sandlock_core/
context.rs

1// Fork + confinement sequence: child-side Landlock + seccomp application
2// and parent-child pipe synchronization.
3
4use std::ffi::CString;
5use std::io;
6use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
7
8use crate::arch;
9use crate::sandbox::{FsIsolation, Sandbox};
10use crate::seccomp::bpf::{self, stmt, jump};
11use crate::sys::structs::{
12    AF_INET, AF_INET6,
13    BPF_ABS, BPF_ALU, BPF_AND, BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K, BPF_LD, BPF_RET, BPF_W,
14    CLONE_NS_FLAGS, DEFAULT_BLOCKLIST_SYSCALLS, EPERM, SYSV_IPC_BLOCKLIST_SYSCALLS,
15    SECCOMP_RET_ALLOW, SECCOMP_RET_ERRNO,
16    SIOCETHTOOL, SIOCGIFADDR, SIOCGIFBRDADDR, SIOCGIFCONF, SIOCGIFDSTADDR,
17    SIOCGIFFLAGS, SIOCGIFHWADDR, SIOCGIFINDEX, SIOCGIFNAME, SIOCGIFNETMASK,
18    SOCK_DGRAM, SOCK_RAW, SOCK_TYPE_MASK, TIOCLINUX, TIOCSTI,
19    PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER,
20    OFFSET_ARGS0_LO, OFFSET_ARGS1_LO, OFFSET_ARGS2_LO, OFFSET_ARGS3_LO, OFFSET_NR,
21    SockFilter,
22};
23
24// ============================================================
25// Pipe pair for parent-child synchronization
26// ============================================================
27
28/// Pipes for parent-child communication after fork().
29pub struct PipePair {
30    /// Parent reads the notif fd number written by the child.
31    pub notif_r: OwnedFd,
32    /// Child writes the notif fd number to the parent.
33    pub notif_w: OwnedFd,
34    /// Child reads the "supervisor ready" signal from the parent.
35    pub ready_r: OwnedFd,
36    /// Parent writes the "supervisor ready" signal to the child.
37    pub ready_w: OwnedFd,
38}
39
40impl PipePair {
41    /// Create two pipe pairs using `pipe2(O_CLOEXEC)`.
42    pub fn new() -> io::Result<Self> {
43        let mut notif_fds = [0i32; 2];
44        let mut ready_fds = [0i32; 2];
45
46        // SAFETY: pipe2 with valid pointers and O_CLOEXEC
47        let ret = unsafe { libc::pipe2(notif_fds.as_mut_ptr(), libc::O_CLOEXEC) };
48        if ret < 0 {
49            return Err(io::Error::last_os_error());
50        }
51
52        let ret = unsafe { libc::pipe2(ready_fds.as_mut_ptr(), libc::O_CLOEXEC) };
53        if ret < 0 {
54            // Close the first pair on failure
55            unsafe {
56                libc::close(notif_fds[0]);
57                libc::close(notif_fds[1]);
58            }
59            return Err(io::Error::last_os_error());
60        }
61
62        // SAFETY: pipe2 returned valid fds
63        Ok(PipePair {
64            notif_r: unsafe { OwnedFd::from_raw_fd(notif_fds[0]) },
65            notif_w: unsafe { OwnedFd::from_raw_fd(notif_fds[1]) },
66            ready_r: unsafe { OwnedFd::from_raw_fd(ready_fds[0]) },
67            ready_w: unsafe { OwnedFd::from_raw_fd(ready_fds[1]) },
68        })
69    }
70}
71
72// ============================================================
73// Pipe I/O helpers
74// ============================================================
75
76/// Write a `u32` as 4 little-endian bytes to a raw fd.
77pub(crate) fn write_u32_fd(fd: RawFd, val: u32) -> io::Result<()> {
78    let buf = val.to_le_bytes();
79    let mut written = 0usize;
80    while written < 4 {
81        let ret = unsafe {
82            libc::write(
83                fd,
84                buf[written..].as_ptr() as *const libc::c_void,
85                4 - written,
86            )
87        };
88        if ret < 0 {
89            return Err(io::Error::last_os_error());
90        }
91        written += ret as usize;
92    }
93    Ok(())
94}
95
96/// Read a `u32` (4 little-endian bytes, blocking) from a raw fd.
97pub(crate) fn read_u32_fd(fd: RawFd) -> io::Result<u32> {
98    let mut buf = [0u8; 4];
99    let mut total = 0usize;
100    while total < 4 {
101        let ret = unsafe {
102            libc::read(
103                fd,
104                buf[total..].as_mut_ptr() as *mut libc::c_void,
105                4 - total,
106            )
107        };
108        if ret < 0 {
109            return Err(io::Error::last_os_error());
110        }
111        if ret == 0 {
112            return Err(io::Error::new(
113                io::ErrorKind::UnexpectedEof,
114                "pipe closed before 4 bytes read",
115            ));
116        }
117        total += ret as usize;
118    }
119    Ok(u32::from_le_bytes(buf))
120}
121
122// ============================================================
123// Syscall name → number mapping
124// ============================================================
125
126/// Map a syscall name to its `libc::SYS_*` number.
127///
128/// Covers all names in `DEFAULT_BLOCKLIST_SYSCALLS` plus extras needed for
129/// notif and arg-filter lists.
130pub fn syscall_name_to_nr(name: &str) -> Option<u32> {
131    let nr: i64 = match name {
132        "mount" => libc::SYS_mount,
133        "umount2" => libc::SYS_umount2,
134        "pivot_root" => libc::SYS_pivot_root,
135        "swapon" => libc::SYS_swapon,
136        "swapoff" => libc::SYS_swapoff,
137        "reboot" => libc::SYS_reboot,
138        "sethostname" => libc::SYS_sethostname,
139        "setdomainname" => libc::SYS_setdomainname,
140        "kexec_load" => libc::SYS_kexec_load,
141        "init_module" => libc::SYS_init_module,
142        "finit_module" => libc::SYS_finit_module,
143        "delete_module" => libc::SYS_delete_module,
144        "unshare" => libc::SYS_unshare,
145        "setns" => libc::SYS_setns,
146        "perf_event_open" => libc::SYS_perf_event_open,
147        "bpf" => libc::SYS_bpf,
148        "userfaultfd" => libc::SYS_userfaultfd,
149        "keyctl" => libc::SYS_keyctl,
150        "add_key" => libc::SYS_add_key,
151        "request_key" => libc::SYS_request_key,
152        "ptrace" => libc::SYS_ptrace,
153        "process_vm_readv" => libc::SYS_process_vm_readv,
154        "process_vm_writev" => libc::SYS_process_vm_writev,
155        "open_by_handle_at" => libc::SYS_open_by_handle_at,
156        "name_to_handle_at" => libc::SYS_name_to_handle_at,
157        "ioperm" => arch::SYS_IOPERM?,
158        "iopl" => arch::SYS_IOPL?,
159        "quotactl" => libc::SYS_quotactl,
160        "acct" => libc::SYS_acct,
161        "lookup_dcookie" => libc::SYS_lookup_dcookie,
162        // nfsservctl was removed in Linux 3.1; no libc constant — skip
163        "personality" => libc::SYS_personality,
164        "io_uring_setup" => libc::SYS_io_uring_setup,
165        "io_uring_enter" => libc::SYS_io_uring_enter,
166        "io_uring_register" => libc::SYS_io_uring_register,
167        // Additional syscalls for notif/arg filters
168        "clone" => libc::SYS_clone,
169        "clone3" => libc::SYS_clone3,
170        "vfork" => arch::SYS_VFORK?,
171        "mmap" => libc::SYS_mmap,
172        "munmap" => libc::SYS_munmap,
173        "brk" => libc::SYS_brk,
174        "mremap" => libc::SYS_mremap,
175        "connect" => libc::SYS_connect,
176        "sendto" => libc::SYS_sendto,
177        "sendmsg" => libc::SYS_sendmsg,
178        "sendmmsg" => libc::SYS_sendmmsg,
179        "ioctl" => libc::SYS_ioctl,
180        "socket" => libc::SYS_socket,
181        "prctl" => libc::SYS_prctl,
182        "getrandom" => libc::SYS_getrandom,
183        "openat" => libc::SYS_openat,
184        "open" => arch::SYS_OPEN?,
185        "getdents64" => libc::SYS_getdents64,
186        "getdents" => arch::SYS_GETDENTS?,
187        "bind" => libc::SYS_bind,
188        "getsockname" => libc::SYS_getsockname,
189        "clock_gettime" => libc::SYS_clock_gettime,
190        "gettimeofday" => libc::SYS_gettimeofday,
191        "time" => arch::SYS_TIME?,
192        "clock_nanosleep" => libc::SYS_clock_nanosleep,
193        "timerfd_settime" => libc::SYS_timerfd_settime,
194        "timer_settime" => libc::SYS_timer_settime,
195        "execve" => libc::SYS_execve,
196        "execveat" => libc::SYS_execveat,
197        // COW filesystem syscalls
198        "unlinkat" => libc::SYS_unlinkat,
199        "mkdirat" => libc::SYS_mkdirat,
200        "renameat2" => libc::SYS_renameat2,
201        "newfstatat" => libc::SYS_newfstatat,
202        "statx" => libc::SYS_statx,
203        "faccessat" => libc::SYS_faccessat,
204        "symlinkat" => libc::SYS_symlinkat,
205        "linkat" => libc::SYS_linkat,
206        "fchmodat" => libc::SYS_fchmodat,
207        "fchownat" => libc::SYS_fchownat,
208        "readlinkat" => libc::SYS_readlinkat,
209        "truncate" => libc::SYS_truncate,
210        "utimensat" => libc::SYS_utimensat,
211        "unlink" => arch::SYS_UNLINK?,
212        "rmdir" => arch::SYS_RMDIR?,
213        "mkdir" => arch::SYS_MKDIR?,
214        "rename" => arch::SYS_RENAME?,
215        "stat" => arch::SYS_STAT?,
216        "lstat" => arch::SYS_LSTAT?,
217        "access" => arch::SYS_ACCESS?,
218        "symlink" => arch::SYS_SYMLINK?,
219        "link" => arch::SYS_LINK?,
220        "chmod" => arch::SYS_CHMOD?,
221        "chown" => arch::SYS_CHOWN?,
222        "lchown" => arch::SYS_LCHOWN?,
223        "readlink" => arch::SYS_READLINK?,
224        "futimesat" => arch::SYS_FUTIMESAT?,
225        "fork" => arch::SYS_FORK?,
226        // SysV IPC (gated by extra_allow_syscalls=["sysv_ipc"]; denied by default)
227        "shmget" => libc::SYS_shmget,
228        "shmat" => libc::SYS_shmat,
229        "shmdt" => libc::SYS_shmdt,
230        "shmctl" => libc::SYS_shmctl,
231        "msgget" => libc::SYS_msgget,
232        "msgsnd" => libc::SYS_msgsnd,
233        "msgrcv" => libc::SYS_msgrcv,
234        "msgctl" => libc::SYS_msgctl,
235        "semget" => libc::SYS_semget,
236        "semop" => libc::SYS_semop,
237        "semctl" => libc::SYS_semctl,
238        "semtimedop" => libc::SYS_semtimedop,
239        _ => return None,
240    };
241    Some(nr as u32)
242}
243
244// ============================================================
245// Sandbox → syscall lists
246// ============================================================
247
248/// Determine which syscalls need `SECCOMP_RET_USER_NOTIF`.
249pub fn notif_syscalls(policy: &Sandbox, sandbox_name: Option<&str>) -> Vec<u32> {
250    let mut nrs = vec![
251        libc::SYS_clone as u32,
252        libc::SYS_clone3 as u32,
253        libc::SYS_wait4 as u32,
254        libc::SYS_waitid as u32,
255    ];
256    arch::push_optional_syscall(&mut nrs, arch::SYS_VFORK);
257    // Bare fork(2) carries none of the namespace/process-limit risk of
258    // clone/clone3 and was historically left out of the BPF filter so
259    // hot fork-loops (COW map-reduce) bypass the supervisor entirely.
260    // It only needs interception when policy_fn is active, so the
261    // supervisor can register the new child via ptrace fork events
262    // before it can run user code (argv-safety invariant).
263    if policy.policy_fn.is_some() {
264        arch::push_optional_syscall(&mut nrs, arch::SYS_FORK);
265    }
266
267    if policy.max_memory.is_some() {
268        nrs.push(libc::SYS_mmap as u32);
269        nrs.push(libc::SYS_munmap as u32);
270        nrs.push(libc::SYS_brk as u32);
271        nrs.push(libc::SYS_mremap as u32);
272        // shmget is in notif only when SysV IPC is allowed. The BPF
273        // layout puts notif JEQs before deny JEQs, so a syscall on
274        // both lists would notify (RET_USER_NOTIF) and silently
275        // bypass the kernel-level deny. When extra_allow_syscalls does not contain "sysv_ipc",
276        // shmget belongs only on the blocklist.
277        if policy.allows_sysv_ipc() {
278            nrs.push(libc::SYS_shmget as u32);
279        }
280    }
281
282    if !policy.net_allow.is_empty()
283        || policy.policy_fn.is_some()
284        || !policy.http_allow.is_empty()
285        || !policy.http_deny.is_empty()
286    {
287        nrs.push(libc::SYS_connect as u32);
288        nrs.push(libc::SYS_sendto as u32);
289        nrs.push(libc::SYS_sendmsg as u32);
290        nrs.push(libc::SYS_sendmmsg as u32);
291        nrs.push(libc::SYS_bind as u32);
292    }
293
294    if policy.random_seed.is_some() {
295        nrs.push(libc::SYS_getrandom as u32);
296        // Also intercept openat so the supervisor can re-patch vDSO after exec.
297        nrs.push(libc::SYS_openat as u32);
298    }
299
300    if policy.time_start.is_some() {
301        nrs.extend_from_slice(&[
302            libc::SYS_clock_nanosleep as u32,
303            libc::SYS_timerfd_settime as u32,
304            libc::SYS_timer_settime as u32,
305        ]);
306        // Also intercept openat so the supervisor gets a notification after exec
307        // and can re-patch the vDSO (exec replaces vDSO with a fresh copy).
308        nrs.push(libc::SYS_openat as u32);
309    }
310
311    // /proc virtualization + /etc/hosts virtualization (always on).
312    //
313    // `openat` carries the simple `(AT_FDCWD, "/proc/...")` and
314    // `(AT_FDCWD, "/etc/hosts")` spellings; `openat2` is the same shape
315    // on newer libcs; legacy `open(path, ...)` is the same path without a
316    // dirfd. The handlers normalize all three into a single absolute path
317    // check, so we have to put every variant on the notif list — otherwise
318    // a caller that picks `open` or `openat2` slips past virtualization
319    // and reads the real on-disk file.
320    nrs.push(libc::SYS_openat as u32);
321    nrs.push(arch::SYS_OPENAT2 as u32);
322    arch::push_optional_syscall(&mut nrs, arch::SYS_OPEN);
323    nrs.push(libc::SYS_getdents64 as u32);
324    arch::push_optional_syscall(&mut nrs, arch::SYS_GETDENTS);
325
326    // Netlink virtualization (always on):
327    //   socket, bind, getsockname — swap in a unix socketpair for AF_NETLINK
328    //   recvfrom, recvmsg         — zero msg_name so glibc accepts the reply
329    //                                (kernel only writes sun_family on unix
330    //                                 recvmsg, leaving nl_pid uninitialized)
331    //   close                     — unregister (pid, fd) so reuse doesn't
332    //                                collide with the cookie set
333    // Send traffic flows through the real socketpair untouched.
334    nrs.push(libc::SYS_socket as u32);
335    nrs.push(libc::SYS_bind as u32);
336    nrs.push(libc::SYS_getsockname as u32);
337    nrs.push(libc::SYS_recvfrom as u32);
338    nrs.push(libc::SYS_recvmsg as u32);
339    nrs.push(libc::SYS_close as u32);
340    // Virtualize sched_getaffinity so nproc/sysconf agree with /proc/cpuinfo
341    if policy.num_cpus.is_some() {
342        nrs.push(libc::SYS_sched_getaffinity as u32);
343    }
344    if sandbox_name.is_some() {
345        nrs.push(libc::SYS_uname as u32);
346        nrs.push(libc::SYS_openat as u32);
347    }
348
349    // COW filesystem interception (seccomp-based, unprivileged)
350    if policy.workdir.is_some() && policy.fs_isolation == FsIsolation::None {
351        nrs.extend_from_slice(&[
352            libc::SYS_openat as u32,
353            libc::SYS_execve as u32,
354            libc::SYS_execveat as u32,
355            libc::SYS_unlinkat as u32,
356            libc::SYS_mkdirat as u32,
357            libc::SYS_renameat2 as u32,
358            libc::SYS_symlinkat as u32,
359            libc::SYS_linkat as u32,
360            libc::SYS_fchmodat as u32,
361            libc::SYS_fchownat as u32,
362            libc::SYS_truncate as u32,
363            libc::SYS_utimensat as u32,
364            libc::SYS_newfstatat as u32,
365            libc::SYS_statx as u32,
366            libc::SYS_faccessat as u32,
367            439u32,                       // SYS_faccessat2 — glibc 2.33+ uses this instead of faccessat
368            libc::SYS_readlinkat as u32,
369            libc::SYS_getdents64 as u32,
370            libc::SYS_chdir as u32,
371            libc::SYS_getcwd as u32,
372        ]);
373        for nr in [
374            arch::SYS_OPEN, arch::SYS_UNLINK, arch::SYS_RMDIR, arch::SYS_MKDIR,
375            arch::SYS_RENAME, arch::SYS_SYMLINK, arch::SYS_LINK, arch::SYS_CHMOD,
376            arch::SYS_CHOWN, arch::SYS_LCHOWN, arch::SYS_STAT, arch::SYS_LSTAT,
377            arch::SYS_ACCESS, arch::SYS_READLINK, arch::SYS_GETDENTS,
378        ] {
379            arch::push_optional_syscall(&mut nrs, nr);
380        }
381    }
382
383    // Chroot path interception
384    if policy.chroot.is_some() {
385        nrs.extend_from_slice(&[
386            libc::SYS_openat as u32,
387            libc::SYS_execve as u32,
388            libc::SYS_execveat as u32,
389            libc::SYS_unlinkat as u32,
390            libc::SYS_mkdirat as u32,
391            libc::SYS_renameat2 as u32,
392            libc::SYS_symlinkat as u32,
393            libc::SYS_linkat as u32,
394            libc::SYS_fchmodat as u32,
395            libc::SYS_fchownat as u32,
396            libc::SYS_truncate as u32,
397            libc::SYS_newfstatat as u32,
398            libc::SYS_statx as u32,
399            libc::SYS_faccessat as u32,
400            439u32,                       // SYS_faccessat2 — glibc 2.33+ uses this instead of faccessat
401            libc::SYS_readlinkat as u32,
402            libc::SYS_getdents64 as u32,
403            libc::SYS_chdir as u32,
404            libc::SYS_getcwd as u32,
405            libc::SYS_statfs as u32,
406            libc::SYS_utimensat as u32,
407        ]);
408        for nr in [
409            arch::SYS_OPEN, arch::SYS_STAT, arch::SYS_LSTAT, arch::SYS_ACCESS,
410            arch::SYS_READLINK, arch::SYS_GETDENTS, arch::SYS_UNLINK,
411            arch::SYS_RMDIR, arch::SYS_MKDIR, arch::SYS_RENAME,
412            arch::SYS_SYMLINK, arch::SYS_LINK, arch::SYS_CHMOD,
413            arch::SYS_CHOWN, arch::SYS_LCHOWN,
414        ] {
415            arch::push_optional_syscall(&mut nrs, nr);
416        }
417    }
418
419    // Explicit deny-paths need path-bearing syscalls intercepted.
420    if !policy.fs_denied.is_empty() {
421        nrs.extend_from_slice(&[
422            libc::SYS_openat as u32,
423            libc::SYS_execve as u32,
424            libc::SYS_execveat as u32,
425            libc::SYS_linkat as u32,
426            libc::SYS_renameat2 as u32,
427            libc::SYS_symlinkat as u32,
428        ]);
429        for nr in [arch::SYS_OPEN, arch::SYS_LINK, arch::SYS_RENAME, arch::SYS_SYMLINK] {
430            arch::push_optional_syscall(&mut nrs, nr);
431        }
432    }
433
434    // Dynamic policy callback — intercept key syscalls for event emission.
435    if policy.policy_fn.is_some() {
436        nrs.extend_from_slice(&[
437            libc::SYS_openat as u32,
438            libc::SYS_connect as u32,
439            libc::SYS_sendto as u32,
440            libc::SYS_bind as u32,
441            libc::SYS_execve as u32,
442            libc::SYS_execveat as u32,
443        ]);
444    }
445
446    // Port remapping
447    if policy.port_remap {
448        nrs.extend_from_slice(&[
449            libc::SYS_bind as u32,
450            libc::SYS_getsockname as u32,
451        ]);
452    }
453
454    nrs.sort_unstable();
455    nrs.dedup();
456    nrs
457}
458
459/// Resolve `NO_SUPERVISOR_BLOCKLIST_SYSCALLS` names to numbers, plus
460/// SysV IPC syscalls when `policy.allows_sysv_ipc()` is false.
461pub fn no_supervisor_blocklist_syscall_numbers(policy: &Sandbox) -> Vec<u32> {
462    use crate::sys::structs::NO_SUPERVISOR_BLOCKLIST_SYSCALLS;
463    let mut nrs: Vec<u32> = NO_SUPERVISOR_BLOCKLIST_SYSCALLS
464        .iter()
465        .copied()
466        .chain(policy.extra_deny_syscalls.iter().map(String::as_str))
467        .filter_map(|n| syscall_name_to_nr(n))
468        .collect();
469    if !policy.allows_sysv_ipc() {
470        for name in SYSV_IPC_BLOCKLIST_SYSCALLS {
471            if let Some(nr) = syscall_name_to_nr(name) {
472                if !nrs.contains(&nr) {
473                    nrs.push(nr);
474                }
475            }
476        }
477    }
478    nrs.sort_unstable();
479    nrs.dedup();
480    nrs
481}
482
483/// Resolve the default syscall blocklist plus policy extras to numbers.
484///
485/// SysV IPC syscalls are appended to the resolved blocklist when
486/// `policy.allows_sysv_ipc()` is false.
487pub fn blocklist_syscall_numbers(policy: &Sandbox) -> Vec<u32> {
488    let mut nrs: Vec<u32> = DEFAULT_BLOCKLIST_SYSCALLS
489        .iter()
490        .copied()
491        .chain(policy.extra_deny_syscalls.iter().map(String::as_str))
492        .filter_map(|n| syscall_name_to_nr(n))
493        .collect();
494    if !policy.allows_sysv_ipc() {
495        for name in SYSV_IPC_BLOCKLIST_SYSCALLS {
496            if let Some(nr) = syscall_name_to_nr(name) {
497                if !nrs.contains(&nr) {
498                    nrs.push(nr);
499                }
500            }
501        }
502    }
503    nrs.sort_unstable();
504    nrs.dedup();
505    nrs
506}
507
508/// Build argument-level seccomp filter instructions matching the Python
509/// `_build_arg_filters()` exactly.
510///
511/// Returns a `Vec<SockFilter>` containing self-contained BPF blocks for:
512///   - clone: block namespace creation flags
513///   - ioctl: block TIOCSTI, TIOCLINUX, SIOCGIF*, SIOCETHTOOL
514///   - prctl: block PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER
515///   - socket: block SOCK_RAW/SOCK_DGRAM on AF_INET/AF_INET6 (with type mask)
516pub fn arg_filters(policy: &Sandbox) -> Vec<SockFilter> {
517    let ret_errno = SECCOMP_RET_ERRNO | EPERM as u32;
518    let nr_clone = libc::SYS_clone as u32;
519    let nr_ioctl = libc::SYS_ioctl as u32;
520    let nr_prctl = libc::SYS_prctl as u32;
521    let nr_socket = libc::SYS_socket as u32;
522
523    let mut insns: Vec<SockFilter> = Vec::new();
524
525    // --- clone: block namespace creation flags ---
526    // 5 instructions:
527    //   LD NR
528    //   JEQ clone → +0, skip 3
529    //   LD arg0
530    //   JSET NS_FLAGS → +0, skip 1
531    //   RET ERRNO
532    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
533    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_clone, 0, 3));
534    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
535    insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, CLONE_NS_FLAGS as u32, 0, 1));
536    insns.push(stmt(BPF_RET | BPF_K, ret_errno));
537
538    // --- ioctl: block dangerous commands ---
539    // Block terminal injection (TIOCSTI, TIOCLINUX) and network interface
540    // enumeration ioctls (SIOCGIF*, SIOCETHTOOL) to complement NETLINK_ROUTE
541    // virtualization.
542    // Layout: LD NR, JEQ ioctl (skip 1 + N*2), LD arg1, [JEQ cmd, RET ERRNO] * N
543    let dangerous_ioctls: &[u32] = &[
544        TIOCSTI as u32,
545        TIOCLINUX as u32,
546        SIOCGIFNAME as u32,
547        SIOCGIFCONF as u32,
548        SIOCGIFFLAGS as u32,
549        SIOCGIFADDR as u32,
550        SIOCGIFDSTADDR as u32,
551        SIOCGIFBRDADDR as u32,
552        SIOCGIFNETMASK as u32,
553        SIOCGIFHWADDR as u32,
554        SIOCGIFINDEX as u32,
555        SIOCETHTOOL as u32,
556    ];
557    let n_ioctls = dangerous_ioctls.len();
558    let skip_count = (1 + n_ioctls * 2) as u8;
559    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
560    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_ioctl, 0, skip_count));
561    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
562    for &cmd in dangerous_ioctls {
563        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, cmd, 0, 1));
564        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
565    }
566
567    // --- prctl: block dangerous options ---
568    // Layout: LD NR, JEQ prctl (skip 1 + N*2), LD arg0, [JEQ op, RET ERRNO] * N
569    let dangerous_prctl_ops: &[u32] = &[PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER];
570    let n_ops = dangerous_prctl_ops.len();
571    let skip_count = (1 + n_ops * 2) as u8;
572    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
573    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_prctl, 0, skip_count));
574    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
575    for &op in dangerous_prctl_ops {
576        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, op, 0, 1));
577        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
578    }
579
580    // --- socket: block SOCK_RAW and/or SOCK_DGRAM on AF_INET/AF_INET6 ---
581    //
582    // SOCK_RAW is unconditionally denied. Sandlock does not expose
583    // raw ICMP — packet-crafting capabilities aren't part of the XOA
584    // threat model, and destination filtering at `sendto` can't be
585    // honestly enforced for raw sockets (the agent controls the IP
586    // header). Workloads that need ping should use the kernel ping
587    // socket (SOCK_DGRAM + IPPROTO_ICMP) via an `icmp://...` rule.
588    //
589    // SOCK_DGRAM is denied unless a UDP or ICMP rule exists in
590    // net_allow. The kernel ping socket uses SOCK_DGRAM with
591    // IPPROTO_ICMP, so the same type bit gates both — destination
592    // filtering at sendto (Phase 2) is what separates them per-rule.
593    use crate::sandbox::Protocol;
594    let any_udp_rule = policy.net_allow.iter().any(|r| r.protocol == Protocol::Udp);
595    let any_icmp_rule = policy.net_allow.iter().any(|r| r.protocol == Protocol::Icmp);
596    let mut blocked_types: Vec<u32> = Vec::new();
597    blocked_types.push(SOCK_RAW);
598    if !any_udp_rule && !any_icmp_rule {
599        blocked_types.push(SOCK_DGRAM);
600    }
601
602    if !blocked_types.is_empty() {
603        let n = blocked_types.len();
604        // Instructions after domain checks: 2 (load+AND) + N (JEQs) + 1 (RET)
605        let after_domain = 2 + n + 1;
606        // Total after NR check: 3 (load domain + 2 JEQs) + after_domain
607        let skip_all = (3 + after_domain) as u8;
608
609        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
610        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, skip_all));
611        // Load domain (arg0)
612        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
613        // AF_INET → skip to type check (jump over AF_INET6 check)
614        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET, 1, 0));
615        // AF_INET6 → type check; else skip everything remaining
616        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET6, 0, after_domain as u8));
617        // Load type (arg1) and mask off SOCK_NONBLOCK|SOCK_CLOEXEC
618        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
619        insns.push(stmt(BPF_ALU | BPF_AND | BPF_K, SOCK_TYPE_MASK));
620        // Check each blocked type
621        for (i, &sock_type) in blocked_types.iter().enumerate() {
622            let remaining = n - i - 1;
623            // Match → jump to RET ERRNO (skip 'remaining' JEQs ahead)
624            // No match on last type → skip past RET ERRNO (jf=1)
625            // No match on non-last → check next type (jf=0)
626            let jf: u8 = if remaining == 0 { 1 } else { 0 };
627            insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, sock_type, remaining as u8, jf));
628        }
629        // Deny return (reached by any matching JEQ)
630        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
631    }
632
633    // (raw ICMP carve-out removed — SOCK_RAW is unconditionally denied
634    // by the blocked_types block above. Sandlock does not expose raw
635    // sockets; ping uses the SOCK_DGRAM kernel ping socket via an
636    // `icmp://...` rule, gated by host `ping_group_range`.)
637
638    // --- wait4: skip notification for WNOHANG/WNOWAIT (non-blocking) ---
639    // wait4(pid, status, options, rusage) — options is arg2
640    // 5 instructions:
641    //   LD NR
642    //   JEQ wait4 → +0, skip 3
643    //   LD arg2
644    //   JSET (WNOHANG|WNOWAIT) → +0, skip 1
645    //   RET ALLOW
646    {
647        let nr_wait4 = libc::SYS_wait4 as u32;
648        let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000/* WNOWAIT */) as u32;
649        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
650        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_wait4, 0, 3));
651        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS2_LO));
652        insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
653        insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
654    }
655
656    // --- waitid: skip notification for WNOHANG/WNOWAIT (non-blocking) ---
657    // waitid(idtype, id, infop, options, rusage) — options is arg3
658    {
659        let nr_waitid = libc::SYS_waitid as u32;
660        let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000/* WNOWAIT */) as u32;
661        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
662        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_waitid, 0, 3));
663        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS3_LO));
664        insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
665        insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
666    }
667
668    insns
669}
670
671// ============================================================
672// Close fds above threshold
673// ============================================================
674
675/// Close all file descriptors above `min_fd`, except those in `keep`.
676fn close_fds_above(min_fd: RawFd, keep: &[RawFd]) {
677    // Read /proc/self/fd to enumerate open fds.
678    // Collect all fd numbers first, then close them after dropping the directory
679    // iterator. This avoids closing the directory fd during iteration.
680    let fds_to_close: Vec<RawFd> = {
681        let dir = match std::fs::read_dir("/proc/self/fd") {
682            Ok(d) => d,
683            Err(_) => return,
684        };
685        dir.flatten()
686            .filter_map(|entry| {
687                entry.file_name().into_string().ok()
688                    .and_then(|name| name.parse::<RawFd>().ok())
689            })
690            .filter(|&fd| fd > min_fd && !keep.contains(&fd))
691            .collect()
692    };
693    // The directory is now closed; safe to close the collected fds.
694    for fd in fds_to_close {
695        unsafe { libc::close(fd) };
696    }
697}
698
699// ============================================================
700// COW filesystem config passed from parent to child
701// ============================================================
702
703// Re-export ChildMountConfig so callers can use the old import path.
704pub(crate) use crate::cow::ChildMountConfig;
705
706/// Write uid/gid maps for an unprivileged user namespace.
707/// `real_uid`/`real_gid` must be captured *before* unshare(CLONE_NEWUSER),
708/// since getuid()/getgid() return the overflow id (65534) after unshare.
709/// `target_uid`/`target_gid` are the UIDs visible inside the namespace.
710fn write_id_maps(real_uid: u32, real_gid: u32, target_uid: u32, target_gid: u32) {
711    let _ = std::fs::write("/proc/self/uid_map", format!("{} {} 1\n", target_uid, real_uid));
712    let _ = std::fs::write("/proc/self/setgroups", "deny\n");
713    let _ = std::fs::write("/proc/self/gid_map", format!("{} {} 1\n", target_gid, real_gid));
714}
715
716/// Write uid/gid maps using the post-unshare overflow uid (65534).
717/// Used by the OverlayFS COW path which maps to root (UID 0) inside.
718fn write_id_maps_overflow() {
719    let uid = unsafe { libc::getuid() };
720    let gid = unsafe { libc::getgid() };
721    write_id_maps(uid, gid, 0, 0);
722}
723
724// ============================================================
725// Child-side confinement (never returns)
726// ============================================================
727
728/// Arguments threaded from the parent's `do_spawn` into the child-side
729/// `confine_child`.  Packed into a struct because `confine_child` historically
730/// grew to seven positional parameters and a struct keeps the call site
731/// readable when new flags get added (e.g. `extra_syscalls` for user
732/// handlers).  Lifetimes tie everything to the parent's stack frame — the
733/// child never outlives the fork point because `confine_child` either execs
734/// or exits.
735pub(crate) struct ChildSpawnArgs<'a> {
736    pub sandbox: &'a Sandbox,
737    pub cmd: &'a [CString],
738    pub pipes: &'a PipePair,
739    pub cow_config: Option<&'a ChildMountConfig>,
740    /// Skip the user-notification supervisor: child installs a kernel-only
741    /// deny filter, parent reads `notif_fd_num = 0` and never starts a
742    /// supervisor. Mirrors `Sandbox::no_supervisor`.
743    pub no_supervisor: bool,
744    pub keep_fds: &'a [RawFd],
745    /// Sandbox instance name. When set, it is also exposed as the
746    /// sandbox's virtual hostname.
747    pub sandbox_name: Option<&'a str>,
748    /// Syscall numbers for which the parent registered user `Handler`s.
749    /// Merged into the child's BPF notif list so the kernel actually
750    /// raises USER_NOTIF for them.
751    pub extra_syscalls: &'a [u32],
752    /// PID of the parent process captured before fork. Used to detect
753    /// parent death in the child without assuming PID 1 is always init
754    /// (incorrect in containers where the entrypoint runs as PID 1).
755    pub parent_pid: libc::pid_t,
756}
757
758/// Apply irreversible confinement (Landlock + seccomp) then exec the command.
759///
760/// This function **never returns**: it calls `execvp` on success or
761/// `_exit(127)` on any error.
762pub(crate) fn confine_child(args: ChildSpawnArgs<'_>) -> ! {
763    let ChildSpawnArgs {
764        sandbox,
765        cmd,
766        pipes,
767        cow_config,
768        no_supervisor,
769        keep_fds,
770        sandbox_name,
771        extra_syscalls,
772        parent_pid,
773    } = args;
774    // Helper: abort child on error. Includes the OS error automatically.
775    macro_rules! fail {
776        ($msg:expr) => {{
777            let err = std::io::Error::last_os_error();
778            let _ = write!(std::io::stderr(), "sandlock child: {}: {}\n", $msg, err);
779            unsafe { libc::_exit(127) };
780        }};
781    }
782
783    use std::io::Write;
784
785    // 1. New process group
786    if unsafe { libc::setpgid(0, 0) } != 0 {
787        fail!("setpgid");
788    }
789
790    // 1b. If stdin is a terminal, become the foreground process group
791    //     so interactive shells can read from the TTY.
792    //     Must ignore SIGTTOU first — a background pgrp calling tcsetpgrp
793    //     gets stopped by SIGTTOU otherwise.
794    if unsafe { libc::isatty(0) } == 1 {
795        unsafe {
796            libc::signal(libc::SIGTTOU, libc::SIG_IGN);
797            libc::tcsetpgrp(0, libc::getpgrp());
798            libc::signal(libc::SIGTTOU, libc::SIG_DFL);
799        }
800    }
801
802    // 2. Die if parent exits
803    if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } != 0 {
804        fail!("prctl(PR_SET_PDEATHSIG)");
805    }
806
807    // 3. Check parent didn't die between fork and prctl.
808    // Compare against the actual parent PID captured before fork rather than
809    // hardcoding 1, since containers often run the entrypoint as PID 1 and a
810    // child forked from it legitimately has getppid() == 1.
811    if unsafe { libc::getppid() } != parent_pid {
812        fail!("parent died before confinement");
813    }
814
815    // 4. Optional: disable ASLR
816    if sandbox.no_randomize_memory {
817        const ADDR_NO_RANDOMIZE: libc::c_ulong = 0x0040000;
818        // Read current personality first (0xffffffff = query), then OR in the flag.
819        let current = unsafe { libc::personality(0xffffffff) };
820        if current == -1 {
821            fail!("personality(query)");
822        }
823        if unsafe { libc::personality(current as libc::c_ulong | ADDR_NO_RANDOMIZE) } == -1 {
824            fail!("personality(ADDR_NO_RANDOMIZE)");
825        }
826    }
827
828    // 4b. Optional: CPU core binding
829    if let Some(ref cores) = sandbox.cpu_cores {
830        if !cores.is_empty() {
831            let mut set = unsafe { std::mem::zeroed::<libc::cpu_set_t>() };
832            unsafe { libc::CPU_ZERO(&mut set) };
833            for &core in cores {
834                unsafe { libc::CPU_SET(core as usize, &mut set) };
835            }
836            if unsafe {
837                libc::sched_setaffinity(
838                    0,
839                    std::mem::size_of::<libc::cpu_set_t>(),
840                    &set,
841                )
842            } != 0
843            {
844                fail!("sched_setaffinity");
845            }
846        }
847    }
848
849    // 5. Optional: disable THP
850    if sandbox.no_huge_pages {
851        if unsafe { libc::prctl(libc::PR_SET_THP_DISABLE, 1, 0, 0, 0) } != 0 {
852            fail!("prctl(PR_SET_THP_DISABLE)");
853        }
854    }
855
856    // 5c. Optional: disable core dumps
857    if sandbox.no_coredump {
858        // Set RLIMIT_CORE to 0 — the kernel will not write a core file.
859        // We intentionally do NOT call prctl(PR_SET_DUMPABLE, 0) because
860        // that would break pidfd_getfd which the supervisor needs.
861        // The seccomp filter already blocks the child from calling
862        // prctl(PR_SET_DUMPABLE, ...) so it can't re-enable it.
863        let rlim = libc::rlimit { rlim_cur: 0, rlim_max: 0 };
864        if unsafe { libc::setrlimit(libc::RLIMIT_CORE, &rlim) } != 0 {
865            fail!("setrlimit(RLIMIT_CORE, 0)");
866        }
867    }
868
869    // Capture real uid/gid before any unshare (after unshare they become 65534)
870    let real_uid = unsafe { libc::getuid() };
871    let real_gid = unsafe { libc::getgid() };
872
873    // 5b. User namespace for --uid mapping (when not using OverlayFS COW,
874    //     which sets up its own user namespace)
875    if let Some(target_uid) = sandbox.uid {
876        if cow_config.is_none() {
877            if unsafe { libc::unshare(libc::CLONE_NEWUSER) } != 0 {
878                fail!("unshare(CLONE_NEWUSER)");
879            }
880            write_id_maps(real_uid, real_gid, target_uid, target_uid);
881        }
882    }
883
884    // 5c. User + mount namespace for OverlayFS COW (includes CLONE_NEWUSER)
885    if let Some(ref cow) = cow_config {
886        // unshare user + mount namespaces (unprivileged)
887        if unsafe { libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) } != 0 {
888            fail!("unshare(CLONE_NEWUSER | CLONE_NEWNS)");
889        }
890
891        // Write uid/gid maps using overflow uid (preserves existing COW behavior)
892        write_id_maps_overflow();
893
894        // Mount the overlay filesystem ON TOP of the workdir so the child
895        // sees the merged view at the original path.  The kernel resolves
896        // lowerdir before the covering mount takes effect, so using the
897        // same path as both lowerdir and mount-point is safe inside our
898        // private mount namespace.
899        let lowerdir = cow.lowers.iter()
900            .map(|p| p.display().to_string())
901            .collect::<Vec<_>>()
902            .join(":");
903        let opts = format!(
904            "lowerdir={},upperdir={},workdir={}",
905            lowerdir,
906            cow.upper.display(),
907            cow.work.display(),
908        );
909
910        let mount_cstr = match CString::new(cow.mount_point.to_str().unwrap_or("")) {
911            Ok(c) => c,
912            Err(_) => fail!("invalid overlay mount point path"),
913        };
914        let overlay_cstr = CString::new("overlay").unwrap();
915        let opts_cstr = match CString::new(opts) {
916            Ok(c) => c,
917            Err(_) => fail!("invalid overlay opts"),
918        };
919
920        let ret = unsafe {
921            libc::mount(
922                overlay_cstr.as_ptr(),
923                mount_cstr.as_ptr(),
924                overlay_cstr.as_ptr(),
925                0,
926                opts_cstr.as_ptr() as *const libc::c_void,
927            )
928        };
929        if ret != 0 {
930            fail!("mount overlay");
931        }
932    }
933
934    // 6. Optional: change working directory
935    // cwd controls where the child starts; workdir is only for COW
936    let effective_cwd = if let Some(ref cwd) = sandbox.cwd {
937        if let Some(ref chroot_root) = sandbox.chroot {
938            Some(chroot_root.join(cwd.strip_prefix("/").unwrap_or(cwd)))
939        } else {
940            Some(cwd.clone())
941        }
942    } else if let Some(ref chroot_root) = sandbox.chroot {
943        // Default to chroot root
944        Some(chroot_root.to_path_buf())
945    } else if let Some(ref workdir) = sandbox.workdir {
946        // Default to workdir when set (COW working directory)
947        Some(workdir.clone())
948    } else {
949        None
950    };
951
952    if let Some(ref cwd) = effective_cwd {
953        let c_path = match CString::new(cwd.as_os_str().as_encoded_bytes()) {
954            Ok(c) => c,
955            Err(_) => fail!("invalid cwd path"),
956        };
957        if unsafe { libc::chdir(c_path.as_ptr()) } != 0 {
958            fail!("chdir");
959        }
960    }
961
962    // 7. Set NO_NEW_PRIVS (required for both Landlock and seccomp without CAP_SYS_ADMIN)
963    if unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) } != 0 {
964        fail!("prctl(PR_SET_NO_NEW_PRIVS)");
965    }
966
967    // 8. Apply Landlock confinement (IRREVERSIBLE)
968    if let Err(e) = crate::landlock::confine(sandbox) {
969        fail!(format!("landlock: {}", e));
970    }
971
972    // 9. Assemble and install seccomp filter (IRREVERSIBLE)
973    let args = arg_filters(sandbox);
974    let mut keep_fd: i32 = -1;
975
976    if no_supervisor {
977        // No-supervisor mode: deny-only kernel filter, no NEW_LISTENER.
978        // BPF filters are ANDed by the kernel, so an outer filter (from a
979        // wrapping sandbox) keeps tightening this layer too.
980        //
981        // Uses the relaxed `no_supervisor_blocklist_syscall_numbers` deny
982        // list (which leaves `ptrace`, `unshare`, `process_vm_*`, etc.
983        // alone) so an inner full-supervisor sandlock nested under this
984        // one still has the syscalls its supervisor needs.
985        let deny = no_supervisor_blocklist_syscall_numbers(sandbox);
986        let filter = match bpf::assemble_filter(&[], &deny, &args) {
987            Ok(f) => f,
988            Err(e) => fail!(format!("seccomp assemble: {}", e)),
989        };
990        if let Err(e) = bpf::install_deny_filter(&filter) {
991            fail!(format!("seccomp deny filter: {}", e));
992        }
993        // fd=0 tells the parent there's no supervisor to attach to.
994        if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), 0) {
995            fail!(format!("write no-supervisor signal: {}", e));
996        }
997    } else {
998        let deny = blocklist_syscall_numbers(sandbox);
999        // First-level sandbox: notif + deny filter with NEW_LISTENER.
1000        //
1001        // Caller-supplied handlers must have their syscalls registered in
1002        // the BPF filter, otherwise the kernel never raises a notification for
1003        // them and the handler silently never fires.  We merge `extra_syscalls`
1004        // into the notif list and dedup so each syscall produces exactly one
1005        // JEQ in the assembled program.
1006        let mut notif = notif_syscalls(sandbox, sandbox_name);
1007        if !extra_syscalls.is_empty() {
1008            notif.extend_from_slice(extra_syscalls);
1009        }
1010        // Argv-safety gate (companion to the policy_fn case in
1011        // notif_syscalls): a handler bound to execve/execveat
1012        // can call `read_child_mem` to inspect argv, so the supervisor
1013        // must register newly forked children before they can run user
1014        // code — same invariant policy_fn relies on. Bare fork(2)
1015        // therefore needs to be intercepted here too.
1016        let exec_extra = extra_syscalls.iter().any(|&n| {
1017            n == libc::SYS_execve as u32 || n == libc::SYS_execveat as u32
1018        });
1019        if exec_extra {
1020            arch::push_optional_syscall(&mut notif, arch::SYS_FORK);
1021        }
1022        notif.sort_unstable();
1023        notif.dedup();
1024        let filter = match bpf::assemble_filter(&notif, &deny, &args) {
1025            Ok(f) => f,
1026            Err(e) => fail!(format!("seccomp assemble: {}", e)),
1027        };
1028        let notif_fd = match bpf::install_filter(&filter) {
1029            Ok(fd) => fd,
1030            Err(e) => {
1031                // EBUSY here means another seccomp filter on this task already
1032                // owns the SECCOMP_FILTER_FLAG_NEW_LISTENER slot. The kernel
1033                // permits at most one listener per task — to nest, opt this
1034                // sandbox out of the supervisor via `Sandbox::no_supervisor`
1035                // (or the CLI's `--no-supervisor` flag).
1036                if e.raw_os_error() == Some(libc::EBUSY) {
1037                    let _ = write!(
1038                        std::io::stderr(),
1039                        "sandlock child: seccomp install: {} (an outer sandbox already owns the \
1040                         seccomp listener; pass --no-supervisor or Sandbox::no_supervisor(true) \
1041                         on this sandbox to nest)\n",
1042                        e,
1043                    );
1044                    unsafe { libc::_exit(127) };
1045                }
1046                fail!(format!("seccomp install: {}", e));
1047            }
1048        };
1049        keep_fd = notif_fd.as_raw_fd();
1050        if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), keep_fd as u32) {
1051            fail!(format!("write notif fd: {}", e));
1052        }
1053        std::mem::forget(notif_fd);
1054    }
1055
1056    // 10. Wait for parent to signal ready
1057    match read_u32_fd(pipes.ready_r.as_raw_fd()) {
1058        Ok(_) => {}
1059        Err(e) => fail!(format!("read ready signal: {}", e)),
1060    }
1061
1062    // 12. Close all fds above stderr (always on for isolation)
1063    let mut fds_to_keep: Vec<RawFd> = keep_fds.to_vec();
1064    if keep_fd >= 0 {
1065        fds_to_keep.push(keep_fd);
1066    }
1067    close_fds_above(2, &fds_to_keep);
1068
1069    // 13. Apply environment
1070    if sandbox.clean_env {
1071        // Clear all env vars first
1072        for (key, _) in std::env::vars_os() {
1073            std::env::remove_var(&key);
1074        }
1075    }
1076    for (key, value) in &sandbox.env {
1077        std::env::set_var(key, value);
1078    }
1079
1080    // 13b. GPU device visibility
1081    if let Some(ref devices) = sandbox.gpu_devices {
1082        if !devices.is_empty() {
1083            let vis = devices.iter().map(|d| d.to_string()).collect::<Vec<_>>().join(",");
1084            std::env::set_var("CUDA_VISIBLE_DEVICES", &vis);
1085            std::env::set_var("ROCR_VISIBLE_DEVICES", &vis);
1086        }
1087        // Empty list = all GPUs visible, don't set env vars
1088    }
1089
1090    // 14. exec
1091    debug_assert!(!cmd.is_empty(), "cmd must not be empty");
1092    let argv_ptrs: Vec<*const libc::c_char> = cmd
1093        .iter()
1094        .map(|s| s.as_ptr())
1095        .chain(std::iter::once(std::ptr::null()))
1096        .collect();
1097
1098    if sandbox.chroot.is_some() {
1099        // With chroot the seccomp handler rewrites the filename to a host path
1100        // (or /proc/self/fd/N).  Pass a separate PATH_MAX buffer as the `file`
1101        // argument so the rewrite does not corrupt argv[0] — which must stay as
1102        // the original command name (e.g. busybox uses argv[0] for applet
1103        // detection).  execvp still handles PATH lookup for bare command names.
1104        let mut exec_path = vec![0u8; libc::PATH_MAX as usize];
1105        let orig = cmd[0].as_bytes_with_nul();
1106        exec_path[..orig.len()].copy_from_slice(orig);
1107
1108        unsafe {
1109            libc::execvp(
1110                exec_path.as_ptr() as *const libc::c_char,
1111                argv_ptrs.as_ptr(),
1112            )
1113        };
1114    } else {
1115        unsafe { libc::execvp(argv_ptrs[0], argv_ptrs.as_ptr()) };
1116    }
1117
1118    // If we get here, exec failed
1119    fail!(format!("execvp '{}'", cmd[0].to_string_lossy()));
1120}
1121
1122// ============================================================
1123// Tests
1124// ============================================================
1125
1126#[cfg(test)]
1127mod tests {
1128    use super::*;
1129
1130    #[test]
1131    fn test_pipe_pair_creation() {
1132        let pipes = PipePair::new().expect("pipe creation failed");
1133        // Verify fds are valid (non-negative)
1134        assert!(pipes.notif_r.as_raw_fd() >= 0);
1135        assert!(pipes.notif_w.as_raw_fd() >= 0);
1136        assert!(pipes.ready_r.as_raw_fd() >= 0);
1137        assert!(pipes.ready_w.as_raw_fd() >= 0);
1138        // All four fds should be distinct
1139        let fds = [
1140            pipes.notif_r.as_raw_fd(),
1141            pipes.notif_w.as_raw_fd(),
1142            pipes.ready_r.as_raw_fd(),
1143            pipes.ready_w.as_raw_fd(),
1144        ];
1145        for i in 0..4 {
1146            for j in (i + 1)..4 {
1147                assert_ne!(fds[i], fds[j]);
1148            }
1149        }
1150    }
1151
1152    #[test]
1153    fn test_write_read_u32() {
1154        let pipes = PipePair::new().expect("pipe creation failed");
1155        let val = 42u32;
1156        write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
1157        let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
1158        assert_eq!(got, val);
1159    }
1160
1161    #[test]
1162    fn test_write_read_u32_large() {
1163        let pipes = PipePair::new().expect("pipe creation failed");
1164        let val = 0xDEAD_BEEFu32;
1165        write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
1166        let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
1167        assert_eq!(got, val);
1168    }
1169
1170    #[test]
1171    fn test_notif_syscalls_always_has_clone() {
1172        let policy = Sandbox::builder().build().unwrap();
1173        let nrs = notif_syscalls(&policy, None);
1174        assert!(nrs.contains(&(libc::SYS_clone as u32)));
1175        assert!(nrs.contains(&(libc::SYS_clone3 as u32)));
1176        if let Some(vfork) = arch::SYS_VFORK {
1177            assert!(nrs.contains(&(vfork as u32)));
1178        }
1179        // Bare fork(2) is intercepted only when policy_fn is active —
1180        // see notif_syscalls. The default policy has no policy_fn, so
1181        // fork stays out of the BPF filter and hot fork-loops keep
1182        // bypassing the supervisor.
1183        if let Some(fork) = arch::SYS_FORK {
1184            assert!(!nrs.contains(&(fork as u32)));
1185        }
1186    }
1187
1188    #[test]
1189    fn test_notif_syscalls_fork_gated_on_policy_fn() {
1190        let Some(fork) = arch::SYS_FORK else { return };
1191        let policy = Sandbox::builder()
1192            .policy_fn(|_event, _ctx| crate::policy_fn::Verdict::Allow)
1193            .build()
1194            .unwrap();
1195        let nrs = notif_syscalls(&policy, None);
1196        assert!(nrs.contains(&(fork as u32)));
1197    }
1198
1199    #[test]
1200    fn test_notif_syscalls_memory() {
1201        // shmget only appears in notif when SysV IPC is allowed —
1202        // otherwise it is on the kernel blocklist and notifying would
1203        // bypass the deny (notif JEQs precede deny JEQs in the BPF
1204        // layout).
1205        let policy = Sandbox::builder()
1206            .max_memory(crate::sandbox::ByteSize::mib(256))
1207            .extra_allow_syscalls(vec!["sysv_ipc".into()])
1208            .build()
1209            .unwrap();
1210        let nrs = notif_syscalls(&policy, None);
1211        assert!(nrs.contains(&(libc::SYS_mmap as u32)));
1212        assert!(nrs.contains(&(libc::SYS_munmap as u32)));
1213        assert!(nrs.contains(&(libc::SYS_brk as u32)));
1214        assert!(nrs.contains(&(libc::SYS_mremap as u32)));
1215        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1216    }
1217
1218    #[test]
1219    fn test_notif_syscalls_memory_excludes_shmget_when_sysv_ipc_denied() {
1220        // With max_memory but allows_sysv_ipc()=false (the default),
1221        // shmget must NOT be in notif: if it were, the BPF filter
1222        // would route it to RET_USER_NOTIF before reaching the deny
1223        // JEQ, silently bypassing the kernel-level deny.
1224        let policy = Sandbox::builder()
1225            .max_memory(crate::sandbox::ByteSize::mib(256))
1226            .build()
1227            .unwrap();
1228        let nrs = notif_syscalls(&policy, None);
1229        assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1230        // Other memory syscalls remain notified — they are not denied.
1231        assert!(nrs.contains(&(libc::SYS_mmap as u32)));
1232        assert!(nrs.contains(&(libc::SYS_brk as u32)));
1233    }
1234
1235    #[test]
1236    fn test_notif_syscalls_net() {
1237        let policy = Sandbox::builder()
1238            .net_allow("example.com:443")
1239            .build()
1240            .unwrap();
1241        let nrs = notif_syscalls(&policy, None);
1242        assert!(nrs.contains(&(libc::SYS_connect as u32)));
1243        assert!(nrs.contains(&(libc::SYS_sendto as u32)));
1244        assert!(nrs.contains(&(libc::SYS_sendmsg as u32)));
1245        assert!(nrs.contains(&(libc::SYS_sendmmsg as u32)));
1246    }
1247
1248    #[test]
1249    fn test_notif_syscalls_sandbox_name_enables_hostname_virtualization() {
1250        let policy = Sandbox::builder().build().unwrap();
1251        let nrs = notif_syscalls(&policy, Some("api.local"));
1252        assert!(nrs.contains(&(libc::SYS_uname as u32)));
1253        assert!(nrs.contains(&(libc::SYS_openat as u32)));
1254    }
1255
1256    /// SYS_faccessat2 (439) must be in the notification filter for both
1257    /// chroot and COW modes — glibc 2.33+ uses it instead of faccessat.
1258    #[test]
1259    fn test_notif_syscalls_faccessat2() {
1260        const SYS_FACCESSAT2: u32 = 439;
1261
1262        // Chroot mode
1263        let policy = Sandbox::builder()
1264            .chroot("/tmp")
1265            .build()
1266            .unwrap();
1267        let nrs = notif_syscalls(&policy, None);
1268        assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
1269        assert!(nrs.contains(&SYS_FACCESSAT2),
1270                "chroot notif filter must include SYS_faccessat2 (439)");
1271
1272        // COW mode
1273        let policy = Sandbox::builder()
1274            .workdir("/tmp")
1275            .build()
1276            .unwrap();
1277        let nrs = notif_syscalls(&policy, None);
1278        assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
1279        assert!(nrs.contains(&SYS_FACCESSAT2),
1280                "COW notif filter must include SYS_faccessat2 (439)");
1281    }
1282
1283    #[test]
1284    fn test_blocklist_syscall_numbers_default() {
1285        let policy = Sandbox::builder().build().unwrap();
1286        let nrs = blocklist_syscall_numbers(&policy);
1287        // Should contain mount, ptrace, etc.
1288        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1289        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1290        assert!(nrs.contains(&(libc::SYS_bpf as u32)));
1291        // SysV IPC denied by default (no IPC namespace in sandlock)
1292        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1293        assert!(nrs.contains(&(libc::SYS_shmat as u32)));
1294        assert!(nrs.contains(&(libc::SYS_msgget as u32)));
1295        assert!(nrs.contains(&(libc::SYS_semget as u32)));
1296        // nfsservctl has no libc constant, so it is skipped
1297        assert!(!nrs.is_empty());
1298    }
1299
1300    #[test]
1301    fn test_blocklist_syscall_numbers_custom() {
1302        let policy = Sandbox::builder()
1303            .extra_deny_syscalls(vec!["mount".into(), "ptrace".into()])
1304            .build()
1305            .unwrap();
1306        let nrs = blocklist_syscall_numbers(&policy);
1307        // User-supplied blocklist still gets SysV IPC appended
1308        // (allows_sysv_ipc() defaults to false).
1309        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1310        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1311        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1312    }
1313
1314    #[test]
1315    fn test_blocklist_syscall_numbers_custom_with_sysv_ipc_allowed() {
1316        let policy = Sandbox::builder()
1317            .extra_deny_syscalls(vec!["mount".into(), "ptrace".into()])
1318            .extra_allow_syscalls(vec!["sysv_ipc".into()])
1319            .build()
1320            .unwrap();
1321        let nrs = blocklist_syscall_numbers(&policy);
1322        // Default blocklist plus user extras — no SysV IPC append.
1323        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1324        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1325        assert!(nrs.contains(&(libc::SYS_bpf as u32)));
1326        assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1327    }
1328
1329    #[test]
1330    fn test_blocklist_syscall_numbers_default_with_sysv_ipc_allowed() {
1331        let policy = Sandbox::builder()
1332            .extra_allow_syscalls(vec!["sysv_ipc".into()])
1333            .build()
1334            .unwrap();
1335        let nrs = blocklist_syscall_numbers(&policy);
1336        // Default blocklist still present, but SysV IPC is permitted.
1337        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1338        assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1339        assert!(!nrs.contains(&(libc::SYS_msgget as u32)));
1340        assert!(!nrs.contains(&(libc::SYS_semget as u32)));
1341    }
1342
1343    #[test]
1344    fn test_no_supervisor_blocklist_includes_sysv_ipc_by_default() {
1345        let policy = Sandbox::builder().build().unwrap();
1346        let nrs = no_supervisor_blocklist_syscall_numbers(&policy);
1347        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1348        assert!(nrs.contains(&(libc::SYS_msgget as u32)));
1349        assert!(nrs.contains(&(libc::SYS_semget as u32)));
1350    }
1351
1352    #[test]
1353    fn test_no_supervisor_blocklist_excludes_sysv_ipc_when_allowed() {
1354        let policy = Sandbox::builder()
1355            .extra_allow_syscalls(vec!["sysv_ipc".into()])
1356            .build()
1357            .unwrap();
1358        let nrs = no_supervisor_blocklist_syscall_numbers(&policy);
1359        assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1360        assert!(!nrs.contains(&(libc::SYS_msgget as u32)));
1361        assert!(!nrs.contains(&(libc::SYS_semget as u32)));
1362    }
1363
1364    #[test]
1365    fn test_arg_filters_has_clone_ioctl_prctl_socket() {
1366        use crate::sys::structs::{
1367            BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K,
1368        };
1369        let policy = Sandbox::builder().build().unwrap();
1370        let filters = arg_filters(&policy);
1371        // Should contain JEQ for clone syscall nr
1372        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1373            && f.k == libc::SYS_clone as u32));
1374        // Should contain JSET for CLONE_NS_FLAGS
1375        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JSET | BPF_K)
1376            && f.k == CLONE_NS_FLAGS as u32));
1377        // Should contain JEQ for ioctl syscall nr
1378        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1379            && f.k == libc::SYS_ioctl as u32));
1380        // Should contain JEQ for TIOCSTI, TIOCLINUX, and SIOCGIF*/SIOCETHTOOL
1381        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1382            && f.k == TIOCSTI as u32));
1383        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1384            && f.k == TIOCLINUX as u32));
1385        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1386            && f.k == SIOCGIFCONF as u32));
1387        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1388            && f.k == SIOCETHTOOL as u32));
1389        // Should contain JEQ for prctl syscall nr
1390        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1391            && f.k == libc::SYS_prctl as u32));
1392        // Should contain JEQ for PR_SET_DUMPABLE
1393        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1394            && f.k == PR_SET_DUMPABLE));
1395    }
1396
1397    #[test]
1398    fn test_arg_filters_raw_sockets() {
1399        use crate::sys::structs::{BPF_ALU, BPF_AND, BPF_JEQ, BPF_JMP, BPF_K};
1400        // Raw sockets are blocked by default — no `icmp-raw://*` rule.
1401        let policy = Sandbox::builder().build().unwrap();
1402        let filters = arg_filters(&policy);
1403        // Should have AF_INET check
1404        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1405            && f.k == AF_INET));
1406        // Should have AF_INET6 check
1407        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1408            && f.k == AF_INET6));
1409        // Should have ALU AND SOCK_TYPE_MASK
1410        assert!(filters.iter().any(|f| f.code == (BPF_ALU | BPF_AND | BPF_K)
1411            && f.k == SOCK_TYPE_MASK));
1412        // Should have JEQ SOCK_RAW
1413        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1414            && f.k == SOCK_RAW));
1415    }
1416
1417    #[test]
1418    fn test_arg_filters_udp_denied_by_default() {
1419        use crate::sys::structs::{BPF_JEQ, BPF_JMP, BPF_K};
1420        // UDP is denied by default — no `udp://...` rule in net_allow.
1421        let policy = Sandbox::builder().build().unwrap();
1422        let filters = arg_filters(&policy);
1423        // Should have JEQ SOCK_DGRAM
1424        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1425            && f.k == SOCK_DGRAM));
1426    }
1427
1428    #[test]
1429    fn test_syscall_name_to_nr_covers_defaults() {
1430        // Every name in DEFAULT_BLOCKLIST_SYSCALLS should resolve unless the
1431        // running architecture does not expose that syscall.
1432        let expected_unresolved: &[&str] = &[
1433            "nfsservctl",
1434            #[cfg(target_arch = "aarch64")]
1435            "ioperm",
1436            #[cfg(target_arch = "aarch64")]
1437            "iopl",
1438        ];
1439        let mut skipped = 0;
1440        for name in DEFAULT_BLOCKLIST_SYSCALLS {
1441            match syscall_name_to_nr(name) {
1442                Some(_) => {}
1443                None => {
1444                    assert!(
1445                        expected_unresolved.contains(name),
1446                        "unexpected unresolved syscall: {}",
1447                        name
1448                    );
1449                    skipped += 1;
1450                }
1451            }
1452        }
1453        assert_eq!(skipped, expected_unresolved.len());
1454    }
1455}