Skip to main content

sandlock_core/
context.rs

1// Fork + confinement sequence: child-side Landlock + seccomp application
2// and parent-child pipe synchronization.
3
4use std::ffi::CString;
5use std::io;
6use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
7
8use syscalls::{Sysno, SysnoSet};
9
10use crate::arch;
11use crate::sandbox::Sandbox;
12use crate::seccomp::bpf::{self, stmt, jump};
13use crate::sys::structs::{
14    AF_INET, AF_INET6,
15    BPF_ABS, BPF_ALU, BPF_AND, BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K, BPF_LD, BPF_RET, BPF_W,
16    CLONE_NS_FLAGS, DEFAULT_BLOCKLIST_SYSCALLS, EPERM, SYSV_IPC_BLOCKLIST_SYSCALLS,
17    SECCOMP_RET_ALLOW, SECCOMP_RET_ERRNO,
18    SIOCETHTOOL, SIOCGIFADDR, SIOCGIFBRDADDR, SIOCGIFCONF, SIOCGIFDSTADDR,
19    SIOCGIFFLAGS, SIOCGIFHWADDR, SIOCGIFINDEX, SIOCGIFNAME, SIOCGIFNETMASK,
20    SOCK_DGRAM, SOCK_RAW, SOCK_TYPE_MASK, TIOCLINUX, TIOCSTI,
21    PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER,
22    OFFSET_ARGS0_LO, OFFSET_ARGS1_LO, OFFSET_ARGS2_LO, OFFSET_ARGS3_LO, OFFSET_NR,
23    SockFilter,
24};
25
26// ============================================================
27// Pipe pair for parent-child synchronization
28// ============================================================
29
30/// Pipes for parent-child communication after fork().
31pub struct PipePair {
32    /// Parent reads the notif fd number written by the child.
33    pub notif_r: OwnedFd,
34    /// Child writes the notif fd number to the parent.
35    pub notif_w: OwnedFd,
36    /// Child reads the "supervisor ready" signal from the parent.
37    pub ready_r: OwnedFd,
38    /// Parent writes the "supervisor ready" signal to the child.
39    pub ready_w: OwnedFd,
40}
41
42impl PipePair {
43    /// Create two pipe pairs using `pipe2(O_CLOEXEC)`.
44    pub fn new() -> io::Result<Self> {
45        let mut notif_fds = [0i32; 2];
46        let mut ready_fds = [0i32; 2];
47
48        // SAFETY: pipe2 with valid pointers and O_CLOEXEC
49        let ret = unsafe { libc::pipe2(notif_fds.as_mut_ptr(), libc::O_CLOEXEC) };
50        if ret < 0 {
51            return Err(io::Error::last_os_error());
52        }
53
54        let ret = unsafe { libc::pipe2(ready_fds.as_mut_ptr(), libc::O_CLOEXEC) };
55        if ret < 0 {
56            // Close the first pair on failure
57            unsafe {
58                libc::close(notif_fds[0]);
59                libc::close(notif_fds[1]);
60            }
61            return Err(io::Error::last_os_error());
62        }
63
64        // SAFETY: pipe2 returned valid fds
65        Ok(PipePair {
66            notif_r: unsafe { OwnedFd::from_raw_fd(notif_fds[0]) },
67            notif_w: unsafe { OwnedFd::from_raw_fd(notif_fds[1]) },
68            ready_r: unsafe { OwnedFd::from_raw_fd(ready_fds[0]) },
69            ready_w: unsafe { OwnedFd::from_raw_fd(ready_fds[1]) },
70        })
71    }
72}
73
74// ============================================================
75// Pipe I/O helpers
76// ============================================================
77
78/// Write a `u32` as 4 little-endian bytes to a raw fd.
79pub(crate) fn write_u32_fd(fd: RawFd, val: u32) -> io::Result<()> {
80    let buf = val.to_le_bytes();
81    let mut written = 0usize;
82    while written < 4 {
83        let ret = unsafe {
84            libc::write(
85                fd,
86                buf[written..].as_ptr() as *const libc::c_void,
87                4 - written,
88            )
89        };
90        if ret < 0 {
91            return Err(io::Error::last_os_error());
92        }
93        written += ret as usize;
94    }
95    Ok(())
96}
97
98/// Read a `u32` (4 little-endian bytes, blocking) from a raw fd.
99pub(crate) fn read_u32_fd(fd: RawFd) -> io::Result<u32> {
100    let mut buf = [0u8; 4];
101    let mut total = 0usize;
102    while total < 4 {
103        let ret = unsafe {
104            libc::read(
105                fd,
106                buf[total..].as_mut_ptr() as *mut libc::c_void,
107                4 - total,
108            )
109        };
110        if ret < 0 {
111            return Err(io::Error::last_os_error());
112        }
113        if ret == 0 {
114            return Err(io::Error::new(
115                io::ErrorKind::UnexpectedEof,
116                "pipe closed before 4 bytes read",
117            ));
118        }
119        total += ret as usize;
120    }
121    Ok(u32::from_le_bytes(buf))
122}
123
124#[cfg(test)]
125use crate::seccomp::syscall::syscall_name_to_nr;
126
127// ============================================================
128// Sandbox → syscall lists
129// ============================================================
130
131#[derive(Default)]
132struct SyscallList {
133    nrs: Vec<u32>,
134}
135
136impl SyscallList {
137    fn with(syscalls: &[i64]) -> Self {
138        let mut list = Self::default();
139        list.extend(syscalls);
140        list
141    }
142
143    fn push(&mut self, nr: i64) {
144        self.nrs.push(nr as u32);
145    }
146
147    fn extend(&mut self, syscalls: &[i64]) {
148        self.nrs.extend(syscalls.iter().map(|&nr| nr as u32));
149    }
150
151    fn push_optional(&mut self, nr: Option<i64>) {
152        if let Some(nr) = nr {
153            self.push(nr);
154        }
155    }
156
157
158    fn finish(mut self) -> Vec<u32> {
159        self.nrs.sort_unstable();
160        self.nrs.dedup();
161        self.nrs
162    }
163}
164
165const BASE_NOTIF_SYSCALLS: &[i64] = &[
166    libc::SYS_clone,
167    libc::SYS_clone3,
168    libc::SYS_wait4,
169    libc::SYS_waitid,
170];
171
172const MEMORY_NOTIF_SYSCALLS: &[i64] = &[
173    libc::SYS_mmap,
174    libc::SYS_munmap,
175    libc::SYS_brk,
176    libc::SYS_mremap,
177];
178
179const NETWORK_POLICY_SYSCALLS: &[i64] = &[
180    libc::SYS_connect,
181    libc::SYS_sendto,
182    libc::SYS_sendmsg,
183    libc::SYS_sendmmsg,
184    libc::SYS_bind,
185];
186
187// Also intercept openat so the supervisor can re-patch vDSO after exec.
188const RANDOM_NOTIF_SYSCALLS: &[i64] = &[libc::SYS_getrandom, libc::SYS_openat];
189
190// Also intercept openat so the supervisor gets a notification after exec
191// and can re-patch the vDSO (exec replaces vDSO with a fresh copy).
192const TIME_NOTIF_SYSCALLS: &[i64] = &[
193    libc::SYS_clock_nanosleep,
194    libc::SYS_timerfd_settime,
195    libc::SYS_timer_settime,
196    libc::SYS_openat,
197];
198
199// /proc virtualization + /etc/hosts virtualization (always on).
200//
201// `openat` carries the simple `(AT_FDCWD, "/proc/...")` and
202// `(AT_FDCWD, "/etc/hosts")` spellings; `openat2` is the same shape
203// on newer libcs; legacy `open(path, ...)` is the same path without a
204// dirfd. The handlers normalize all three into a single absolute path
205// check, so we have to put every variant on the notif list -- otherwise
206// a caller that picks `open` or `openat2` slips past virtualization
207// and reads the real on-disk file.
208fn procfs_hosts_notif_syscalls() -> Vec<i64> {
209    let mut v = vec![libc::SYS_openat, arch::SYS_OPENAT2, libc::SYS_getdents64];
210    v.extend([arch::sys_open(), arch::sys_getdents()].into_iter().flatten());
211    v
212}
213
214// Netlink virtualization (always on):
215//   socket, bind, getsockname -- swap in a unix socketpair for AF_NETLINK
216//   recvfrom, recvmsg         -- zero msg_name so glibc accepts the reply
217//                                (kernel only writes sun_family on unix
218//                                 recvmsg, leaving nl_pid uninitialized)
219//   close                     -- unregister (pid, fd) so reuse doesn't
220//                                collide with the cookie set
221// Send traffic flows through the real socketpair untouched.
222const NETLINK_NOTIF_SYSCALLS: &[i64] = &[
223    libc::SYS_socket,
224    libc::SYS_bind,
225    libc::SYS_getsockname,
226    libc::SYS_recvfrom,
227    libc::SYS_recvmsg,
228    libc::SYS_close,
229];
230
231fn cow_path_syscalls() -> Vec<i64> {
232    let mut v = vec![
233        libc::SYS_openat,
234        libc::SYS_execve,
235        libc::SYS_execveat,
236        libc::SYS_unlinkat,
237        libc::SYS_mkdirat,
238        libc::SYS_renameat2,
239        libc::SYS_symlinkat,
240        libc::SYS_linkat,
241        libc::SYS_fchmodat,
242        libc::SYS_fchownat,
243        libc::SYS_truncate,
244        libc::SYS_utimensat,
245        libc::SYS_newfstatat,
246        libc::SYS_statx,
247        libc::SYS_faccessat,
248        arch::SYS_FACCESSAT2,
249        libc::SYS_readlinkat,
250        libc::SYS_getdents64,
251        libc::SYS_chdir,
252        libc::SYS_getcwd,
253    ];
254    v.extend(
255        [
256            arch::sys_open(),
257            arch::sys_unlink(),
258            arch::sys_rmdir(),
259            arch::sys_mkdir(),
260            arch::sys_rename(),
261            arch::sys_symlink(),
262            arch::sys_link(),
263            arch::sys_chmod(),
264            arch::sys_chown(),
265            arch::sys_lchown(),
266            arch::sys_stat(),
267            arch::sys_lstat(),
268            arch::sys_access(),
269            arch::sys_readlink(),
270            arch::sys_getdents(),
271        ]
272        .into_iter()
273        .flatten(),
274    );
275    v
276}
277
278fn chroot_path_syscalls() -> Vec<i64> {
279    let mut v = vec![
280        libc::SYS_openat,
281        libc::SYS_execve,
282        libc::SYS_execveat,
283        libc::SYS_unlinkat,
284        libc::SYS_mkdirat,
285        libc::SYS_renameat2,
286        libc::SYS_symlinkat,
287        libc::SYS_linkat,
288        libc::SYS_fchmodat,
289        libc::SYS_fchownat,
290        libc::SYS_truncate,
291        libc::SYS_newfstatat,
292        libc::SYS_statx,
293        libc::SYS_faccessat,
294        arch::SYS_FACCESSAT2,
295        libc::SYS_readlinkat,
296        libc::SYS_getdents64,
297        libc::SYS_chdir,
298        libc::SYS_getcwd,
299        libc::SYS_statfs,
300        libc::SYS_utimensat,
301        // xattr family (path-based): must be mediated so that paths under an
302        // fs_mount/chroot resolve to the real backing file rather than the
303        // empty mount point (issue #84). The fd-based f*xattr variants need
304        // no mediation — their fd already points at the resolved file.
305        libc::SYS_getxattr,
306        libc::SYS_lgetxattr,
307        libc::SYS_setxattr,
308        libc::SYS_lsetxattr,
309        libc::SYS_listxattr,
310        libc::SYS_llistxattr,
311        libc::SYS_removexattr,
312        libc::SYS_lremovexattr,
313    ];
314    v.extend(
315        [
316            arch::sys_open(),
317            arch::sys_stat(),
318            arch::sys_lstat(),
319            arch::sys_access(),
320            arch::sys_readlink(),
321            arch::sys_getdents(),
322            arch::sys_unlink(),
323            arch::sys_rmdir(),
324            arch::sys_mkdir(),
325            arch::sys_rename(),
326            arch::sys_symlink(),
327            arch::sys_link(),
328            arch::sys_chmod(),
329            arch::sys_chown(),
330            arch::sys_lchown(),
331        ]
332        .into_iter()
333        .flatten(),
334    );
335    v
336}
337
338fn fs_denied_path_syscalls() -> Vec<i64> {
339    let mut v = vec![
340        libc::SYS_openat,
341        libc::SYS_execve,
342        libc::SYS_execveat,
343        libc::SYS_linkat,
344        libc::SYS_renameat2,
345        libc::SYS_symlinkat,
346    ];
347    v.extend(
348        [
349            arch::sys_open(),
350            arch::sys_link(),
351            arch::sys_rename(),
352            arch::sys_symlink(),
353        ]
354        .into_iter()
355        .flatten(),
356    );
357    v
358}
359
360const POLICY_EVENT_SYSCALLS: &[i64] = &[
361    libc::SYS_openat,
362    libc::SYS_connect,
363    libc::SYS_sendto,
364    libc::SYS_bind,
365    libc::SYS_execve,
366    libc::SYS_execveat,
367];
368
369const PORT_REMAP_SYSCALLS: &[i64] = &[
370    libc::SYS_bind,
371    libc::SYS_getsockname,
372];
373
374fn needs_network_supervision(policy: &Sandbox) -> bool {
375    !policy.net_allow.is_empty()
376        || !policy.net_deny.is_empty()
377        || !policy.net_deny_bind.is_empty()
378        || policy.policy_fn.is_some()
379        || !policy.http_allow.is_empty()
380        || !policy.http_deny.is_empty()
381}
382
383/// Determine which syscalls need `SECCOMP_RET_USER_NOTIF`.
384pub fn notif_syscalls(policy: &Sandbox, sandbox_name: Option<&str>) -> Vec<u32> {
385    let mut nrs = SyscallList::with(BASE_NOTIF_SYSCALLS);
386    nrs.push_optional(arch::sys_vfork());
387
388    // Bare fork(2) carries none of the namespace/process-limit risk of
389    // clone/clone3 and was historically left out of the BPF filter so
390    // hot fork-loops (COW map-reduce) bypass the supervisor entirely.
391    // It only needs interception when policy_fn is active, so the
392    // supervisor can register the new child via ptrace fork events
393    // before it can run user code (argv-safety invariant).
394    if policy.policy_fn.is_some() {
395        nrs.push_optional(arch::sys_fork());
396    }
397
398    if policy.max_memory.is_some() {
399        nrs.extend(MEMORY_NOTIF_SYSCALLS);
400        // shmget is in notif only when SysV IPC is allowed. The BPF
401        // layout puts notif JEQs before deny JEQs, so a syscall on
402        // both lists would notify (RET_USER_NOTIF) and silently
403        // bypass the kernel-level deny. When extra_allow_syscalls does not contain "sysv_ipc",
404        // shmget belongs only on the blocklist.
405        if policy.allows_sysv_ipc() {
406            nrs.push(libc::SYS_shmget);
407        }
408    }
409
410    if needs_network_supervision(policy) {
411        nrs.extend(NETWORK_POLICY_SYSCALLS);
412    }
413
414    if policy.random_seed.is_some() {
415        nrs.extend(RANDOM_NOTIF_SYSCALLS);
416    }
417
418    if policy.time_start.is_some() {
419        nrs.extend(TIME_NOTIF_SYSCALLS);
420    }
421
422    nrs.extend(&procfs_hosts_notif_syscalls());
423    nrs.extend(NETLINK_NOTIF_SYSCALLS);
424
425    // Virtualize sched_getaffinity so nproc/sysconf agree with /proc/cpuinfo
426    if policy.num_cpus.is_some() {
427        nrs.push(libc::SYS_sched_getaffinity);
428    }
429    if sandbox_name.is_some() {
430        nrs.extend(&[libc::SYS_uname, libc::SYS_openat]);
431    }
432
433    // COW filesystem interception (seccomp-based, unprivileged)
434    if policy.workdir.is_some() {
435        nrs.extend(&cow_path_syscalls());
436    }
437
438    // Chroot path interception
439    if policy.chroot.is_some() {
440        nrs.extend(&chroot_path_syscalls());
441    }
442
443    // Explicit deny-paths need path-bearing syscalls intercepted.
444    if !policy.fs_denied.is_empty() {
445        nrs.extend(&fs_denied_path_syscalls());
446    }
447
448    // Dynamic policy callback — intercept key syscalls for event emission.
449    if policy.policy_fn.is_some() {
450        nrs.extend(POLICY_EVENT_SYSCALLS);
451    }
452
453    // Port remapping
454    if policy.port_remap {
455        nrs.extend(PORT_REMAP_SYSCALLS);
456    }
457
458    nrs.finish()
459}
460
461/// Resolve `base` syscall names plus policy extras (and SysV IPC syscalls when
462/// `policy.allows_sysv_ipc()` is false) to a deduplicated, ascending list of
463/// numbers for the current architecture.
464///
465/// A `SysnoSet` accumulates the membership: it dedups inherently (so SysV IPC
466/// folds in with a plain `insert`) and iterates in ascending syscall order.
467/// Names that do not exist on this architecture resolve to nothing and are
468/// skipped, so the result stays arch-correct.
469fn resolve_blocklist(base: &[&str], policy: &Sandbox) -> Vec<u32> {
470    let mut set: SysnoSet = base
471        .iter()
472        .copied()
473        .chain(policy.extra_deny_syscalls.iter().map(String::as_str))
474        .filter_map(|n| n.parse::<Sysno>().ok())
475        .collect();
476    if !policy.allows_sysv_ipc() {
477        for name in SYSV_IPC_BLOCKLIST_SYSCALLS {
478            if let Ok(sysno) = name.parse::<Sysno>() {
479                set.insert(sysno);
480            }
481        }
482    }
483    set.iter().map(|s| s.id() as u32).collect()
484}
485
486/// Resolve `NO_SUPERVISOR_BLOCKLIST_SYSCALLS` names to numbers, plus
487/// SysV IPC syscalls when `policy.allows_sysv_ipc()` is false.
488pub fn no_supervisor_blocklist_syscall_numbers(policy: &Sandbox) -> Vec<u32> {
489    use crate::sys::structs::NO_SUPERVISOR_BLOCKLIST_SYSCALLS;
490    resolve_blocklist(NO_SUPERVISOR_BLOCKLIST_SYSCALLS, policy)
491}
492
493/// Resolve the default syscall blocklist plus policy extras to numbers.
494///
495/// SysV IPC syscalls are appended to the resolved blocklist when
496/// `policy.allows_sysv_ipc()` is false.
497pub fn blocklist_syscall_numbers(policy: &Sandbox) -> Vec<u32> {
498    resolve_blocklist(DEFAULT_BLOCKLIST_SYSCALLS, policy)
499}
500
501/// Build argument-level seccomp filter instructions matching the Python
502/// `_build_arg_filters()` exactly.
503///
504/// Returns a `Vec<SockFilter>` containing self-contained BPF blocks for:
505///   - clone: block namespace creation flags
506///   - ioctl: block TIOCSTI, TIOCLINUX, SIOCGIF*, SIOCETHTOOL
507///   - prctl: block PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER
508///   - socket: block SOCK_RAW/SOCK_DGRAM on AF_INET/AF_INET6 (with type mask)
509pub fn arg_filters(policy: &Sandbox) -> Vec<SockFilter> {
510    let ret_errno = SECCOMP_RET_ERRNO | EPERM as u32;
511    let nr_clone = libc::SYS_clone as u32;
512    let nr_ioctl = libc::SYS_ioctl as u32;
513    let nr_prctl = libc::SYS_prctl as u32;
514    let nr_socket = libc::SYS_socket as u32;
515
516    let mut insns: Vec<SockFilter> = Vec::new();
517
518    // --- clone: block namespace creation flags ---
519    // 5 instructions:
520    //   LD NR
521    //   JEQ clone → +0, skip 3
522    //   LD arg0
523    //   JSET NS_FLAGS → +0, skip 1
524    //   RET ERRNO
525    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
526    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_clone, 0, 3));
527    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
528    insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, CLONE_NS_FLAGS as u32, 0, 1));
529    insns.push(stmt(BPF_RET | BPF_K, ret_errno));
530
531    // --- ioctl: block dangerous commands ---
532    // Block terminal injection (TIOCSTI, TIOCLINUX) and network interface
533    // enumeration ioctls (SIOCGIF*, SIOCETHTOOL) to complement NETLINK_ROUTE
534    // virtualization.
535    // Layout: LD NR, JEQ ioctl (skip 1 + N*2), LD arg1, [JEQ cmd, RET ERRNO] * N
536    let dangerous_ioctls: &[u32] = &[
537        TIOCSTI as u32,
538        TIOCLINUX as u32,
539        SIOCGIFNAME as u32,
540        SIOCGIFCONF as u32,
541        SIOCGIFFLAGS as u32,
542        SIOCGIFADDR as u32,
543        SIOCGIFDSTADDR as u32,
544        SIOCGIFBRDADDR as u32,
545        SIOCGIFNETMASK as u32,
546        SIOCGIFHWADDR as u32,
547        SIOCGIFINDEX as u32,
548        SIOCETHTOOL as u32,
549    ];
550    let n_ioctls = dangerous_ioctls.len();
551    let skip_count = (1 + n_ioctls * 2) as u8;
552    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
553    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_ioctl, 0, skip_count));
554    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
555    for &cmd in dangerous_ioctls {
556        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, cmd, 0, 1));
557        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
558    }
559
560    // --- prctl: block dangerous options ---
561    // Layout: LD NR, JEQ prctl (skip 1 + N*2), LD arg0, [JEQ op, RET ERRNO] * N
562    let dangerous_prctl_ops: &[u32] = &[PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER];
563    let n_ops = dangerous_prctl_ops.len();
564    let skip_count = (1 + n_ops * 2) as u8;
565    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
566    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_prctl, 0, skip_count));
567    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
568    for &op in dangerous_prctl_ops {
569        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, op, 0, 1));
570        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
571    }
572
573    // --- socket: block SOCK_RAW and/or SOCK_DGRAM on AF_INET/AF_INET6 ---
574    //
575    // SOCK_RAW is unconditionally denied. Sandlock does not expose
576    // raw ICMP — packet-crafting capabilities aren't part of the XOA
577    // threat model, and destination filtering at `sendto` can't be
578    // honestly enforced for raw sockets (the agent controls the IP
579    // header). Workloads that need ping should use the kernel ping
580    // socket (SOCK_DGRAM + IPPROTO_ICMP) via an `icmp://...` rule.
581    //
582    // SOCK_DGRAM is denied unless a UDP or ICMP rule exists in
583    // net_allow. The kernel ping socket uses SOCK_DGRAM with
584    // IPPROTO_ICMP, so the same type bit gates both — destination
585    // filtering at sendto (Phase 2) is what separates them per-rule.
586    use crate::sandbox::Protocol;
587    let any_udp_rule = policy.net_allow.iter().any(|r| r.protocol == Protocol::Udp);
588    let any_icmp_rule = policy.net_allow.iter().any(|r| r.protocol == Protocol::Icmp);
589    // `--net-deny` is default-allow, so UDP and the kernel ping socket
590    // (both SOCK_DGRAM) must be creatable; without this the sandbox
591    // could not even do DNS over UDP. Per-destination UDP/ICMP denial
592    // is still enforced on the sendto on-behalf path via the DenyList.
593    let net_deny_active = !policy.net_deny.is_empty();
594    let mut blocked_types: Vec<u32> = Vec::new();
595    blocked_types.push(SOCK_RAW);
596    if !any_udp_rule && !any_icmp_rule && !net_deny_active {
597        blocked_types.push(SOCK_DGRAM);
598    }
599
600    if !blocked_types.is_empty() {
601        let n = blocked_types.len();
602        // Instructions after domain checks: 2 (load+AND) + N (JEQs) + 1 (RET)
603        let after_domain = 2 + n + 1;
604        // Total after NR check: 3 (load domain + 2 JEQs) + after_domain
605        let skip_all = (3 + after_domain) as u8;
606
607        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
608        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, skip_all));
609        // Load domain (arg0)
610        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
611        // AF_INET → skip to type check (jump over AF_INET6 check)
612        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET, 1, 0));
613        // AF_INET6 → type check; else skip everything remaining
614        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET6, 0, after_domain as u8));
615        // Load type (arg1) and mask off SOCK_NONBLOCK|SOCK_CLOEXEC
616        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
617        insns.push(stmt(BPF_ALU | BPF_AND | BPF_K, SOCK_TYPE_MASK));
618        // Check each blocked type
619        for (i, &sock_type) in blocked_types.iter().enumerate() {
620            let remaining = n - i - 1;
621            // Match → jump to RET ERRNO (skip 'remaining' JEQs ahead)
622            // No match on last type → skip past RET ERRNO (jf=1)
623            // No match on non-last → check next type (jf=0)
624            let jf: u8 = if remaining == 0 { 1 } else { 0 };
625            insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, sock_type, remaining as u8, jf));
626        }
627        // Deny return (reached by any matching JEQ)
628        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
629    }
630
631    // (raw ICMP carve-out removed — SOCK_RAW is unconditionally denied
632    // by the blocked_types block above. Sandlock does not expose raw
633    // sockets; ping uses the SOCK_DGRAM kernel ping socket via an
634    // `icmp://...` rule, gated by host `ping_group_range`.)
635
636    // --- wait4: skip notification for WNOHANG/WNOWAIT (non-blocking) ---
637    // wait4(pid, status, options, rusage) — options is arg2
638    // 5 instructions:
639    //   LD NR
640    //   JEQ wait4 → +0, skip 3
641    //   LD arg2
642    //   JSET (WNOHANG|WNOWAIT) → +0, skip 1
643    //   RET ALLOW
644    {
645        let nr_wait4 = libc::SYS_wait4 as u32;
646        let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000/* WNOWAIT */) as u32;
647        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
648        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_wait4, 0, 3));
649        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS2_LO));
650        insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
651        insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
652    }
653
654    // --- waitid: skip notification for WNOHANG/WNOWAIT (non-blocking) ---
655    // waitid(idtype, id, infop, options, rusage) — options is arg3
656    {
657        let nr_waitid = libc::SYS_waitid as u32;
658        let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000/* WNOWAIT */) as u32;
659        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
660        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_waitid, 0, 3));
661        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS3_LO));
662        insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
663        insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
664    }
665
666    insns
667}
668
669// ============================================================
670// Close fds above threshold
671// ============================================================
672
673/// Close all file descriptors above `min_fd`, except those in `keep`.
674fn close_fds_above(min_fd: RawFd, keep: &[RawFd]) {
675    // Read /proc/self/fd to enumerate open fds.
676    // Collect all fd numbers first, then close them after dropping the directory
677    // iterator. This avoids closing the directory fd during iteration.
678    let fds_to_close: Vec<RawFd> = {
679        let dir = match std::fs::read_dir("/proc/self/fd") {
680            Ok(d) => d,
681            Err(_) => return,
682        };
683        dir.flatten()
684            .filter_map(|entry| {
685                entry.file_name().into_string().ok()
686                    .and_then(|name| name.parse::<RawFd>().ok())
687            })
688            .filter(|&fd| fd > min_fd && !keep.contains(&fd))
689            .collect()
690    };
691    // The directory is now closed; safe to close the collected fds.
692    for fd in fds_to_close {
693        unsafe { libc::close(fd) };
694    }
695}
696
697// ============================================================
698// User-namespace uid/gid mapping helpers
699// ============================================================
700
701/// Write uid/gid maps for an unprivileged user namespace.
702/// `real_uid`/`real_gid` must be captured *before* unshare(CLONE_NEWUSER),
703/// since getuid()/getgid() return the overflow id (65534) after unshare.
704/// `target_uid`/`target_gid` are the UIDs visible inside the namespace.
705fn write_id_maps(real_uid: u32, real_gid: u32, target_uid: u32, target_gid: u32) {
706    let _ = std::fs::write("/proc/self/uid_map", format!("{} {} 1\n", target_uid, real_uid));
707    let _ = std::fs::write("/proc/self/setgroups", "deny\n");
708    let _ = std::fs::write("/proc/self/gid_map", format!("{} {} 1\n", target_gid, real_gid));
709}
710
711// ============================================================
712// Child-side confinement (never returns)
713// ============================================================
714
715/// Arguments threaded from the parent's `do_spawn` into the child-side
716/// `confine_child`.  Packed into a struct because `confine_child` historically
717/// grew to seven positional parameters and a struct keeps the call site
718/// readable when new flags get added (e.g. `extra_syscalls` for user
719/// handlers).  Lifetimes tie everything to the parent's stack frame — the
720/// child never outlives the fork point because `confine_child` either execs
721/// or exits.
722pub(crate) struct ChildSpawnArgs<'a> {
723    pub sandbox: &'a Sandbox,
724    pub cmd: &'a [CString],
725    pub pipes: &'a PipePair,
726    /// Skip the user-notification supervisor: child installs a kernel-only
727    /// deny filter, parent reads `notif_fd_num = 0` and never starts a
728    /// supervisor. Mirrors `Sandbox::no_supervisor`.
729    pub no_supervisor: bool,
730    pub keep_fds: &'a [RawFd],
731    /// Sandbox instance name. When set, it is also exposed as the
732    /// sandbox's virtual hostname.
733    pub sandbox_name: Option<&'a str>,
734    /// Syscall numbers for which the parent registered user `Handler`s.
735    /// Merged into the child's BPF notif list so the kernel actually
736    /// raises USER_NOTIF for them.
737    pub extra_syscalls: &'a [u32],
738    /// PID of the parent process captured before fork. Used to detect
739    /// parent death in the child without assuming PID 1 is always init
740    /// (incorrect in containers where the entrypoint runs as PID 1).
741    pub parent_pid: libc::pid_t,
742}
743
744/// Apply irreversible confinement (Landlock + seccomp) then exec the command.
745///
746/// This function **never returns**: it calls `execvp` on success or
747/// `_exit(127)` on any error.
748pub(crate) fn confine_child(args: ChildSpawnArgs<'_>) -> ! {
749    let ChildSpawnArgs {
750        sandbox,
751        cmd,
752        pipes,
753        no_supervisor,
754        keep_fds,
755        sandbox_name,
756        extra_syscalls,
757        parent_pid,
758    } = args;
759    // Helper: abort child on error. Includes the OS error automatically.
760    macro_rules! fail {
761        ($msg:expr) => {{
762            let err = std::io::Error::last_os_error();
763            let _ = write!(std::io::stderr(), "sandlock child: {}: {}\n", $msg, err);
764            unsafe { libc::_exit(127) };
765        }};
766    }
767
768    use std::io::Write;
769
770    // 1. New process group
771    if unsafe { libc::setpgid(0, 0) } != 0 {
772        fail!("setpgid");
773    }
774
775    // 1b. If stdin is a terminal, become the foreground process group
776    //     so interactive shells can read from the TTY.
777    //     Must ignore SIGTTOU first — a background pgrp calling tcsetpgrp
778    //     gets stopped by SIGTTOU otherwise.
779    if unsafe { libc::isatty(0) } == 1 {
780        unsafe {
781            libc::signal(libc::SIGTTOU, libc::SIG_IGN);
782            libc::tcsetpgrp(0, libc::getpgrp());
783            libc::signal(libc::SIGTTOU, libc::SIG_DFL);
784        }
785    }
786
787    // 2. Die if parent exits
788    if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } != 0 {
789        fail!("prctl(PR_SET_PDEATHSIG)");
790    }
791
792    // 3. Check parent didn't die between fork and prctl.
793    // Compare against the actual parent PID captured before fork rather than
794    // hardcoding 1, since containers often run the entrypoint as PID 1 and a
795    // child forked from it legitimately has getppid() == 1.
796    if unsafe { libc::getppid() } != parent_pid {
797        fail!("parent died before confinement");
798    }
799
800    // 4. Optional: disable ASLR
801    if sandbox.no_randomize_memory {
802        const ADDR_NO_RANDOMIZE: libc::c_ulong = 0x0040000;
803        // Read current personality first (0xffffffff = query), then OR in the flag.
804        let current = unsafe { libc::personality(0xffffffff) };
805        if current == -1 {
806            fail!("personality(query)");
807        }
808        if unsafe { libc::personality(current as libc::c_ulong | ADDR_NO_RANDOMIZE) } == -1 {
809            fail!("personality(ADDR_NO_RANDOMIZE)");
810        }
811    }
812
813    // 4b. Optional: CPU core binding
814    if let Some(ref cores) = sandbox.cpu_cores {
815        if !cores.is_empty() {
816            let mut set = unsafe { std::mem::zeroed::<libc::cpu_set_t>() };
817            unsafe { libc::CPU_ZERO(&mut set) };
818            for &core in cores {
819                unsafe { libc::CPU_SET(core as usize, &mut set) };
820            }
821            if unsafe {
822                libc::sched_setaffinity(
823                    0,
824                    std::mem::size_of::<libc::cpu_set_t>(),
825                    &set,
826                )
827            } != 0
828            {
829                fail!("sched_setaffinity");
830            }
831        }
832    }
833
834    // 5. Optional: disable THP
835    if sandbox.no_huge_pages {
836        if unsafe { libc::prctl(libc::PR_SET_THP_DISABLE, 1, 0, 0, 0) } != 0 {
837            fail!("prctl(PR_SET_THP_DISABLE)");
838        }
839    }
840
841    // 5c. Optional: disable core dumps
842    if sandbox.no_coredump {
843        // Set RLIMIT_CORE to 0 — the kernel will not write a core file.
844        // We intentionally do NOT call prctl(PR_SET_DUMPABLE, 0) because
845        // that would break pidfd_getfd which the supervisor needs.
846        // The seccomp filter already blocks the child from calling
847        // prctl(PR_SET_DUMPABLE, ...) so it can't re-enable it.
848        let rlim = libc::rlimit { rlim_cur: 0, rlim_max: 0 };
849        if unsafe { libc::setrlimit(libc::RLIMIT_CORE, &rlim) } != 0 {
850            fail!("setrlimit(RLIMIT_CORE, 0)");
851        }
852    }
853
854    // Capture real uid/gid before any unshare (after unshare they become 65534)
855    let real_uid = unsafe { libc::getuid() };
856    let real_gid = unsafe { libc::getgid() };
857
858    // 5b. User namespace for --uid mapping.
859    if let Some(target_uid) = sandbox.uid {
860        if unsafe { libc::unshare(libc::CLONE_NEWUSER) } != 0 {
861            fail!("unshare(CLONE_NEWUSER)");
862        }
863        write_id_maps(real_uid, real_gid, target_uid, target_uid);
864    }
865
866    // 6. Optional: change working directory
867    // cwd controls where the child starts; workdir is only for COW
868    let effective_cwd = if let Some(ref cwd) = sandbox.cwd {
869        if let Some(ref chroot_root) = sandbox.chroot {
870            Some(chroot_root.join(cwd.strip_prefix("/").unwrap_or(cwd)))
871        } else {
872            Some(cwd.clone())
873        }
874    } else if let Some(ref chroot_root) = sandbox.chroot {
875        // Default to chroot root
876        Some(chroot_root.to_path_buf())
877    } else if let Some(ref workdir) = sandbox.workdir {
878        // Default to workdir when set (COW working directory)
879        Some(workdir.clone())
880    } else {
881        None
882    };
883
884    if let Some(ref cwd) = effective_cwd {
885        let c_path = match CString::new(cwd.as_os_str().as_encoded_bytes()) {
886            Ok(c) => c,
887            Err(_) => fail!("invalid cwd path"),
888        };
889        if unsafe { libc::chdir(c_path.as_ptr()) } != 0 {
890            fail!("chdir");
891        }
892    }
893
894    // 7. Set NO_NEW_PRIVS (required for both Landlock and seccomp without CAP_SYS_ADMIN)
895    if unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) } != 0 {
896        fail!("prctl(PR_SET_NO_NEW_PRIVS)");
897    }
898
899    // 8. Apply Landlock confinement (IRREVERSIBLE)
900    if let Err(e) = crate::landlock::confine(sandbox) {
901        fail!(format!("landlock: {}", e));
902    }
903
904    // 9. Assemble and install seccomp filter (IRREVERSIBLE)
905    let args = arg_filters(sandbox);
906    let mut keep_fd: i32 = -1;
907
908    if no_supervisor {
909        // No-supervisor mode: deny-only kernel filter, no NEW_LISTENER.
910        // BPF filters are ANDed by the kernel, so an outer filter (from a
911        // wrapping sandbox) keeps tightening this layer too.
912        //
913        // Uses the relaxed `no_supervisor_blocklist_syscall_numbers` deny
914        // list (which leaves `ptrace`, `unshare`, `process_vm_*`, etc.
915        // alone) so an inner full-supervisor sandlock nested under this
916        // one still has the syscalls its supervisor needs.
917        let deny = no_supervisor_blocklist_syscall_numbers(sandbox);
918        let filter = match bpf::assemble_filter(&[], &deny, &args) {
919            Ok(f) => f,
920            Err(e) => fail!(format!("seccomp assemble: {}", e)),
921        };
922        if let Err(e) = bpf::install_deny_filter(&filter) {
923            fail!(format!("seccomp deny filter: {}", e));
924        }
925        // fd=0 tells the parent there's no supervisor to attach to.
926        if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), 0) {
927            fail!(format!("write no-supervisor signal: {}", e));
928        }
929    } else {
930        let deny = blocklist_syscall_numbers(sandbox);
931        // First-level sandbox: notif + deny filter with NEW_LISTENER.
932        //
933        // Caller-supplied handlers must have their syscalls registered in
934        // the BPF filter, otherwise the kernel never raises a notification for
935        // them and the handler silently never fires.  We merge `extra_syscalls`
936        // into the notif list and dedup so each syscall produces exactly one
937        // JEQ in the assembled program.
938        let mut notif = notif_syscalls(sandbox, sandbox_name);
939        if !extra_syscalls.is_empty() {
940            notif.extend_from_slice(extra_syscalls);
941        }
942        // Argv-safety gate (companion to the policy_fn case in
943        // notif_syscalls): a handler bound to execve/execveat
944        // can call `read_child_mem` to inspect argv, so the supervisor
945        // must register newly forked children before they can run user
946        // code — same invariant policy_fn relies on. Bare fork(2)
947        // therefore needs to be intercepted here too.
948        let exec_extra = extra_syscalls.iter().any(|&n| {
949            n == libc::SYS_execve as u32 || n == libc::SYS_execveat as u32
950        });
951        if exec_extra {
952            arch::push_optional_syscall(&mut notif, arch::sys_fork());
953        }
954        notif.sort_unstable();
955        notif.dedup();
956        let filter = match bpf::assemble_filter(&notif, &deny, &args) {
957            Ok(f) => f,
958            Err(e) => fail!(format!("seccomp assemble: {}", e)),
959        };
960        let notif_fd = match bpf::install_filter(&filter) {
961            Ok(fd) => fd,
962            Err(e) => {
963                // EBUSY here means another seccomp filter on this task already
964                // owns the SECCOMP_FILTER_FLAG_NEW_LISTENER slot. The kernel
965                // permits at most one listener per task — to nest, opt this
966                // sandbox out of the supervisor via `Sandbox::no_supervisor`
967                // (or the CLI's `--no-supervisor` flag).
968                if e.raw_os_error() == Some(libc::EBUSY) {
969                    let _ = write!(
970                        std::io::stderr(),
971                        "sandlock child: seccomp install: {} (an outer sandbox already owns the \
972                         seccomp listener; pass --no-supervisor or Sandbox::no_supervisor(true) \
973                         on this sandbox to nest)\n",
974                        e,
975                    );
976                    unsafe { libc::_exit(127) };
977                }
978                fail!(format!("seccomp install: {}", e));
979            }
980        };
981        keep_fd = notif_fd.as_raw_fd();
982        if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), keep_fd as u32) {
983            fail!(format!("write notif fd: {}", e));
984        }
985        std::mem::forget(notif_fd);
986    }
987
988    // 10. Wait for parent to signal ready
989    match read_u32_fd(pipes.ready_r.as_raw_fd()) {
990        Ok(_) => {}
991        Err(e) => fail!(format!("read ready signal: {}", e)),
992    }
993
994    // 12. Close all fds above stderr (always on for isolation)
995    let mut fds_to_keep: Vec<RawFd> = keep_fds.to_vec();
996    if keep_fd >= 0 {
997        fds_to_keep.push(keep_fd);
998    }
999    close_fds_above(2, &fds_to_keep);
1000
1001    // 13. Apply environment
1002    if sandbox.clean_env {
1003        // Clear all env vars first
1004        for (key, _) in std::env::vars_os() {
1005            std::env::remove_var(&key);
1006        }
1007    }
1008    for (key, value) in &sandbox.env {
1009        std::env::set_var(key, value);
1010    }
1011
1012    // 13b. GPU device visibility
1013    if let Some(ref devices) = sandbox.gpu_devices {
1014        if !devices.is_empty() {
1015            let vis = devices.iter().map(|d| d.to_string()).collect::<Vec<_>>().join(",");
1016            std::env::set_var("CUDA_VISIBLE_DEVICES", &vis);
1017            std::env::set_var("ROCR_VISIBLE_DEVICES", &vis);
1018        }
1019        // Empty list = all GPUs visible, don't set env vars
1020    }
1021
1022    // 14. exec
1023    debug_assert!(!cmd.is_empty(), "cmd must not be empty");
1024    let argv_ptrs: Vec<*const libc::c_char> = cmd
1025        .iter()
1026        .map(|s| s.as_ptr())
1027        .chain(std::iter::once(std::ptr::null()))
1028        .collect();
1029
1030    if sandbox.chroot.is_some() {
1031        // With chroot the seccomp handler rewrites the filename to a host path
1032        // (or /proc/self/fd/N).  Pass a separate PATH_MAX buffer as the `file`
1033        // argument so the rewrite does not corrupt argv[0] — which must stay as
1034        // the original command name (e.g. busybox uses argv[0] for applet
1035        // detection).  execvp still handles PATH lookup for bare command names.
1036        let mut exec_path = vec![0u8; libc::PATH_MAX as usize];
1037        let orig = cmd[0].as_bytes_with_nul();
1038        exec_path[..orig.len()].copy_from_slice(orig);
1039
1040        unsafe {
1041            libc::execvp(
1042                exec_path.as_ptr() as *const libc::c_char,
1043                argv_ptrs.as_ptr(),
1044            )
1045        };
1046    } else {
1047        unsafe { libc::execvp(argv_ptrs[0], argv_ptrs.as_ptr()) };
1048    }
1049
1050    // If we get here, exec failed
1051    fail!(format!("execvp '{}'", cmd[0].to_string_lossy()));
1052}
1053
1054// ============================================================
1055// Tests
1056// ============================================================
1057
1058#[cfg(test)]
1059mod tests {
1060    use super::*;
1061
1062    #[test]
1063    fn test_pipe_pair_creation() {
1064        let pipes = PipePair::new().expect("pipe creation failed");
1065        // Verify fds are valid (non-negative)
1066        assert!(pipes.notif_r.as_raw_fd() >= 0);
1067        assert!(pipes.notif_w.as_raw_fd() >= 0);
1068        assert!(pipes.ready_r.as_raw_fd() >= 0);
1069        assert!(pipes.ready_w.as_raw_fd() >= 0);
1070        // All four fds should be distinct
1071        let fds = [
1072            pipes.notif_r.as_raw_fd(),
1073            pipes.notif_w.as_raw_fd(),
1074            pipes.ready_r.as_raw_fd(),
1075            pipes.ready_w.as_raw_fd(),
1076        ];
1077        for i in 0..4 {
1078            for j in (i + 1)..4 {
1079                assert_ne!(fds[i], fds[j]);
1080            }
1081        }
1082    }
1083
1084    #[test]
1085    fn test_write_read_u32() {
1086        let pipes = PipePair::new().expect("pipe creation failed");
1087        let val = 42u32;
1088        write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
1089        let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
1090        assert_eq!(got, val);
1091    }
1092
1093    #[test]
1094    fn test_write_read_u32_large() {
1095        let pipes = PipePair::new().expect("pipe creation failed");
1096        let val = 0xDEAD_BEEFu32;
1097        write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
1098        let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
1099        assert_eq!(got, val);
1100    }
1101
1102    #[test]
1103    fn test_notif_syscalls_always_has_clone() {
1104        let policy = Sandbox::builder().build().unwrap();
1105        let nrs = notif_syscalls(&policy, None);
1106        assert!(nrs.contains(&(libc::SYS_clone as u32)));
1107        assert!(nrs.contains(&(libc::SYS_clone3 as u32)));
1108        if let Some(vfork) = arch::sys_vfork() {
1109            assert!(nrs.contains(&(vfork as u32)));
1110        }
1111        // Bare fork(2) is intercepted only when policy_fn is active —
1112        // see notif_syscalls. The default policy has no policy_fn, so
1113        // fork stays out of the BPF filter and hot fork-loops keep
1114        // bypassing the supervisor.
1115        if let Some(fork) = arch::sys_fork() {
1116            assert!(!nrs.contains(&(fork as u32)));
1117        }
1118    }
1119
1120    #[test]
1121    fn test_notif_syscalls_fork_gated_on_policy_fn() {
1122        let Some(fork) = arch::sys_fork() else { return };
1123        let policy = Sandbox::builder()
1124            .policy_fn(|_event, _ctx| crate::policy_fn::Verdict::Allow)
1125            .build()
1126            .unwrap();
1127        let nrs = notif_syscalls(&policy, None);
1128        assert!(nrs.contains(&(fork as u32)));
1129    }
1130
1131    #[test]
1132    fn test_notif_syscalls_memory() {
1133        // shmget only appears in notif when SysV IPC is allowed —
1134        // otherwise it is on the kernel blocklist and notifying would
1135        // bypass the deny (notif JEQs precede deny JEQs in the BPF
1136        // layout).
1137        let policy = Sandbox::builder()
1138            .max_memory(crate::sandbox::ByteSize::mib(256))
1139            .extra_allow_syscalls(vec!["sysv_ipc".into()])
1140            .build()
1141            .unwrap();
1142        let nrs = notif_syscalls(&policy, None);
1143        assert!(nrs.contains(&(libc::SYS_mmap as u32)));
1144        assert!(nrs.contains(&(libc::SYS_munmap as u32)));
1145        assert!(nrs.contains(&(libc::SYS_brk as u32)));
1146        assert!(nrs.contains(&(libc::SYS_mremap as u32)));
1147        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1148    }
1149
1150    #[test]
1151    fn test_notif_syscalls_memory_excludes_shmget_when_sysv_ipc_denied() {
1152        // With max_memory but allows_sysv_ipc()=false (the default),
1153        // shmget must NOT be in notif: if it were, the BPF filter
1154        // would route it to RET_USER_NOTIF before reaching the deny
1155        // JEQ, silently bypassing the kernel-level deny.
1156        let policy = Sandbox::builder()
1157            .max_memory(crate::sandbox::ByteSize::mib(256))
1158            .build()
1159            .unwrap();
1160        let nrs = notif_syscalls(&policy, None);
1161        assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1162        // Other memory syscalls remain notified — they are not denied.
1163        assert!(nrs.contains(&(libc::SYS_mmap as u32)));
1164        assert!(nrs.contains(&(libc::SYS_brk as u32)));
1165    }
1166
1167    #[test]
1168    fn test_notif_syscalls_net() {
1169        let policy = Sandbox::builder()
1170            .net_allow("example.com:443")
1171            .build()
1172            .unwrap();
1173        let nrs = notif_syscalls(&policy, None);
1174        assert!(nrs.contains(&(libc::SYS_connect as u32)));
1175        assert!(nrs.contains(&(libc::SYS_sendto as u32)));
1176        assert!(nrs.contains(&(libc::SYS_sendmsg as u32)));
1177        assert!(nrs.contains(&(libc::SYS_sendmmsg as u32)));
1178    }
1179
1180    #[test]
1181    fn test_notif_syscalls_net_deny() {
1182        // --net-deny is default-allow but still needs every connect/sendto
1183        // routed to the on-behalf path so the DenyList can refuse matches.
1184        let policy = Sandbox::builder()
1185            .net_deny("10.0.0.0/8")
1186            .build()
1187            .unwrap();
1188        let nrs = notif_syscalls(&policy, None);
1189        assert!(nrs.contains(&(libc::SYS_connect as u32)));
1190        assert!(nrs.contains(&(libc::SYS_sendto as u32)));
1191    }
1192
1193    #[test]
1194    fn test_notif_syscalls_sandbox_name_enables_hostname_virtualization() {
1195        let policy = Sandbox::builder().build().unwrap();
1196        let nrs = notif_syscalls(&policy, Some("api.local"));
1197        assert!(nrs.contains(&(libc::SYS_uname as u32)));
1198        assert!(nrs.contains(&(libc::SYS_openat as u32)));
1199    }
1200
1201    /// SYS_faccessat2 (439) must be in the notification filter for both
1202    /// chroot and COW modes — glibc 2.33+ uses it instead of faccessat.
1203    #[test]
1204    fn test_notif_syscalls_faccessat2() {
1205        // Chroot mode
1206        let policy = Sandbox::builder()
1207            .chroot("/tmp")
1208            .build()
1209            .unwrap();
1210        let nrs = notif_syscalls(&policy, None);
1211        assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
1212        assert!(nrs.contains(&(arch::SYS_FACCESSAT2 as u32)),
1213                "chroot notif filter must include SYS_faccessat2 (439)");
1214
1215        // COW mode
1216        let policy = Sandbox::builder()
1217            .workdir("/tmp")
1218            .build()
1219            .unwrap();
1220        let nrs = notif_syscalls(&policy, None);
1221        assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
1222        assert!(nrs.contains(&(arch::SYS_FACCESSAT2 as u32)),
1223                "COW notif filter must include SYS_faccessat2 (439)");
1224    }
1225
1226    #[test]
1227    fn test_blocklist_syscall_numbers_default() {
1228        let policy = Sandbox::builder().build().unwrap();
1229        let nrs = blocklist_syscall_numbers(&policy);
1230        // Should contain mount, ptrace, etc.
1231        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1232        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1233        assert!(nrs.contains(&(libc::SYS_bpf as u32)));
1234        // SysV IPC denied by default (no IPC namespace in sandlock)
1235        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1236        assert!(nrs.contains(&(libc::SYS_shmat as u32)));
1237        assert!(nrs.contains(&(libc::SYS_msgget as u32)));
1238        assert!(nrs.contains(&(libc::SYS_semget as u32)));
1239        // nfsservctl has no libc constant, so it is skipped
1240        assert!(!nrs.is_empty());
1241    }
1242
1243    #[test]
1244    fn test_blocklist_syscall_numbers_custom() {
1245        let policy = Sandbox::builder()
1246            .extra_deny_syscalls(vec!["mount".into(), "ptrace".into()])
1247            .build()
1248            .unwrap();
1249        let nrs = blocklist_syscall_numbers(&policy);
1250        // User-supplied blocklist still gets SysV IPC appended
1251        // (allows_sysv_ipc() defaults to false).
1252        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1253        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1254        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1255    }
1256
1257    #[test]
1258    fn test_blocklist_syscall_numbers_custom_with_sysv_ipc_allowed() {
1259        let policy = Sandbox::builder()
1260            .extra_deny_syscalls(vec!["mount".into(), "ptrace".into()])
1261            .extra_allow_syscalls(vec!["sysv_ipc".into()])
1262            .build()
1263            .unwrap();
1264        let nrs = blocklist_syscall_numbers(&policy);
1265        // Default blocklist plus user extras — no SysV IPC append.
1266        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1267        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1268        assert!(nrs.contains(&(libc::SYS_bpf as u32)));
1269        assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1270    }
1271
1272    #[test]
1273    fn test_blocklist_syscall_numbers_default_with_sysv_ipc_allowed() {
1274        let policy = Sandbox::builder()
1275            .extra_allow_syscalls(vec!["sysv_ipc".into()])
1276            .build()
1277            .unwrap();
1278        let nrs = blocklist_syscall_numbers(&policy);
1279        // Default blocklist still present, but SysV IPC is permitted.
1280        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1281        assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1282        assert!(!nrs.contains(&(libc::SYS_msgget as u32)));
1283        assert!(!nrs.contains(&(libc::SYS_semget as u32)));
1284    }
1285
1286    #[test]
1287    fn test_no_supervisor_blocklist_includes_sysv_ipc_by_default() {
1288        let policy = Sandbox::builder().build().unwrap();
1289        let nrs = no_supervisor_blocklist_syscall_numbers(&policy);
1290        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1291        assert!(nrs.contains(&(libc::SYS_msgget as u32)));
1292        assert!(nrs.contains(&(libc::SYS_semget as u32)));
1293    }
1294
1295    #[test]
1296    fn test_no_supervisor_blocklist_excludes_sysv_ipc_when_allowed() {
1297        let policy = Sandbox::builder()
1298            .extra_allow_syscalls(vec!["sysv_ipc".into()])
1299            .build()
1300            .unwrap();
1301        let nrs = no_supervisor_blocklist_syscall_numbers(&policy);
1302        assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1303        assert!(!nrs.contains(&(libc::SYS_msgget as u32)));
1304        assert!(!nrs.contains(&(libc::SYS_semget as u32)));
1305    }
1306
1307    #[test]
1308    fn test_arg_filters_has_clone_ioctl_prctl_socket() {
1309        use crate::sys::structs::{
1310            BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K,
1311        };
1312        let policy = Sandbox::builder().build().unwrap();
1313        let filters = arg_filters(&policy);
1314        // Should contain JEQ for clone syscall nr
1315        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1316            && f.k == libc::SYS_clone as u32));
1317        // Should contain JSET for CLONE_NS_FLAGS
1318        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JSET | BPF_K)
1319            && f.k == CLONE_NS_FLAGS as u32));
1320        // Should contain JEQ for ioctl syscall nr
1321        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1322            && f.k == libc::SYS_ioctl as u32));
1323        // Should contain JEQ for TIOCSTI, TIOCLINUX, and SIOCGIF*/SIOCETHTOOL
1324        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1325            && f.k == TIOCSTI as u32));
1326        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1327            && f.k == TIOCLINUX as u32));
1328        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1329            && f.k == SIOCGIFCONF as u32));
1330        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1331            && f.k == SIOCETHTOOL as u32));
1332        // Should contain JEQ for prctl syscall nr
1333        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1334            && f.k == libc::SYS_prctl as u32));
1335        // Should contain JEQ for PR_SET_DUMPABLE
1336        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1337            && f.k == PR_SET_DUMPABLE));
1338    }
1339
1340    #[test]
1341    fn test_arg_filters_raw_sockets() {
1342        use crate::sys::structs::{BPF_ALU, BPF_AND, BPF_JEQ, BPF_JMP, BPF_K};
1343        // Raw sockets are blocked by default — no `icmp-raw://*` rule.
1344        let policy = Sandbox::builder().build().unwrap();
1345        let filters = arg_filters(&policy);
1346        // Should have AF_INET check
1347        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1348            && f.k == AF_INET));
1349        // Should have AF_INET6 check
1350        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1351            && f.k == AF_INET6));
1352        // Should have ALU AND SOCK_TYPE_MASK
1353        assert!(filters.iter().any(|f| f.code == (BPF_ALU | BPF_AND | BPF_K)
1354            && f.k == SOCK_TYPE_MASK));
1355        // Should have JEQ SOCK_RAW
1356        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1357            && f.k == SOCK_RAW));
1358    }
1359
1360    #[test]
1361    fn test_arg_filters_udp_denied_by_default() {
1362        use crate::sys::structs::{BPF_JEQ, BPF_JMP, BPF_K};
1363        // UDP is denied by default — no `udp://...` rule in net_allow.
1364        let policy = Sandbox::builder().build().unwrap();
1365        let filters = arg_filters(&policy);
1366        // Should have JEQ SOCK_DGRAM
1367        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1368            && f.k == SOCK_DGRAM));
1369    }
1370
1371    #[test]
1372    fn test_syscall_name_to_nr_covers_defaults() {
1373        // Every name in DEFAULT_BLOCKLIST_SYSCALLS should resolve unless the
1374        // running architecture does not expose that syscall.
1375        // `nfsservctl` now resolves: the syscalls crate carries it (kernel
1376        // returns ENOSYS, but the ABI number exists), so it is enforced in the
1377        // blocklist rather than silently dropped. `ioperm`/`iopl` are x86-only.
1378        let expected_unresolved: &[&str] = &[
1379            #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
1380            "ioperm",
1381            #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
1382            "iopl",
1383        ];
1384        let mut skipped = 0;
1385        for name in DEFAULT_BLOCKLIST_SYSCALLS {
1386            match syscall_name_to_nr(name) {
1387                Some(_) => {}
1388                None => {
1389                    assert!(
1390                        expected_unresolved.contains(name),
1391                        "unexpected unresolved syscall: {}",
1392                        name
1393                    );
1394                    skipped += 1;
1395                }
1396            }
1397        }
1398        assert_eq!(skipped, expected_unresolved.len());
1399    }
1400}