Skip to main content

sandlock_core/
context.rs

1// Fork + confinement sequence: child-side Landlock + seccomp application
2// and parent-child pipe synchronization.
3
4use std::ffi::CString;
5use std::io;
6use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
7
8use crate::policy::{FsIsolation, Policy};
9use crate::seccomp::bpf::{self, stmt, jump};
10use crate::sys::structs::{
11    AF_INET, AF_INET6, AF_NETLINK,
12    BPF_ABS, BPF_ALU, BPF_AND, BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K, BPF_LD, BPF_RET, BPF_W,
13    CLONE_NS_FLAGS, DEFAULT_DENY_SYSCALLS, EPERM, NETLINK_SOCK_DIAG, SECCOMP_RET_ERRNO,
14    SOCK_DGRAM, SOCK_RAW, SOCK_TYPE_MASK, TIOCLINUX, TIOCSTI,
15    PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER,
16    OFFSET_ARGS0_LO, OFFSET_ARGS1_LO, OFFSET_ARGS2_LO, OFFSET_NR,
17    SockFilter,
18};
19
20// ============================================================
21// Pipe pair for parent-child synchronization
22// ============================================================
23
24/// Pipes for parent-child communication after fork().
25pub struct PipePair {
26    /// Parent reads the notif fd number written by the child.
27    pub notif_r: OwnedFd,
28    /// Child writes the notif fd number to the parent.
29    pub notif_w: OwnedFd,
30    /// Child reads the "supervisor ready" signal from the parent.
31    pub ready_r: OwnedFd,
32    /// Parent writes the "supervisor ready" signal to the child.
33    pub ready_w: OwnedFd,
34}
35
36impl PipePair {
37    /// Create two pipe pairs using `pipe2(O_CLOEXEC)`.
38    pub fn new() -> io::Result<Self> {
39        let mut notif_fds = [0i32; 2];
40        let mut ready_fds = [0i32; 2];
41
42        // SAFETY: pipe2 with valid pointers and O_CLOEXEC
43        let ret = unsafe { libc::pipe2(notif_fds.as_mut_ptr(), libc::O_CLOEXEC) };
44        if ret < 0 {
45            return Err(io::Error::last_os_error());
46        }
47
48        let ret = unsafe { libc::pipe2(ready_fds.as_mut_ptr(), libc::O_CLOEXEC) };
49        if ret < 0 {
50            // Close the first pair on failure
51            unsafe {
52                libc::close(notif_fds[0]);
53                libc::close(notif_fds[1]);
54            }
55            return Err(io::Error::last_os_error());
56        }
57
58        // SAFETY: pipe2 returned valid fds
59        Ok(PipePair {
60            notif_r: unsafe { OwnedFd::from_raw_fd(notif_fds[0]) },
61            notif_w: unsafe { OwnedFd::from_raw_fd(notif_fds[1]) },
62            ready_r: unsafe { OwnedFd::from_raw_fd(ready_fds[0]) },
63            ready_w: unsafe { OwnedFd::from_raw_fd(ready_fds[1]) },
64        })
65    }
66}
67
68// ============================================================
69// Pipe I/O helpers
70// ============================================================
71
72/// Write a `u32` as 4 little-endian bytes to a raw fd.
73pub(crate) fn write_u32_fd(fd: RawFd, val: u32) -> io::Result<()> {
74    let buf = val.to_le_bytes();
75    let mut written = 0usize;
76    while written < 4 {
77        let ret = unsafe {
78            libc::write(
79                fd,
80                buf[written..].as_ptr() as *const libc::c_void,
81                4 - written,
82            )
83        };
84        if ret < 0 {
85            return Err(io::Error::last_os_error());
86        }
87        written += ret as usize;
88    }
89    Ok(())
90}
91
92/// Read a `u32` (4 little-endian bytes, blocking) from a raw fd.
93pub(crate) fn read_u32_fd(fd: RawFd) -> io::Result<u32> {
94    let mut buf = [0u8; 4];
95    let mut total = 0usize;
96    while total < 4 {
97        let ret = unsafe {
98            libc::read(
99                fd,
100                buf[total..].as_mut_ptr() as *mut libc::c_void,
101                4 - total,
102            )
103        };
104        if ret < 0 {
105            return Err(io::Error::last_os_error());
106        }
107        if ret == 0 {
108            return Err(io::Error::new(
109                io::ErrorKind::UnexpectedEof,
110                "pipe closed before 4 bytes read",
111            ));
112        }
113        total += ret as usize;
114    }
115    Ok(u32::from_le_bytes(buf))
116}
117
118// ============================================================
119// Syscall name → number mapping
120// ============================================================
121
122/// Map a syscall name to its `libc::SYS_*` number.
123///
124/// Covers all names in `DEFAULT_DENY_SYSCALLS` plus extras needed for
125/// notif and arg-filter lists.
126pub fn syscall_name_to_nr(name: &str) -> Option<u32> {
127    let nr: i64 = match name {
128        "mount" => libc::SYS_mount,
129        "umount2" => libc::SYS_umount2,
130        "pivot_root" => libc::SYS_pivot_root,
131        "swapon" => libc::SYS_swapon,
132        "swapoff" => libc::SYS_swapoff,
133        "reboot" => libc::SYS_reboot,
134        "sethostname" => libc::SYS_sethostname,
135        "setdomainname" => libc::SYS_setdomainname,
136        "kexec_load" => libc::SYS_kexec_load,
137        "init_module" => libc::SYS_init_module,
138        "finit_module" => libc::SYS_finit_module,
139        "delete_module" => libc::SYS_delete_module,
140        "unshare" => libc::SYS_unshare,
141        "setns" => libc::SYS_setns,
142        "perf_event_open" => libc::SYS_perf_event_open,
143        "bpf" => libc::SYS_bpf,
144        "userfaultfd" => libc::SYS_userfaultfd,
145        "keyctl" => libc::SYS_keyctl,
146        "add_key" => libc::SYS_add_key,
147        "request_key" => libc::SYS_request_key,
148        "ptrace" => libc::SYS_ptrace,
149        "process_vm_readv" => libc::SYS_process_vm_readv,
150        "process_vm_writev" => libc::SYS_process_vm_writev,
151        "open_by_handle_at" => libc::SYS_open_by_handle_at,
152        "name_to_handle_at" => libc::SYS_name_to_handle_at,
153        "ioperm" => libc::SYS_ioperm,
154        "iopl" => libc::SYS_iopl,
155        "quotactl" => libc::SYS_quotactl,
156        "acct" => libc::SYS_acct,
157        "lookup_dcookie" => libc::SYS_lookup_dcookie,
158        // nfsservctl was removed in Linux 3.1; no libc constant — skip
159        "io_uring_setup" => libc::SYS_io_uring_setup,
160        "io_uring_enter" => libc::SYS_io_uring_enter,
161        "io_uring_register" => libc::SYS_io_uring_register,
162        // Additional syscalls for notif/arg filters
163        "clone" => libc::SYS_clone,
164        "clone3" => libc::SYS_clone3,
165        "vfork" => libc::SYS_vfork,
166        "mmap" => libc::SYS_mmap,
167        "munmap" => libc::SYS_munmap,
168        "brk" => libc::SYS_brk,
169        "mremap" => libc::SYS_mremap,
170        "connect" => libc::SYS_connect,
171        "sendto" => libc::SYS_sendto,
172        "sendmsg" => libc::SYS_sendmsg,
173        "ioctl" => libc::SYS_ioctl,
174        "socket" => libc::SYS_socket,
175        "prctl" => libc::SYS_prctl,
176        "getrandom" => libc::SYS_getrandom,
177        "openat" => libc::SYS_openat,
178        "open" => libc::SYS_open,
179        "getdents64" => libc::SYS_getdents64,
180        "getdents" => libc::SYS_getdents,
181        "bind" => libc::SYS_bind,
182        "getsockname" => libc::SYS_getsockname,
183        "clock_gettime" => libc::SYS_clock_gettime,
184        "gettimeofday" => libc::SYS_gettimeofday,
185        "time" => libc::SYS_time,
186        "clock_nanosleep" => libc::SYS_clock_nanosleep,
187        "timerfd_settime" => libc::SYS_timerfd_settime,
188        "timer_settime" => libc::SYS_timer_settime,
189        "execve" => libc::SYS_execve,
190        "execveat" => libc::SYS_execveat,
191        // COW filesystem syscalls
192        "unlinkat" => libc::SYS_unlinkat,
193        "mkdirat" => libc::SYS_mkdirat,
194        "renameat2" => libc::SYS_renameat2,
195        "newfstatat" => libc::SYS_newfstatat,
196        "statx" => libc::SYS_statx,
197        "faccessat" => libc::SYS_faccessat,
198        "symlinkat" => libc::SYS_symlinkat,
199        "linkat" => libc::SYS_linkat,
200        "fchmodat" => libc::SYS_fchmodat,
201        "fchownat" => libc::SYS_fchownat,
202        "readlinkat" => libc::SYS_readlinkat,
203        "truncate" => libc::SYS_truncate,
204        "utimensat" => libc::SYS_utimensat,
205        "unlink" => libc::SYS_unlink,
206        "rmdir" => libc::SYS_rmdir,
207        "mkdir" => libc::SYS_mkdir,
208        "rename" => libc::SYS_rename,
209        "stat" => libc::SYS_stat,
210        "lstat" => libc::SYS_lstat,
211        "access" => libc::SYS_access,
212        "symlink" => libc::SYS_symlink,
213        "link" => libc::SYS_link,
214        "chmod" => libc::SYS_chmod,
215        "chown" => libc::SYS_chown,
216        "lchown" => libc::SYS_lchown,
217        "readlink" => libc::SYS_readlink,
218        "futimesat" => libc::SYS_futimesat,
219        "fork" => libc::SYS_fork,
220        _ => return None,
221    };
222    Some(nr as u32)
223}
224
225// ============================================================
226// Policy → syscall lists
227// ============================================================
228
229/// Determine which syscalls need `SECCOMP_RET_USER_NOTIF`.
230pub fn notif_syscalls(policy: &Policy) -> Vec<u32> {
231    let mut nrs = vec![
232        libc::SYS_clone as u32,
233        libc::SYS_clone3 as u32,
234        libc::SYS_vfork as u32,
235    ];
236
237    if policy.max_memory.is_some() {
238        nrs.push(libc::SYS_mmap as u32);
239        nrs.push(libc::SYS_munmap as u32);
240        nrs.push(libc::SYS_brk as u32);
241        nrs.push(libc::SYS_mremap as u32);
242        nrs.push(libc::SYS_shmget as u32);
243    }
244
245    if !policy.net_allow_hosts.is_empty()
246        || policy.policy_fn.is_some()
247        || !policy.http_allow.is_empty()
248        || !policy.http_deny.is_empty()
249    {
250        nrs.push(libc::SYS_connect as u32);
251        nrs.push(libc::SYS_sendto as u32);
252        nrs.push(libc::SYS_sendmsg as u32);
253        nrs.push(libc::SYS_bind as u32);
254    }
255
256    if policy.random_seed.is_some() {
257        nrs.push(libc::SYS_getrandom as u32);
258        // Also intercept openat so the supervisor can re-patch vDSO after exec.
259        nrs.push(libc::SYS_openat as u32);
260    }
261
262    if policy.time_start.is_some() {
263        nrs.extend_from_slice(&[
264            libc::SYS_clock_nanosleep as u32,
265            libc::SYS_timerfd_settime as u32,
266            libc::SYS_timer_settime as u32,
267        ]);
268        // Also intercept openat so the supervisor gets a notification after exec
269        // and can re-patch the vDSO (exec replaces vDSO with a fresh copy).
270        nrs.push(libc::SYS_openat as u32);
271    }
272
273    // /proc virtualization (always on: PID filtering, sensitive path blocking)
274    nrs.push(libc::SYS_openat as u32);
275    nrs.extend_from_slice(&[
276        libc::SYS_getdents64 as u32,
277        libc::SYS_getdents as u32,
278    ]);
279    // Virtualize sched_getaffinity so nproc/sysconf agree with /proc/cpuinfo
280    if policy.num_cpus.is_some() {
281        nrs.push(libc::SYS_sched_getaffinity as u32);
282    }
283    if policy.hostname.is_some() {
284        nrs.push(libc::SYS_uname as u32);
285        nrs.push(libc::SYS_openat as u32);
286    }
287
288    // COW filesystem interception (seccomp-based, unprivileged)
289    if policy.workdir.is_some() && policy.fs_isolation == FsIsolation::None {
290        nrs.extend_from_slice(&[
291            libc::SYS_openat as u32,
292            libc::SYS_unlinkat as u32,
293            libc::SYS_mkdirat as u32,
294            libc::SYS_renameat2 as u32,
295            libc::SYS_symlinkat as u32,
296            libc::SYS_linkat as u32,
297            libc::SYS_fchmodat as u32,
298            libc::SYS_fchownat as u32,
299            libc::SYS_truncate as u32,
300            libc::SYS_newfstatat as u32,
301            libc::SYS_statx as u32,
302            libc::SYS_faccessat as u32,
303            439u32,                       // SYS_faccessat2 — glibc 2.33+ uses this instead of faccessat
304            libc::SYS_readlinkat as u32,
305            libc::SYS_getdents64 as u32,
306            libc::SYS_getdents as u32,
307        ]);
308    }
309
310    // Chroot path interception
311    if policy.chroot.is_some() {
312        nrs.extend_from_slice(&[
313            libc::SYS_openat as u32,
314            libc::SYS_open as u32,        // musl uses open(2) instead of openat
315            libc::SYS_execve as u32,
316            libc::SYS_execveat as u32,
317            libc::SYS_unlinkat as u32,
318            libc::SYS_mkdirat as u32,
319            libc::SYS_renameat2 as u32,
320            libc::SYS_symlinkat as u32,
321            libc::SYS_linkat as u32,
322            libc::SYS_fchmodat as u32,
323            libc::SYS_fchownat as u32,
324            libc::SYS_truncate as u32,
325            libc::SYS_newfstatat as u32,
326            libc::SYS_stat as u32,        // musl uses stat(2) instead of newfstatat
327            libc::SYS_lstat as u32,       // musl uses lstat(2) instead of newfstatat
328            libc::SYS_statx as u32,
329            libc::SYS_faccessat as u32,
330            439u32,                       // SYS_faccessat2 — glibc 2.33+ uses this instead of faccessat
331            libc::SYS_access as u32,      // musl uses access(2) instead of faccessat
332            libc::SYS_readlinkat as u32,
333            libc::SYS_readlink as u32,    // musl uses readlink(2) instead of readlinkat
334            libc::SYS_getdents64 as u32,
335            libc::SYS_getdents as u32,
336            libc::SYS_chdir as u32,
337            libc::SYS_getcwd as u32,
338            libc::SYS_statfs as u32,
339            libc::SYS_utimensat as u32,
340            libc::SYS_unlink as u32,      // musl uses unlink(2) instead of unlinkat
341            libc::SYS_rmdir as u32,       // musl uses rmdir(2) instead of unlinkat
342            libc::SYS_mkdir as u32,       // musl uses mkdir(2) instead of mkdirat
343            libc::SYS_rename as u32,      // musl uses rename(2) instead of renameat2
344            libc::SYS_symlink as u32,     // musl uses symlink(2) instead of symlinkat
345            libc::SYS_link as u32,        // musl uses link(2) instead of linkat
346            libc::SYS_chmod as u32,       // musl uses chmod(2) instead of fchmodat
347            libc::SYS_chown as u32,       // musl uses chown(2)/lchown(2) instead of fchownat
348            libc::SYS_lchown as u32,
349        ]);
350    }
351
352    // Explicit deny-paths need path-bearing syscalls intercepted.
353    if !policy.fs_denied.is_empty() {
354        nrs.extend_from_slice(&[
355            libc::SYS_openat as u32,
356            libc::SYS_open as u32,
357            libc::SYS_execve as u32,
358            libc::SYS_execveat as u32,
359        ]);
360    }
361
362    // Dynamic policy callback — intercept key syscalls for event emission.
363    if policy.policy_fn.is_some() {
364        nrs.extend_from_slice(&[
365            libc::SYS_openat as u32,
366            libc::SYS_connect as u32,
367            libc::SYS_sendto as u32,
368            libc::SYS_bind as u32,
369            libc::SYS_execve as u32,
370            libc::SYS_execveat as u32,
371        ]);
372    }
373
374    // Port remapping
375    if policy.port_remap {
376        nrs.extend_from_slice(&[
377            libc::SYS_bind as u32,
378            libc::SYS_getsockname as u32,
379        ]);
380    }
381
382    nrs.sort_unstable();
383    nrs.dedup();
384    nrs
385}
386
387/// Resolve `NO_SUPERVISOR_DENY_SYSCALLS` names to numbers.
388pub fn no_supervisor_deny_syscall_numbers() -> Vec<u32> {
389    use crate::sys::structs::NO_SUPERVISOR_DENY_SYSCALLS;
390    NO_SUPERVISOR_DENY_SYSCALLS
391        .iter()
392        .filter_map(|n| syscall_name_to_nr(n))
393        .collect()
394}
395
396/// Resolve `deny_syscalls` names to numbers.
397///
398/// If both `deny_syscalls` and `allow_syscalls` are `None`, returns the
399/// numbers for `DEFAULT_DENY_SYSCALLS`.
400pub fn deny_syscall_numbers(policy: &Policy) -> Vec<u32> {
401    if let Some(ref names) = policy.deny_syscalls {
402        names
403            .iter()
404            .filter_map(|n| syscall_name_to_nr(n))
405            .collect()
406    } else if policy.allow_syscalls.is_none() {
407        DEFAULT_DENY_SYSCALLS
408            .iter()
409            .filter_map(|n| syscall_name_to_nr(n))
410            .collect()
411    } else {
412        // allow_syscalls is set — no deny list
413        Vec::new()
414    }
415}
416
417/// Build argument-level seccomp filter instructions matching the Python
418/// `_build_arg_filters()` exactly.
419///
420/// Returns a `Vec<SockFilter>` containing self-contained BPF blocks for:
421///   - clone: block namespace creation flags
422///   - ioctl: block TIOCSTI, TIOCLINUX
423///   - prctl: block PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER
424///   - socket: block NETLINK_SOCK_DIAG (with AF_NETLINK domain check)
425///   - socket: block SOCK_RAW/SOCK_DGRAM on AF_INET/AF_INET6 (with type mask)
426pub fn arg_filters(policy: &Policy) -> Vec<SockFilter> {
427    let ret_errno = SECCOMP_RET_ERRNO | EPERM as u32;
428    let nr_clone = libc::SYS_clone as u32;
429    let nr_ioctl = libc::SYS_ioctl as u32;
430    let nr_prctl = libc::SYS_prctl as u32;
431    let nr_socket = libc::SYS_socket as u32;
432
433    let mut insns: Vec<SockFilter> = Vec::new();
434
435    // --- clone: block namespace creation flags ---
436    // 5 instructions:
437    //   LD NR
438    //   JEQ clone → +0, skip 3
439    //   LD arg0
440    //   JSET NS_FLAGS → +0, skip 1
441    //   RET ERRNO
442    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
443    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_clone, 0, 3));
444    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
445    insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, CLONE_NS_FLAGS as u32, 0, 1));
446    insns.push(stmt(BPF_RET | BPF_K, ret_errno));
447
448    // --- ioctl: block dangerous commands (TIOCSTI, TIOCLINUX) ---
449    // Layout: LD NR, JEQ ioctl (skip 1 + N*2), LD arg1, [JEQ cmd, RET ERRNO] * N
450    let dangerous_ioctls: &[u32] = &[TIOCSTI as u32, TIOCLINUX as u32];
451    let n_ioctls = dangerous_ioctls.len();
452    let skip_count = (1 + n_ioctls * 2) as u8;
453    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
454    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_ioctl, 0, skip_count));
455    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
456    for &cmd in dangerous_ioctls {
457        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, cmd, 0, 1));
458        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
459    }
460
461    // --- prctl: block dangerous options ---
462    // Layout: LD NR, JEQ prctl (skip 1 + N*2), LD arg0, [JEQ op, RET ERRNO] * N
463    let dangerous_prctl_ops: &[u32] = &[PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER];
464    let n_ops = dangerous_prctl_ops.len();
465    let skip_count = (1 + n_ops * 2) as u8;
466    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
467    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_prctl, 0, skip_count));
468    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
469    for &op in dangerous_prctl_ops {
470        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, op, 0, 1));
471        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
472    }
473
474    // --- socket: block NETLINK_SOCK_DIAG (only on AF_NETLINK domain) ---
475    // 7 instructions:
476    //   LD NR
477    //   JEQ socket → +0, skip 5
478    //   LD arg0 (domain)
479    //   JEQ AF_NETLINK → +0, skip 3
480    //   LD arg2 (protocol)
481    //   JEQ NETLINK_SOCK_DIAG → +0, skip 1
482    //   RET ERRNO
483    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
484    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, 5));
485    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
486    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_NETLINK, 0, 3));
487    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS2_LO));
488    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, NETLINK_SOCK_DIAG, 0, 1));
489    insns.push(stmt(BPF_RET | BPF_K, ret_errno));
490
491    // --- socket: block SOCK_RAW and/or SOCK_DGRAM on AF_INET/AF_INET6 ---
492    let mut blocked_types: Vec<u32> = Vec::new();
493    if policy.no_raw_sockets {
494        blocked_types.push(SOCK_RAW);
495    }
496    if policy.no_udp {
497        blocked_types.push(SOCK_DGRAM);
498    }
499
500    if !blocked_types.is_empty() {
501        let n = blocked_types.len();
502        // Instructions after domain checks: 2 (load+AND) + N (JEQs) + 1 (RET)
503        let after_domain = 2 + n + 1;
504        // Total after NR check: 3 (load domain + 2 JEQs) + after_domain
505        let skip_all = (3 + after_domain) as u8;
506
507        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
508        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, skip_all));
509        // Load domain (arg0)
510        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
511        // AF_INET → skip to type check (jump over AF_INET6 check)
512        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET, 1, 0));
513        // AF_INET6 → type check; else skip everything remaining
514        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET6, 0, after_domain as u8));
515        // Load type (arg1) and mask off SOCK_NONBLOCK|SOCK_CLOEXEC
516        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
517        insns.push(stmt(BPF_ALU | BPF_AND | BPF_K, SOCK_TYPE_MASK));
518        // Check each blocked type
519        for (i, &sock_type) in blocked_types.iter().enumerate() {
520            let remaining = n - i - 1;
521            // Match → jump to RET ERRNO (skip 'remaining' JEQs ahead)
522            // No match on last type → skip past RET ERRNO (jf=1)
523            // No match on non-last → check next type (jf=0)
524            let jf: u8 = if remaining == 0 { 1 } else { 0 };
525            insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, sock_type, remaining as u8, jf));
526        }
527        // Deny return (reached by any matching JEQ)
528        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
529    }
530
531    insns
532}
533
534// ============================================================
535// Close fds above threshold
536// ============================================================
537
538/// Close all file descriptors above `min_fd`, except those in `keep`.
539fn close_fds_above(min_fd: RawFd, keep: &[RawFd]) {
540    // Read /proc/self/fd to enumerate open fds.
541    // Collect all fd numbers first, then close them after dropping the directory
542    // iterator. This avoids closing the directory fd during iteration.
543    let fds_to_close: Vec<RawFd> = {
544        let dir = match std::fs::read_dir("/proc/self/fd") {
545            Ok(d) => d,
546            Err(_) => return,
547        };
548        dir.flatten()
549            .filter_map(|entry| {
550                entry.file_name().into_string().ok()
551                    .and_then(|name| name.parse::<RawFd>().ok())
552            })
553            .filter(|&fd| fd > min_fd && !keep.contains(&fd))
554            .collect()
555    };
556    // The directory is now closed; safe to close the collected fds.
557    for fd in fds_to_close {
558        unsafe { libc::close(fd) };
559    }
560}
561
562// ============================================================
563// COW filesystem config passed from parent to child
564// ============================================================
565
566// Re-export ChildMountConfig so callers can use the old import path.
567pub(crate) use crate::cow::ChildMountConfig;
568
569/// Write uid/gid maps for an unprivileged user namespace.
570/// `real_uid`/`real_gid` must be captured *before* unshare(CLONE_NEWUSER),
571/// since getuid()/getgid() return the overflow id (65534) after unshare.
572/// `target_uid`/`target_gid` are the UIDs visible inside the namespace.
573fn write_id_maps(real_uid: u32, real_gid: u32, target_uid: u32, target_gid: u32) {
574    let _ = std::fs::write("/proc/self/uid_map", format!("{} {} 1\n", target_uid, real_uid));
575    let _ = std::fs::write("/proc/self/setgroups", "deny\n");
576    let _ = std::fs::write("/proc/self/gid_map", format!("{} {} 1\n", target_gid, real_gid));
577}
578
579/// Write uid/gid maps using the post-unshare overflow uid (65534).
580/// Used by the OverlayFS COW path which maps to root (UID 0) inside.
581fn write_id_maps_overflow() {
582    let uid = unsafe { libc::getuid() };
583    let gid = unsafe { libc::getgid() };
584    write_id_maps(uid, gid, 0, 0);
585}
586
587// ============================================================
588// Child-side confinement (never returns)
589// ============================================================
590
591/// Apply irreversible confinement (Landlock + seccomp) then exec the command.
592///
593/// This function **never returns**: it calls `execvp` on success or
594/// `_exit(127)` on any error.
595pub(crate) fn confine_child(policy: &Policy, cmd: &[CString], pipes: &PipePair, cow_config: Option<&ChildMountConfig>, nested: bool) -> ! {
596    // Helper: abort child on error. Includes the OS error automatically.
597    macro_rules! fail {
598        ($msg:expr) => {{
599            let err = std::io::Error::last_os_error();
600            let _ = write!(std::io::stderr(), "sandlock child: {}: {}\n", $msg, err);
601            unsafe { libc::_exit(127) };
602        }};
603    }
604
605    use std::io::Write;
606
607    // 1. New process group
608    if unsafe { libc::setpgid(0, 0) } != 0 {
609        fail!("setpgid");
610    }
611
612    // 1b. If stdin is a terminal, become the foreground process group
613    //     so interactive shells can read from the TTY.
614    //     Must ignore SIGTTOU first — a background pgrp calling tcsetpgrp
615    //     gets stopped by SIGTTOU otherwise.
616    if unsafe { libc::isatty(0) } == 1 {
617        unsafe {
618            libc::signal(libc::SIGTTOU, libc::SIG_IGN);
619            libc::tcsetpgrp(0, libc::getpgrp());
620            libc::signal(libc::SIGTTOU, libc::SIG_DFL);
621        }
622    }
623
624    // 2. Die if parent exits
625    if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } != 0 {
626        fail!("prctl(PR_SET_PDEATHSIG)");
627    }
628
629    // 3. Check parent didn't die between fork and prctl
630    if unsafe { libc::getppid() } == 1 {
631        fail!("parent died before confinement");
632    }
633
634    // 4. Optional: disable ASLR
635    if policy.no_randomize_memory {
636        const ADDR_NO_RANDOMIZE: libc::c_ulong = 0x0040000;
637        // Read current personality first (0xffffffff = query), then OR in the flag.
638        let current = unsafe { libc::personality(0xffffffff) };
639        if current == -1 {
640            fail!("personality(query)");
641        }
642        if unsafe { libc::personality(current as libc::c_ulong | ADDR_NO_RANDOMIZE) } == -1 {
643            fail!("personality(ADDR_NO_RANDOMIZE)");
644        }
645    }
646
647    // 4b. Optional: CPU core binding
648    if let Some(ref cores) = policy.cpu_cores {
649        if !cores.is_empty() {
650            let mut set = unsafe { std::mem::zeroed::<libc::cpu_set_t>() };
651            unsafe { libc::CPU_ZERO(&mut set) };
652            for &core in cores {
653                unsafe { libc::CPU_SET(core as usize, &mut set) };
654            }
655            if unsafe {
656                libc::sched_setaffinity(
657                    0,
658                    std::mem::size_of::<libc::cpu_set_t>(),
659                    &set,
660                )
661            } != 0
662            {
663                fail!("sched_setaffinity");
664            }
665        }
666    }
667
668    // 5. Optional: disable THP
669    if policy.no_huge_pages {
670        if unsafe { libc::prctl(libc::PR_SET_THP_DISABLE, 1, 0, 0, 0) } != 0 {
671            fail!("prctl(PR_SET_THP_DISABLE)");
672        }
673    }
674
675    // 5c. Optional: disable core dumps
676    if policy.no_coredump {
677        // Set RLIMIT_CORE to 0 — the kernel will not write a core file.
678        // We intentionally do NOT call prctl(PR_SET_DUMPABLE, 0) because
679        // that would break pidfd_getfd which the supervisor needs.
680        // The seccomp filter already blocks the child from calling
681        // prctl(PR_SET_DUMPABLE, ...) so it can't re-enable it.
682        let rlim = libc::rlimit { rlim_cur: 0, rlim_max: 0 };
683        if unsafe { libc::setrlimit(libc::RLIMIT_CORE, &rlim) } != 0 {
684            fail!("setrlimit(RLIMIT_CORE, 0)");
685        }
686    }
687
688    // Capture real uid/gid before any unshare (after unshare they become 65534)
689    let real_uid = unsafe { libc::getuid() };
690    let real_gid = unsafe { libc::getgid() };
691
692    // 5b. User namespace for --uid mapping (when not using OverlayFS COW,
693    //     which sets up its own user namespace)
694    if let Some(target_uid) = policy.uid {
695        if cow_config.is_none() {
696            if unsafe { libc::unshare(libc::CLONE_NEWUSER) } != 0 {
697                fail!("unshare(CLONE_NEWUSER)");
698            }
699            write_id_maps(real_uid, real_gid, target_uid, target_uid);
700        }
701    }
702
703    // 5c. User + mount namespace for OverlayFS COW (includes CLONE_NEWUSER)
704    if let Some(ref cow) = cow_config {
705        // unshare user + mount namespaces (unprivileged)
706        if unsafe { libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) } != 0 {
707            fail!("unshare(CLONE_NEWUSER | CLONE_NEWNS)");
708        }
709
710        // Write uid/gid maps using overflow uid (preserves existing COW behavior)
711        write_id_maps_overflow();
712
713        // Mount the overlay filesystem ON TOP of the workdir so the child
714        // sees the merged view at the original path.  The kernel resolves
715        // lowerdir before the covering mount takes effect, so using the
716        // same path as both lowerdir and mount-point is safe inside our
717        // private mount namespace.
718        let lowerdir = cow.lowers.iter()
719            .map(|p| p.display().to_string())
720            .collect::<Vec<_>>()
721            .join(":");
722        let opts = format!(
723            "lowerdir={},upperdir={},workdir={}",
724            lowerdir,
725            cow.upper.display(),
726            cow.work.display(),
727        );
728
729        let mount_cstr = match CString::new(cow.mount_point.to_str().unwrap_or("")) {
730            Ok(c) => c,
731            Err(_) => fail!("invalid overlay mount point path"),
732        };
733        let overlay_cstr = CString::new("overlay").unwrap();
734        let opts_cstr = match CString::new(opts) {
735            Ok(c) => c,
736            Err(_) => fail!("invalid overlay opts"),
737        };
738
739        let ret = unsafe {
740            libc::mount(
741                overlay_cstr.as_ptr(),
742                mount_cstr.as_ptr(),
743                overlay_cstr.as_ptr(),
744                0,
745                opts_cstr.as_ptr() as *const libc::c_void,
746            )
747        };
748        if ret != 0 {
749            fail!("mount overlay");
750        }
751    }
752
753    // 6. Optional: change working directory
754    // cwd controls where the child starts; workdir is only for COW
755    let effective_cwd = if let Some(ref cwd) = policy.cwd {
756        if let Some(ref chroot_root) = policy.chroot {
757            Some(chroot_root.join(cwd.strip_prefix("/").unwrap_or(cwd)))
758        } else {
759            Some(cwd.clone())
760        }
761    } else if let Some(ref chroot_root) = policy.chroot {
762        // Default to chroot root
763        Some(chroot_root.to_path_buf())
764    } else {
765        None
766    };
767
768    if let Some(ref cwd) = effective_cwd {
769        let c_path = match CString::new(cwd.as_os_str().as_encoded_bytes()) {
770            Ok(c) => c,
771            Err(_) => fail!("invalid cwd path"),
772        };
773        if unsafe { libc::chdir(c_path.as_ptr()) } != 0 {
774            fail!("chdir");
775        }
776    }
777
778    // 7. Set NO_NEW_PRIVS (required for both Landlock and seccomp without CAP_SYS_ADMIN)
779    if unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) } != 0 {
780        fail!("prctl(PR_SET_NO_NEW_PRIVS)");
781    }
782
783    // 8. Apply Landlock confinement (IRREVERSIBLE)
784    if let Err(e) = crate::landlock::confine(policy) {
785        fail!(format!("landlock: {}", e));
786    }
787
788    // 9. Assemble and install seccomp filter (IRREVERSIBLE)
789    let deny = deny_syscall_numbers(policy);
790    let args = arg_filters(policy);
791    let mut keep_fd: i32 = -1;
792
793    if nested {
794        // Nested sandbox: deny-only filter (no supervisor — parent handles it).
795        // BPF filters are ANDed by the kernel, so each level can only tighten.
796        let filter = bpf::assemble_filter(&[], &deny, &args);
797        if let Err(e) = bpf::install_deny_filter(&filter) {
798            fail!(format!("seccomp deny filter: {}", e));
799        }
800        // Signal nested mode to parent (fd=0 means no supervisor needed)
801        if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), 0) {
802            fail!(format!("write nested signal: {}", e));
803        }
804    } else {
805        // First-level sandbox: notif + deny filter with NEW_LISTENER.
806        let notif = notif_syscalls(policy);
807        let filter = bpf::assemble_filter(&notif, &deny, &args);
808        let notif_fd = match bpf::install_filter(&filter) {
809            Ok(fd) => fd,
810            Err(e) => fail!(format!("seccomp install: {}", e)),
811        };
812        keep_fd = notif_fd.as_raw_fd();
813        if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), keep_fd as u32) {
814            fail!(format!("write notif fd: {}", e));
815        }
816        std::mem::forget(notif_fd);
817    }
818
819    // Mark this process as confined for in-process nesting detection
820    crate::sandbox::CONFINED.store(true, std::sync::atomic::Ordering::Relaxed);
821
822    // 10. Wait for parent to signal ready
823    match read_u32_fd(pipes.ready_r.as_raw_fd()) {
824        Ok(_) => {}
825        Err(e) => fail!(format!("read ready signal: {}", e)),
826    }
827
828    // 12. Close all fds above stderr (always on for isolation)
829    if keep_fd >= 0 {
830        close_fds_above(2, &[keep_fd]);
831    } else {
832        close_fds_above(2, &[]);
833    }
834
835    // 13. Apply environment
836    if policy.clean_env {
837        // Clear all env vars first
838        for (key, _) in std::env::vars_os() {
839            std::env::remove_var(&key);
840        }
841    }
842    for (key, value) in &policy.env {
843        std::env::set_var(key, value);
844    }
845
846    // 13b. GPU device visibility
847    if let Some(ref devices) = policy.gpu_devices {
848        if !devices.is_empty() {
849            let vis = devices.iter().map(|d| d.to_string()).collect::<Vec<_>>().join(",");
850            std::env::set_var("CUDA_VISIBLE_DEVICES", &vis);
851            std::env::set_var("ROCR_VISIBLE_DEVICES", &vis);
852        }
853        // Empty list = all GPUs visible, don't set env vars
854    }
855
856    // 14. exec
857    debug_assert!(!cmd.is_empty(), "cmd must not be empty");
858    let argv_ptrs: Vec<*const libc::c_char> = cmd
859        .iter()
860        .map(|s| s.as_ptr())
861        .chain(std::iter::once(std::ptr::null()))
862        .collect();
863
864    if policy.chroot.is_some() {
865        // With chroot the seccomp handler rewrites the filename to a host path
866        // (or /proc/self/fd/N).  Pass a separate PATH_MAX buffer as the `file`
867        // argument so the rewrite does not corrupt argv[0] — which must stay as
868        // the original command name (e.g. busybox uses argv[0] for applet
869        // detection).  execvp still handles PATH lookup for bare command names.
870        let mut exec_path = vec![0u8; libc::PATH_MAX as usize];
871        let orig = cmd[0].as_bytes_with_nul();
872        exec_path[..orig.len()].copy_from_slice(orig);
873
874        unsafe {
875            libc::execvp(
876                exec_path.as_ptr() as *const libc::c_char,
877                argv_ptrs.as_ptr(),
878            )
879        };
880    } else {
881        unsafe { libc::execvp(argv_ptrs[0], argv_ptrs.as_ptr()) };
882    }
883
884    // If we get here, exec failed
885    fail!(format!("execvp '{}'", cmd[0].to_string_lossy()));
886}
887
888// ============================================================
889// Tests
890// ============================================================
891
892#[cfg(test)]
893mod tests {
894    use super::*;
895
896    #[test]
897    fn test_pipe_pair_creation() {
898        let pipes = PipePair::new().expect("pipe creation failed");
899        // Verify fds are valid (non-negative)
900        assert!(pipes.notif_r.as_raw_fd() >= 0);
901        assert!(pipes.notif_w.as_raw_fd() >= 0);
902        assert!(pipes.ready_r.as_raw_fd() >= 0);
903        assert!(pipes.ready_w.as_raw_fd() >= 0);
904        // All four fds should be distinct
905        let fds = [
906            pipes.notif_r.as_raw_fd(),
907            pipes.notif_w.as_raw_fd(),
908            pipes.ready_r.as_raw_fd(),
909            pipes.ready_w.as_raw_fd(),
910        ];
911        for i in 0..4 {
912            for j in (i + 1)..4 {
913                assert_ne!(fds[i], fds[j]);
914            }
915        }
916    }
917
918    #[test]
919    fn test_write_read_u32() {
920        let pipes = PipePair::new().expect("pipe creation failed");
921        let val = 42u32;
922        write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
923        let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
924        assert_eq!(got, val);
925    }
926
927    #[test]
928    fn test_write_read_u32_large() {
929        let pipes = PipePair::new().expect("pipe creation failed");
930        let val = 0xDEAD_BEEFu32;
931        write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
932        let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
933        assert_eq!(got, val);
934    }
935
936    #[test]
937    fn test_notif_syscalls_always_has_clone() {
938        let policy = Policy::builder().build().unwrap();
939        let nrs = notif_syscalls(&policy);
940        assert!(nrs.contains(&(libc::SYS_clone as u32)));
941        assert!(nrs.contains(&(libc::SYS_clone3 as u32)));
942        assert!(nrs.contains(&(libc::SYS_vfork as u32)));
943    }
944
945    #[test]
946    fn test_notif_syscalls_memory() {
947        let policy = Policy::builder()
948            .max_memory(crate::policy::ByteSize::mib(256))
949            .build()
950            .unwrap();
951        let nrs = notif_syscalls(&policy);
952        assert!(nrs.contains(&(libc::SYS_mmap as u32)));
953        assert!(nrs.contains(&(libc::SYS_munmap as u32)));
954        assert!(nrs.contains(&(libc::SYS_brk as u32)));
955        assert!(nrs.contains(&(libc::SYS_mremap as u32)));
956        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
957    }
958
959    #[test]
960    fn test_notif_syscalls_net() {
961        let policy = Policy::builder()
962            .net_allow_host("example.com")
963            .build()
964            .unwrap();
965        let nrs = notif_syscalls(&policy);
966        assert!(nrs.contains(&(libc::SYS_connect as u32)));
967        assert!(nrs.contains(&(libc::SYS_sendto as u32)));
968        assert!(nrs.contains(&(libc::SYS_sendmsg as u32)));
969    }
970
971    /// SYS_faccessat2 (439) must be in the notification filter for both
972    /// chroot and COW modes — glibc 2.33+ uses it instead of faccessat.
973    #[test]
974    fn test_notif_syscalls_faccessat2() {
975        const SYS_FACCESSAT2: u32 = 439;
976
977        // Chroot mode
978        let policy = Policy::builder()
979            .chroot("/tmp")
980            .build()
981            .unwrap();
982        let nrs = notif_syscalls(&policy);
983        assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
984        assert!(nrs.contains(&SYS_FACCESSAT2),
985                "chroot notif filter must include SYS_faccessat2 (439)");
986
987        // COW mode
988        let policy = Policy::builder()
989            .workdir("/tmp")
990            .build()
991            .unwrap();
992        let nrs = notif_syscalls(&policy);
993        assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
994        assert!(nrs.contains(&SYS_FACCESSAT2),
995                "COW notif filter must include SYS_faccessat2 (439)");
996    }
997
998    #[test]
999    fn test_deny_syscall_numbers_default() {
1000        let policy = Policy::builder().build().unwrap();
1001        let nrs = deny_syscall_numbers(&policy);
1002        // Should contain mount, ptrace, etc.
1003        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1004        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1005        assert!(nrs.contains(&(libc::SYS_bpf as u32)));
1006        // nfsservctl has no libc constant, so it is skipped
1007        assert!(!nrs.is_empty());
1008    }
1009
1010    #[test]
1011    fn test_deny_syscall_numbers_custom() {
1012        let policy = Policy::builder()
1013            .deny_syscalls(vec!["mount".into(), "ptrace".into()])
1014            .build()
1015            .unwrap();
1016        let nrs = deny_syscall_numbers(&policy);
1017        assert_eq!(nrs.len(), 2);
1018        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1019        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1020    }
1021
1022    #[test]
1023    fn test_deny_syscall_numbers_empty_when_allow_set() {
1024        let policy = Policy::builder()
1025            .allow_syscalls(vec!["read".into(), "write".into()])
1026            .build()
1027            .unwrap();
1028        let nrs = deny_syscall_numbers(&policy);
1029        assert!(nrs.is_empty());
1030    }
1031
1032    #[test]
1033    fn test_arg_filters_has_clone_ioctl_prctl_socket() {
1034        use crate::sys::structs::{
1035            BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K,
1036        };
1037        let policy = Policy::builder().build().unwrap();
1038        let filters = arg_filters(&policy);
1039        // Should contain JEQ for clone syscall nr
1040        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1041            && f.k == libc::SYS_clone as u32));
1042        // Should contain JSET for CLONE_NS_FLAGS
1043        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JSET | BPF_K)
1044            && f.k == CLONE_NS_FLAGS as u32));
1045        // Should contain JEQ for ioctl syscall nr
1046        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1047            && f.k == libc::SYS_ioctl as u32));
1048        // Should contain JEQ for TIOCSTI and TIOCLINUX
1049        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1050            && f.k == TIOCSTI as u32));
1051        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1052            && f.k == TIOCLINUX as u32));
1053        // Should contain JEQ for prctl syscall nr
1054        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1055            && f.k == libc::SYS_prctl as u32));
1056        // Should contain JEQ for PR_SET_DUMPABLE
1057        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1058            && f.k == PR_SET_DUMPABLE));
1059        // Should contain JEQ for socket + AF_NETLINK + NETLINK_SOCK_DIAG
1060        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1061            && f.k == AF_NETLINK));
1062        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1063            && f.k == NETLINK_SOCK_DIAG));
1064    }
1065
1066    #[test]
1067    fn test_arg_filters_raw_sockets() {
1068        use crate::sys::structs::{BPF_ALU, BPF_AND, BPF_JEQ, BPF_JMP, BPF_K};
1069        let policy = Policy::builder().no_raw_sockets(true).build().unwrap();
1070        let filters = arg_filters(&policy);
1071        // Should have AF_INET check
1072        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1073            && f.k == AF_INET));
1074        // Should have AF_INET6 check
1075        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1076            && f.k == AF_INET6));
1077        // Should have ALU AND SOCK_TYPE_MASK
1078        assert!(filters.iter().any(|f| f.code == (BPF_ALU | BPF_AND | BPF_K)
1079            && f.k == SOCK_TYPE_MASK));
1080        // Should have JEQ SOCK_RAW
1081        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1082            && f.k == SOCK_RAW));
1083    }
1084
1085    #[test]
1086    fn test_arg_filters_no_udp() {
1087        use crate::sys::structs::{BPF_JEQ, BPF_JMP, BPF_K};
1088        let policy = Policy::builder().no_udp(true).build().unwrap();
1089        let filters = arg_filters(&policy);
1090        // Should have JEQ SOCK_DGRAM
1091        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1092            && f.k == SOCK_DGRAM));
1093    }
1094
1095    #[test]
1096    fn test_syscall_name_to_nr_covers_defaults() {
1097        // Every name in DEFAULT_DENY_SYSCALLS except nfsservctl should resolve
1098        let mut skipped = 0;
1099        for name in DEFAULT_DENY_SYSCALLS {
1100            match syscall_name_to_nr(name) {
1101                Some(_) => {}
1102                None => {
1103                    assert_eq!(*name, "nfsservctl", "unexpected unresolved syscall: {}", name);
1104                    skipped += 1;
1105                }
1106            }
1107        }
1108        assert_eq!(skipped, 1); // only nfsservctl
1109    }
1110}