Skip to main content

sandlock_core/
context.rs

1// Fork + confinement sequence: child-side Landlock + seccomp application
2// and parent-child pipe synchronization.
3
4use std::ffi::CString;
5use std::io;
6use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
7
8use crate::arch;
9use crate::policy::{FsIsolation, Policy};
10use crate::seccomp::bpf::{self, stmt, jump};
11use crate::sys::structs::{
12    AF_INET, AF_INET6,
13    BPF_ABS, BPF_ALU, BPF_AND, BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K, BPF_LD, BPF_RET, BPF_W,
14    CLONE_NS_FLAGS, DEFAULT_DENY_SYSCALLS, EPERM,
15    SECCOMP_RET_ALLOW, SECCOMP_RET_ERRNO,
16    SIOCETHTOOL, SIOCGIFADDR, SIOCGIFBRDADDR, SIOCGIFCONF, SIOCGIFDSTADDR,
17    SIOCGIFFLAGS, SIOCGIFHWADDR, SIOCGIFINDEX, SIOCGIFNAME, SIOCGIFNETMASK,
18    SOCK_DGRAM, SOCK_RAW, SOCK_TYPE_MASK, TIOCLINUX, TIOCSTI,
19    PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER,
20    OFFSET_ARGS0_LO, OFFSET_ARGS1_LO, OFFSET_ARGS2_LO, OFFSET_ARGS3_LO, OFFSET_NR,
21    SockFilter,
22};
23
24// ============================================================
25// Pipe pair for parent-child synchronization
26// ============================================================
27
28/// Pipes for parent-child communication after fork().
29pub struct PipePair {
30    /// Parent reads the notif fd number written by the child.
31    pub notif_r: OwnedFd,
32    /// Child writes the notif fd number to the parent.
33    pub notif_w: OwnedFd,
34    /// Child reads the "supervisor ready" signal from the parent.
35    pub ready_r: OwnedFd,
36    /// Parent writes the "supervisor ready" signal to the child.
37    pub ready_w: OwnedFd,
38}
39
40impl PipePair {
41    /// Create two pipe pairs using `pipe2(O_CLOEXEC)`.
42    pub fn new() -> io::Result<Self> {
43        let mut notif_fds = [0i32; 2];
44        let mut ready_fds = [0i32; 2];
45
46        // SAFETY: pipe2 with valid pointers and O_CLOEXEC
47        let ret = unsafe { libc::pipe2(notif_fds.as_mut_ptr(), libc::O_CLOEXEC) };
48        if ret < 0 {
49            return Err(io::Error::last_os_error());
50        }
51
52        let ret = unsafe { libc::pipe2(ready_fds.as_mut_ptr(), libc::O_CLOEXEC) };
53        if ret < 0 {
54            // Close the first pair on failure
55            unsafe {
56                libc::close(notif_fds[0]);
57                libc::close(notif_fds[1]);
58            }
59            return Err(io::Error::last_os_error());
60        }
61
62        // SAFETY: pipe2 returned valid fds
63        Ok(PipePair {
64            notif_r: unsafe { OwnedFd::from_raw_fd(notif_fds[0]) },
65            notif_w: unsafe { OwnedFd::from_raw_fd(notif_fds[1]) },
66            ready_r: unsafe { OwnedFd::from_raw_fd(ready_fds[0]) },
67            ready_w: unsafe { OwnedFd::from_raw_fd(ready_fds[1]) },
68        })
69    }
70}
71
72// ============================================================
73// Pipe I/O helpers
74// ============================================================
75
76/// Write a `u32` as 4 little-endian bytes to a raw fd.
77pub(crate) fn write_u32_fd(fd: RawFd, val: u32) -> io::Result<()> {
78    let buf = val.to_le_bytes();
79    let mut written = 0usize;
80    while written < 4 {
81        let ret = unsafe {
82            libc::write(
83                fd,
84                buf[written..].as_ptr() as *const libc::c_void,
85                4 - written,
86            )
87        };
88        if ret < 0 {
89            return Err(io::Error::last_os_error());
90        }
91        written += ret as usize;
92    }
93    Ok(())
94}
95
96/// Read a `u32` (4 little-endian bytes, blocking) from a raw fd.
97pub(crate) fn read_u32_fd(fd: RawFd) -> io::Result<u32> {
98    let mut buf = [0u8; 4];
99    let mut total = 0usize;
100    while total < 4 {
101        let ret = unsafe {
102            libc::read(
103                fd,
104                buf[total..].as_mut_ptr() as *mut libc::c_void,
105                4 - total,
106            )
107        };
108        if ret < 0 {
109            return Err(io::Error::last_os_error());
110        }
111        if ret == 0 {
112            return Err(io::Error::new(
113                io::ErrorKind::UnexpectedEof,
114                "pipe closed before 4 bytes read",
115            ));
116        }
117        total += ret as usize;
118    }
119    Ok(u32::from_le_bytes(buf))
120}
121
122// ============================================================
123// Syscall name → number mapping
124// ============================================================
125
126/// Map a syscall name to its `libc::SYS_*` number.
127///
128/// Covers all names in `DEFAULT_DENY_SYSCALLS` plus extras needed for
129/// notif and arg-filter lists.
130pub fn syscall_name_to_nr(name: &str) -> Option<u32> {
131    let nr: i64 = match name {
132        "mount" => libc::SYS_mount,
133        "umount2" => libc::SYS_umount2,
134        "pivot_root" => libc::SYS_pivot_root,
135        "swapon" => libc::SYS_swapon,
136        "swapoff" => libc::SYS_swapoff,
137        "reboot" => libc::SYS_reboot,
138        "sethostname" => libc::SYS_sethostname,
139        "setdomainname" => libc::SYS_setdomainname,
140        "kexec_load" => libc::SYS_kexec_load,
141        "init_module" => libc::SYS_init_module,
142        "finit_module" => libc::SYS_finit_module,
143        "delete_module" => libc::SYS_delete_module,
144        "unshare" => libc::SYS_unshare,
145        "setns" => libc::SYS_setns,
146        "perf_event_open" => libc::SYS_perf_event_open,
147        "bpf" => libc::SYS_bpf,
148        "userfaultfd" => libc::SYS_userfaultfd,
149        "keyctl" => libc::SYS_keyctl,
150        "add_key" => libc::SYS_add_key,
151        "request_key" => libc::SYS_request_key,
152        "ptrace" => libc::SYS_ptrace,
153        "process_vm_readv" => libc::SYS_process_vm_readv,
154        "process_vm_writev" => libc::SYS_process_vm_writev,
155        "open_by_handle_at" => libc::SYS_open_by_handle_at,
156        "name_to_handle_at" => libc::SYS_name_to_handle_at,
157        "ioperm" => arch::SYS_IOPERM?,
158        "iopl" => arch::SYS_IOPL?,
159        "quotactl" => libc::SYS_quotactl,
160        "acct" => libc::SYS_acct,
161        "lookup_dcookie" => libc::SYS_lookup_dcookie,
162        // nfsservctl was removed in Linux 3.1; no libc constant — skip
163        "personality" => libc::SYS_personality,
164        "io_uring_setup" => libc::SYS_io_uring_setup,
165        "io_uring_enter" => libc::SYS_io_uring_enter,
166        "io_uring_register" => libc::SYS_io_uring_register,
167        // Additional syscalls for notif/arg filters
168        "clone" => libc::SYS_clone,
169        "clone3" => libc::SYS_clone3,
170        "vfork" => arch::SYS_VFORK?,
171        "mmap" => libc::SYS_mmap,
172        "munmap" => libc::SYS_munmap,
173        "brk" => libc::SYS_brk,
174        "mremap" => libc::SYS_mremap,
175        "connect" => libc::SYS_connect,
176        "sendto" => libc::SYS_sendto,
177        "sendmsg" => libc::SYS_sendmsg,
178        "ioctl" => libc::SYS_ioctl,
179        "socket" => libc::SYS_socket,
180        "prctl" => libc::SYS_prctl,
181        "getrandom" => libc::SYS_getrandom,
182        "openat" => libc::SYS_openat,
183        "open" => arch::SYS_OPEN?,
184        "getdents64" => libc::SYS_getdents64,
185        "getdents" => arch::SYS_GETDENTS?,
186        "bind" => libc::SYS_bind,
187        "getsockname" => libc::SYS_getsockname,
188        "clock_gettime" => libc::SYS_clock_gettime,
189        "gettimeofday" => libc::SYS_gettimeofday,
190        "time" => arch::SYS_TIME?,
191        "clock_nanosleep" => libc::SYS_clock_nanosleep,
192        "timerfd_settime" => libc::SYS_timerfd_settime,
193        "timer_settime" => libc::SYS_timer_settime,
194        "execve" => libc::SYS_execve,
195        "execveat" => libc::SYS_execveat,
196        // COW filesystem syscalls
197        "unlinkat" => libc::SYS_unlinkat,
198        "mkdirat" => libc::SYS_mkdirat,
199        "renameat2" => libc::SYS_renameat2,
200        "newfstatat" => libc::SYS_newfstatat,
201        "statx" => libc::SYS_statx,
202        "faccessat" => libc::SYS_faccessat,
203        "symlinkat" => libc::SYS_symlinkat,
204        "linkat" => libc::SYS_linkat,
205        "fchmodat" => libc::SYS_fchmodat,
206        "fchownat" => libc::SYS_fchownat,
207        "readlinkat" => libc::SYS_readlinkat,
208        "truncate" => libc::SYS_truncate,
209        "utimensat" => libc::SYS_utimensat,
210        "unlink" => arch::SYS_UNLINK?,
211        "rmdir" => arch::SYS_RMDIR?,
212        "mkdir" => arch::SYS_MKDIR?,
213        "rename" => arch::SYS_RENAME?,
214        "stat" => arch::SYS_STAT?,
215        "lstat" => arch::SYS_LSTAT?,
216        "access" => arch::SYS_ACCESS?,
217        "symlink" => arch::SYS_SYMLINK?,
218        "link" => arch::SYS_LINK?,
219        "chmod" => arch::SYS_CHMOD?,
220        "chown" => arch::SYS_CHOWN?,
221        "lchown" => arch::SYS_LCHOWN?,
222        "readlink" => arch::SYS_READLINK?,
223        "futimesat" => arch::SYS_FUTIMESAT?,
224        "fork" => arch::SYS_FORK?,
225        _ => return None,
226    };
227    Some(nr as u32)
228}
229
230// ============================================================
231// Policy → syscall lists
232// ============================================================
233
234/// Determine which syscalls need `SECCOMP_RET_USER_NOTIF`.
235pub fn notif_syscalls(policy: &Policy) -> Vec<u32> {
236    let mut nrs = vec![
237        libc::SYS_clone as u32,
238        libc::SYS_clone3 as u32,
239        libc::SYS_wait4 as u32,
240        libc::SYS_waitid as u32,
241    ];
242    arch::push_optional_syscall(&mut nrs, arch::SYS_VFORK);
243
244    if policy.max_memory.is_some() {
245        nrs.push(libc::SYS_mmap as u32);
246        nrs.push(libc::SYS_munmap as u32);
247        nrs.push(libc::SYS_brk as u32);
248        nrs.push(libc::SYS_mremap as u32);
249        nrs.push(libc::SYS_shmget as u32);
250    }
251
252    if policy.net_allow_hosts.is_some()
253        || policy.policy_fn.is_some()
254        || !policy.http_allow.is_empty()
255        || !policy.http_deny.is_empty()
256    {
257        nrs.push(libc::SYS_connect as u32);
258        nrs.push(libc::SYS_sendto as u32);
259        nrs.push(libc::SYS_sendmsg as u32);
260        nrs.push(libc::SYS_bind as u32);
261    }
262
263    if policy.random_seed.is_some() {
264        nrs.push(libc::SYS_getrandom as u32);
265        // Also intercept openat so the supervisor can re-patch vDSO after exec.
266        nrs.push(libc::SYS_openat as u32);
267    }
268
269    if policy.time_start.is_some() {
270        nrs.extend_from_slice(&[
271            libc::SYS_clock_nanosleep as u32,
272            libc::SYS_timerfd_settime as u32,
273            libc::SYS_timer_settime as u32,
274        ]);
275        // Also intercept openat so the supervisor gets a notification after exec
276        // and can re-patch the vDSO (exec replaces vDSO with a fresh copy).
277        nrs.push(libc::SYS_openat as u32);
278    }
279
280    // /proc virtualization (always on: PID filtering, sensitive path blocking)
281    nrs.push(libc::SYS_openat as u32);
282    nrs.push(libc::SYS_getdents64 as u32);
283    arch::push_optional_syscall(&mut nrs, arch::SYS_GETDENTS);
284
285    // Netlink virtualization (always on):
286    //   socket, bind, getsockname — swap in a unix socketpair for AF_NETLINK
287    //   recvfrom, recvmsg         — zero msg_name so glibc accepts the reply
288    //                                (kernel only writes sun_family on unix
289    //                                 recvmsg, leaving nl_pid uninitialized)
290    //   close                     — unregister (pid, fd) so reuse doesn't
291    //                                collide with the cookie set
292    // Send traffic flows through the real socketpair untouched.
293    nrs.push(libc::SYS_socket as u32);
294    nrs.push(libc::SYS_bind as u32);
295    nrs.push(libc::SYS_getsockname as u32);
296    nrs.push(libc::SYS_recvfrom as u32);
297    nrs.push(libc::SYS_recvmsg as u32);
298    nrs.push(libc::SYS_close as u32);
299    // Virtualize sched_getaffinity so nproc/sysconf agree with /proc/cpuinfo
300    if policy.num_cpus.is_some() {
301        nrs.push(libc::SYS_sched_getaffinity as u32);
302    }
303    if policy.hostname.is_some() {
304        nrs.push(libc::SYS_uname as u32);
305        nrs.push(libc::SYS_openat as u32);
306    }
307
308    // COW filesystem interception (seccomp-based, unprivileged)
309    if policy.workdir.is_some() && policy.fs_isolation == FsIsolation::None {
310        nrs.extend_from_slice(&[
311            libc::SYS_openat as u32,
312            libc::SYS_unlinkat as u32,
313            libc::SYS_mkdirat as u32,
314            libc::SYS_renameat2 as u32,
315            libc::SYS_symlinkat as u32,
316            libc::SYS_linkat as u32,
317            libc::SYS_fchmodat as u32,
318            libc::SYS_fchownat as u32,
319            libc::SYS_truncate as u32,
320            libc::SYS_utimensat as u32,
321            libc::SYS_newfstatat as u32,
322            libc::SYS_statx as u32,
323            libc::SYS_faccessat as u32,
324            439u32,                       // SYS_faccessat2 — glibc 2.33+ uses this instead of faccessat
325            libc::SYS_readlinkat as u32,
326            libc::SYS_getdents64 as u32,
327            libc::SYS_chdir as u32,
328            libc::SYS_getcwd as u32,
329        ]);
330        for nr in [
331            arch::SYS_OPEN, arch::SYS_UNLINK, arch::SYS_RMDIR, arch::SYS_MKDIR,
332            arch::SYS_RENAME, arch::SYS_SYMLINK, arch::SYS_LINK, arch::SYS_CHMOD,
333            arch::SYS_CHOWN, arch::SYS_LCHOWN, arch::SYS_STAT, arch::SYS_LSTAT,
334            arch::SYS_ACCESS, arch::SYS_READLINK, arch::SYS_GETDENTS,
335        ] {
336            arch::push_optional_syscall(&mut nrs, nr);
337        }
338    }
339
340    // Chroot path interception
341    if policy.chroot.is_some() {
342        nrs.extend_from_slice(&[
343            libc::SYS_openat as u32,
344            libc::SYS_execve as u32,
345            libc::SYS_execveat as u32,
346            libc::SYS_unlinkat as u32,
347            libc::SYS_mkdirat as u32,
348            libc::SYS_renameat2 as u32,
349            libc::SYS_symlinkat as u32,
350            libc::SYS_linkat as u32,
351            libc::SYS_fchmodat as u32,
352            libc::SYS_fchownat as u32,
353            libc::SYS_truncate as u32,
354            libc::SYS_newfstatat as u32,
355            libc::SYS_statx as u32,
356            libc::SYS_faccessat as u32,
357            439u32,                       // SYS_faccessat2 — glibc 2.33+ uses this instead of faccessat
358            libc::SYS_readlinkat as u32,
359            libc::SYS_getdents64 as u32,
360            libc::SYS_chdir as u32,
361            libc::SYS_getcwd as u32,
362            libc::SYS_statfs as u32,
363            libc::SYS_utimensat as u32,
364        ]);
365        for nr in [
366            arch::SYS_OPEN, arch::SYS_STAT, arch::SYS_LSTAT, arch::SYS_ACCESS,
367            arch::SYS_READLINK, arch::SYS_GETDENTS, arch::SYS_UNLINK,
368            arch::SYS_RMDIR, arch::SYS_MKDIR, arch::SYS_RENAME,
369            arch::SYS_SYMLINK, arch::SYS_LINK, arch::SYS_CHMOD,
370            arch::SYS_CHOWN, arch::SYS_LCHOWN,
371        ] {
372            arch::push_optional_syscall(&mut nrs, nr);
373        }
374    }
375
376    // Explicit deny-paths need path-bearing syscalls intercepted.
377    if !policy.fs_denied.is_empty() {
378        nrs.extend_from_slice(&[
379            libc::SYS_openat as u32,
380            libc::SYS_execve as u32,
381            libc::SYS_execveat as u32,
382            libc::SYS_linkat as u32,
383            libc::SYS_renameat2 as u32,
384            libc::SYS_symlinkat as u32,
385        ]);
386        for nr in [arch::SYS_OPEN, arch::SYS_LINK, arch::SYS_RENAME, arch::SYS_SYMLINK] {
387            arch::push_optional_syscall(&mut nrs, nr);
388        }
389    }
390
391    // Dynamic policy callback — intercept key syscalls for event emission.
392    if policy.policy_fn.is_some() {
393        nrs.extend_from_slice(&[
394            libc::SYS_openat as u32,
395            libc::SYS_connect as u32,
396            libc::SYS_sendto as u32,
397            libc::SYS_bind as u32,
398            libc::SYS_execve as u32,
399            libc::SYS_execveat as u32,
400        ]);
401    }
402
403    // Port remapping
404    if policy.port_remap {
405        nrs.extend_from_slice(&[
406            libc::SYS_bind as u32,
407            libc::SYS_getsockname as u32,
408        ]);
409    }
410
411    nrs.sort_unstable();
412    nrs.dedup();
413    nrs
414}
415
416/// Resolve `NO_SUPERVISOR_DENY_SYSCALLS` names to numbers.
417pub fn no_supervisor_deny_syscall_numbers() -> Vec<u32> {
418    use crate::sys::structs::NO_SUPERVISOR_DENY_SYSCALLS;
419    NO_SUPERVISOR_DENY_SYSCALLS
420        .iter()
421        .filter_map(|n| syscall_name_to_nr(n))
422        .collect()
423}
424
425/// Resolve `deny_syscalls` names to numbers.
426///
427/// If both `deny_syscalls` and `allow_syscalls` are `None`, returns the
428/// numbers for `DEFAULT_DENY_SYSCALLS`.
429pub fn deny_syscall_numbers(policy: &Policy) -> Vec<u32> {
430    if let Some(ref names) = policy.deny_syscalls {
431        names
432            .iter()
433            .filter_map(|n| syscall_name_to_nr(n))
434            .collect()
435    } else if policy.allow_syscalls.is_none() {
436        DEFAULT_DENY_SYSCALLS
437            .iter()
438            .filter_map(|n| syscall_name_to_nr(n))
439            .collect()
440    } else {
441        // allow_syscalls is set — no deny list
442        Vec::new()
443    }
444}
445
446/// Build argument-level seccomp filter instructions matching the Python
447/// `_build_arg_filters()` exactly.
448///
449/// Returns a `Vec<SockFilter>` containing self-contained BPF blocks for:
450///   - clone: block namespace creation flags
451///   - ioctl: block TIOCSTI, TIOCLINUX, SIOCGIF*, SIOCETHTOOL
452///   - prctl: block PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER
453///   - socket: block SOCK_RAW/SOCK_DGRAM on AF_INET/AF_INET6 (with type mask)
454pub fn arg_filters(policy: &Policy) -> Vec<SockFilter> {
455    let ret_errno = SECCOMP_RET_ERRNO | EPERM as u32;
456    let nr_clone = libc::SYS_clone as u32;
457    let nr_ioctl = libc::SYS_ioctl as u32;
458    let nr_prctl = libc::SYS_prctl as u32;
459    let nr_socket = libc::SYS_socket as u32;
460
461    let mut insns: Vec<SockFilter> = Vec::new();
462
463    // --- clone: block namespace creation flags ---
464    // 5 instructions:
465    //   LD NR
466    //   JEQ clone → +0, skip 3
467    //   LD arg0
468    //   JSET NS_FLAGS → +0, skip 1
469    //   RET ERRNO
470    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
471    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_clone, 0, 3));
472    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
473    insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, CLONE_NS_FLAGS as u32, 0, 1));
474    insns.push(stmt(BPF_RET | BPF_K, ret_errno));
475
476    // --- ioctl: block dangerous commands ---
477    // Block terminal injection (TIOCSTI, TIOCLINUX) and network interface
478    // enumeration ioctls (SIOCGIF*, SIOCETHTOOL) to complement NETLINK_ROUTE
479    // virtualization.
480    // Layout: LD NR, JEQ ioctl (skip 1 + N*2), LD arg1, [JEQ cmd, RET ERRNO] * N
481    let dangerous_ioctls: &[u32] = &[
482        TIOCSTI as u32,
483        TIOCLINUX as u32,
484        SIOCGIFNAME as u32,
485        SIOCGIFCONF as u32,
486        SIOCGIFFLAGS as u32,
487        SIOCGIFADDR as u32,
488        SIOCGIFDSTADDR as u32,
489        SIOCGIFBRDADDR as u32,
490        SIOCGIFNETMASK as u32,
491        SIOCGIFHWADDR as u32,
492        SIOCGIFINDEX as u32,
493        SIOCETHTOOL as u32,
494    ];
495    let n_ioctls = dangerous_ioctls.len();
496    let skip_count = (1 + n_ioctls * 2) as u8;
497    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
498    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_ioctl, 0, skip_count));
499    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
500    for &cmd in dangerous_ioctls {
501        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, cmd, 0, 1));
502        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
503    }
504
505    // --- prctl: block dangerous options ---
506    // Layout: LD NR, JEQ prctl (skip 1 + N*2), LD arg0, [JEQ op, RET ERRNO] * N
507    let dangerous_prctl_ops: &[u32] = &[PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER];
508    let n_ops = dangerous_prctl_ops.len();
509    let skip_count = (1 + n_ops * 2) as u8;
510    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
511    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_prctl, 0, skip_count));
512    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
513    for &op in dangerous_prctl_ops {
514        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, op, 0, 1));
515        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
516    }
517
518    // --- socket: block SOCK_RAW and/or SOCK_DGRAM on AF_INET/AF_INET6 ---
519    let mut blocked_types: Vec<u32> = Vec::new();
520    if policy.no_raw_sockets {
521        blocked_types.push(SOCK_RAW);
522    }
523    if policy.no_udp {
524        blocked_types.push(SOCK_DGRAM);
525    }
526
527    if !blocked_types.is_empty() {
528        let n = blocked_types.len();
529        // Instructions after domain checks: 2 (load+AND) + N (JEQs) + 1 (RET)
530        let after_domain = 2 + n + 1;
531        // Total after NR check: 3 (load domain + 2 JEQs) + after_domain
532        let skip_all = (3 + after_domain) as u8;
533
534        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
535        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, skip_all));
536        // Load domain (arg0)
537        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
538        // AF_INET → skip to type check (jump over AF_INET6 check)
539        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET, 1, 0));
540        // AF_INET6 → type check; else skip everything remaining
541        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET6, 0, after_domain as u8));
542        // Load type (arg1) and mask off SOCK_NONBLOCK|SOCK_CLOEXEC
543        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
544        insns.push(stmt(BPF_ALU | BPF_AND | BPF_K, SOCK_TYPE_MASK));
545        // Check each blocked type
546        for (i, &sock_type) in blocked_types.iter().enumerate() {
547            let remaining = n - i - 1;
548            // Match → jump to RET ERRNO (skip 'remaining' JEQs ahead)
549            // No match on last type → skip past RET ERRNO (jf=1)
550            // No match on non-last → check next type (jf=0)
551            let jf: u8 = if remaining == 0 { 1 } else { 0 };
552            insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, sock_type, remaining as u8, jf));
553        }
554        // Deny return (reached by any matching JEQ)
555        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
556    }
557
558    // --- wait4: skip notification for WNOHANG/WNOWAIT (non-blocking) ---
559    // wait4(pid, status, options, rusage) — options is arg2
560    // 5 instructions:
561    //   LD NR
562    //   JEQ wait4 → +0, skip 3
563    //   LD arg2
564    //   JSET (WNOHANG|WNOWAIT) → +0, skip 1
565    //   RET ALLOW
566    {
567        let nr_wait4 = libc::SYS_wait4 as u32;
568        let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000/* WNOWAIT */) as u32;
569        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
570        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_wait4, 0, 3));
571        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS2_LO));
572        insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
573        insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
574    }
575
576    // --- waitid: skip notification for WNOHANG/WNOWAIT (non-blocking) ---
577    // waitid(idtype, id, infop, options, rusage) — options is arg3
578    {
579        let nr_waitid = libc::SYS_waitid as u32;
580        let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000/* WNOWAIT */) as u32;
581        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
582        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_waitid, 0, 3));
583        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS3_LO));
584        insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
585        insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
586    }
587
588    insns
589}
590
591// ============================================================
592// Close fds above threshold
593// ============================================================
594
595/// Close all file descriptors above `min_fd`, except those in `keep`.
596fn close_fds_above(min_fd: RawFd, keep: &[RawFd]) {
597    // Read /proc/self/fd to enumerate open fds.
598    // Collect all fd numbers first, then close them after dropping the directory
599    // iterator. This avoids closing the directory fd during iteration.
600    let fds_to_close: Vec<RawFd> = {
601        let dir = match std::fs::read_dir("/proc/self/fd") {
602            Ok(d) => d,
603            Err(_) => return,
604        };
605        dir.flatten()
606            .filter_map(|entry| {
607                entry.file_name().into_string().ok()
608                    .and_then(|name| name.parse::<RawFd>().ok())
609            })
610            .filter(|&fd| fd > min_fd && !keep.contains(&fd))
611            .collect()
612    };
613    // The directory is now closed; safe to close the collected fds.
614    for fd in fds_to_close {
615        unsafe { libc::close(fd) };
616    }
617}
618
619// ============================================================
620// COW filesystem config passed from parent to child
621// ============================================================
622
623// Re-export ChildMountConfig so callers can use the old import path.
624pub(crate) use crate::cow::ChildMountConfig;
625
626/// Write uid/gid maps for an unprivileged user namespace.
627/// `real_uid`/`real_gid` must be captured *before* unshare(CLONE_NEWUSER),
628/// since getuid()/getgid() return the overflow id (65534) after unshare.
629/// `target_uid`/`target_gid` are the UIDs visible inside the namespace.
630fn write_id_maps(real_uid: u32, real_gid: u32, target_uid: u32, target_gid: u32) {
631    let _ = std::fs::write("/proc/self/uid_map", format!("{} {} 1\n", target_uid, real_uid));
632    let _ = std::fs::write("/proc/self/setgroups", "deny\n");
633    let _ = std::fs::write("/proc/self/gid_map", format!("{} {} 1\n", target_gid, real_gid));
634}
635
636/// Write uid/gid maps using the post-unshare overflow uid (65534).
637/// Used by the OverlayFS COW path which maps to root (UID 0) inside.
638fn write_id_maps_overflow() {
639    let uid = unsafe { libc::getuid() };
640    let gid = unsafe { libc::getgid() };
641    write_id_maps(uid, gid, 0, 0);
642}
643
644// ============================================================
645// Child-side confinement (never returns)
646// ============================================================
647
648/// Apply irreversible confinement (Landlock + seccomp) then exec the command.
649///
650/// This function **never returns**: it calls `execvp` on success or
651/// `_exit(127)` on any error.
652pub(crate) fn confine_child(policy: &Policy, cmd: &[CString], pipes: &PipePair, cow_config: Option<&ChildMountConfig>, nested: bool, keep_fds: &[RawFd]) -> ! {
653    // Helper: abort child on error. Includes the OS error automatically.
654    macro_rules! fail {
655        ($msg:expr) => {{
656            let err = std::io::Error::last_os_error();
657            let _ = write!(std::io::stderr(), "sandlock child: {}: {}\n", $msg, err);
658            unsafe { libc::_exit(127) };
659        }};
660    }
661
662    use std::io::Write;
663
664    // 1. New process group
665    if unsafe { libc::setpgid(0, 0) } != 0 {
666        fail!("setpgid");
667    }
668
669    // 1b. If stdin is a terminal, become the foreground process group
670    //     so interactive shells can read from the TTY.
671    //     Must ignore SIGTTOU first — a background pgrp calling tcsetpgrp
672    //     gets stopped by SIGTTOU otherwise.
673    if unsafe { libc::isatty(0) } == 1 {
674        unsafe {
675            libc::signal(libc::SIGTTOU, libc::SIG_IGN);
676            libc::tcsetpgrp(0, libc::getpgrp());
677            libc::signal(libc::SIGTTOU, libc::SIG_DFL);
678        }
679    }
680
681    // 2. Die if parent exits
682    if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } != 0 {
683        fail!("prctl(PR_SET_PDEATHSIG)");
684    }
685
686    // 3. Check parent didn't die between fork and prctl
687    if unsafe { libc::getppid() } == 1 {
688        fail!("parent died before confinement");
689    }
690
691    // 4. Optional: disable ASLR
692    if policy.no_randomize_memory {
693        const ADDR_NO_RANDOMIZE: libc::c_ulong = 0x0040000;
694        // Read current personality first (0xffffffff = query), then OR in the flag.
695        let current = unsafe { libc::personality(0xffffffff) };
696        if current == -1 {
697            fail!("personality(query)");
698        }
699        if unsafe { libc::personality(current as libc::c_ulong | ADDR_NO_RANDOMIZE) } == -1 {
700            fail!("personality(ADDR_NO_RANDOMIZE)");
701        }
702    }
703
704    // 4b. Optional: CPU core binding
705    if let Some(ref cores) = policy.cpu_cores {
706        if !cores.is_empty() {
707            let mut set = unsafe { std::mem::zeroed::<libc::cpu_set_t>() };
708            unsafe { libc::CPU_ZERO(&mut set) };
709            for &core in cores {
710                unsafe { libc::CPU_SET(core as usize, &mut set) };
711            }
712            if unsafe {
713                libc::sched_setaffinity(
714                    0,
715                    std::mem::size_of::<libc::cpu_set_t>(),
716                    &set,
717                )
718            } != 0
719            {
720                fail!("sched_setaffinity");
721            }
722        }
723    }
724
725    // 5. Optional: disable THP
726    if policy.no_huge_pages {
727        if unsafe { libc::prctl(libc::PR_SET_THP_DISABLE, 1, 0, 0, 0) } != 0 {
728            fail!("prctl(PR_SET_THP_DISABLE)");
729        }
730    }
731
732    // 5c. Optional: disable core dumps
733    if policy.no_coredump {
734        // Set RLIMIT_CORE to 0 — the kernel will not write a core file.
735        // We intentionally do NOT call prctl(PR_SET_DUMPABLE, 0) because
736        // that would break pidfd_getfd which the supervisor needs.
737        // The seccomp filter already blocks the child from calling
738        // prctl(PR_SET_DUMPABLE, ...) so it can't re-enable it.
739        let rlim = libc::rlimit { rlim_cur: 0, rlim_max: 0 };
740        if unsafe { libc::setrlimit(libc::RLIMIT_CORE, &rlim) } != 0 {
741            fail!("setrlimit(RLIMIT_CORE, 0)");
742        }
743    }
744
745    // Capture real uid/gid before any unshare (after unshare they become 65534)
746    let real_uid = unsafe { libc::getuid() };
747    let real_gid = unsafe { libc::getgid() };
748
749    // 5b. User namespace for --uid mapping (when not using OverlayFS COW,
750    //     which sets up its own user namespace)
751    if let Some(target_uid) = policy.uid {
752        if cow_config.is_none() {
753            if unsafe { libc::unshare(libc::CLONE_NEWUSER) } != 0 {
754                fail!("unshare(CLONE_NEWUSER)");
755            }
756            write_id_maps(real_uid, real_gid, target_uid, target_uid);
757        }
758    }
759
760    // 5c. User + mount namespace for OverlayFS COW (includes CLONE_NEWUSER)
761    if let Some(ref cow) = cow_config {
762        // unshare user + mount namespaces (unprivileged)
763        if unsafe { libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) } != 0 {
764            fail!("unshare(CLONE_NEWUSER | CLONE_NEWNS)");
765        }
766
767        // Write uid/gid maps using overflow uid (preserves existing COW behavior)
768        write_id_maps_overflow();
769
770        // Mount the overlay filesystem ON TOP of the workdir so the child
771        // sees the merged view at the original path.  The kernel resolves
772        // lowerdir before the covering mount takes effect, so using the
773        // same path as both lowerdir and mount-point is safe inside our
774        // private mount namespace.
775        let lowerdir = cow.lowers.iter()
776            .map(|p| p.display().to_string())
777            .collect::<Vec<_>>()
778            .join(":");
779        let opts = format!(
780            "lowerdir={},upperdir={},workdir={}",
781            lowerdir,
782            cow.upper.display(),
783            cow.work.display(),
784        );
785
786        let mount_cstr = match CString::new(cow.mount_point.to_str().unwrap_or("")) {
787            Ok(c) => c,
788            Err(_) => fail!("invalid overlay mount point path"),
789        };
790        let overlay_cstr = CString::new("overlay").unwrap();
791        let opts_cstr = match CString::new(opts) {
792            Ok(c) => c,
793            Err(_) => fail!("invalid overlay opts"),
794        };
795
796        let ret = unsafe {
797            libc::mount(
798                overlay_cstr.as_ptr(),
799                mount_cstr.as_ptr(),
800                overlay_cstr.as_ptr(),
801                0,
802                opts_cstr.as_ptr() as *const libc::c_void,
803            )
804        };
805        if ret != 0 {
806            fail!("mount overlay");
807        }
808    }
809
810    // 6. Optional: change working directory
811    // cwd controls where the child starts; workdir is only for COW
812    let effective_cwd = if let Some(ref cwd) = policy.cwd {
813        if let Some(ref chroot_root) = policy.chroot {
814            Some(chroot_root.join(cwd.strip_prefix("/").unwrap_or(cwd)))
815        } else {
816            Some(cwd.clone())
817        }
818    } else if let Some(ref chroot_root) = policy.chroot {
819        // Default to chroot root
820        Some(chroot_root.to_path_buf())
821    } else if let Some(ref workdir) = policy.workdir {
822        // Default to workdir when set (COW working directory)
823        Some(workdir.clone())
824    } else {
825        None
826    };
827
828    if let Some(ref cwd) = effective_cwd {
829        let c_path = match CString::new(cwd.as_os_str().as_encoded_bytes()) {
830            Ok(c) => c,
831            Err(_) => fail!("invalid cwd path"),
832        };
833        if unsafe { libc::chdir(c_path.as_ptr()) } != 0 {
834            fail!("chdir");
835        }
836    }
837
838    // 7. Set NO_NEW_PRIVS (required for both Landlock and seccomp without CAP_SYS_ADMIN)
839    if unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) } != 0 {
840        fail!("prctl(PR_SET_NO_NEW_PRIVS)");
841    }
842
843    // 8. Apply Landlock confinement (IRREVERSIBLE)
844    if let Err(e) = crate::landlock::confine(policy) {
845        fail!(format!("landlock: {}", e));
846    }
847
848    // 9. Assemble and install seccomp filter (IRREVERSIBLE)
849    let deny = deny_syscall_numbers(policy);
850    let args = arg_filters(policy);
851    let mut keep_fd: i32 = -1;
852
853    if nested {
854        // Nested sandbox: deny-only filter (no supervisor — parent handles it).
855        // BPF filters are ANDed by the kernel, so each level can only tighten.
856        let filter = match bpf::assemble_filter(&[], &deny, &args) {
857            Ok(f) => f,
858            Err(e) => fail!(format!("seccomp assemble: {}", e)),
859        };
860        if let Err(e) = bpf::install_deny_filter(&filter) {
861            fail!(format!("seccomp deny filter: {}", e));
862        }
863        // Signal nested mode to parent (fd=0 means no supervisor needed)
864        if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), 0) {
865            fail!(format!("write nested signal: {}", e));
866        }
867    } else {
868        // First-level sandbox: notif + deny filter with NEW_LISTENER.
869        let notif = notif_syscalls(policy);
870        let filter = match bpf::assemble_filter(&notif, &deny, &args) {
871            Ok(f) => f,
872            Err(e) => fail!(format!("seccomp assemble: {}", e)),
873        };
874        let notif_fd = match bpf::install_filter(&filter) {
875            Ok(fd) => fd,
876            Err(e) => fail!(format!("seccomp install: {}", e)),
877        };
878        keep_fd = notif_fd.as_raw_fd();
879        if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), keep_fd as u32) {
880            fail!(format!("write notif fd: {}", e));
881        }
882        std::mem::forget(notif_fd);
883    }
884
885    // Mark this process as confined for in-process nesting detection
886    crate::sandbox::CONFINED.store(true, std::sync::atomic::Ordering::Relaxed);
887
888    // 10. Wait for parent to signal ready
889    match read_u32_fd(pipes.ready_r.as_raw_fd()) {
890        Ok(_) => {}
891        Err(e) => fail!(format!("read ready signal: {}", e)),
892    }
893
894    // 12. Close all fds above stderr (always on for isolation)
895    let mut fds_to_keep: Vec<RawFd> = keep_fds.to_vec();
896    if keep_fd >= 0 {
897        fds_to_keep.push(keep_fd);
898    }
899    close_fds_above(2, &fds_to_keep);
900
901    // 13. Apply environment
902    if policy.clean_env {
903        // Clear all env vars first
904        for (key, _) in std::env::vars_os() {
905            std::env::remove_var(&key);
906        }
907    }
908    for (key, value) in &policy.env {
909        std::env::set_var(key, value);
910    }
911
912    // 13b. GPU device visibility
913    if let Some(ref devices) = policy.gpu_devices {
914        if !devices.is_empty() {
915            let vis = devices.iter().map(|d| d.to_string()).collect::<Vec<_>>().join(",");
916            std::env::set_var("CUDA_VISIBLE_DEVICES", &vis);
917            std::env::set_var("ROCR_VISIBLE_DEVICES", &vis);
918        }
919        // Empty list = all GPUs visible, don't set env vars
920    }
921
922    // 14. exec
923    debug_assert!(!cmd.is_empty(), "cmd must not be empty");
924    let argv_ptrs: Vec<*const libc::c_char> = cmd
925        .iter()
926        .map(|s| s.as_ptr())
927        .chain(std::iter::once(std::ptr::null()))
928        .collect();
929
930    if policy.chroot.is_some() {
931        // With chroot the seccomp handler rewrites the filename to a host path
932        // (or /proc/self/fd/N).  Pass a separate PATH_MAX buffer as the `file`
933        // argument so the rewrite does not corrupt argv[0] — which must stay as
934        // the original command name (e.g. busybox uses argv[0] for applet
935        // detection).  execvp still handles PATH lookup for bare command names.
936        let mut exec_path = vec![0u8; libc::PATH_MAX as usize];
937        let orig = cmd[0].as_bytes_with_nul();
938        exec_path[..orig.len()].copy_from_slice(orig);
939
940        unsafe {
941            libc::execvp(
942                exec_path.as_ptr() as *const libc::c_char,
943                argv_ptrs.as_ptr(),
944            )
945        };
946    } else {
947        unsafe { libc::execvp(argv_ptrs[0], argv_ptrs.as_ptr()) };
948    }
949
950    // If we get here, exec failed
951    fail!(format!("execvp '{}'", cmd[0].to_string_lossy()));
952}
953
954// ============================================================
955// Tests
956// ============================================================
957
958#[cfg(test)]
959mod tests {
960    use super::*;
961
962    #[test]
963    fn test_pipe_pair_creation() {
964        let pipes = PipePair::new().expect("pipe creation failed");
965        // Verify fds are valid (non-negative)
966        assert!(pipes.notif_r.as_raw_fd() >= 0);
967        assert!(pipes.notif_w.as_raw_fd() >= 0);
968        assert!(pipes.ready_r.as_raw_fd() >= 0);
969        assert!(pipes.ready_w.as_raw_fd() >= 0);
970        // All four fds should be distinct
971        let fds = [
972            pipes.notif_r.as_raw_fd(),
973            pipes.notif_w.as_raw_fd(),
974            pipes.ready_r.as_raw_fd(),
975            pipes.ready_w.as_raw_fd(),
976        ];
977        for i in 0..4 {
978            for j in (i + 1)..4 {
979                assert_ne!(fds[i], fds[j]);
980            }
981        }
982    }
983
984    #[test]
985    fn test_write_read_u32() {
986        let pipes = PipePair::new().expect("pipe creation failed");
987        let val = 42u32;
988        write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
989        let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
990        assert_eq!(got, val);
991    }
992
993    #[test]
994    fn test_write_read_u32_large() {
995        let pipes = PipePair::new().expect("pipe creation failed");
996        let val = 0xDEAD_BEEFu32;
997        write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
998        let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
999        assert_eq!(got, val);
1000    }
1001
1002    #[test]
1003    fn test_notif_syscalls_always_has_clone() {
1004        let policy = Policy::builder().build().unwrap();
1005        let nrs = notif_syscalls(&policy);
1006        assert!(nrs.contains(&(libc::SYS_clone as u32)));
1007        assert!(nrs.contains(&(libc::SYS_clone3 as u32)));
1008        if let Some(vfork) = arch::SYS_VFORK {
1009            assert!(nrs.contains(&(vfork as u32)));
1010        }
1011    }
1012
1013    #[test]
1014    fn test_notif_syscalls_memory() {
1015        let policy = Policy::builder()
1016            .max_memory(crate::policy::ByteSize::mib(256))
1017            .build()
1018            .unwrap();
1019        let nrs = notif_syscalls(&policy);
1020        assert!(nrs.contains(&(libc::SYS_mmap as u32)));
1021        assert!(nrs.contains(&(libc::SYS_munmap as u32)));
1022        assert!(nrs.contains(&(libc::SYS_brk as u32)));
1023        assert!(nrs.contains(&(libc::SYS_mremap as u32)));
1024        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1025    }
1026
1027    #[test]
1028    fn test_notif_syscalls_net() {
1029        let policy = Policy::builder()
1030            .net_allow_host("example.com")
1031            .build()
1032            .unwrap();
1033        let nrs = notif_syscalls(&policy);
1034        assert!(nrs.contains(&(libc::SYS_connect as u32)));
1035        assert!(nrs.contains(&(libc::SYS_sendto as u32)));
1036        assert!(nrs.contains(&(libc::SYS_sendmsg as u32)));
1037    }
1038
1039    /// SYS_faccessat2 (439) must be in the notification filter for both
1040    /// chroot and COW modes — glibc 2.33+ uses it instead of faccessat.
1041    #[test]
1042    fn test_notif_syscalls_faccessat2() {
1043        const SYS_FACCESSAT2: u32 = 439;
1044
1045        // Chroot mode
1046        let policy = Policy::builder()
1047            .chroot("/tmp")
1048            .build()
1049            .unwrap();
1050        let nrs = notif_syscalls(&policy);
1051        assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
1052        assert!(nrs.contains(&SYS_FACCESSAT2),
1053                "chroot notif filter must include SYS_faccessat2 (439)");
1054
1055        // COW mode
1056        let policy = Policy::builder()
1057            .workdir("/tmp")
1058            .build()
1059            .unwrap();
1060        let nrs = notif_syscalls(&policy);
1061        assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
1062        assert!(nrs.contains(&SYS_FACCESSAT2),
1063                "COW notif filter must include SYS_faccessat2 (439)");
1064    }
1065
1066    #[test]
1067    fn test_deny_syscall_numbers_default() {
1068        let policy = Policy::builder().build().unwrap();
1069        let nrs = deny_syscall_numbers(&policy);
1070        // Should contain mount, ptrace, etc.
1071        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1072        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1073        assert!(nrs.contains(&(libc::SYS_bpf as u32)));
1074        // nfsservctl has no libc constant, so it is skipped
1075        assert!(!nrs.is_empty());
1076    }
1077
1078    #[test]
1079    fn test_deny_syscall_numbers_custom() {
1080        let policy = Policy::builder()
1081            .deny_syscalls(vec!["mount".into(), "ptrace".into()])
1082            .build()
1083            .unwrap();
1084        let nrs = deny_syscall_numbers(&policy);
1085        assert_eq!(nrs.len(), 2);
1086        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1087        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1088    }
1089
1090    #[test]
1091    fn test_deny_syscall_numbers_empty_when_allow_set() {
1092        let policy = Policy::builder()
1093            .allow_syscalls(vec!["read".into(), "write".into()])
1094            .build()
1095            .unwrap();
1096        let nrs = deny_syscall_numbers(&policy);
1097        assert!(nrs.is_empty());
1098    }
1099
1100    #[test]
1101    fn test_arg_filters_has_clone_ioctl_prctl_socket() {
1102        use crate::sys::structs::{
1103            BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K,
1104        };
1105        let policy = Policy::builder().build().unwrap();
1106        let filters = arg_filters(&policy);
1107        // Should contain JEQ for clone syscall nr
1108        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1109            && f.k == libc::SYS_clone as u32));
1110        // Should contain JSET for CLONE_NS_FLAGS
1111        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JSET | BPF_K)
1112            && f.k == CLONE_NS_FLAGS as u32));
1113        // Should contain JEQ for ioctl syscall nr
1114        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1115            && f.k == libc::SYS_ioctl as u32));
1116        // Should contain JEQ for TIOCSTI, TIOCLINUX, and SIOCGIF*/SIOCETHTOOL
1117        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1118            && f.k == TIOCSTI as u32));
1119        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1120            && f.k == TIOCLINUX as u32));
1121        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1122            && f.k == SIOCGIFCONF as u32));
1123        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1124            && f.k == SIOCETHTOOL as u32));
1125        // Should contain JEQ for prctl syscall nr
1126        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1127            && f.k == libc::SYS_prctl as u32));
1128        // Should contain JEQ for PR_SET_DUMPABLE
1129        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1130            && f.k == PR_SET_DUMPABLE));
1131    }
1132
1133    #[test]
1134    fn test_arg_filters_raw_sockets() {
1135        use crate::sys::structs::{BPF_ALU, BPF_AND, BPF_JEQ, BPF_JMP, BPF_K};
1136        let policy = Policy::builder().no_raw_sockets(true).build().unwrap();
1137        let filters = arg_filters(&policy);
1138        // Should have AF_INET check
1139        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1140            && f.k == AF_INET));
1141        // Should have AF_INET6 check
1142        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1143            && f.k == AF_INET6));
1144        // Should have ALU AND SOCK_TYPE_MASK
1145        assert!(filters.iter().any(|f| f.code == (BPF_ALU | BPF_AND | BPF_K)
1146            && f.k == SOCK_TYPE_MASK));
1147        // Should have JEQ SOCK_RAW
1148        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1149            && f.k == SOCK_RAW));
1150    }
1151
1152    #[test]
1153    fn test_arg_filters_no_udp() {
1154        use crate::sys::structs::{BPF_JEQ, BPF_JMP, BPF_K};
1155        let policy = Policy::builder().no_udp(true).build().unwrap();
1156        let filters = arg_filters(&policy);
1157        // Should have JEQ SOCK_DGRAM
1158        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1159            && f.k == SOCK_DGRAM));
1160    }
1161
1162    #[test]
1163    fn test_syscall_name_to_nr_covers_defaults() {
1164        // Every name in DEFAULT_DENY_SYSCALLS should resolve unless the
1165        // running architecture does not expose that syscall.
1166        let expected_unresolved: &[&str] = &[
1167            "nfsservctl",
1168            #[cfg(target_arch = "aarch64")]
1169            "ioperm",
1170            #[cfg(target_arch = "aarch64")]
1171            "iopl",
1172        ];
1173        let mut skipped = 0;
1174        for name in DEFAULT_DENY_SYSCALLS {
1175            match syscall_name_to_nr(name) {
1176                Some(_) => {}
1177                None => {
1178                    assert!(
1179                        expected_unresolved.contains(name),
1180                        "unexpected unresolved syscall: {}",
1181                        name
1182                    );
1183                    skipped += 1;
1184                }
1185            }
1186        }
1187        assert_eq!(skipped, expected_unresolved.len());
1188    }
1189}