Skip to main content

sandlock_core/
context.rs

1// Fork + confinement sequence: child-side Landlock + seccomp application
2// and parent-child pipe synchronization.
3
4use std::ffi::CString;
5use std::io;
6use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
7
8use crate::policy::{FsIsolation, Policy};
9use crate::seccomp::bpf::{self, stmt, jump};
10use crate::sys::structs::{
11    AF_INET, AF_INET6, AF_NETLINK,
12    BPF_ABS, BPF_ALU, BPF_AND, BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K, BPF_LD, BPF_RET, BPF_W,
13    CLONE_NS_FLAGS, DEFAULT_DENY_SYSCALLS, EPERM,
14    SECCOMP_RET_ALLOW, SECCOMP_RET_ERRNO,
15    SOCK_DGRAM, SOCK_RAW, SOCK_TYPE_MASK, TIOCLINUX, TIOCSTI,
16    PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER,
17    OFFSET_ARGS0_LO, OFFSET_ARGS1_LO, OFFSET_ARGS2_LO, OFFSET_ARGS3_LO, OFFSET_NR,
18    SockFilter,
19};
20
21// ============================================================
22// Pipe pair for parent-child synchronization
23// ============================================================
24
25/// Pipes for parent-child communication after fork().
26pub struct PipePair {
27    /// Parent reads the notif fd number written by the child.
28    pub notif_r: OwnedFd,
29    /// Child writes the notif fd number to the parent.
30    pub notif_w: OwnedFd,
31    /// Child reads the "supervisor ready" signal from the parent.
32    pub ready_r: OwnedFd,
33    /// Parent writes the "supervisor ready" signal to the child.
34    pub ready_w: OwnedFd,
35}
36
37impl PipePair {
38    /// Create two pipe pairs using `pipe2(O_CLOEXEC)`.
39    pub fn new() -> io::Result<Self> {
40        let mut notif_fds = [0i32; 2];
41        let mut ready_fds = [0i32; 2];
42
43        // SAFETY: pipe2 with valid pointers and O_CLOEXEC
44        let ret = unsafe { libc::pipe2(notif_fds.as_mut_ptr(), libc::O_CLOEXEC) };
45        if ret < 0 {
46            return Err(io::Error::last_os_error());
47        }
48
49        let ret = unsafe { libc::pipe2(ready_fds.as_mut_ptr(), libc::O_CLOEXEC) };
50        if ret < 0 {
51            // Close the first pair on failure
52            unsafe {
53                libc::close(notif_fds[0]);
54                libc::close(notif_fds[1]);
55            }
56            return Err(io::Error::last_os_error());
57        }
58
59        // SAFETY: pipe2 returned valid fds
60        Ok(PipePair {
61            notif_r: unsafe { OwnedFd::from_raw_fd(notif_fds[0]) },
62            notif_w: unsafe { OwnedFd::from_raw_fd(notif_fds[1]) },
63            ready_r: unsafe { OwnedFd::from_raw_fd(ready_fds[0]) },
64            ready_w: unsafe { OwnedFd::from_raw_fd(ready_fds[1]) },
65        })
66    }
67}
68
69// ============================================================
70// Pipe I/O helpers
71// ============================================================
72
73/// Write a `u32` as 4 little-endian bytes to a raw fd.
74pub(crate) fn write_u32_fd(fd: RawFd, val: u32) -> io::Result<()> {
75    let buf = val.to_le_bytes();
76    let mut written = 0usize;
77    while written < 4 {
78        let ret = unsafe {
79            libc::write(
80                fd,
81                buf[written..].as_ptr() as *const libc::c_void,
82                4 - written,
83            )
84        };
85        if ret < 0 {
86            return Err(io::Error::last_os_error());
87        }
88        written += ret as usize;
89    }
90    Ok(())
91}
92
93/// Read a `u32` (4 little-endian bytes, blocking) from a raw fd.
94pub(crate) fn read_u32_fd(fd: RawFd) -> io::Result<u32> {
95    let mut buf = [0u8; 4];
96    let mut total = 0usize;
97    while total < 4 {
98        let ret = unsafe {
99            libc::read(
100                fd,
101                buf[total..].as_mut_ptr() as *mut libc::c_void,
102                4 - total,
103            )
104        };
105        if ret < 0 {
106            return Err(io::Error::last_os_error());
107        }
108        if ret == 0 {
109            return Err(io::Error::new(
110                io::ErrorKind::UnexpectedEof,
111                "pipe closed before 4 bytes read",
112            ));
113        }
114        total += ret as usize;
115    }
116    Ok(u32::from_le_bytes(buf))
117}
118
119// ============================================================
120// Syscall name → number mapping
121// ============================================================
122
123/// Map a syscall name to its `libc::SYS_*` number.
124///
125/// Covers all names in `DEFAULT_DENY_SYSCALLS` plus extras needed for
126/// notif and arg-filter lists.
127pub fn syscall_name_to_nr(name: &str) -> Option<u32> {
128    let nr: i64 = match name {
129        "mount" => libc::SYS_mount,
130        "umount2" => libc::SYS_umount2,
131        "pivot_root" => libc::SYS_pivot_root,
132        "swapon" => libc::SYS_swapon,
133        "swapoff" => libc::SYS_swapoff,
134        "reboot" => libc::SYS_reboot,
135        "sethostname" => libc::SYS_sethostname,
136        "setdomainname" => libc::SYS_setdomainname,
137        "kexec_load" => libc::SYS_kexec_load,
138        "init_module" => libc::SYS_init_module,
139        "finit_module" => libc::SYS_finit_module,
140        "delete_module" => libc::SYS_delete_module,
141        "unshare" => libc::SYS_unshare,
142        "setns" => libc::SYS_setns,
143        "perf_event_open" => libc::SYS_perf_event_open,
144        "bpf" => libc::SYS_bpf,
145        "userfaultfd" => libc::SYS_userfaultfd,
146        "keyctl" => libc::SYS_keyctl,
147        "add_key" => libc::SYS_add_key,
148        "request_key" => libc::SYS_request_key,
149        "ptrace" => libc::SYS_ptrace,
150        "process_vm_readv" => libc::SYS_process_vm_readv,
151        "process_vm_writev" => libc::SYS_process_vm_writev,
152        "open_by_handle_at" => libc::SYS_open_by_handle_at,
153        "name_to_handle_at" => libc::SYS_name_to_handle_at,
154        "ioperm" => libc::SYS_ioperm,
155        "iopl" => libc::SYS_iopl,
156        "quotactl" => libc::SYS_quotactl,
157        "acct" => libc::SYS_acct,
158        "lookup_dcookie" => libc::SYS_lookup_dcookie,
159        // nfsservctl was removed in Linux 3.1; no libc constant — skip
160        "personality" => libc::SYS_personality,
161        "io_uring_setup" => libc::SYS_io_uring_setup,
162        "io_uring_enter" => libc::SYS_io_uring_enter,
163        "io_uring_register" => libc::SYS_io_uring_register,
164        // Additional syscalls for notif/arg filters
165        "clone" => libc::SYS_clone,
166        "clone3" => libc::SYS_clone3,
167        "vfork" => libc::SYS_vfork,
168        "mmap" => libc::SYS_mmap,
169        "munmap" => libc::SYS_munmap,
170        "brk" => libc::SYS_brk,
171        "mremap" => libc::SYS_mremap,
172        "connect" => libc::SYS_connect,
173        "sendto" => libc::SYS_sendto,
174        "sendmsg" => libc::SYS_sendmsg,
175        "ioctl" => libc::SYS_ioctl,
176        "socket" => libc::SYS_socket,
177        "prctl" => libc::SYS_prctl,
178        "getrandom" => libc::SYS_getrandom,
179        "openat" => libc::SYS_openat,
180        "open" => libc::SYS_open,
181        "getdents64" => libc::SYS_getdents64,
182        "getdents" => libc::SYS_getdents,
183        "bind" => libc::SYS_bind,
184        "getsockname" => libc::SYS_getsockname,
185        "clock_gettime" => libc::SYS_clock_gettime,
186        "gettimeofday" => libc::SYS_gettimeofday,
187        "time" => libc::SYS_time,
188        "clock_nanosleep" => libc::SYS_clock_nanosleep,
189        "timerfd_settime" => libc::SYS_timerfd_settime,
190        "timer_settime" => libc::SYS_timer_settime,
191        "execve" => libc::SYS_execve,
192        "execveat" => libc::SYS_execveat,
193        // COW filesystem syscalls
194        "unlinkat" => libc::SYS_unlinkat,
195        "mkdirat" => libc::SYS_mkdirat,
196        "renameat2" => libc::SYS_renameat2,
197        "newfstatat" => libc::SYS_newfstatat,
198        "statx" => libc::SYS_statx,
199        "faccessat" => libc::SYS_faccessat,
200        "symlinkat" => libc::SYS_symlinkat,
201        "linkat" => libc::SYS_linkat,
202        "fchmodat" => libc::SYS_fchmodat,
203        "fchownat" => libc::SYS_fchownat,
204        "readlinkat" => libc::SYS_readlinkat,
205        "truncate" => libc::SYS_truncate,
206        "utimensat" => libc::SYS_utimensat,
207        "unlink" => libc::SYS_unlink,
208        "rmdir" => libc::SYS_rmdir,
209        "mkdir" => libc::SYS_mkdir,
210        "rename" => libc::SYS_rename,
211        "stat" => libc::SYS_stat,
212        "lstat" => libc::SYS_lstat,
213        "access" => libc::SYS_access,
214        "symlink" => libc::SYS_symlink,
215        "link" => libc::SYS_link,
216        "chmod" => libc::SYS_chmod,
217        "chown" => libc::SYS_chown,
218        "lchown" => libc::SYS_lchown,
219        "readlink" => libc::SYS_readlink,
220        "futimesat" => libc::SYS_futimesat,
221        "fork" => libc::SYS_fork,
222        _ => return None,
223    };
224    Some(nr as u32)
225}
226
227// ============================================================
228// Policy → syscall lists
229// ============================================================
230
231/// Determine which syscalls need `SECCOMP_RET_USER_NOTIF`.
232pub fn notif_syscalls(policy: &Policy) -> Vec<u32> {
233    let mut nrs = vec![
234        libc::SYS_clone as u32,
235        libc::SYS_clone3 as u32,
236        libc::SYS_vfork as u32,
237        libc::SYS_wait4 as u32,
238        libc::SYS_waitid as u32,
239    ];
240
241    if policy.max_memory.is_some() {
242        nrs.push(libc::SYS_mmap as u32);
243        nrs.push(libc::SYS_munmap as u32);
244        nrs.push(libc::SYS_brk as u32);
245        nrs.push(libc::SYS_mremap as u32);
246        nrs.push(libc::SYS_shmget as u32);
247    }
248
249    if !policy.net_allow_hosts.is_empty()
250        || policy.policy_fn.is_some()
251        || !policy.http_allow.is_empty()
252        || !policy.http_deny.is_empty()
253    {
254        nrs.push(libc::SYS_connect as u32);
255        nrs.push(libc::SYS_sendto as u32);
256        nrs.push(libc::SYS_sendmsg as u32);
257        nrs.push(libc::SYS_bind as u32);
258    }
259
260    if policy.random_seed.is_some() {
261        nrs.push(libc::SYS_getrandom as u32);
262        // Also intercept openat so the supervisor can re-patch vDSO after exec.
263        nrs.push(libc::SYS_openat as u32);
264    }
265
266    if policy.time_start.is_some() {
267        nrs.extend_from_slice(&[
268            libc::SYS_clock_nanosleep as u32,
269            libc::SYS_timerfd_settime as u32,
270            libc::SYS_timer_settime as u32,
271        ]);
272        // Also intercept openat so the supervisor gets a notification after exec
273        // and can re-patch the vDSO (exec replaces vDSO with a fresh copy).
274        nrs.push(libc::SYS_openat as u32);
275    }
276
277    // /proc virtualization (always on: PID filtering, sensitive path blocking)
278    nrs.push(libc::SYS_openat as u32);
279    nrs.extend_from_slice(&[
280        libc::SYS_getdents64 as u32,
281        libc::SYS_getdents as u32,
282    ]);
283    // Virtualize sched_getaffinity so nproc/sysconf agree with /proc/cpuinfo
284    if policy.num_cpus.is_some() {
285        nrs.push(libc::SYS_sched_getaffinity as u32);
286    }
287    if policy.hostname.is_some() {
288        nrs.push(libc::SYS_uname as u32);
289        nrs.push(libc::SYS_openat as u32);
290    }
291
292    // COW filesystem interception (seccomp-based, unprivileged)
293    if policy.workdir.is_some() && policy.fs_isolation == FsIsolation::None {
294        nrs.extend_from_slice(&[
295            libc::SYS_openat as u32,
296            libc::SYS_open as u32,
297            libc::SYS_unlinkat as u32,
298            libc::SYS_unlink as u32,
299            libc::SYS_rmdir as u32,
300            libc::SYS_mkdirat as u32,
301            libc::SYS_mkdir as u32,
302            libc::SYS_renameat2 as u32,
303            libc::SYS_rename as u32,
304            libc::SYS_symlinkat as u32,
305            libc::SYS_symlink as u32,
306            libc::SYS_linkat as u32,
307            libc::SYS_link as u32,
308            libc::SYS_fchmodat as u32,
309            libc::SYS_chmod as u32,
310            libc::SYS_fchownat as u32,
311            libc::SYS_chown as u32,
312            libc::SYS_lchown as u32,
313            libc::SYS_truncate as u32,
314            libc::SYS_utimensat as u32,
315            libc::SYS_newfstatat as u32,
316            libc::SYS_stat as u32,
317            libc::SYS_lstat as u32,
318            libc::SYS_statx as u32,
319            libc::SYS_faccessat as u32,
320            439u32,                       // SYS_faccessat2 — glibc 2.33+ uses this instead of faccessat
321            libc::SYS_access as u32,
322            libc::SYS_readlinkat as u32,
323            libc::SYS_readlink as u32,
324            libc::SYS_getdents64 as u32,
325            libc::SYS_getdents as u32,
326            libc::SYS_chdir as u32,
327        ]);
328    }
329
330    // Chroot path interception
331    if policy.chroot.is_some() {
332        nrs.extend_from_slice(&[
333            libc::SYS_openat as u32,
334            libc::SYS_open as u32,        // musl uses open(2) instead of openat
335            libc::SYS_execve as u32,
336            libc::SYS_execveat as u32,
337            libc::SYS_unlinkat as u32,
338            libc::SYS_mkdirat as u32,
339            libc::SYS_renameat2 as u32,
340            libc::SYS_symlinkat as u32,
341            libc::SYS_linkat as u32,
342            libc::SYS_fchmodat as u32,
343            libc::SYS_fchownat as u32,
344            libc::SYS_truncate as u32,
345            libc::SYS_newfstatat as u32,
346            libc::SYS_stat as u32,        // musl uses stat(2) instead of newfstatat
347            libc::SYS_lstat as u32,       // musl uses lstat(2) instead of newfstatat
348            libc::SYS_statx as u32,
349            libc::SYS_faccessat as u32,
350            439u32,                       // SYS_faccessat2 — glibc 2.33+ uses this instead of faccessat
351            libc::SYS_access as u32,      // musl uses access(2) instead of faccessat
352            libc::SYS_readlinkat as u32,
353            libc::SYS_readlink as u32,    // musl uses readlink(2) instead of readlinkat
354            libc::SYS_getdents64 as u32,
355            libc::SYS_getdents as u32,
356            libc::SYS_chdir as u32,
357            libc::SYS_getcwd as u32,
358            libc::SYS_statfs as u32,
359            libc::SYS_utimensat as u32,
360            libc::SYS_unlink as u32,      // musl uses unlink(2) instead of unlinkat
361            libc::SYS_rmdir as u32,       // musl uses rmdir(2) instead of unlinkat
362            libc::SYS_mkdir as u32,       // musl uses mkdir(2) instead of mkdirat
363            libc::SYS_rename as u32,      // musl uses rename(2) instead of renameat2
364            libc::SYS_symlink as u32,     // musl uses symlink(2) instead of symlinkat
365            libc::SYS_link as u32,        // musl uses link(2) instead of linkat
366            libc::SYS_chmod as u32,       // musl uses chmod(2) instead of fchmodat
367            libc::SYS_chown as u32,       // musl uses chown(2)/lchown(2) instead of fchownat
368            libc::SYS_lchown as u32,
369        ]);
370    }
371
372    // Explicit deny-paths need path-bearing syscalls intercepted.
373    if !policy.fs_denied.is_empty() {
374        nrs.extend_from_slice(&[
375            libc::SYS_openat as u32,
376            libc::SYS_open as u32,
377            libc::SYS_execve as u32,
378            libc::SYS_execveat as u32,
379            libc::SYS_linkat as u32,
380            libc::SYS_link as u32,
381            libc::SYS_renameat2 as u32,
382            libc::SYS_rename as u32,
383            libc::SYS_symlinkat as u32,
384            libc::SYS_symlink as u32,
385        ]);
386    }
387
388    // Dynamic policy callback — intercept key syscalls for event emission.
389    if policy.policy_fn.is_some() {
390        nrs.extend_from_slice(&[
391            libc::SYS_openat as u32,
392            libc::SYS_connect as u32,
393            libc::SYS_sendto as u32,
394            libc::SYS_bind as u32,
395            libc::SYS_execve as u32,
396            libc::SYS_execveat as u32,
397        ]);
398    }
399
400    // Port remapping
401    if policy.port_remap {
402        nrs.extend_from_slice(&[
403            libc::SYS_bind as u32,
404            libc::SYS_getsockname as u32,
405        ]);
406    }
407
408    nrs.sort_unstable();
409    nrs.dedup();
410    nrs
411}
412
413/// Resolve `NO_SUPERVISOR_DENY_SYSCALLS` names to numbers.
414pub fn no_supervisor_deny_syscall_numbers() -> Vec<u32> {
415    use crate::sys::structs::NO_SUPERVISOR_DENY_SYSCALLS;
416    NO_SUPERVISOR_DENY_SYSCALLS
417        .iter()
418        .filter_map(|n| syscall_name_to_nr(n))
419        .collect()
420}
421
422/// Resolve `deny_syscalls` names to numbers.
423///
424/// If both `deny_syscalls` and `allow_syscalls` are `None`, returns the
425/// numbers for `DEFAULT_DENY_SYSCALLS`.
426pub fn deny_syscall_numbers(policy: &Policy) -> Vec<u32> {
427    if let Some(ref names) = policy.deny_syscalls {
428        names
429            .iter()
430            .filter_map(|n| syscall_name_to_nr(n))
431            .collect()
432    } else if policy.allow_syscalls.is_none() {
433        DEFAULT_DENY_SYSCALLS
434            .iter()
435            .filter_map(|n| syscall_name_to_nr(n))
436            .collect()
437    } else {
438        // allow_syscalls is set — no deny list
439        Vec::new()
440    }
441}
442
443/// Build argument-level seccomp filter instructions matching the Python
444/// `_build_arg_filters()` exactly.
445///
446/// Returns a `Vec<SockFilter>` containing self-contained BPF blocks for:
447///   - clone: block namespace creation flags
448///   - ioctl: block TIOCSTI, TIOCLINUX
449///   - prctl: block PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER
450///   - socket: block all AF_NETLINK sockets (network topology enumeration)
451///   - socket: block SOCK_RAW/SOCK_DGRAM on AF_INET/AF_INET6 (with type mask)
452pub fn arg_filters(policy: &Policy) -> Vec<SockFilter> {
453    let ret_errno = SECCOMP_RET_ERRNO | EPERM as u32;
454    let nr_clone = libc::SYS_clone as u32;
455    let nr_ioctl = libc::SYS_ioctl as u32;
456    let nr_prctl = libc::SYS_prctl as u32;
457    let nr_socket = libc::SYS_socket as u32;
458
459    let mut insns: Vec<SockFilter> = Vec::new();
460
461    // --- clone: block namespace creation flags ---
462    // 5 instructions:
463    //   LD NR
464    //   JEQ clone → +0, skip 3
465    //   LD arg0
466    //   JSET NS_FLAGS → +0, skip 1
467    //   RET ERRNO
468    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
469    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_clone, 0, 3));
470    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
471    insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, CLONE_NS_FLAGS as u32, 0, 1));
472    insns.push(stmt(BPF_RET | BPF_K, ret_errno));
473
474    // --- ioctl: block dangerous commands (TIOCSTI, TIOCLINUX) ---
475    // Layout: LD NR, JEQ ioctl (skip 1 + N*2), LD arg1, [JEQ cmd, RET ERRNO] * N
476    let dangerous_ioctls: &[u32] = &[TIOCSTI as u32, TIOCLINUX as u32];
477    let n_ioctls = dangerous_ioctls.len();
478    let skip_count = (1 + n_ioctls * 2) as u8;
479    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
480    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_ioctl, 0, skip_count));
481    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
482    for &cmd in dangerous_ioctls {
483        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, cmd, 0, 1));
484        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
485    }
486
487    // --- prctl: block dangerous options ---
488    // Layout: LD NR, JEQ prctl (skip 1 + N*2), LD arg0, [JEQ op, RET ERRNO] * N
489    let dangerous_prctl_ops: &[u32] = &[PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER];
490    let n_ops = dangerous_prctl_ops.len();
491    let skip_count = (1 + n_ops * 2) as u8;
492    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
493    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_prctl, 0, skip_count));
494    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
495    for &op in dangerous_prctl_ops {
496        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, op, 0, 1));
497        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
498    }
499
500    // --- socket: block all AF_NETLINK sockets ---
501    // Netlink sockets allow network topology enumeration (interfaces, routes,
502    // ARP, etc.) which leaks host network configuration.  Block the entire
503    // AF_NETLINK family, not just NETLINK_SOCK_DIAG.
504    // 5 instructions:
505    //   LD NR
506    //   JEQ socket → +0, skip 3
507    //   LD arg0 (domain)
508    //   JEQ AF_NETLINK → +0, skip 1
509    //   RET ERRNO
510    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
511    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, 3));
512    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
513    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_NETLINK, 0, 1));
514    insns.push(stmt(BPF_RET | BPF_K, ret_errno));
515
516    // --- socket: block SOCK_RAW and/or SOCK_DGRAM on AF_INET/AF_INET6 ---
517    let mut blocked_types: Vec<u32> = Vec::new();
518    if policy.no_raw_sockets {
519        blocked_types.push(SOCK_RAW);
520    }
521    if policy.no_udp {
522        blocked_types.push(SOCK_DGRAM);
523    }
524
525    if !blocked_types.is_empty() {
526        let n = blocked_types.len();
527        // Instructions after domain checks: 2 (load+AND) + N (JEQs) + 1 (RET)
528        let after_domain = 2 + n + 1;
529        // Total after NR check: 3 (load domain + 2 JEQs) + after_domain
530        let skip_all = (3 + after_domain) as u8;
531
532        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
533        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, skip_all));
534        // Load domain (arg0)
535        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
536        // AF_INET → skip to type check (jump over AF_INET6 check)
537        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET, 1, 0));
538        // AF_INET6 → type check; else skip everything remaining
539        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET6, 0, after_domain as u8));
540        // Load type (arg1) and mask off SOCK_NONBLOCK|SOCK_CLOEXEC
541        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
542        insns.push(stmt(BPF_ALU | BPF_AND | BPF_K, SOCK_TYPE_MASK));
543        // Check each blocked type
544        for (i, &sock_type) in blocked_types.iter().enumerate() {
545            let remaining = n - i - 1;
546            // Match → jump to RET ERRNO (skip 'remaining' JEQs ahead)
547            // No match on last type → skip past RET ERRNO (jf=1)
548            // No match on non-last → check next type (jf=0)
549            let jf: u8 = if remaining == 0 { 1 } else { 0 };
550            insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, sock_type, remaining as u8, jf));
551        }
552        // Deny return (reached by any matching JEQ)
553        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
554    }
555
556    // --- wait4: skip notification for WNOHANG/WNOWAIT (non-blocking) ---
557    // wait4(pid, status, options, rusage) — options is arg2
558    // 5 instructions:
559    //   LD NR
560    //   JEQ wait4 → +0, skip 3
561    //   LD arg2
562    //   JSET (WNOHANG|WNOWAIT) → +0, skip 1
563    //   RET ALLOW
564    {
565        let nr_wait4 = libc::SYS_wait4 as u32;
566        let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000/* WNOWAIT */) as u32;
567        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
568        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_wait4, 0, 3));
569        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS2_LO));
570        insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
571        insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
572    }
573
574    // --- waitid: skip notification for WNOHANG/WNOWAIT (non-blocking) ---
575    // waitid(idtype, id, infop, options, rusage) — options is arg3
576    {
577        let nr_waitid = libc::SYS_waitid as u32;
578        let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000/* WNOWAIT */) as u32;
579        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
580        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_waitid, 0, 3));
581        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS3_LO));
582        insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
583        insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
584    }
585
586    insns
587}
588
589// ============================================================
590// Close fds above threshold
591// ============================================================
592
593/// Close all file descriptors above `min_fd`, except those in `keep`.
594fn close_fds_above(min_fd: RawFd, keep: &[RawFd]) {
595    // Read /proc/self/fd to enumerate open fds.
596    // Collect all fd numbers first, then close them after dropping the directory
597    // iterator. This avoids closing the directory fd during iteration.
598    let fds_to_close: Vec<RawFd> = {
599        let dir = match std::fs::read_dir("/proc/self/fd") {
600            Ok(d) => d,
601            Err(_) => return,
602        };
603        dir.flatten()
604            .filter_map(|entry| {
605                entry.file_name().into_string().ok()
606                    .and_then(|name| name.parse::<RawFd>().ok())
607            })
608            .filter(|&fd| fd > min_fd && !keep.contains(&fd))
609            .collect()
610    };
611    // The directory is now closed; safe to close the collected fds.
612    for fd in fds_to_close {
613        unsafe { libc::close(fd) };
614    }
615}
616
617// ============================================================
618// COW filesystem config passed from parent to child
619// ============================================================
620
621// Re-export ChildMountConfig so callers can use the old import path.
622pub(crate) use crate::cow::ChildMountConfig;
623
624/// Write uid/gid maps for an unprivileged user namespace.
625/// `real_uid`/`real_gid` must be captured *before* unshare(CLONE_NEWUSER),
626/// since getuid()/getgid() return the overflow id (65534) after unshare.
627/// `target_uid`/`target_gid` are the UIDs visible inside the namespace.
628fn write_id_maps(real_uid: u32, real_gid: u32, target_uid: u32, target_gid: u32) {
629    let _ = std::fs::write("/proc/self/uid_map", format!("{} {} 1\n", target_uid, real_uid));
630    let _ = std::fs::write("/proc/self/setgroups", "deny\n");
631    let _ = std::fs::write("/proc/self/gid_map", format!("{} {} 1\n", target_gid, real_gid));
632}
633
634/// Write uid/gid maps using the post-unshare overflow uid (65534).
635/// Used by the OverlayFS COW path which maps to root (UID 0) inside.
636fn write_id_maps_overflow() {
637    let uid = unsafe { libc::getuid() };
638    let gid = unsafe { libc::getgid() };
639    write_id_maps(uid, gid, 0, 0);
640}
641
642// ============================================================
643// Child-side confinement (never returns)
644// ============================================================
645
646/// Apply irreversible confinement (Landlock + seccomp) then exec the command.
647///
648/// This function **never returns**: it calls `execvp` on success or
649/// `_exit(127)` on any error.
650pub(crate) fn confine_child(policy: &Policy, cmd: &[CString], pipes: &PipePair, cow_config: Option<&ChildMountConfig>, nested: bool) -> ! {
651    // Helper: abort child on error. Includes the OS error automatically.
652    macro_rules! fail {
653        ($msg:expr) => {{
654            let err = std::io::Error::last_os_error();
655            let _ = write!(std::io::stderr(), "sandlock child: {}: {}\n", $msg, err);
656            unsafe { libc::_exit(127) };
657        }};
658    }
659
660    use std::io::Write;
661
662    // 1. New process group
663    if unsafe { libc::setpgid(0, 0) } != 0 {
664        fail!("setpgid");
665    }
666
667    // 1b. If stdin is a terminal, become the foreground process group
668    //     so interactive shells can read from the TTY.
669    //     Must ignore SIGTTOU first — a background pgrp calling tcsetpgrp
670    //     gets stopped by SIGTTOU otherwise.
671    if unsafe { libc::isatty(0) } == 1 {
672        unsafe {
673            libc::signal(libc::SIGTTOU, libc::SIG_IGN);
674            libc::tcsetpgrp(0, libc::getpgrp());
675            libc::signal(libc::SIGTTOU, libc::SIG_DFL);
676        }
677    }
678
679    // 2. Die if parent exits
680    if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } != 0 {
681        fail!("prctl(PR_SET_PDEATHSIG)");
682    }
683
684    // 3. Check parent didn't die between fork and prctl
685    if unsafe { libc::getppid() } == 1 {
686        fail!("parent died before confinement");
687    }
688
689    // 4. Optional: disable ASLR
690    if policy.no_randomize_memory {
691        const ADDR_NO_RANDOMIZE: libc::c_ulong = 0x0040000;
692        // Read current personality first (0xffffffff = query), then OR in the flag.
693        let current = unsafe { libc::personality(0xffffffff) };
694        if current == -1 {
695            fail!("personality(query)");
696        }
697        if unsafe { libc::personality(current as libc::c_ulong | ADDR_NO_RANDOMIZE) } == -1 {
698            fail!("personality(ADDR_NO_RANDOMIZE)");
699        }
700    }
701
702    // 4b. Optional: CPU core binding
703    if let Some(ref cores) = policy.cpu_cores {
704        if !cores.is_empty() {
705            let mut set = unsafe { std::mem::zeroed::<libc::cpu_set_t>() };
706            unsafe { libc::CPU_ZERO(&mut set) };
707            for &core in cores {
708                unsafe { libc::CPU_SET(core as usize, &mut set) };
709            }
710            if unsafe {
711                libc::sched_setaffinity(
712                    0,
713                    std::mem::size_of::<libc::cpu_set_t>(),
714                    &set,
715                )
716            } != 0
717            {
718                fail!("sched_setaffinity");
719            }
720        }
721    }
722
723    // 5. Optional: disable THP
724    if policy.no_huge_pages {
725        if unsafe { libc::prctl(libc::PR_SET_THP_DISABLE, 1, 0, 0, 0) } != 0 {
726            fail!("prctl(PR_SET_THP_DISABLE)");
727        }
728    }
729
730    // 5c. Optional: disable core dumps
731    if policy.no_coredump {
732        // Set RLIMIT_CORE to 0 — the kernel will not write a core file.
733        // We intentionally do NOT call prctl(PR_SET_DUMPABLE, 0) because
734        // that would break pidfd_getfd which the supervisor needs.
735        // The seccomp filter already blocks the child from calling
736        // prctl(PR_SET_DUMPABLE, ...) so it can't re-enable it.
737        let rlim = libc::rlimit { rlim_cur: 0, rlim_max: 0 };
738        if unsafe { libc::setrlimit(libc::RLIMIT_CORE, &rlim) } != 0 {
739            fail!("setrlimit(RLIMIT_CORE, 0)");
740        }
741    }
742
743    // Capture real uid/gid before any unshare (after unshare they become 65534)
744    let real_uid = unsafe { libc::getuid() };
745    let real_gid = unsafe { libc::getgid() };
746
747    // 5b. User namespace for --uid mapping (when not using OverlayFS COW,
748    //     which sets up its own user namespace)
749    if let Some(target_uid) = policy.uid {
750        if cow_config.is_none() {
751            if unsafe { libc::unshare(libc::CLONE_NEWUSER) } != 0 {
752                fail!("unshare(CLONE_NEWUSER)");
753            }
754            write_id_maps(real_uid, real_gid, target_uid, target_uid);
755        }
756    }
757
758    // 5c. User + mount namespace for OverlayFS COW (includes CLONE_NEWUSER)
759    if let Some(ref cow) = cow_config {
760        // unshare user + mount namespaces (unprivileged)
761        if unsafe { libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) } != 0 {
762            fail!("unshare(CLONE_NEWUSER | CLONE_NEWNS)");
763        }
764
765        // Write uid/gid maps using overflow uid (preserves existing COW behavior)
766        write_id_maps_overflow();
767
768        // Mount the overlay filesystem ON TOP of the workdir so the child
769        // sees the merged view at the original path.  The kernel resolves
770        // lowerdir before the covering mount takes effect, so using the
771        // same path as both lowerdir and mount-point is safe inside our
772        // private mount namespace.
773        let lowerdir = cow.lowers.iter()
774            .map(|p| p.display().to_string())
775            .collect::<Vec<_>>()
776            .join(":");
777        let opts = format!(
778            "lowerdir={},upperdir={},workdir={}",
779            lowerdir,
780            cow.upper.display(),
781            cow.work.display(),
782        );
783
784        let mount_cstr = match CString::new(cow.mount_point.to_str().unwrap_or("")) {
785            Ok(c) => c,
786            Err(_) => fail!("invalid overlay mount point path"),
787        };
788        let overlay_cstr = CString::new("overlay").unwrap();
789        let opts_cstr = match CString::new(opts) {
790            Ok(c) => c,
791            Err(_) => fail!("invalid overlay opts"),
792        };
793
794        let ret = unsafe {
795            libc::mount(
796                overlay_cstr.as_ptr(),
797                mount_cstr.as_ptr(),
798                overlay_cstr.as_ptr(),
799                0,
800                opts_cstr.as_ptr() as *const libc::c_void,
801            )
802        };
803        if ret != 0 {
804            fail!("mount overlay");
805        }
806    }
807
808    // 6. Optional: change working directory
809    // cwd controls where the child starts; workdir is only for COW
810    let effective_cwd = if let Some(ref cwd) = policy.cwd {
811        if let Some(ref chroot_root) = policy.chroot {
812            Some(chroot_root.join(cwd.strip_prefix("/").unwrap_or(cwd)))
813        } else {
814            Some(cwd.clone())
815        }
816    } else if let Some(ref chroot_root) = policy.chroot {
817        // Default to chroot root
818        Some(chroot_root.to_path_buf())
819    } else if let Some(ref workdir) = policy.workdir {
820        // Default to workdir when set (COW working directory)
821        Some(workdir.clone())
822    } else {
823        None
824    };
825
826    if let Some(ref cwd) = effective_cwd {
827        let c_path = match CString::new(cwd.as_os_str().as_encoded_bytes()) {
828            Ok(c) => c,
829            Err(_) => fail!("invalid cwd path"),
830        };
831        if unsafe { libc::chdir(c_path.as_ptr()) } != 0 {
832            fail!("chdir");
833        }
834    }
835
836    // 7. Set NO_NEW_PRIVS (required for both Landlock and seccomp without CAP_SYS_ADMIN)
837    if unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) } != 0 {
838        fail!("prctl(PR_SET_NO_NEW_PRIVS)");
839    }
840
841    // 8. Apply Landlock confinement (IRREVERSIBLE)
842    if let Err(e) = crate::landlock::confine(policy) {
843        fail!(format!("landlock: {}", e));
844    }
845
846    // 9. Assemble and install seccomp filter (IRREVERSIBLE)
847    let deny = deny_syscall_numbers(policy);
848    let args = arg_filters(policy);
849    let mut keep_fd: i32 = -1;
850
851    if nested {
852        // Nested sandbox: deny-only filter (no supervisor — parent handles it).
853        // BPF filters are ANDed by the kernel, so each level can only tighten.
854        let filter = bpf::assemble_filter(&[], &deny, &args);
855        if let Err(e) = bpf::install_deny_filter(&filter) {
856            fail!(format!("seccomp deny filter: {}", e));
857        }
858        // Signal nested mode to parent (fd=0 means no supervisor needed)
859        if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), 0) {
860            fail!(format!("write nested signal: {}", e));
861        }
862    } else {
863        // First-level sandbox: notif + deny filter with NEW_LISTENER.
864        let notif = notif_syscalls(policy);
865        let filter = bpf::assemble_filter(&notif, &deny, &args);
866        let notif_fd = match bpf::install_filter(&filter) {
867            Ok(fd) => fd,
868            Err(e) => fail!(format!("seccomp install: {}", e)),
869        };
870        keep_fd = notif_fd.as_raw_fd();
871        if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), keep_fd as u32) {
872            fail!(format!("write notif fd: {}", e));
873        }
874        std::mem::forget(notif_fd);
875    }
876
877    // Mark this process as confined for in-process nesting detection
878    crate::sandbox::CONFINED.store(true, std::sync::atomic::Ordering::Relaxed);
879
880    // 10. Wait for parent to signal ready
881    match read_u32_fd(pipes.ready_r.as_raw_fd()) {
882        Ok(_) => {}
883        Err(e) => fail!(format!("read ready signal: {}", e)),
884    }
885
886    // 12. Close all fds above stderr (always on for isolation)
887    if keep_fd >= 0 {
888        close_fds_above(2, &[keep_fd]);
889    } else {
890        close_fds_above(2, &[]);
891    }
892
893    // 13. Apply environment
894    if policy.clean_env {
895        // Clear all env vars first
896        for (key, _) in std::env::vars_os() {
897            std::env::remove_var(&key);
898        }
899    }
900    for (key, value) in &policy.env {
901        std::env::set_var(key, value);
902    }
903
904    // 13b. GPU device visibility
905    if let Some(ref devices) = policy.gpu_devices {
906        if !devices.is_empty() {
907            let vis = devices.iter().map(|d| d.to_string()).collect::<Vec<_>>().join(",");
908            std::env::set_var("CUDA_VISIBLE_DEVICES", &vis);
909            std::env::set_var("ROCR_VISIBLE_DEVICES", &vis);
910        }
911        // Empty list = all GPUs visible, don't set env vars
912    }
913
914    // 14. exec
915    debug_assert!(!cmd.is_empty(), "cmd must not be empty");
916    let argv_ptrs: Vec<*const libc::c_char> = cmd
917        .iter()
918        .map(|s| s.as_ptr())
919        .chain(std::iter::once(std::ptr::null()))
920        .collect();
921
922    if policy.chroot.is_some() {
923        // With chroot the seccomp handler rewrites the filename to a host path
924        // (or /proc/self/fd/N).  Pass a separate PATH_MAX buffer as the `file`
925        // argument so the rewrite does not corrupt argv[0] — which must stay as
926        // the original command name (e.g. busybox uses argv[0] for applet
927        // detection).  execvp still handles PATH lookup for bare command names.
928        let mut exec_path = vec![0u8; libc::PATH_MAX as usize];
929        let orig = cmd[0].as_bytes_with_nul();
930        exec_path[..orig.len()].copy_from_slice(orig);
931
932        unsafe {
933            libc::execvp(
934                exec_path.as_ptr() as *const libc::c_char,
935                argv_ptrs.as_ptr(),
936            )
937        };
938    } else {
939        unsafe { libc::execvp(argv_ptrs[0], argv_ptrs.as_ptr()) };
940    }
941
942    // If we get here, exec failed
943    fail!(format!("execvp '{}'", cmd[0].to_string_lossy()));
944}
945
946// ============================================================
947// Tests
948// ============================================================
949
950#[cfg(test)]
951mod tests {
952    use super::*;
953
954    #[test]
955    fn test_pipe_pair_creation() {
956        let pipes = PipePair::new().expect("pipe creation failed");
957        // Verify fds are valid (non-negative)
958        assert!(pipes.notif_r.as_raw_fd() >= 0);
959        assert!(pipes.notif_w.as_raw_fd() >= 0);
960        assert!(pipes.ready_r.as_raw_fd() >= 0);
961        assert!(pipes.ready_w.as_raw_fd() >= 0);
962        // All four fds should be distinct
963        let fds = [
964            pipes.notif_r.as_raw_fd(),
965            pipes.notif_w.as_raw_fd(),
966            pipes.ready_r.as_raw_fd(),
967            pipes.ready_w.as_raw_fd(),
968        ];
969        for i in 0..4 {
970            for j in (i + 1)..4 {
971                assert_ne!(fds[i], fds[j]);
972            }
973        }
974    }
975
976    #[test]
977    fn test_write_read_u32() {
978        let pipes = PipePair::new().expect("pipe creation failed");
979        let val = 42u32;
980        write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
981        let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
982        assert_eq!(got, val);
983    }
984
985    #[test]
986    fn test_write_read_u32_large() {
987        let pipes = PipePair::new().expect("pipe creation failed");
988        let val = 0xDEAD_BEEFu32;
989        write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
990        let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
991        assert_eq!(got, val);
992    }
993
994    #[test]
995    fn test_notif_syscalls_always_has_clone() {
996        let policy = Policy::builder().build().unwrap();
997        let nrs = notif_syscalls(&policy);
998        assert!(nrs.contains(&(libc::SYS_clone as u32)));
999        assert!(nrs.contains(&(libc::SYS_clone3 as u32)));
1000        assert!(nrs.contains(&(libc::SYS_vfork as u32)));
1001    }
1002
1003    #[test]
1004    fn test_notif_syscalls_memory() {
1005        let policy = Policy::builder()
1006            .max_memory(crate::policy::ByteSize::mib(256))
1007            .build()
1008            .unwrap();
1009        let nrs = notif_syscalls(&policy);
1010        assert!(nrs.contains(&(libc::SYS_mmap as u32)));
1011        assert!(nrs.contains(&(libc::SYS_munmap as u32)));
1012        assert!(nrs.contains(&(libc::SYS_brk as u32)));
1013        assert!(nrs.contains(&(libc::SYS_mremap as u32)));
1014        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1015    }
1016
1017    #[test]
1018    fn test_notif_syscalls_net() {
1019        let policy = Policy::builder()
1020            .net_allow_host("example.com")
1021            .build()
1022            .unwrap();
1023        let nrs = notif_syscalls(&policy);
1024        assert!(nrs.contains(&(libc::SYS_connect as u32)));
1025        assert!(nrs.contains(&(libc::SYS_sendto as u32)));
1026        assert!(nrs.contains(&(libc::SYS_sendmsg as u32)));
1027    }
1028
1029    /// SYS_faccessat2 (439) must be in the notification filter for both
1030    /// chroot and COW modes — glibc 2.33+ uses it instead of faccessat.
1031    #[test]
1032    fn test_notif_syscalls_faccessat2() {
1033        const SYS_FACCESSAT2: u32 = 439;
1034
1035        // Chroot mode
1036        let policy = Policy::builder()
1037            .chroot("/tmp")
1038            .build()
1039            .unwrap();
1040        let nrs = notif_syscalls(&policy);
1041        assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
1042        assert!(nrs.contains(&SYS_FACCESSAT2),
1043                "chroot notif filter must include SYS_faccessat2 (439)");
1044
1045        // COW mode
1046        let policy = Policy::builder()
1047            .workdir("/tmp")
1048            .build()
1049            .unwrap();
1050        let nrs = notif_syscalls(&policy);
1051        assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
1052        assert!(nrs.contains(&SYS_FACCESSAT2),
1053                "COW notif filter must include SYS_faccessat2 (439)");
1054    }
1055
1056    #[test]
1057    fn test_deny_syscall_numbers_default() {
1058        let policy = Policy::builder().build().unwrap();
1059        let nrs = deny_syscall_numbers(&policy);
1060        // Should contain mount, ptrace, etc.
1061        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1062        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1063        assert!(nrs.contains(&(libc::SYS_bpf as u32)));
1064        // nfsservctl has no libc constant, so it is skipped
1065        assert!(!nrs.is_empty());
1066    }
1067
1068    #[test]
1069    fn test_deny_syscall_numbers_custom() {
1070        let policy = Policy::builder()
1071            .deny_syscalls(vec!["mount".into(), "ptrace".into()])
1072            .build()
1073            .unwrap();
1074        let nrs = deny_syscall_numbers(&policy);
1075        assert_eq!(nrs.len(), 2);
1076        assert!(nrs.contains(&(libc::SYS_mount as u32)));
1077        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1078    }
1079
1080    #[test]
1081    fn test_deny_syscall_numbers_empty_when_allow_set() {
1082        let policy = Policy::builder()
1083            .allow_syscalls(vec!["read".into(), "write".into()])
1084            .build()
1085            .unwrap();
1086        let nrs = deny_syscall_numbers(&policy);
1087        assert!(nrs.is_empty());
1088    }
1089
1090    #[test]
1091    fn test_arg_filters_has_clone_ioctl_prctl_socket() {
1092        use crate::sys::structs::{
1093            BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K,
1094        };
1095        let policy = Policy::builder().build().unwrap();
1096        let filters = arg_filters(&policy);
1097        // Should contain JEQ for clone syscall nr
1098        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1099            && f.k == libc::SYS_clone as u32));
1100        // Should contain JSET for CLONE_NS_FLAGS
1101        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JSET | BPF_K)
1102            && f.k == CLONE_NS_FLAGS as u32));
1103        // Should contain JEQ for ioctl syscall nr
1104        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1105            && f.k == libc::SYS_ioctl as u32));
1106        // Should contain JEQ for TIOCSTI and TIOCLINUX
1107        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1108            && f.k == TIOCSTI as u32));
1109        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1110            && f.k == TIOCLINUX as u32));
1111        // Should contain JEQ for prctl syscall nr
1112        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1113            && f.k == libc::SYS_prctl as u32));
1114        // Should contain JEQ for PR_SET_DUMPABLE
1115        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1116            && f.k == PR_SET_DUMPABLE));
1117        // Should contain JEQ for socket + AF_NETLINK (all netlink blocked)
1118        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1119            && f.k == AF_NETLINK));
1120    }
1121
1122    #[test]
1123    fn test_arg_filters_raw_sockets() {
1124        use crate::sys::structs::{BPF_ALU, BPF_AND, BPF_JEQ, BPF_JMP, BPF_K};
1125        let policy = Policy::builder().no_raw_sockets(true).build().unwrap();
1126        let filters = arg_filters(&policy);
1127        // Should have AF_INET check
1128        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1129            && f.k == AF_INET));
1130        // Should have AF_INET6 check
1131        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1132            && f.k == AF_INET6));
1133        // Should have ALU AND SOCK_TYPE_MASK
1134        assert!(filters.iter().any(|f| f.code == (BPF_ALU | BPF_AND | BPF_K)
1135            && f.k == SOCK_TYPE_MASK));
1136        // Should have JEQ SOCK_RAW
1137        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1138            && f.k == SOCK_RAW));
1139    }
1140
1141    #[test]
1142    fn test_arg_filters_no_udp() {
1143        use crate::sys::structs::{BPF_JEQ, BPF_JMP, BPF_K};
1144        let policy = Policy::builder().no_udp(true).build().unwrap();
1145        let filters = arg_filters(&policy);
1146        // Should have JEQ SOCK_DGRAM
1147        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1148            && f.k == SOCK_DGRAM));
1149    }
1150
1151    #[test]
1152    fn test_syscall_name_to_nr_covers_defaults() {
1153        // Every name in DEFAULT_DENY_SYSCALLS except nfsservctl should resolve
1154        let mut skipped = 0;
1155        for name in DEFAULT_DENY_SYSCALLS {
1156            match syscall_name_to_nr(name) {
1157                Some(_) => {}
1158                None => {
1159                    assert_eq!(*name, "nfsservctl", "unexpected unresolved syscall: {}", name);
1160                    skipped += 1;
1161                }
1162            }
1163        }
1164        assert_eq!(skipped, 1); // only nfsservctl
1165    }
1166}