Skip to main content

evalbox_sys/
seccomp.rs

1//! Seccomp-BPF syscall filtering.
2//!
3//! Seccomp-BPF allows filtering syscalls using Berkeley Packet Filter (BPF) programs.
4//! This provides a second layer of defense after Landlock - even if a path is accessible,
5//! dangerous syscalls are blocked.
6//!
7//! ## Filter Structure
8//!
9//! The BPF filter runs on every syscall:
10//!
11//! 1. Verify architecture is `x86_64` (kill otherwise)
12//! 2. Load syscall number from `seccomp_data`
13//! 3. Block `clone3` entirely (cannot inspect flags in struct)
14//! 4. For `clone`, inspect flags and block namespace creation
15//! 5. For `socket`, inspect domain and block dangerous types (`AF_NETLINK`, `SOCK_RAW`)
16//! 6. Compare other syscalls against whitelist
17//! 7. Allow if match found, kill process otherwise
18//!
19//! ## Clone Flag Filtering
20//!
21//! The `clone` syscall is allowed but with restricted flags:
22//!
23//! - `CLONE_NEWUSER` - User namespace (kernel attack surface)
24//! - `CLONE_NEWNET` - Network namespace (`nf_tables` access)
25//! - `CLONE_NEWNS` - Mount namespace
26//! - `CLONE_NEWPID` - PID namespace
27//! - `CLONE_NEWIPC` - IPC namespace
28//! - `CLONE_NEWUTS` - UTS namespace
29//! - `CLONE_NEWCGROUP` - Cgroup namespace
30//!
31//! The `clone3` syscall is blocked entirely because its flags are passed via
32//! a userspace struct pointer that BPF cannot dereference.
33//!
34//! ## Socket Filtering
35//!
36//! The `socket` syscall is filtered to block:
37//!
38//! - `AF_NETLINK` (16) - Access to kernel netlink interfaces (`nf_tables`, CVE-2024-1086)
39//! - `SOCK_RAW` (3) - Raw packet access (can craft arbitrary packets)
40//!
41//! Allowed socket types:
42//! - `AF_UNIX` (1) - Local IPC (Python multiprocessing, etc.)
43//! - `AF_INET`/`AF_INET6` (2, 10) - Network (Landlock controls actual access)
44//!
45//! ## Removed Dangerous Syscalls
46//!
47//! - `memfd_create` + `execveat` - Enables fileless execution (bypass Landlock)
48//! - `setresuid`/`setresgid` - No reason to change UID in sandbox
49//! - `setsid`/`setpgid` - Session manipulation, unnecessary
50//! - `ioctl` - Allowed with argument filtering (TIOCSTI, TIOCSETD, TIOCLINUX blocked)
51//!
52//! ## Security Notes
53//!
54//! - Filter is permanent - cannot be removed once applied
55//! - Requires `PR_SET_NO_NEW_PRIVS` first
56//! - Blocked syscall = immediate process termination (SIGSYS)
57//! - `kill`/`tgkill` are safe due to Landlock v5 `SCOPE_SIGNAL` isolation
58//! - `prctl` allowed but `PR_SET_SECCOMP` has no effect (filter already applied)
59
60use rustix::io::Errno;
61
62use crate::last_errno;
63
64// Seccomp constants
65const SECCOMP_SET_MODE_FILTER: u32 = 1;
66const SECCOMP_RET_KILL_PROCESS: u32 = 0x80000000;
67const SECCOMP_RET_USER_NOTIF: u32 = 0x7fc00000;
68const SECCOMP_RET_ALLOW: u32 = 0x7fff0000;
69// Return ENOSYS (38) to allow graceful fallback
70const SECCOMP_RET_ERRNO_ENOSYS: u32 = 0x00050000 | 38;
71
72// BPF instruction classes
73const BPF_LD: u16 = 0x00;
74const BPF_JMP: u16 = 0x05;
75const BPF_RET: u16 = 0x06;
76
77// BPF ld/ldx fields
78const BPF_W: u16 = 0x00;
79const BPF_ABS: u16 = 0x20;
80
81// BPF alu/jmp fields
82const BPF_JEQ: u16 = 0x10;
83const BPF_JSET: u16 = 0x40;
84const BPF_K: u16 = 0x00;
85
86const AUDIT_ARCH_X86_64: u32 = 0xc000003e;
87
88// seccomp_data offsets (x86_64)
89const OFFSET_SYSCALL_NR: u32 = 0;
90const OFFSET_ARCH: u32 = 4;
91const OFFSET_ARGS_0: u32 = 16; // args[0], lower 32 bits
92const OFFSET_ARGS_1: u32 = 24; // args[1], lower 32 bits
93
94// Clone flags that create new namespaces - blocked to prevent sandbox escape
95const CLONE_NEWNS: u32 = 0x00020000;
96const CLONE_NEWCGROUP: u32 = 0x02000000;
97const CLONE_NEWUTS: u32 = 0x04000000;
98const CLONE_NEWIPC: u32 = 0x08000000;
99const CLONE_NEWUSER: u32 = 0x10000000;
100const CLONE_NEWPID: u32 = 0x20000000;
101const CLONE_NEWNET: u32 = 0x40000000;
102
103/// Combined mask of all blocked clone flags.
104const BLOCKED_CLONE_FLAGS: u32 = CLONE_NEWNS
105    | CLONE_NEWCGROUP
106    | CLONE_NEWUTS
107    | CLONE_NEWIPC
108    | CLONE_NEWUSER
109    | CLONE_NEWPID
110    | CLONE_NEWNET;
111
112// Socket constants
113const AF_NETLINK: u32 = 16; // Kernel netlink (nf_tables, etc.) - BLOCKED
114const SOCK_RAW: u32 = 3; // Raw sockets - BLOCKED
115
116// Dangerous ioctl commands - BLOCKED
117// See: https://madaidans-insecurities.github.io/guides/linux-hardening.html
118// TIOCSTI: Inject terminal input - sandbox escape vector
119const TIOCSTI: u32 = 0x5412;
120// TIOCSETD: Load TTY line disciplines - multiple exploits (CVE-2017-2636, etc.)
121const TIOCSETD: u32 = 0x5423;
122// TIOCLINUX: Linux-specific terminal ops - can inject input on virtual consoles
123const TIOCLINUX: u32 = 0x541C;
124
125/// Maximum whitelist size (BPF jump offsets are u8)
126const MAX_WHITELIST_SIZE: usize = 200;
127
128#[repr(C)]
129#[derive(Debug, Clone, Copy, Default)]
130pub struct SockFilter {
131    pub code: u16,
132    pub jt: u8,
133    pub jf: u8,
134    pub k: u32,
135}
136
137impl SockFilter {
138    #[inline]
139    pub const fn stmt(code: u16, k: u32) -> Self {
140        Self {
141            code,
142            jt: 0,
143            jf: 0,
144            k,
145        }
146    }
147
148    #[inline]
149    pub const fn jump(code: u16, k: u32, jt: u8, jf: u8) -> Self {
150        Self { code, jt, jf, k }
151    }
152}
153
154#[repr(C)]
155#[derive(Debug)]
156pub struct SockFprog {
157    pub len: u16,
158    pub filter: *const SockFilter,
159}
160
161/// Syscalls allowed in the sandbox.
162///
163/// ## Special handling (not in this list):
164/// - `clone` - Allowed with flag filtering (blocks `CLONE_NEW`*)
165/// - `clone3` - Returns ENOSYS (glibc falls back to `clone`)
166/// - `socket` - Allowed with domain/type filtering (blocks `AF_NETLINK`, `SOCK_RAW`)
167/// - `ioctl` - Allowed with command filtering (blocks `TIOCSTI`, `TIOCSETD`, `TIOCLINUX`)
168///
169/// ## Removed for security:
170/// - `memfd_create` - With execveat enables fileless execution
171/// - `execveat` - Removed to prevent fileless execution
172/// - `setresuid`/`setresgid` - No need to change UID in sandbox
173/// - `setsid`/`setpgid` - Session manipulation unnecessary
174///
175/// ## Notes:
176/// - `kill`/`tgkill` safe due to Landlock v5 `SCOPE_SIGNAL` isolation
177/// - `prctl` kept for runtime needs (`PR_SET_NAME`, etc.)
178pub const DEFAULT_WHITELIST: &[i64] = &[
179    // === Basic I/O ===
180    libc::SYS_read,
181    libc::SYS_write,
182    libc::SYS_close,
183    libc::SYS_close_range, // Modern fd range closing
184    libc::SYS_fstat,
185    libc::SYS_lseek,
186    libc::SYS_pread64,
187    libc::SYS_pwrite64,
188    libc::SYS_readv,
189    libc::SYS_writev,
190    libc::SYS_preadv,
191    libc::SYS_pwritev,
192    libc::SYS_preadv2,
193    libc::SYS_pwritev2,
194    libc::SYS_dup,
195    libc::SYS_dup2,
196    libc::SYS_dup3,
197    libc::SYS_fcntl,
198    libc::SYS_flock,
199    libc::SYS_fsync,
200    libc::SYS_fdatasync,
201    libc::SYS_ftruncate,
202    libc::SYS_fadvise64,
203    libc::SYS_access,
204    libc::SYS_pipe,
205    libc::SYS_pipe2,
206    libc::SYS_select,
207    libc::SYS_poll,
208    libc::SYS_ppoll,
209    libc::SYS_pselect6,
210    // Efficient file operations (Python/Node use these)
211    libc::SYS_sendfile,
212    libc::SYS_copy_file_range,
213    libc::SYS_splice,
214    libc::SYS_tee,
215    // === Memory ===
216    libc::SYS_mmap,
217    libc::SYS_mprotect,
218    libc::SYS_munmap,
219    libc::SYS_brk,
220    libc::SYS_mremap,
221    libc::SYS_msync,
222    libc::SYS_mincore,
223    libc::SYS_madvise,
224    // memfd_create REMOVED - enables fileless execution with execveat
225    libc::SYS_membarrier,
226    libc::SYS_mlock,
227    libc::SYS_mlock2,
228    libc::SYS_munlock,
229    libc::SYS_mlockall,
230    libc::SYS_munlockall,
231    // === Process info (read-only) ===
232    libc::SYS_getpid,
233    libc::SYS_getppid,
234    libc::SYS_gettid, // Thread ID (used by Python, Go, etc.)
235    libc::SYS_getuid,
236    libc::SYS_getgid,
237    libc::SYS_geteuid,
238    libc::SYS_getegid,
239    libc::SYS_getresuid,
240    libc::SYS_getresgid,
241    // setresuid/setresgid REMOVED - no need to change UID in sandbox
242    libc::SYS_getpgrp,
243    // setpgid/setsid REMOVED - session manipulation unnecessary
244    libc::SYS_getgroups,
245    libc::SYS_getsid,
246    libc::SYS_uname,
247    libc::SYS_getrusage,
248    libc::SYS_times,
249    libc::SYS_sysinfo,
250    // === Time ===
251    libc::SYS_clock_gettime,
252    libc::SYS_clock_getres,
253    libc::SYS_clock_nanosleep,
254    libc::SYS_gettimeofday,
255    libc::SYS_nanosleep,
256    // === Filesystem (Landlock restricts actual access) ===
257    libc::SYS_openat,
258    libc::SYS_open,
259    libc::SYS_creat,
260    libc::SYS_unlink,
261    libc::SYS_unlinkat,
262    libc::SYS_rename,
263    libc::SYS_renameat,
264    libc::SYS_renameat2,
265    libc::SYS_mkdir,
266    libc::SYS_mkdirat,
267    libc::SYS_rmdir,
268    libc::SYS_symlink,
269    libc::SYS_symlinkat,
270    libc::SYS_link,
271    libc::SYS_linkat,
272    libc::SYS_chmod,
273    libc::SYS_fchmod,
274    libc::SYS_fchmodat,
275    libc::SYS_chown,
276    libc::SYS_fchown,
277    libc::SYS_fchownat,
278    libc::SYS_lchown,
279    libc::SYS_utimensat,
280    libc::SYS_faccessat,
281    libc::SYS_faccessat2,
282    libc::SYS_stat,
283    libc::SYS_lstat,
284    libc::SYS_newfstatat,
285    libc::SYS_statfs,
286    libc::SYS_fstatfs,
287    libc::SYS_statx,
288    libc::SYS_getdents,
289    libc::SYS_getdents64,
290    libc::SYS_getcwd,
291    libc::SYS_chdir,
292    libc::SYS_fchdir,
293    libc::SYS_readlink,
294    libc::SYS_readlinkat,
295    // === Signals (safe due to Landlock SCOPE_SIGNAL) ===
296    libc::SYS_rt_sigaction,
297    libc::SYS_rt_sigprocmask,
298    libc::SYS_rt_sigreturn,
299    libc::SYS_rt_sigsuspend,
300    libc::SYS_rt_sigpending,
301    libc::SYS_rt_sigtimedwait,
302    libc::SYS_sigaltstack,
303    libc::SYS_kill,   // Safe: Landlock SCOPE_SIGNAL isolates
304    libc::SYS_tgkill, // Safe: Landlock SCOPE_SIGNAL isolates
305    libc::SYS_tkill,  // Safe: Landlock SCOPE_SIGNAL isolates
306    // === Process control ===
307    libc::SYS_execve,
308    // execveat REMOVED - with memfd_create enables fileless execution
309    libc::SYS_fork,  // Safe: no flags
310    libc::SYS_vfork, // Safe: no flags
311    libc::SYS_exit,
312    libc::SYS_exit_group,
313    libc::SYS_wait4,
314    libc::SYS_waitid,
315    libc::SYS_set_tid_address,
316    libc::SYS_futex,
317    libc::SYS_get_robust_list,
318    libc::SYS_set_robust_list,
319    libc::SYS_sched_yield,
320    libc::SYS_sched_getaffinity, // Go runtime needs
321    libc::SYS_sched_setaffinity, // Go runtime needs
322    libc::SYS_sched_getparam,
323    libc::SYS_sched_setparam,
324    libc::SYS_sched_getscheduler,
325    libc::SYS_sched_get_priority_max,
326    libc::SYS_sched_get_priority_min,
327    libc::SYS_arch_prctl,
328    libc::SYS_prctl, // Kept for PR_SET_NAME, etc. PR_SET_SECCOMP is no-op
329    libc::SYS_getrandom,
330    libc::SYS_prlimit64,
331    libc::SYS_rseq,
332    libc::SYS_ioprio_get,
333    // === Terminal/Device I/O ===
334    // ioctl is handled specially below - blocks TIOCSTI, TIOCSETD, TIOCLINUX
335    // (not in whitelist, filtered like socket)
336    // === Event mechanisms ===
337    libc::SYS_eventfd,
338    libc::SYS_eventfd2,
339    libc::SYS_epoll_create,
340    libc::SYS_epoll_create1,
341    libc::SYS_epoll_ctl,
342    libc::SYS_epoll_wait,
343    libc::SYS_epoll_pwait,
344    libc::SYS_epoll_pwait2,
345    libc::SYS_timerfd_create,
346    libc::SYS_timerfd_settime,
347    libc::SYS_timerfd_gettime,
348    libc::SYS_signalfd,
349    libc::SYS_signalfd4,
350    // === Sockets (filtered separately for domain/type) ===
351    // SYS_socket handled specially - blocks AF_NETLINK, SOCK_RAW
352    libc::SYS_socketpair,
353    libc::SYS_connect,
354    libc::SYS_bind,
355    libc::SYS_listen,
356    libc::SYS_accept,
357    libc::SYS_accept4,
358    libc::SYS_getsockname,
359    libc::SYS_getpeername,
360    libc::SYS_sendto,
361    libc::SYS_recvfrom,
362    libc::SYS_setsockopt,
363    libc::SYS_getsockopt,
364    libc::SYS_shutdown,
365    libc::SYS_sendmsg,
366    libc::SYS_recvmsg,
367    libc::SYS_sendmmsg,
368    libc::SYS_recvmmsg,
369];
370
371/// Builds a BPF filter with clone and socket argument filtering.
372///
373/// ## Filter Layout
374///
375/// ```text
376/// [0-2]   Architecture check (x86_64)
377/// [3]     Load syscall number
378/// [4]     clone3 -> KILL
379/// [5]     clone -> clone_handler
380/// [6]     socket -> socket_handler
381/// [7..N]  Whitelist checks -> ALLOW
382/// [N+1]   RET KILL (default deny)
383/// [N+2]   RET ALLOW
384/// [N+3-6] Clone handler (load flags, JSET, ALLOW/KILL)
385/// [N+7-12] Socket handler (check AF_NETLINK, check SOCK_RAW)
386/// ```
387///
388/// # Panics
389///
390/// Panics if `syscalls.len()` > 200 (BPF jump offsets are u8)
391pub fn build_whitelist_filter(syscalls: &[i64]) -> Vec<SockFilter> {
392    assert!(
393        syscalls.len() <= MAX_WHITELIST_SIZE,
394        "whitelist too large: {} > {} (BPF jump offset overflow)",
395        syscalls.len(),
396        MAX_WHITELIST_SIZE
397    );
398
399    let n = syscalls.len();
400    let mut filter = Vec::with_capacity(n + 20);
401
402    // === Architecture check ===
403    filter.push(SockFilter::stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARCH));
404    filter.push(SockFilter::jump(
405        BPF_JMP | BPF_JEQ | BPF_K,
406        AUDIT_ARCH_X86_64,
407        1,
408        0,
409    ));
410    filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS));
411
412    // === Load syscall number ===
413    filter.push(SockFilter::stmt(
414        BPF_LD | BPF_W | BPF_ABS,
415        OFFSET_SYSCALL_NR,
416    ));
417
418    // === clone3 -> ERRNO(ENOSYS) ===
419    // Return ENOSYS to allow glibc to fall back to clone() syscall.
420    // We can't inspect clone3 args (struct pointer), so we block it but gracefully.
421    // Jump to ERRNO: skip clone + socket + ioctl checks + whitelist + KILL + ALLOW
422    let clone3_errno_offset = (3 + n + 2) as u8;
423    filter.push(SockFilter::jump(
424        BPF_JMP | BPF_JEQ | BPF_K,
425        libc::SYS_clone3 as u32,
426        clone3_errno_offset,
427        0,
428    ));
429
430    // === clone -> clone_handler ===
431    // Jump to clone handler: skip socket + ioctl checks + whitelist + KILL + ALLOW + ERRNO
432    let clone_handler_offset = (2 + n + 3) as u8;
433    filter.push(SockFilter::jump(
434        BPF_JMP | BPF_JEQ | BPF_K,
435        libc::SYS_clone as u32,
436        clone_handler_offset,
437        0,
438    ));
439
440    // === socket -> socket_handler ===
441    // Jump to socket handler: skip ioctl check + whitelist + KILL + ALLOW + ERRNO + clone_handler(4)
442    let socket_handler_offset = (1 + n + 3 + 4) as u8;
443    filter.push(SockFilter::jump(
444        BPF_JMP | BPF_JEQ | BPF_K,
445        libc::SYS_socket as u32,
446        socket_handler_offset,
447        0,
448    ));
449
450    // === ioctl -> ioctl_handler ===
451    // Jump to ioctl handler: skip whitelist + KILL + ALLOW + ERRNO + clone_handler(4) + socket_handler(6)
452    let ioctl_handler_offset = (n + 3 + 4 + 6) as u8;
453    filter.push(SockFilter::jump(
454        BPF_JMP | BPF_JEQ | BPF_K,
455        libc::SYS_ioctl as u32,
456        ioctl_handler_offset,
457        0,
458    ));
459
460    // === Whitelist check ===
461    for (i, &nr) in syscalls.iter().enumerate() {
462        let allow_offset = (n - i) as u8;
463        filter.push(SockFilter::jump(
464            BPF_JMP | BPF_JEQ | BPF_K,
465            nr as u32,
466            allow_offset,
467            0,
468        ));
469    }
470
471    // === Default deny ===
472    filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS));
473
474    // === ALLOW ===
475    filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
476
477    // === ERRNO(ENOSYS) for clone3 ===
478    // This allows glibc to gracefully fall back to clone()
479    filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ERRNO_ENOSYS));
480
481    // === Clone handler (4 instructions) ===
482    // Load clone flags (args[0])
483    filter.push(SockFilter::stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS_0));
484    // Check blocked flags
485    filter.push(SockFilter::jump(
486        BPF_JMP | BPF_JSET | BPF_K,
487        BLOCKED_CLONE_FLAGS,
488        1,
489        0,
490    ));
491    // No blocked flags -> ALLOW
492    filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
493    // Blocked flags -> KILL
494    filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS));
495
496    // === Socket handler (6 instructions) ===
497    // Load socket domain (args[0])
498    filter.push(SockFilter::stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS_0));
499    // Block AF_NETLINK (domain 16) - access to nf_tables, etc.
500    filter.push(SockFilter::jump(
501        BPF_JMP | BPF_JEQ | BPF_K,
502        AF_NETLINK,
503        3,
504        0,
505    )); // -> KILL
506
507    // Load socket type (args[1])
508    filter.push(SockFilter::stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS_1));
509    // Block SOCK_RAW (type 3) - but need to mask out flags (SOCK_NONBLOCK, etc.)
510    // SOCK_RAW = 3, SOCK_NONBLOCK = 0x800, SOCK_CLOEXEC = 0x80000
511    // We check if (type & 0xF) == SOCK_RAW
512    filter.push(SockFilter::jump(BPF_JMP | BPF_JEQ | BPF_K, SOCK_RAW, 1, 0)); // -> KILL
513
514    // Socket OK -> ALLOW
515    filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
516    // Socket blocked -> KILL
517    filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS));
518
519    // === Ioctl handler (6 instructions) ===
520    // Block dangerous ioctls that can escape sandbox:
521    // - TIOCSTI: inject terminal input (sandbox escape)
522    // - TIOCSETD: load TTY line disciplines (multiple CVEs)
523    // - TIOCLINUX: Linux terminal ops (input injection)
524    // Load ioctl command (args[1])
525    filter.push(SockFilter::stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS_1));
526    // Block TIOCSTI (0x5412) - jt=3 lands on KILL
527    filter.push(SockFilter::jump(BPF_JMP | BPF_JEQ | BPF_K, TIOCSTI, 3, 0));
528    // Block TIOCSETD (0x5423) - jt=2 lands on KILL
529    filter.push(SockFilter::jump(BPF_JMP | BPF_JEQ | BPF_K, TIOCSETD, 2, 0));
530    // Block TIOCLINUX (0x541C) - jt=1 lands on KILL
531    filter.push(SockFilter::jump(BPF_JMP | BPF_JEQ | BPF_K, TIOCLINUX, 1, 0));
532    // Ioctl OK -> ALLOW
533    filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
534    // Ioctl blocked -> KILL
535    filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS));
536
537    filter
538}
539
540/// Applies a seccomp-BPF filter to the current thread.
541///
542/// # Safety
543///
544/// This permanently restricts syscalls for this thread. The filter must be valid.
545///
546/// # Errors
547///
548/// Returns `Errno` if the filter cannot be applied.
549pub unsafe fn seccomp_set_mode_filter(fprog: &SockFprog) -> Result<(), Errno> {
550    let ret = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
551    if ret != 0 {
552        return Err(last_errno());
553    }
554
555    let ret = unsafe {
556        libc::syscall(
557            libc::SYS_seccomp,
558            SECCOMP_SET_MODE_FILTER,
559            0u32,
560            fprog as *const _,
561        )
562    };
563    if ret != 0 { Err(last_errno()) } else { Ok(()) }
564}
565
566/// Returns true if seccomp is available.
567pub fn seccomp_available() -> bool {
568    unsafe { libc::prctl(libc::PR_GET_SECCOMP, 0, 0, 0, 0) >= 0 }
569}
570
571/// Builds a BPF filter that returns `SECCOMP_RET_USER_NOTIF` for the listed
572/// syscalls and `SECCOMP_RET_ALLOW` for everything else.
573///
574/// This filter is installed *before* the kill filter. The kernel evaluates all
575/// stacked filters and returns the strictest verdict, so:
576/// - Syscall in both ALLOW lists → ALLOW
577/// - Syscall in NOTIFY + ALLOW → NOTIFY (supervisor decides)
578/// - Syscall not in kill filter whitelist → KILL (regardless of notify filter)
579///
580/// # Panics
581///
582/// Panics if `syscalls.len()` > 200 (BPF jump offsets are u8).
583pub fn build_notify_filter(syscalls: &[i64]) -> Vec<SockFilter> {
584    assert!(
585        syscalls.len() <= MAX_WHITELIST_SIZE,
586        "notify syscall list too large: {} > {}",
587        syscalls.len(),
588        MAX_WHITELIST_SIZE
589    );
590
591    let n = syscalls.len();
592    let mut filter = Vec::with_capacity(n + 8);
593
594    // Architecture check
595    filter.push(SockFilter::stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARCH));
596    filter.push(SockFilter::jump(
597        BPF_JMP | BPF_JEQ | BPF_K,
598        AUDIT_ARCH_X86_64,
599        1,
600        0,
601    ));
602    filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
603
604    // Load syscall number
605    filter.push(SockFilter::stmt(
606        BPF_LD | BPF_W | BPF_ABS,
607        OFFSET_SYSCALL_NR,
608    ));
609
610    // Check each syscall → jump to NOTIFY
611    for (i, &nr) in syscalls.iter().enumerate() {
612        let notify_offset = (n - i) as u8; // jump to NOTIFY instruction
613        filter.push(SockFilter::jump(
614            BPF_JMP | BPF_JEQ | BPF_K,
615            nr as u32,
616            notify_offset,
617            0,
618        ));
619    }
620
621    // Default: ALLOW
622    filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
623
624    // NOTIFY
625    filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_USER_NOTIF));
626
627    filter
628}
629
630/// Syscalls that are intercepted by the notify filter for filesystem virtualization.
631pub const NOTIFY_FS_SYSCALLS: &[i64] = &[
632    libc::SYS_openat,
633    libc::SYS_open,
634    libc::SYS_creat,
635    libc::SYS_access,
636    libc::SYS_faccessat,
637    libc::SYS_faccessat2,
638    libc::SYS_stat,
639    libc::SYS_lstat,
640    libc::SYS_newfstatat,
641    libc::SYS_statx,
642    libc::SYS_readlink,
643    libc::SYS_readlinkat,
644];
645
646#[cfg(test)]
647mod tests {
648    use super::*;
649
650    #[test]
651    fn filter_structure() {
652        let syscalls = &[libc::SYS_read, libc::SYS_write, libc::SYS_exit];
653        let filter = build_whitelist_filter(syscalls);
654        // 3 (arch) + 1 (load) + 4 (clone3/clone/socket/ioctl) + 3 (whitelist) + 3 (kill/allow/errno)
655        // + 4 (clone handler) + 6 (socket handler) + 6 (ioctl handler) = 30
656        assert_eq!(filter.len(), 30);
657    }
658
659    #[test]
660    fn clone3_returns_enosys() {
661        let filter = build_whitelist_filter(DEFAULT_WHITELIST);
662        let clone3_check = &filter[4];
663        assert_eq!(clone3_check.k, libc::SYS_clone3 as u32);
664        assert!(clone3_check.jt > 0);
665        // clone3 should jump to ERRNO instruction, not KILL
666    }
667
668    #[test]
669    fn clone_has_flag_check() {
670        let filter = build_whitelist_filter(DEFAULT_WHITELIST);
671        let clone_check = &filter[5];
672        assert_eq!(clone_check.k, libc::SYS_clone as u32);
673        assert!(clone_check.jt > 0);
674
675        let has_jset = filter
676            .iter()
677            .any(|f| f.code == (BPF_JMP | BPF_JSET | BPF_K));
678        assert!(has_jset);
679    }
680
681    #[test]
682    fn socket_is_filtered() {
683        let filter = build_whitelist_filter(DEFAULT_WHITELIST);
684        let socket_check = &filter[6];
685        assert_eq!(socket_check.k, libc::SYS_socket as u32);
686        assert!(socket_check.jt > 0);
687    }
688
689    #[test]
690    fn ioctl_is_filtered() {
691        let filter = build_whitelist_filter(DEFAULT_WHITELIST);
692        let ioctl_check = &filter[7];
693        assert_eq!(ioctl_check.k, libc::SYS_ioctl as u32);
694        assert!(ioctl_check.jt > 0);
695    }
696
697    #[test]
698    fn blocked_clone_flags_mask() {
699        assert_ne!(BLOCKED_CLONE_FLAGS & CLONE_NEWUSER, 0);
700        assert_ne!(BLOCKED_CLONE_FLAGS & CLONE_NEWNET, 0);
701        assert_ne!(BLOCKED_CLONE_FLAGS & CLONE_NEWNS, 0);
702        assert_ne!(BLOCKED_CLONE_FLAGS & CLONE_NEWPID, 0);
703        assert_ne!(BLOCKED_CLONE_FLAGS & CLONE_NEWIPC, 0);
704        assert_ne!(BLOCKED_CLONE_FLAGS & CLONE_NEWUTS, 0);
705        assert_ne!(BLOCKED_CLONE_FLAGS & CLONE_NEWCGROUP, 0);
706    }
707
708    #[test]
709    fn dangerous_syscalls_removed() {
710        // These should NOT be in the whitelist
711        assert!(!DEFAULT_WHITELIST.contains(&libc::SYS_clone));
712        assert!(!DEFAULT_WHITELIST.contains(&libc::SYS_clone3));
713        assert!(!DEFAULT_WHITELIST.contains(&libc::SYS_socket)); // Filtered separately
714        assert!(!DEFAULT_WHITELIST.contains(&libc::SYS_memfd_create));
715        assert!(!DEFAULT_WHITELIST.contains(&libc::SYS_execveat));
716        assert!(!DEFAULT_WHITELIST.contains(&libc::SYS_setresuid));
717        assert!(!DEFAULT_WHITELIST.contains(&libc::SYS_setresgid));
718        assert!(!DEFAULT_WHITELIST.contains(&libc::SYS_setsid));
719        assert!(!DEFAULT_WHITELIST.contains(&libc::SYS_setpgid));
720        // Note: ioctl is now allowed as it's needed for terminal ops and Landlock restricts device access
721    }
722
723    #[test]
724    fn safe_syscalls_present() {
725        assert!(DEFAULT_WHITELIST.contains(&libc::SYS_fork));
726        assert!(DEFAULT_WHITELIST.contains(&libc::SYS_vfork));
727        assert!(DEFAULT_WHITELIST.contains(&libc::SYS_execve));
728        assert!(DEFAULT_WHITELIST.contains(&libc::SYS_sendfile));
729        assert!(DEFAULT_WHITELIST.contains(&libc::SYS_close_range));
730    }
731
732    #[test]
733    #[should_panic(expected = "whitelist too large")]
734    fn whitelist_overflow_panics() {
735        let huge: Vec<i64> = (0..300).map(|i| i as i64).collect();
736        build_whitelist_filter(&huge);
737    }
738
739    #[test]
740    fn notify_filter_structure() {
741        let syscalls = &[libc::SYS_openat, libc::SYS_open, libc::SYS_stat];
742        let filter = build_notify_filter(syscalls);
743        // 3 (arch) + 1 (load) + 3 (checks) + 1 (allow) + 1 (notify) = 9
744        assert_eq!(filter.len(), 9);
745    }
746
747    #[test]
748    fn notify_fs_syscalls_present() {
749        assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_openat));
750        assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_open));
751        assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_stat));
752        assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_readlink));
753    }
754}