Skip to main content

nucleus/security/
seccomp.rs

1use crate::error::{NucleusError, Result};
2use crate::security::policy::sha256_hex;
3#[cfg(any(
4    target_arch = "x86_64",
5    target_arch = "aarch64",
6    target_arch = "riscv64"
7))]
8use crate::security::syscall_numbers::{SYS_FADVISE64, SYS_SENDFILE};
9use seccompiler::{BpfProgram, SeccompAction, SeccompCondition, SeccompFilter, SeccompRule};
10use std::collections::BTreeMap;
11use std::path::Path;
12use tracing::{debug, info, warn};
13
14/// Seccomp filter manager
15///
16/// Implements syscall whitelisting for the security state machine
17/// (NucleusSecurity_Seccomp_SeccompEnforcement.tla)
18pub struct SeccompManager {
19    applied: bool,
20}
21
22const DENIED_CLONE_NAMESPACE_FLAGS: u64 = (libc::CLONE_NEWUSER
23    | libc::CLONE_NEWNS
24    | libc::CLONE_NEWNET
25    | libc::CLONE_NEWIPC
26    | libc::CLONE_NEWUTS
27    | libc::CLONE_NEWPID
28    | libc::CLONE_NEWCGROUP
29    | libc::CLONE_NEWTIME) as u64;
30
31impl SeccompManager {
32    pub fn new() -> Self {
33        Self { applied: false }
34    }
35
36    fn base_allowed_syscalls() -> Vec<i64> {
37        let mut syscalls = vec![
38            // File I/O
39            libc::SYS_read,
40            libc::SYS_write,
41            libc::SYS_openat,
42            libc::SYS_close,
43            libc::SYS_fstat,
44            libc::SYS_lseek,
45            libc::SYS_fcntl,
46            libc::SYS_readv,
47            libc::SYS_writev,
48            libc::SYS_preadv,
49            libc::SYS_pwritev,
50            libc::SYS_pread64,
51            libc::SYS_pwrite64,
52            libc::SYS_readlinkat,
53            libc::SYS_newfstatat,
54            libc::SYS_statx,
55            libc::SYS_faccessat,
56            libc::SYS_faccessat2,
57            libc::SYS_dup,
58            libc::SYS_dup3,
59            libc::SYS_pipe2,
60            libc::SYS_unlinkat,
61            libc::SYS_renameat,
62            libc::SYS_renameat2,
63            libc::SYS_linkat,
64            libc::SYS_symlinkat,
65            libc::SYS_fchmod,
66            libc::SYS_fchmodat,
67            libc::SYS_truncate,
68            libc::SYS_ftruncate,
69            libc::SYS_fallocate,
70            #[cfg(any(
71                target_arch = "x86_64",
72                target_arch = "aarch64",
73                target_arch = "riscv64"
74            ))]
75            SYS_FADVISE64,
76            libc::SYS_fsync,
77            libc::SYS_fdatasync,
78            libc::SYS_sync_file_range,
79            libc::SYS_flock,
80            libc::SYS_fstatfs,
81            libc::SYS_statfs,
82            #[cfg(any(
83                target_arch = "x86_64",
84                target_arch = "aarch64",
85                target_arch = "riscv64"
86            ))]
87            SYS_SENDFILE,
88            libc::SYS_copy_file_range,
89            libc::SYS_splice,
90            libc::SYS_tee,
91            // Memory management
92            libc::SYS_mmap,
93            libc::SYS_munmap,
94            libc::SYS_brk,
95            libc::SYS_mremap,
96            libc::SYS_madvise,
97            libc::SYS_msync,
98            libc::SYS_mlock,
99            libc::SYS_munlock,
100            libc::SYS_mlock2,
101            // SysV shared memory – used by PostgreSQL, Redis, and many databases
102            // for shared buffer pools. Safe in PID/IPC namespaces (isolated keyspace).
103            libc::SYS_shmget,
104            libc::SYS_shmat,
105            libc::SYS_shmdt,
106            libc::SYS_shmctl,
107            // POSIX semaphores (used by PostgreSQL for lightweight locking)
108            libc::SYS_semget,
109            libc::SYS_semop,
110            libc::SYS_semctl,
111            libc::SYS_semtimedop,
112            // Process management
113            // fork intentionally excluded – modern glibc/musl use clone(), which
114            // has namespace-flag filtering. Removing SYS_fork forces all forks
115            // through the filtered clone path (defense-in-depth against fork bombs
116            // and unfiltered namespace creation).
117            libc::SYS_execve,
118            // execveat is conditionally allowed below (AT_EMPTY_PATH blocked)
119            libc::SYS_wait4,
120            libc::SYS_waitid,
121            libc::SYS_exit,
122            libc::SYS_exit_group,
123            libc::SYS_getpid,
124            libc::SYS_gettid,
125            libc::SYS_getuid,
126            libc::SYS_getgid,
127            libc::SYS_geteuid,
128            libc::SYS_getegid,
129            libc::SYS_getppid,
130            libc::SYS_setsid,
131            libc::SYS_getgroups,
132            // Signals
133            libc::SYS_rt_sigaction,
134            libc::SYS_rt_sigprocmask,
135            libc::SYS_rt_sigreturn,
136            libc::SYS_rt_sigsuspend,
137            libc::SYS_rt_sigtimedwait,
138            libc::SYS_rt_sigpending,
139            libc::SYS_rt_sigqueueinfo,
140            libc::SYS_sigaltstack,
141            libc::SYS_restart_syscall,
142            // L7: kill/tgkill are safe when PID namespace is active (container
143            // can only signal its own processes). If PID namespace creation fails,
144            // the runtime aborts, so this is safe.
145            libc::SYS_kill,
146            libc::SYS_tgkill,
147            // Time and timers
148            libc::SYS_clock_gettime,
149            libc::SYS_clock_getres,
150            libc::SYS_clock_nanosleep,
151            libc::SYS_gettimeofday,
152            libc::SYS_nanosleep,
153            libc::SYS_setitimer,
154            libc::SYS_getitimer,
155            // Directories
156            libc::SYS_getcwd,
157            libc::SYS_chdir,
158            libc::SYS_fchdir,
159            libc::SYS_mkdirat,
160            libc::SYS_getdents64,
161            // Misc
162            libc::SYS_uname,
163            libc::SYS_getrandom,
164            libc::SYS_futex,
165            libc::SYS_set_tid_address,
166            libc::SYS_set_robust_list,
167            libc::SYS_get_robust_list,
168            // L8: sysinfo removed – leaks host RAM, uptime, and process count.
169            // Applications needing this info should use /proc/meminfo instead.
170            libc::SYS_umask,
171            // prlimit64 moved to arg-filtered section (M3)
172            libc::SYS_getrusage,
173            libc::SYS_times,
174            libc::SYS_sched_yield,
175            libc::SYS_sched_getaffinity,
176            libc::SYS_sched_setaffinity,
177            libc::SYS_sched_getparam,
178            libc::SYS_sched_getscheduler,
179            libc::SYS_getcpu,
180            // Extended attributes – read-only queries, safe
181            libc::SYS_getxattr,
182            libc::SYS_lgetxattr,
183            libc::SYS_fgetxattr,
184            libc::SYS_listxattr,
185            libc::SYS_llistxattr,
186            libc::SYS_flistxattr,
187            libc::SYS_rseq,
188            libc::SYS_close_range,
189            // Ownership – safe after capability drop (CAP_CHOWN/CAP_FOWNER gone;
190            // operations on files not owned by the container UID will EPERM).
191            libc::SYS_fchown,
192            libc::SYS_fchownat,
193            // Legacy AIO – used by databases and storage engines. Operations are
194            // bounded by the process's existing fd permissions.
195            libc::SYS_io_setup,
196            libc::SYS_io_destroy,
197            libc::SYS_io_submit,
198            libc::SYS_io_getevents,
199            // NOTE: io_uring intentionally excluded from defaults – large kernel
200            // attack surface with a history of CVEs. Applications needing io_uring
201            // (e.g. PostgreSQL 18+ io_method=io_uring) should use a custom seccomp
202            // profile that adds io_uring_setup/io_uring_enter/io_uring_register.
203            // Process groups – safe in PID namespace (can only affect own pgrp).
204            libc::SYS_setpgid,
205            libc::SYS_getpgid,
206            // NOTE: memfd_create intentionally excluded – combined with execveat
207            // it enables fileless code execution bypassing all FS controls (SEC-02).
208            // Landlock bootstrap (runtime applies seccomp before Landlock)
209            libc::SYS_landlock_create_ruleset,
210            libc::SYS_landlock_add_rule,
211            libc::SYS_landlock_restrict_self,
212            // Socket/Network (safe introspection + local socketpair)
213            libc::SYS_getsockname,
214            libc::SYS_getpeername,
215            libc::SYS_socketpair,
216            libc::SYS_getsockopt,
217            // Poll/Select
218            libc::SYS_ppoll,
219            libc::SYS_pselect6,
220            libc::SYS_epoll_create1,
221            libc::SYS_epoll_ctl,
222            libc::SYS_epoll_pwait,
223            libc::SYS_eventfd2,
224            libc::SYS_signalfd4,
225            libc::SYS_timerfd_create,
226            libc::SYS_timerfd_settime,
227            libc::SYS_timerfd_gettime,
228        ];
229
230        // Legacy syscalls only available on x86_64 (aarch64 only has the *at variants)
231        #[cfg(target_arch = "x86_64")]
232        syscalls.extend_from_slice(&[
233            libc::SYS_open,
234            libc::SYS_stat,
235            libc::SYS_lstat,
236            libc::SYS_access,
237            libc::SYS_readlink,
238            libc::SYS_dup2,
239            libc::SYS_pipe,
240            libc::SYS_unlink,
241            libc::SYS_rename,
242            libc::SYS_link,
243            libc::SYS_symlink,
244            libc::SYS_chmod,
245            libc::SYS_mkdir,
246            libc::SYS_rmdir,
247            libc::SYS_getdents,
248            libc::SYS_getpgrp,
249            libc::SYS_chown,
250            libc::SYS_fchown,
251            libc::SYS_lchown,
252            libc::SYS_arch_prctl,
253            libc::SYS_getrlimit,
254            libc::SYS_poll,
255            libc::SYS_select,
256            libc::SYS_epoll_create,
257            libc::SYS_epoll_wait,
258            libc::SYS_eventfd,
259            libc::SYS_signalfd,
260        ]);
261
262        syscalls
263    }
264
265    fn allowed_socket_domains(allow_network: bool) -> Vec<i32> {
266        if allow_network {
267            vec![libc::AF_UNIX, libc::AF_INET, libc::AF_INET6]
268        } else {
269            vec![libc::AF_UNIX]
270        }
271    }
272
273    fn network_mode_syscalls(allow_network: bool) -> Vec<i64> {
274        if allow_network {
275            vec![
276                libc::SYS_connect,
277                libc::SYS_sendto,
278                libc::SYS_recvfrom,
279                libc::SYS_sendmsg,
280                libc::SYS_recvmsg,
281                libc::SYS_shutdown,
282                libc::SYS_bind,
283                libc::SYS_listen,
284                libc::SYS_accept,
285                libc::SYS_accept4,
286                libc::SYS_setsockopt,
287            ]
288        } else {
289            Vec::new()
290        }
291    }
292
293    /// Get minimal syscall whitelist for basic container operation
294    ///
295    /// This is a restrictive whitelist that blocks dangerous syscalls:
296    /// - ptrace (process tracing)
297    /// - kexec_load (kernel loading)
298    /// - add_key, request_key, keyctl (kernel keyring)
299    /// - bpf (eBPF programs)
300    /// - perf_event_open (performance monitoring)
301    /// - userfaultfd (user fault handling)
302    fn minimal_filter(
303        allow_network: bool,
304        extra_syscalls: &[String],
305    ) -> Result<BTreeMap<i64, Vec<SeccompRule>>> {
306        let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
307
308        // Essential syscalls for basic operation
309        let allowed_syscalls = Self::base_allowed_syscalls();
310
311        // Allow all these syscalls unconditionally
312        for syscall in allowed_syscalls {
313            rules.insert(syscall, Vec::new());
314        }
315
316        // Add network-mode-specific syscalls
317        for syscall in Self::network_mode_syscalls(allow_network) {
318            rules.insert(syscall, Vec::new());
319        }
320
321        // Add user-requested extra syscalls (--seccomp-allow).
322        // - Already in default/arg-filtered: silently accepted (no-op).
323        // - In OPT_IN_SYSCALLS: added to allowlist.
324        // - Security-critical denied names: WARN and blocked even if later
325        //   accidentally added to OPT_IN_SYSCALLS.
326        // - Known but not opt-in: WARN and blocked (defense-in-depth).
327        // - Unknown name: WARN and blocked.
328        for name in extra_syscalls {
329            if Self::ARG_FILTERED_SYSCALLS.contains(&name.as_str()) {
330                continue;
331            }
332
333            if let Some(nr) = syscall_name_to_number(name) {
334                if let std::collections::btree_map::Entry::Vacant(entry) = rules.entry(nr) {
335                    if Self::SECURITY_CRITICAL_DENIED_SYSCALLS.contains(&name.as_str()) {
336                        warn!(
337                            "--seccomp-allow: security-critical syscall '{}' is always blocked",
338                            name
339                        );
340                    } else if Self::OPT_IN_SYSCALLS.contains(&name.as_str()) {
341                        entry.insert(Vec::new());
342                    } else {
343                        warn!(
344                            "--seccomp-allow: syscall '{}' is not in the opt-in allowlist – blocked",
345                            name
346                        );
347                    }
348                }
349            } else {
350                warn!("--seccomp-allow: unknown syscall '{}' – blocked", name);
351            }
352        }
353
354        // Restrict socket() domains by network mode.
355        // none: AF_UNIX only; network-enabled: AF_UNIX/AF_INET/AF_INET6.
356        let mut socket_rules = Vec::new();
357        for domain in Self::allowed_socket_domains(allow_network) {
358            let condition = SeccompCondition::new(
359                0, // arg0 is socket(domain, type, protocol)
360                seccompiler::SeccompCmpArgLen::Dword,
361                seccompiler::SeccompCmpOp::Eq,
362                domain as u64,
363            )
364            .map_err(|e| {
365                NucleusError::SeccompError(format!(
366                    "Failed to create socket domain condition: {}",
367                    e
368                ))
369            })?;
370            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
371                NucleusError::SeccompError(format!("Failed to create socket rule: {}", e))
372            })?;
373            socket_rules.push(rule);
374        }
375        rules.insert(libc::SYS_socket, socket_rules);
376
377        // ioctl: allow only safe terminal operations (arg0 = request code)
378        let ioctl_allowed: &[u64] = &[
379            0x5401, // TCGETS
380            0x5402, // TCSETS
381            0x5403, // TCSETSW
382            0x5404, // TCSETSF
383            0x540B, // TCFLSH
384            0x540F, // TIOCGPGRP
385            0x5410, // TIOCSPGRP
386            0x5413, // TIOCGWINSZ
387            0x5429, // TIOCGSID
388            0x541B, // FIONREAD
389            0x5421, // M12: FIONBIO – allowed because fcntl(F_SETFL, O_NONBLOCK)
390            // achieves the same result and is already permitted. Blocking
391            // FIONBIO only breaks tokio/mio for no security gain.
392            0x5451, // FIOCLEX
393            0x5450, // FIONCLEX
394        ];
395        let mut ioctl_rules = Vec::new();
396        for &request in ioctl_allowed {
397            let condition = SeccompCondition::new(
398                1, // arg1 is the request code for ioctl(fd, request, ...)
399                seccompiler::SeccompCmpArgLen::Dword,
400                seccompiler::SeccompCmpOp::Eq,
401                request,
402            )
403            .map_err(|e| {
404                NucleusError::SeccompError(format!("Failed to create ioctl condition: {}", e))
405            })?;
406            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
407                NucleusError::SeccompError(format!("Failed to create ioctl rule: {}", e))
408            })?;
409            ioctl_rules.push(rule);
410        }
411        rules.insert(libc::SYS_ioctl, ioctl_rules);
412
413        // prctl: allow only safe operations.
414        // Notably absent (hit default deny):
415        //   PR_CAPBSET_DROP (24) – could weaken the capability bounding set
416        //   PR_SET_SECUREBITS (28) – could disable secure-exec restrictions
417        //   PR_CAP_AMBIENT mutations – could activate retained inheritable caps
418        let prctl_allowed: &[u64] = &[
419            1,  // PR_SET_PDEATHSIG
420            2,  // PR_GET_PDEATHSIG
421            15, // PR_SET_NAME
422            16, // PR_GET_NAME
423            23, // PR_CAPBSET_READ – glibc probes this at startup to discover
424            // cap_last_cap when /proc/sys is masked. Read-only, harmless
425            // after capabilities have been dropped.
426            27, // PR_GET_SECUREBITS – read-only query of securebits flags
427            36, // PR_SET_CHILD_SUBREAPER – safe, only affects own descendants
428            37, // PR_GET_CHILD_SUBREAPER
429            38, // PR_SET_NO_NEW_PRIVS
430            40, // PR_GET_TID_ADDRESS – read-only, returns thread ID address
431            39, // PR_GET_NO_NEW_PRIVS
432        ];
433        let mut prctl_rules = Vec::new();
434        for &option in prctl_allowed {
435            let condition = SeccompCondition::new(
436                0, // arg0 is the option for prctl(option, ...)
437                seccompiler::SeccompCmpArgLen::Dword,
438                seccompiler::SeccompCmpOp::Eq,
439                option,
440            )
441            .map_err(|e| {
442                NucleusError::SeccompError(format!("Failed to create prctl condition: {}", e))
443            })?;
444            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
445                NucleusError::SeccompError(format!("Failed to create prctl rule: {}", e))
446            })?;
447            prctl_rules.push(rule);
448        }
449
450        let ambient_option = SeccompCondition::new(
451            0, // arg0 is the option for prctl(option, ...)
452            seccompiler::SeccompCmpArgLen::Dword,
453            seccompiler::SeccompCmpOp::Eq,
454            libc::PR_CAP_AMBIENT as u64,
455        )
456        .map_err(|e| {
457            NucleusError::SeccompError(format!(
458                "Failed to create PR_CAP_AMBIENT prctl condition: {}",
459                e
460            ))
461        })?;
462        let ambient_is_set = SeccompCondition::new(
463            1, // arg1 is the PR_CAP_AMBIENT subcommand.
464            seccompiler::SeccompCmpArgLen::Dword,
465            seccompiler::SeccompCmpOp::Eq,
466            libc::PR_CAP_AMBIENT_IS_SET as u64,
467        )
468        .map_err(|e| {
469            NucleusError::SeccompError(format!(
470                "Failed to create PR_CAP_AMBIENT_IS_SET prctl condition: {}",
471                e
472            ))
473        })?;
474        let ambient_probe_rule =
475            SeccompRule::new(vec![ambient_option, ambient_is_set]).map_err(|e| {
476                NucleusError::SeccompError(format!(
477                    "Failed to create PR_CAP_AMBIENT_IS_SET prctl rule: {}",
478                    e
479                ))
480            })?;
481        prctl_rules.push(ambient_probe_rule);
482        rules.insert(libc::SYS_prctl, prctl_rules);
483
484        // M3: prlimit64 – only allow GET (new_limit == NULL, i.e. arg2 == 0).
485        // SET operations could raise RLIMIT_NPROC to bypass fork-bomb protection.
486        let prlimit_condition = SeccompCondition::new(
487            2, // arg2 = new_limit pointer for prlimit64(pid, resource, new_limit, old_limit)
488            seccompiler::SeccompCmpArgLen::Qword,
489            seccompiler::SeccompCmpOp::Eq,
490            0u64, // new_limit == NULL means GET-only
491        )
492        .map_err(|e| {
493            NucleusError::SeccompError(format!("Failed to create prlimit64 condition: {}", e))
494        })?;
495        let prlimit_rule = SeccompRule::new(vec![prlimit_condition]).map_err(|e| {
496            NucleusError::SeccompError(format!("Failed to create prlimit64 rule: {}", e))
497        })?;
498        rules.insert(libc::SYS_prlimit64, vec![prlimit_rule]);
499
500        // mprotect: permit RW or RX transitions, but reject PROT_WRITE|PROT_EXEC.
501        let mut mprotect_rules = Vec::new();
502        for allowed in [0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64] {
503            let condition = SeccompCondition::new(
504                2, // arg2 is prot for mprotect(addr, len, prot)
505                seccompiler::SeccompCmpArgLen::Dword,
506                seccompiler::SeccompCmpOp::MaskedEq((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
507                allowed,
508            )
509            .map_err(|e| {
510                NucleusError::SeccompError(format!("Failed to create mprotect condition: {}", e))
511            })?;
512            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
513                NucleusError::SeccompError(format!("Failed to create mprotect rule: {}", e))
514            })?;
515            mprotect_rules.push(rule);
516        }
517        rules.insert(libc::SYS_mprotect, mprotect_rules);
518
519        // preadv2/pwritev2: allow only flags == 0, which makes them equivalent
520        // to preadv/pwritev. Nonzero RWF_* flags include cache-observing
521        // behavior such as RWF_NOWAIT and future flags seccomp cannot review.
522        for (syscall, name) in [
523            (libc::SYS_preadv2, "preadv2"),
524            (libc::SYS_pwritev2, "pwritev2"),
525        ] {
526            let condition = SeccompCondition::new(
527                5, // arg5 = flags for preadv2/pwritev2(fd, iov, iovcnt, off_lo, off_hi, flags)
528                seccompiler::SeccompCmpArgLen::Qword,
529                seccompiler::SeccompCmpOp::Eq,
530                0,
531            )
532            .map_err(|e| {
533                NucleusError::SeccompError(format!(
534                    "Failed to create {} flags condition: {}",
535                    name, e
536                ))
537            })?;
538            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
539                NucleusError::SeccompError(format!("Failed to create {} rule: {}", name, e))
540            })?;
541            rules.insert(syscall, vec![rule]);
542        }
543
544        // clone3 is intentionally absent from the allow map. Its flags live in a
545        // user pointer (struct clone_args), which seccomp BPF cannot dereference,
546        // so it cannot be safely namespace-filtered. The BPF compiler adds an
547        // exact-match ENOSYS deny for clone3 so libc falls back to filtered clone.
548
549        // clone: allow but deny namespace-creating flags to prevent nested namespace creation
550        let clone_condition = SeccompCondition::new(
551            0, // arg0 = flags
552            seccompiler::SeccompCmpArgLen::Qword,
553            seccompiler::SeccompCmpOp::MaskedEq(DENIED_CLONE_NAMESPACE_FLAGS),
554            0, // (flags & ns_flags) == 0: none of the namespace flags set
555        )
556        .map_err(|e| {
557            NucleusError::SeccompError(format!("Failed to create clone condition: {}", e))
558        })?;
559        let clone_rule = SeccompRule::new(vec![clone_condition]).map_err(|e| {
560            NucleusError::SeccompError(format!("Failed to create clone rule: {}", e))
561        })?;
562        rules.insert(libc::SYS_clone, vec![clone_rule]);
563
564        // execveat: allow but block AT_EMPTY_PATH (0x1000) to prevent fileless
565        // execution. With AT_EMPTY_PATH, execveat can execute code from any open
566        // fd (e.g., open + unlink, or even a socket fd), bypassing filesystem
567        // controls – not just memfd_create. Blocking memfd_create alone is
568        // insufficient. Normal execveat with dirfd+pathname (no AT_EMPTY_PATH)
569        // remains allowed.
570        let execveat_condition = SeccompCondition::new(
571            4, // arg4 = flags for execveat(dirfd, pathname, argv, envp, flags)
572            seccompiler::SeccompCmpArgLen::Dword,
573            seccompiler::SeccompCmpOp::MaskedEq(libc::AT_EMPTY_PATH as u64),
574            0, // (flags & AT_EMPTY_PATH) == 0: AT_EMPTY_PATH not set
575        )
576        .map_err(|e| {
577            NucleusError::SeccompError(format!("Failed to create execveat condition: {}", e))
578        })?;
579        let execveat_rule = SeccompRule::new(vec![execveat_condition]).map_err(|e| {
580            NucleusError::SeccompError(format!("Failed to create execveat rule: {}", e))
581        })?;
582        rules.insert(libc::SYS_execveat, vec![execveat_rule]);
583
584        Ok(rules)
585    }
586
587    /// Validate `--seccomp-allow` entries for production mode.
588    ///
589    /// Development runs warn and ignore unsupported names at filter construction
590    /// time. Production mode rejects them early so operators cannot accidentally
591    /// believe a weakened or unsupported syscall fragment was applied.
592    pub fn validate_extra_syscalls_for_production(
593        allow_network: bool,
594        extra_syscalls: &[String],
595    ) -> Result<()> {
596        let base_syscalls = Self::base_allowed_syscalls();
597        let network_syscalls = Self::network_mode_syscalls(allow_network);
598
599        for name in extra_syscalls {
600            let Some(nr) = syscall_name_to_number(name) else {
601                return Err(NucleusError::ConfigError(format!(
602                    "Production mode rejects unknown --seccomp-allow syscall '{}'",
603                    name
604                )));
605            };
606
607            if base_syscalls.contains(&nr)
608                || network_syscalls.contains(&nr)
609                || Self::ARG_FILTERED_SYSCALLS.contains(&name.as_str())
610            {
611                continue;
612            }
613
614            if Self::SECURITY_CRITICAL_DENIED_SYSCALLS.contains(&name.as_str()) {
615                return Err(NucleusError::ConfigError(format!(
616                    "Production mode forbids --seccomp-allow for security-critical syscall '{}'",
617                    name
618                )));
619            }
620
621            if Self::OPT_IN_SYSCALLS.contains(&name.as_str()) {
622                continue;
623            }
624
625            return Err(NucleusError::ConfigError(format!(
626                "Production mode rejects unsupported --seccomp-allow syscall '{}'",
627                name
628            )));
629        }
630
631        Ok(())
632    }
633
634    /// Compile the minimal BPF filter without applying it
635    ///
636    /// This is useful for benchmarking filter compilation overhead
637    /// without the irreversible side effect of applying the filter.
638    ///
639    /// Uses bitmap-based BPF compilation for O(1) syscall dispatch.
640    pub fn compile_minimal_filter() -> Result<BpfProgram> {
641        let rules = Self::minimal_filter(true, &[])?;
642        let target_arch = std::env::consts::ARCH.try_into().map_err(|e| {
643            NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
644        })?;
645        super::seccomp_bpf::compile_bitmap_bpf_with_errno_syscalls(
646            rules,
647            Self::errno_denied_syscalls(),
648            SeccompAction::KillProcess,
649            SeccompAction::Allow,
650            target_arch,
651        )
652    }
653
654    /// Expose minimal_filter for tests in sibling modules.
655    #[cfg(test)]
656    pub(crate) fn minimal_filter_for_test(
657        allow_network: bool,
658        extra_syscalls: &[String],
659    ) -> BTreeMap<i64, Vec<SeccompRule>> {
660        Self::minimal_filter(allow_network, extra_syscalls).unwrap()
661    }
662
663    #[cfg(test)]
664    pub(crate) fn errno_denied_syscalls_for_test() -> &'static [(i64, u32)] {
665        Self::errno_denied_syscalls()
666    }
667
668    /// Apply seccomp filter
669    ///
670    /// This implements the transition: no_filter -> whitelist_active
671    /// in the seccomp state machine (NucleusSecurity_Seccomp_SeccompEnforcement.tla)
672    ///
673    /// Once applied, the filter cannot be removed (irreversible property)
674    /// In rootless mode or if seccomp setup fails, this will warn and continue
675    pub fn apply_minimal_filter(&mut self) -> Result<bool> {
676        self.apply_minimal_filter_with_mode(false, false)
677    }
678
679    /// Apply seccomp filter with configurable failure behavior
680    ///
681    /// When `best_effort` is true, failures are logged and execution continues.
682    /// When false, seccomp setup is fail-closed.
683    pub fn apply_minimal_filter_with_mode(
684        &mut self,
685        best_effort: bool,
686        log_denied: bool,
687    ) -> Result<bool> {
688        self.apply_filter_for_network_mode(true, best_effort, log_denied, &[])
689    }
690
691    /// Apply seccomp filter with network-mode-aware socket restrictions
692    ///
693    /// When `allow_network` is false, `SYS_socket` is restricted to AF_UNIX only,
694    /// preventing creation of network sockets (AF_INET, AF_INET6, etc.).
695    /// When `allow_network` is true, all socket domains are permitted.
696    ///
697    /// When `best_effort` is true, failures are logged and execution continues.
698    /// When false, seccomp setup is fail-closed.
699    pub fn apply_filter_for_network_mode(
700        &mut self,
701        allow_network: bool,
702        best_effort: bool,
703        log_denied: bool,
704        extra_syscalls: &[String],
705    ) -> Result<bool> {
706        if self.applied {
707            debug!("Seccomp filter already applied, skipping");
708            return Ok(true);
709        }
710
711        info!(allow_network, "Applying seccomp filter");
712
713        let rules = match Self::minimal_filter(allow_network, extra_syscalls) {
714            Ok(r) => r,
715            Err(e) => {
716                if best_effort {
717                    warn!(
718                        "Failed to create seccomp rules: {} (continuing without seccomp)",
719                        e
720                    );
721                    return Ok(false);
722                }
723                return Err(e);
724            }
725        };
726
727        let target_arch = match std::env::consts::ARCH.try_into() {
728            Ok(a) => a,
729            Err(e) => {
730                let msg = format!("Unsupported architecture: {:?}", e);
731                if best_effort {
732                    warn!("{} (continuing without seccomp)", msg);
733                    return Ok(false);
734                }
735                return Err(NucleusError::SeccompError(msg));
736            }
737        };
738
739        let bpf_prog: BpfProgram = match super::seccomp_bpf::compile_bitmap_bpf_with_errno_syscalls(
740            rules,
741            Self::errno_denied_syscalls(),
742            SeccompAction::KillProcess,
743            SeccompAction::Allow,
744            target_arch,
745        ) {
746            Ok(p) => p,
747            Err(e) => {
748                if best_effort {
749                    warn!(
750                        "Failed to compile BPF program: {} (continuing without seccomp)",
751                        e
752                    );
753                    return Ok(false);
754                }
755                return Err(e);
756            }
757        };
758
759        // Apply the filter
760        match Self::apply_bpf_program(&bpf_prog, log_denied) {
761            Ok(_) => {
762                self.applied = true;
763                info!("Successfully applied seccomp filter");
764                Ok(true)
765            }
766            Err(e) => {
767                if best_effort {
768                    warn!(
769                        "Failed to apply seccomp filter: {} (continuing without seccomp)",
770                        e
771                    );
772                    Ok(false)
773                } else {
774                    Err(NucleusError::SeccompError(format!(
775                        "Failed to apply seccomp filter: {}",
776                        e
777                    )))
778                }
779            }
780        }
781    }
782
783    /// Apply a seccomp profile loaded from a JSON file.
784    ///
785    /// The profile format is a JSON object with:
786    /// ```json
787    /// {
788    ///   "defaultAction": "SCMP_ACT_ERRNO",
789    ///   "syscalls": [
790    ///     { "names": ["read", "write", "open", ...], "action": "SCMP_ACT_ALLOW" }
791    ///   ]
792    /// }
793    /// ```
794    ///
795    /// This is a subset of the OCI seccomp profile format. Only the syscall name
796    /// allowlist is used; argument-level filtering from the built-in profile is
797    /// not applied when using a custom profile.
798    ///
799    /// If `expected_sha256` is provided, the file's SHA-256 hash is verified
800    /// against it before loading. This prevents silent profile tampering.
801    pub fn apply_profile_from_file(
802        &mut self,
803        profile_path: &Path,
804        expected_sha256: Option<&str>,
805        audit_mode: bool,
806    ) -> Result<bool> {
807        if self.applied {
808            debug!("Seccomp filter already applied, skipping");
809            return Ok(true);
810        }
811
812        info!("Loading seccomp profile from {:?}", profile_path);
813
814        // Read profile file
815        let content = std::fs::read(profile_path).map_err(|e| {
816            NucleusError::SeccompError(format!(
817                "Failed to read seccomp profile {:?}: {}",
818                profile_path, e
819            ))
820        })?;
821
822        // Verify SHA-256 hash if expected
823        if let Some(expected) = expected_sha256 {
824            let actual = sha256_hex(&content);
825            if actual != expected {
826                return Err(NucleusError::SeccompError(format!(
827                    "Seccomp profile hash mismatch: expected {}, got {}",
828                    expected, actual
829                )));
830            }
831            info!("Seccomp profile hash verified: {}", actual);
832        }
833
834        // Parse profile
835        let profile: SeccompProfile = serde_json::from_slice(&content).map_err(|e| {
836            NucleusError::SeccompError(format!("Failed to parse seccomp profile: {}", e))
837        })?;
838
839        // Warn when custom profile allows security-critical syscalls without
840        // argument-level filtering. The built-in filter restricts clone, ioctl,
841        // prctl, and socket at the argument level; a custom profile that allows
842        // them by name only silently removes all of that hardening.
843        Self::warn_missing_arg_filters(&profile);
844
845        // Build filter from profile
846        let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
847
848        for syscall_group in &profile.syscalls {
849            if syscall_group.action == "SCMP_ACT_ALLOW" {
850                for name in &syscall_group.names {
851                    if name == "clone3" {
852                        warn!(
853                            "Custom seccomp profile requested clone3; ignoring it and returning \
854                             ENOSYS because clone3 namespace flags cannot be argument-filtered"
855                        );
856                        continue;
857                    }
858                    if let Some(nr) = syscall_name_to_number(name) {
859                        rules.insert(nr, Vec::new());
860                    } else {
861                        warn!("Unknown syscall in profile: {} (skipping)", name);
862                    }
863                }
864            }
865        }
866
867        // SEC-01: Merge built-in argument filters for security-critical syscalls.
868        // Custom profiles that allow clone/ioctl/prctl/socket/mprotect by name
869        // without argument-level filters would silently remove all hardening.
870        // Overwrite their empty rules with the built-in argument-filtered rules.
871        let builtin_rules = Self::minimal_filter(true, &[])?;
872        for syscall_name in Self::ARG_FILTERED_SYSCALLS {
873            if let Some(nr) = syscall_name_to_number(syscall_name) {
874                if let std::collections::btree_map::Entry::Occupied(mut entry) = rules.entry(nr) {
875                    if let Some(builtin) = builtin_rules.get(&nr) {
876                        if !builtin.is_empty() {
877                            info!(
878                                "Merging built-in argument filters for '{}' into custom profile",
879                                syscall_name
880                            );
881                            entry.insert(builtin.clone());
882                        }
883                    }
884                }
885            }
886        }
887        let target_arch = std::env::consts::ARCH.try_into().map_err(|e| {
888            NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
889        })?;
890
891        let bpf_prog: BpfProgram = super::seccomp_bpf::compile_bitmap_bpf_with_errno_syscalls(
892            rules,
893            Self::errno_denied_syscalls(),
894            SeccompAction::KillProcess,
895            SeccompAction::Allow,
896            target_arch,
897        )?;
898
899        match Self::apply_bpf_program(&bpf_prog, audit_mode) {
900            Ok(_) => {
901                self.applied = true;
902                info!(
903                    "Seccomp profile applied from {:?} (log_denied={})",
904                    profile_path, audit_mode
905                );
906                Ok(true)
907            }
908            Err(e) => Err(e),
909        }
910    }
911
912    /// Install an allow-all seccomp filter with SECCOMP_FILTER_FLAG_LOG.
913    ///
914    /// Used in trace mode: all syscalls are allowed but logged to the kernel
915    /// audit subsystem. A separate reader collects the logged syscalls.
916    pub fn apply_trace_filter(&mut self) -> Result<bool> {
917        if self.applied {
918            debug!("Seccomp filter already applied, skipping trace filter");
919            return Ok(true);
920        }
921
922        info!("Applying seccomp trace filter (allow-all + LOG)");
923
924        // Create an empty rule set – with SeccompAction::Allow as default,
925        // every syscall is permitted. The LOG flag causes the kernel to
926        // audit each syscall decision.
927        let filter = SeccompFilter::new(
928            BTreeMap::new(),
929            SeccompAction::Allow, // default: allow everything
930            SeccompAction::Allow, // match action (unused – no rules)
931            std::env::consts::ARCH.try_into().map_err(|e| {
932                NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
933            })?,
934        )
935        .map_err(|e| NucleusError::SeccompError(format!("Failed to create trace filter: {}", e)))?;
936
937        let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
938            NucleusError::SeccompError(format!("Failed to compile trace BPF: {}", e))
939        })?;
940
941        // Apply with LOG flag so kernel audits every syscall
942        Self::apply_bpf_program(&bpf_prog, true)?;
943        self.applied = true;
944        info!("Seccomp trace filter applied (all syscalls allowed + logged)");
945        Ok(true)
946    }
947
948    /// Syscalls that the built-in filter restricts at the argument level.
949    /// Custom profiles allowing these without argument filters weaken security.
950    const ARG_FILTERED_SYSCALLS: &'static [&'static str] = &[
951        "clone", "execveat", "ioctl", "mprotect", "preadv2", "prctl", "pwritev2", "socket",
952    ];
953
954    /// Security-critical syscalls that must not be re-enabled by the
955    /// convenience `--seccomp-allow` fragment mechanism.
956    const SECURITY_CRITICAL_DENIED_SYSCALLS: &'static [&'static str] = &[
957        // Namespace entry/creation bypasses clone namespace-flag filtering.
958        "clone3",
959        "unshare",
960        "setns",
961        // Kernel keyrings are deliberately outside the built-in policy.
962        "add_key",
963        "request_key",
964        "keyctl",
965    ];
966
967    fn errno_denied_syscalls() -> &'static [(i64, u32)] {
968        &[(libc::SYS_clone3, libc::ENOSYS as u32)]
969    }
970
971    /// Non-default syscalls that may be opted into via `--seccomp-allow`.
972    ///
973    /// Every syscall known to `syscall_name_to_number` but absent from both
974    /// `base_allowed_syscalls` and `ARG_FILTERED_SYSCALLS` must appear here
975    /// to be enableable. Requesting a known syscall that is NOT in this list
976    /// emits a WARN and is silently dropped (defense-in-depth).
977    const OPT_IN_SYSCALLS: &'static [&'static str] = &[
978        // io_uring – large attack surface but needed by modern databases
979        "io_uring_setup",
980        "io_uring_enter",
981        "io_uring_register",
982        // SysV message queues
983        "msgget",
984        "msgsnd",
985        "msgrcv",
986        "msgctl",
987        // POSIX message queues
988        "mq_open",
989        "mq_unlink",
990        "mq_timedsend",
991        "mq_timedreceive",
992        "mq_notify",
993        "mq_getsetattr",
994        // POSIX timers
995        "timer_create",
996        "timer_settime",
997        "timer_gettime",
998        "timer_getoverrun",
999        "timer_delete",
1000        // Inotify / fanotify
1001        "inotify_init",
1002        "inotify_init1",
1003        "inotify_add_watch",
1004        "inotify_rm_watch",
1005        "fanotify_init",
1006        "fanotify_mark",
1007        // Memory (non-default)
1008        "mincore",
1009        "mlockall",
1010        "munlockall",
1011        "membarrier",
1012        "process_madvise",
1013        "mbind",
1014        "set_mempolicy",
1015        "get_mempolicy",
1016        "set_mempolicy_home_node",
1017        "pkey_mprotect",
1018        "pkey_alloc",
1019        "pkey_free",
1020        "cachestat",
1021        "remap_file_pages",
1022        // File I/O (non-default)
1023        "sync",
1024        "syncfs",
1025        "sync_file_range",
1026        "readahead",
1027        "vmsplice",
1028        "openat2",
1029        "name_to_handle_at",
1030        "open_by_handle_at",
1031        "io_cancel",
1032        "io_pgetevents",
1033        "creat",
1034        "fchmodat2",
1035        "statmount",
1036        "listmount",
1037        "utimensat",
1038        "utimes",
1039        "utime",
1040        "futimesat",
1041        // Extended attributes (write)
1042        "setxattr",
1043        "lsetxattr",
1044        "fsetxattr",
1045        "removexattr",
1046        "lremovexattr",
1047        "fremovexattr",
1048        "setxattrat",
1049        "getxattrat",
1050        "listxattrat",
1051        "removexattrat",
1052        // Network (non-default)
1053        "recvmmsg",
1054        "sendmmsg",
1055        // Scheduling (non-default)
1056        "sched_setparam",
1057        "sched_setscheduler",
1058        "sched_get_priority_max",
1059        "sched_get_priority_min",
1060        "sched_rr_get_interval",
1061        "sched_setattr",
1062        "sched_getattr",
1063        // Resource limits / priority
1064        "setrlimit",
1065        "getpriority",
1066        "setpriority",
1067        "ioprio_set",
1068        "ioprio_get",
1069        // Process (non-default, low risk)
1070        "vfork",
1071        "pause",
1072        "alarm",
1073        "tkill",
1074        "sysinfo",
1075        "personality",
1076        "vhangup",
1077        "time",
1078        "pidfd_open",
1079        "pidfd_send_signal",
1080        "pidfd_getfd",
1081        // UID/GID
1082        "setuid",
1083        "setgid",
1084        "setreuid",
1085        "setregid",
1086        "setresuid",
1087        "getresuid",
1088        "setresgid",
1089        "getresgid",
1090        "setfsuid",
1091        "setfsgid",
1092        "setgroups",
1093        "getsid",
1094        // Capabilities (read-only query)
1095        "capget",
1096        // Signals (non-default)
1097        "rt_tgsigqueueinfo",
1098        // Misc
1099        "mknod",
1100        "mknodat",
1101        "syslog",
1102        "clock_settime",
1103        "clock_adjtime",
1104        "adjtimex",
1105        "kcmp",
1106        "epoll_pwait2",
1107        // Futex (non-default)
1108        "futex_waitv",
1109        "futex_wake",
1110        "futex_wait",
1111        "futex_requeue",
1112        // Landlock (already in default but listed for completeness)
1113        "seccomp",
1114    ];
1115
1116    /// Warn when a custom seccomp profile allows security-critical syscalls
1117    /// without argument-level filtering.
1118    fn warn_missing_arg_filters(profile: &SeccompProfile) {
1119        for group in &profile.syscalls {
1120            if group.action != "SCMP_ACT_ALLOW" {
1121                continue;
1122            }
1123            for name in &group.names {
1124                if Self::ARG_FILTERED_SYSCALLS.contains(&name.as_str()) && group.args.is_empty() {
1125                    warn!(
1126                        "Custom seccomp profile allows '{}' without argument filters. \
1127                         The built-in filter restricts this syscall at the argument level. \
1128                         This profile weakens security compared to the default.",
1129                        name
1130                    );
1131                }
1132            }
1133        }
1134    }
1135
1136    /// Check if seccomp filter has been applied
1137    pub fn is_applied(&self) -> bool {
1138        self.applied
1139    }
1140
1141    fn apply_bpf_program(bpf_prog: &BpfProgram, log_denied: bool) -> Result<()> {
1142        let mut flags: libc::c_ulong = 0;
1143        if log_denied {
1144            flags |= libc::SECCOMP_FILTER_FLAG_LOG as libc::c_ulong;
1145        }
1146
1147        match Self::apply_bpf_program_with_flags(bpf_prog, flags) {
1148            Ok(()) => Ok(()),
1149            Err(err)
1150                if log_denied
1151                    && err.raw_os_error() == Some(libc::EINVAL)
1152                    && libc::SECCOMP_FILTER_FLAG_LOG != 0 =>
1153            {
1154                warn!(
1155                    "Kernel rejected SECCOMP_FILTER_FLAG_LOG; continuing with seccomp \
1156                     enforcement without deny logging"
1157                );
1158                Self::apply_bpf_program_with_flags(bpf_prog, 0)?;
1159                Ok(())
1160            }
1161            Err(err) => Err(NucleusError::SeccompError(format!(
1162                "Failed to apply seccomp filter: {}",
1163                err
1164            ))),
1165        }
1166    }
1167
1168    fn apply_bpf_program_with_flags(
1169        bpf_prog: &BpfProgram,
1170        flags: libc::c_ulong,
1171    ) -> std::io::Result<()> {
1172        // SAFETY: `prctl(PR_SET_NO_NEW_PRIVS, ...)` has no pointer arguments here
1173        // and only affects the current thread/process as required before seccomp.
1174        let rc = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
1175        if rc != 0 {
1176            return Err(std::io::Error::last_os_error());
1177        }
1178
1179        let prog = libc::sock_fprog {
1180            len: bpf_prog.len() as u16,
1181            filter: bpf_prog.as_ptr() as *mut libc::sock_filter,
1182        };
1183
1184        // SAFETY: `prog` points to a live BPF program buffer for the duration of
1185        // the syscall and the kernel copies the pointed-to filter immediately.
1186        let rc = unsafe {
1187            libc::syscall(
1188                libc::SYS_seccomp,
1189                libc::SECCOMP_SET_MODE_FILTER,
1190                flags,
1191                &prog as *const libc::sock_fprog,
1192            )
1193        };
1194
1195        if rc < 0 {
1196            return Err(std::io::Error::last_os_error());
1197        }
1198
1199        Ok(())
1200    }
1201}
1202
1203// SeccompProfile and SeccompSyscallGroup are defined in seccomp_generate.rs
1204use crate::security::seccomp_generate::SeccompProfile;
1205
1206/// Map a syscall name (e.g. "read", "write") to its Linux syscall number.
1207///
1208/// Covers the most common syscalls. Unknown names return None.
1209fn syscall_name_to_number(name: &str) -> Option<i64> {
1210    match name {
1211        // File I/O
1212        "read" => Some(libc::SYS_read),
1213        "write" => Some(libc::SYS_write),
1214        #[cfg(target_arch = "x86_64")]
1215        "open" => Some(libc::SYS_open),
1216        "openat" => Some(libc::SYS_openat),
1217        "close" => Some(libc::SYS_close),
1218        #[cfg(target_arch = "x86_64")]
1219        "stat" => Some(libc::SYS_stat),
1220        "fstat" => Some(libc::SYS_fstat),
1221        #[cfg(target_arch = "x86_64")]
1222        "lstat" => Some(libc::SYS_lstat),
1223        "lseek" => Some(libc::SYS_lseek),
1224        #[cfg(target_arch = "x86_64")]
1225        "access" => Some(libc::SYS_access),
1226        "fcntl" => Some(libc::SYS_fcntl),
1227        "readv" => Some(libc::SYS_readv),
1228        "writev" => Some(libc::SYS_writev),
1229        "preadv" => Some(libc::SYS_preadv),
1230        "pwritev" => Some(libc::SYS_pwritev),
1231        "preadv2" => Some(libc::SYS_preadv2),
1232        "pwritev2" => Some(libc::SYS_pwritev2),
1233        "pread64" => Some(libc::SYS_pread64),
1234        "pwrite64" => Some(libc::SYS_pwrite64),
1235        #[cfg(target_arch = "x86_64")]
1236        "readlink" => Some(libc::SYS_readlink),
1237        "readlinkat" => Some(libc::SYS_readlinkat),
1238        "newfstatat" => Some(libc::SYS_newfstatat),
1239        "statx" => Some(libc::SYS_statx),
1240        "faccessat" => Some(libc::SYS_faccessat),
1241        "faccessat2" => Some(libc::SYS_faccessat2),
1242        "dup" => Some(libc::SYS_dup),
1243        #[cfg(target_arch = "x86_64")]
1244        "dup2" => Some(libc::SYS_dup2),
1245        "dup3" => Some(libc::SYS_dup3),
1246        #[cfg(target_arch = "x86_64")]
1247        "pipe" => Some(libc::SYS_pipe),
1248        "pipe2" => Some(libc::SYS_pipe2),
1249        #[cfg(target_arch = "x86_64")]
1250        "unlink" => Some(libc::SYS_unlink),
1251        "unlinkat" => Some(libc::SYS_unlinkat),
1252        #[cfg(target_arch = "x86_64")]
1253        "rename" => Some(libc::SYS_rename),
1254        "renameat" => Some(libc::SYS_renameat),
1255        "renameat2" => Some(libc::SYS_renameat2),
1256        #[cfg(target_arch = "x86_64")]
1257        "link" => Some(libc::SYS_link),
1258        "linkat" => Some(libc::SYS_linkat),
1259        #[cfg(target_arch = "x86_64")]
1260        "symlink" => Some(libc::SYS_symlink),
1261        "symlinkat" => Some(libc::SYS_symlinkat),
1262        #[cfg(target_arch = "x86_64")]
1263        "chmod" => Some(libc::SYS_chmod),
1264        "fchmod" => Some(libc::SYS_fchmod),
1265        "fchmodat" => Some(libc::SYS_fchmodat),
1266        "truncate" => Some(libc::SYS_truncate),
1267        "ftruncate" => Some(libc::SYS_ftruncate),
1268        "fallocate" => Some(libc::SYS_fallocate),
1269        #[cfg(any(
1270            target_arch = "x86_64",
1271            target_arch = "aarch64",
1272            target_arch = "riscv64"
1273        ))]
1274        "fadvise64" => Some(SYS_FADVISE64),
1275        "fsync" => Some(libc::SYS_fsync),
1276        "fdatasync" => Some(libc::SYS_fdatasync),
1277        "flock" => Some(libc::SYS_flock),
1278        #[cfg(any(
1279            target_arch = "x86_64",
1280            target_arch = "aarch64",
1281            target_arch = "riscv64"
1282        ))]
1283        "sendfile" => Some(SYS_SENDFILE),
1284        "copy_file_range" => Some(libc::SYS_copy_file_range),
1285        "splice" => Some(libc::SYS_splice),
1286        "tee" => Some(libc::SYS_tee),
1287        // Memory
1288        "mmap" => Some(libc::SYS_mmap),
1289        "munmap" => Some(libc::SYS_munmap),
1290        "mprotect" => Some(libc::SYS_mprotect),
1291        "brk" => Some(libc::SYS_brk),
1292        "mremap" => Some(libc::SYS_mremap),
1293        "madvise" => Some(libc::SYS_madvise),
1294        "msync" => Some(libc::SYS_msync),
1295        "mlock" => Some(libc::SYS_mlock),
1296        "mlock2" => Some(libc::SYS_mlock2),
1297        "munlock" => Some(libc::SYS_munlock),
1298        // SysV IPC
1299        "shmget" => Some(libc::SYS_shmget),
1300        "shmat" => Some(libc::SYS_shmat),
1301        "shmdt" => Some(libc::SYS_shmdt),
1302        "shmctl" => Some(libc::SYS_shmctl),
1303        "semget" => Some(libc::SYS_semget),
1304        "semop" => Some(libc::SYS_semop),
1305        "semctl" => Some(libc::SYS_semctl),
1306        "semtimedop" => Some(libc::SYS_semtimedop),
1307        // Process
1308        #[cfg(target_arch = "x86_64")]
1309        "fork" => Some(libc::SYS_fork),
1310        "clone" => Some(libc::SYS_clone),
1311        "clone3" => Some(libc::SYS_clone3),
1312        "execve" => Some(libc::SYS_execve),
1313        "execveat" => Some(libc::SYS_execveat),
1314        "wait4" => Some(libc::SYS_wait4),
1315        "waitid" => Some(libc::SYS_waitid),
1316        "exit" => Some(libc::SYS_exit),
1317        "exit_group" => Some(libc::SYS_exit_group),
1318        "getpid" => Some(libc::SYS_getpid),
1319        "gettid" => Some(libc::SYS_gettid),
1320        "getuid" => Some(libc::SYS_getuid),
1321        "getgid" => Some(libc::SYS_getgid),
1322        "geteuid" => Some(libc::SYS_geteuid),
1323        "getegid" => Some(libc::SYS_getegid),
1324        "getppid" => Some(libc::SYS_getppid),
1325        #[cfg(target_arch = "x86_64")]
1326        "getpgrp" => Some(libc::SYS_getpgrp),
1327        "setsid" => Some(libc::SYS_setsid),
1328        "getgroups" => Some(libc::SYS_getgroups),
1329        // Signals
1330        "rt_sigaction" => Some(libc::SYS_rt_sigaction),
1331        "rt_sigprocmask" => Some(libc::SYS_rt_sigprocmask),
1332        "rt_sigreturn" => Some(libc::SYS_rt_sigreturn),
1333        "rt_sigsuspend" => Some(libc::SYS_rt_sigsuspend),
1334        "rt_sigtimedwait" => Some(libc::SYS_rt_sigtimedwait),
1335        "rt_sigpending" => Some(libc::SYS_rt_sigpending),
1336        "rt_sigqueueinfo" => Some(libc::SYS_rt_sigqueueinfo),
1337        "sigaltstack" => Some(libc::SYS_sigaltstack),
1338        "restart_syscall" => Some(libc::SYS_restart_syscall),
1339        "kill" => Some(libc::SYS_kill),
1340        "tgkill" => Some(libc::SYS_tgkill),
1341        // Time
1342        "clock_gettime" => Some(libc::SYS_clock_gettime),
1343        "clock_getres" => Some(libc::SYS_clock_getres),
1344        "clock_nanosleep" => Some(libc::SYS_clock_nanosleep),
1345        "gettimeofday" => Some(libc::SYS_gettimeofday),
1346        "nanosleep" => Some(libc::SYS_nanosleep),
1347        // Directories
1348        "getcwd" => Some(libc::SYS_getcwd),
1349        "chdir" => Some(libc::SYS_chdir),
1350        "fchdir" => Some(libc::SYS_fchdir),
1351        #[cfg(target_arch = "x86_64")]
1352        "mkdir" => Some(libc::SYS_mkdir),
1353        "mkdirat" => Some(libc::SYS_mkdirat),
1354        #[cfg(target_arch = "x86_64")]
1355        "rmdir" => Some(libc::SYS_rmdir),
1356        #[cfg(target_arch = "x86_64")]
1357        "getdents" => Some(libc::SYS_getdents),
1358        "getdents64" => Some(libc::SYS_getdents64),
1359        // Network
1360        "socket" => Some(libc::SYS_socket),
1361        "connect" => Some(libc::SYS_connect),
1362        "sendto" => Some(libc::SYS_sendto),
1363        "recvfrom" => Some(libc::SYS_recvfrom),
1364        "sendmsg" => Some(libc::SYS_sendmsg),
1365        "recvmsg" => Some(libc::SYS_recvmsg),
1366        "shutdown" => Some(libc::SYS_shutdown),
1367        "bind" => Some(libc::SYS_bind),
1368        "listen" => Some(libc::SYS_listen),
1369        "accept" => Some(libc::SYS_accept),
1370        "accept4" => Some(libc::SYS_accept4),
1371        "setsockopt" => Some(libc::SYS_setsockopt),
1372        "getsockopt" => Some(libc::SYS_getsockopt),
1373        "getsockname" => Some(libc::SYS_getsockname),
1374        "getpeername" => Some(libc::SYS_getpeername),
1375        "socketpair" => Some(libc::SYS_socketpair),
1376        // Poll/Select
1377        #[cfg(target_arch = "x86_64")]
1378        "poll" => Some(libc::SYS_poll),
1379        "ppoll" => Some(libc::SYS_ppoll),
1380        #[cfg(target_arch = "x86_64")]
1381        "select" => Some(libc::SYS_select),
1382        "pselect6" => Some(libc::SYS_pselect6),
1383        #[cfg(target_arch = "x86_64")]
1384        "epoll_create" => Some(libc::SYS_epoll_create),
1385        "epoll_create1" => Some(libc::SYS_epoll_create1),
1386        "epoll_ctl" => Some(libc::SYS_epoll_ctl),
1387        #[cfg(target_arch = "x86_64")]
1388        "epoll_wait" => Some(libc::SYS_epoll_wait),
1389        "epoll_pwait" => Some(libc::SYS_epoll_pwait),
1390        #[cfg(target_arch = "x86_64")]
1391        "eventfd" => Some(libc::SYS_eventfd),
1392        "eventfd2" => Some(libc::SYS_eventfd2),
1393        #[cfg(target_arch = "x86_64")]
1394        "signalfd" => Some(libc::SYS_signalfd),
1395        "signalfd4" => Some(libc::SYS_signalfd4),
1396        "timerfd_create" => Some(libc::SYS_timerfd_create),
1397        "timerfd_settime" => Some(libc::SYS_timerfd_settime),
1398        "timerfd_gettime" => Some(libc::SYS_timerfd_gettime),
1399        // Misc
1400        "uname" => Some(libc::SYS_uname),
1401        "getrandom" => Some(libc::SYS_getrandom),
1402        "futex" => Some(libc::SYS_futex),
1403        "set_tid_address" => Some(libc::SYS_set_tid_address),
1404        "set_robust_list" => Some(libc::SYS_set_robust_list),
1405        "get_robust_list" => Some(libc::SYS_get_robust_list),
1406        #[cfg(target_arch = "x86_64")]
1407        "arch_prctl" => Some(libc::SYS_arch_prctl),
1408        "sysinfo" => Some(libc::SYS_sysinfo),
1409        "umask" => Some(libc::SYS_umask),
1410        #[cfg(target_arch = "x86_64")]
1411        "getrlimit" => Some(libc::SYS_getrlimit),
1412        "prlimit64" => Some(libc::SYS_prlimit64),
1413        "getrusage" => Some(libc::SYS_getrusage),
1414        "times" => Some(libc::SYS_times),
1415        "sched_yield" => Some(libc::SYS_sched_yield),
1416        "sched_getaffinity" => Some(libc::SYS_sched_getaffinity),
1417        "getcpu" => Some(libc::SYS_getcpu),
1418        "rseq" => Some(libc::SYS_rseq),
1419        "close_range" => Some(libc::SYS_close_range),
1420        // Ownership
1421        "fchown" => Some(libc::SYS_fchown),
1422        "fchownat" => Some(libc::SYS_fchownat),
1423        #[cfg(target_arch = "x86_64")]
1424        "chown" => Some(libc::SYS_chown),
1425        #[cfg(target_arch = "x86_64")]
1426        "lchown" => Some(libc::SYS_lchown),
1427        // io_uring
1428        "io_uring_setup" => Some(libc::SYS_io_uring_setup),
1429        "io_uring_enter" => Some(libc::SYS_io_uring_enter),
1430        "io_uring_register" => Some(libc::SYS_io_uring_register),
1431        // Legacy AIO
1432        "io_setup" => Some(libc::SYS_io_setup),
1433        "io_destroy" => Some(libc::SYS_io_destroy),
1434        "io_submit" => Some(libc::SYS_io_submit),
1435        "io_getevents" => Some(libc::SYS_io_getevents),
1436        // Timers
1437        "setitimer" => Some(libc::SYS_setitimer),
1438        "getitimer" => Some(libc::SYS_getitimer),
1439        // Process groups
1440        "setpgid" => Some(libc::SYS_setpgid),
1441        "getpgid" => Some(libc::SYS_getpgid),
1442        "memfd_create" => Some(libc::SYS_memfd_create),
1443        "ioctl" => Some(libc::SYS_ioctl),
1444        "prctl" => Some(libc::SYS_prctl),
1445        // Landlock
1446        "landlock_create_ruleset" => Some(libc::SYS_landlock_create_ruleset),
1447        "landlock_add_rule" => Some(libc::SYS_landlock_add_rule),
1448        "landlock_restrict_self" => Some(libc::SYS_landlock_restrict_self),
1449        // --- Additional syscalls (not in default allowlist, available via --seccomp-allow) ---
1450        // Memory
1451        "mincore" => Some(libc::SYS_mincore),
1452        "mlockall" => Some(libc::SYS_mlockall),
1453        "munlockall" => Some(libc::SYS_munlockall),
1454        "mbind" => Some(libc::SYS_mbind),
1455        "set_mempolicy" => Some(libc::SYS_set_mempolicy),
1456        "get_mempolicy" => Some(libc::SYS_get_mempolicy),
1457        "memfd_secret" => Some(libc::SYS_memfd_secret),
1458        "membarrier" => Some(libc::SYS_membarrier),
1459        "process_madvise" => Some(libc::SYS_process_madvise),
1460        "pkey_mprotect" => Some(libc::SYS_pkey_mprotect),
1461        "pkey_alloc" => Some(libc::SYS_pkey_alloc),
1462        "pkey_free" => Some(libc::SYS_pkey_free),
1463        "mseal" => Some(libc::SYS_mseal),
1464        "map_shadow_stack" => Some(453),
1465        "remap_file_pages" => Some(libc::SYS_remap_file_pages),
1466        "set_mempolicy_home_node" => Some(libc::SYS_set_mempolicy_home_node),
1467        "cachestat" => Some(451),
1468        // Process
1469        #[cfg(target_arch = "x86_64")]
1470        "vfork" => Some(libc::SYS_vfork),
1471        #[cfg(target_arch = "x86_64")]
1472        "pause" => Some(libc::SYS_pause),
1473        #[cfg(target_arch = "x86_64")]
1474        "alarm" => Some(libc::SYS_alarm),
1475        "tkill" => Some(libc::SYS_tkill),
1476        "ptrace" => Some(libc::SYS_ptrace),
1477        "process_vm_readv" => Some(libc::SYS_process_vm_readv),
1478        "process_vm_writev" => Some(libc::SYS_process_vm_writev),
1479        "process_mrelease" => Some(libc::SYS_process_mrelease),
1480        "kcmp" => Some(libc::SYS_kcmp),
1481        "unshare" => Some(libc::SYS_unshare),
1482        "setns" => Some(libc::SYS_setns),
1483        "pidfd_open" => Some(libc::SYS_pidfd_open),
1484        "pidfd_send_signal" => Some(libc::SYS_pidfd_send_signal),
1485        "pidfd_getfd" => Some(libc::SYS_pidfd_getfd),
1486        // UID/GID
1487        "setuid" => Some(libc::SYS_setuid),
1488        "setgid" => Some(libc::SYS_setgid),
1489        "setreuid" => Some(libc::SYS_setreuid),
1490        "setregid" => Some(libc::SYS_setregid),
1491        "setresuid" => Some(libc::SYS_setresuid),
1492        "getresuid" => Some(libc::SYS_getresuid),
1493        "setresgid" => Some(libc::SYS_setresgid),
1494        "getresgid" => Some(libc::SYS_getresgid),
1495        "setfsuid" => Some(libc::SYS_setfsuid),
1496        "setfsgid" => Some(libc::SYS_setfsgid),
1497        "setgroups" => Some(libc::SYS_setgroups),
1498        "getsid" => Some(libc::SYS_getsid),
1499        // Capabilities
1500        "capget" => Some(libc::SYS_capget),
1501        "capset" => Some(libc::SYS_capset),
1502        // Signals
1503        "rt_tgsigqueueinfo" => Some(libc::SYS_rt_tgsigqueueinfo),
1504        // SysV message queues
1505        "msgget" => Some(libc::SYS_msgget),
1506        "msgsnd" => Some(libc::SYS_msgsnd),
1507        "msgrcv" => Some(libc::SYS_msgrcv),
1508        "msgctl" => Some(libc::SYS_msgctl),
1509        // Timers
1510        "timer_create" => Some(libc::SYS_timer_create),
1511        "timer_settime" => Some(libc::SYS_timer_settime),
1512        "timer_gettime" => Some(libc::SYS_timer_gettime),
1513        "timer_getoverrun" => Some(libc::SYS_timer_getoverrun),
1514        "timer_delete" => Some(libc::SYS_timer_delete),
1515        "clock_settime" => Some(libc::SYS_clock_settime),
1516        "clock_adjtime" => Some(libc::SYS_clock_adjtime),
1517        #[cfg(target_arch = "x86_64")]
1518        "time" => Some(libc::SYS_time),
1519        // File I/O (non-default)
1520        #[cfg(target_arch = "x86_64")]
1521        "creat" => Some(libc::SYS_creat),
1522        "readahead" => Some(libc::SYS_readahead),
1523        "sync" => Some(libc::SYS_sync),
1524        "syncfs" => Some(libc::SYS_syncfs),
1525        "vmsplice" => Some(libc::SYS_vmsplice),
1526        "utimensat" => Some(libc::SYS_utimensat),
1527        #[cfg(target_arch = "x86_64")]
1528        "utimes" => Some(libc::SYS_utimes),
1529        #[cfg(target_arch = "x86_64")]
1530        "utime" => Some(libc::SYS_utime),
1531        #[cfg(target_arch = "x86_64")]
1532        "futimesat" => Some(libc::SYS_futimesat),
1533        "openat2" => Some(libc::SYS_openat2),
1534        "name_to_handle_at" => Some(libc::SYS_name_to_handle_at),
1535        "open_by_handle_at" => Some(libc::SYS_open_by_handle_at),
1536        "fchmodat2" => Some(libc::SYS_fchmodat2),
1537        "statmount" => Some(457),
1538        "listmount" => Some(458),
1539        // Extended attributes (write)
1540        "setxattr" => Some(libc::SYS_setxattr),
1541        "lsetxattr" => Some(libc::SYS_lsetxattr),
1542        "fsetxattr" => Some(libc::SYS_fsetxattr),
1543        "removexattr" => Some(libc::SYS_removexattr),
1544        "lremovexattr" => Some(libc::SYS_lremovexattr),
1545        "fremovexattr" => Some(libc::SYS_fremovexattr),
1546        "setxattrat" => Some(463),
1547        "getxattrat" => Some(464),
1548        "listxattrat" => Some(465),
1549        "removexattrat" => Some(466),
1550        // Network (non-default)
1551        "recvmmsg" => Some(libc::SYS_recvmmsg),
1552        "sendmmsg" => Some(libc::SYS_sendmmsg),
1553        // Inotify
1554        #[cfg(target_arch = "x86_64")]
1555        "inotify_init" => Some(libc::SYS_inotify_init),
1556        "inotify_init1" => Some(libc::SYS_inotify_init1),
1557        "inotify_add_watch" => Some(libc::SYS_inotify_add_watch),
1558        "inotify_rm_watch" => Some(libc::SYS_inotify_rm_watch),
1559        // Fanotify
1560        "fanotify_init" => Some(libc::SYS_fanotify_init),
1561        "fanotify_mark" => Some(libc::SYS_fanotify_mark),
1562        // Epoll (non-default)
1563        "epoll_pwait2" => Some(libc::SYS_epoll_pwait2),
1564        // Scheduling (non-default)
1565        "sched_setparam" => Some(libc::SYS_sched_setparam),
1566        "sched_setscheduler" => Some(libc::SYS_sched_setscheduler),
1567        "sched_get_priority_max" => Some(libc::SYS_sched_get_priority_max),
1568        "sched_get_priority_min" => Some(libc::SYS_sched_get_priority_min),
1569        "sched_rr_get_interval" => Some(libc::SYS_sched_rr_get_interval),
1570        "sched_setattr" => Some(libc::SYS_sched_setattr),
1571        "sched_getattr" => Some(libc::SYS_sched_getattr),
1572        "sched_setaffinity" => Some(libc::SYS_sched_setaffinity),
1573        // Resource limits
1574        #[cfg(target_arch = "x86_64")]
1575        "setrlimit" => Some(libc::SYS_setrlimit),
1576        "getpriority" => Some(libc::SYS_getpriority),
1577        "setpriority" => Some(libc::SYS_setpriority),
1578        "ioprio_set" => Some(libc::SYS_ioprio_set),
1579        "ioprio_get" => Some(libc::SYS_ioprio_get),
1580        // Futex (non-default)
1581        "futex_waitv" => Some(libc::SYS_futex_waitv),
1582        "futex_wake" => Some(454),
1583        "futex_wait" => Some(455),
1584        "futex_requeue" => Some(456),
1585        // Kernel modules
1586        "init_module" => Some(libc::SYS_init_module),
1587        "finit_module" => Some(libc::SYS_finit_module),
1588        "delete_module" => Some(libc::SYS_delete_module),
1589        // eBPF and performance
1590        "bpf" => Some(libc::SYS_bpf),
1591        "perf_event_open" => Some(libc::SYS_perf_event_open),
1592        // Seccomp
1593        "seccomp" => Some(libc::SYS_seccomp),
1594        // Userfaultfd
1595        "userfaultfd" => Some(libc::SYS_userfaultfd),
1596        // Mount (non-default)
1597        "mount" => Some(libc::SYS_mount),
1598        "umount2" => Some(libc::SYS_umount2),
1599        "pivot_root" => Some(libc::SYS_pivot_root),
1600        "mount_setattr" => Some(libc::SYS_mount_setattr),
1601        "open_tree" => Some(libc::SYS_open_tree),
1602        "open_tree_attr" => Some(467),
1603        "move_mount" => Some(libc::SYS_move_mount),
1604        "fsopen" => Some(libc::SYS_fsopen),
1605        "fsconfig" => Some(libc::SYS_fsconfig),
1606        "fsmount" => Some(libc::SYS_fsmount),
1607        "fspick" => Some(libc::SYS_fspick),
1608        // Misc (non-default)
1609        "syslog" => Some(libc::SYS_syslog),
1610        "reboot" => Some(libc::SYS_reboot),
1611        "swapon" => Some(libc::SYS_swapon),
1612        "swapoff" => Some(libc::SYS_swapoff),
1613        "chroot" => Some(libc::SYS_chroot),
1614        "acct" => Some(libc::SYS_acct),
1615        "settimeofday" => Some(libc::SYS_settimeofday),
1616        "sethostname" => Some(libc::SYS_sethostname),
1617        "setdomainname" => Some(libc::SYS_setdomainname),
1618        "adjtimex" => Some(libc::SYS_adjtimex),
1619        #[cfg(target_arch = "x86_64")]
1620        "modify_ldt" => Some(libc::SYS_modify_ldt),
1621        #[cfg(target_arch = "x86_64")]
1622        "iopl" => Some(libc::SYS_iopl),
1623        #[cfg(target_arch = "x86_64")]
1624        "ioperm" => Some(libc::SYS_ioperm),
1625        "quotactl" => Some(libc::SYS_quotactl),
1626        "quotactl_fd" => Some(libc::SYS_quotactl_fd),
1627        "personality" => Some(libc::SYS_personality),
1628        "vhangup" => Some(libc::SYS_vhangup),
1629        #[cfg(target_arch = "x86_64")]
1630        "ustat" => Some(libc::SYS_ustat),
1631        #[cfg(target_arch = "x86_64")]
1632        "sysfs" => Some(libc::SYS_sysfs),
1633        "mknod" => Some(libc::SYS_mknod),
1634        "mknodat" => Some(libc::SYS_mknodat),
1635        "migrate_pages" => Some(libc::SYS_migrate_pages),
1636        "move_pages" => Some(libc::SYS_move_pages),
1637        #[cfg(target_arch = "x86_64")]
1638        "kexec_load" => Some(libc::SYS_kexec_load),
1639        "kexec_file_load" => Some(libc::SYS_kexec_file_load),
1640        // POSIX message queues
1641        "mq_open" => Some(libc::SYS_mq_open),
1642        "mq_unlink" => Some(libc::SYS_mq_unlink),
1643        "mq_timedsend" => Some(libc::SYS_mq_timedsend),
1644        "mq_timedreceive" => Some(libc::SYS_mq_timedreceive),
1645        "mq_notify" => Some(libc::SYS_mq_notify),
1646        "mq_getsetattr" => Some(libc::SYS_mq_getsetattr),
1647        // Keyring
1648        "add_key" => Some(libc::SYS_add_key),
1649        "request_key" => Some(libc::SYS_request_key),
1650        "keyctl" => Some(libc::SYS_keyctl),
1651        // IO pgetevents
1652        "io_pgetevents" => Some(333),
1653        // LSM
1654        "lsm_get_self_attr" => Some(459),
1655        "lsm_set_self_attr" => Some(460),
1656        "lsm_list_modules" => Some(461),
1657        #[cfg(target_arch = "x86_64")]
1658        "lookup_dcookie" => Some(libc::SYS_lookup_dcookie),
1659        "uretprobe" => Some(335),
1660        _ => None,
1661    }
1662}
1663
1664impl Default for SeccompManager {
1665    fn default() -> Self {
1666        Self::new()
1667    }
1668}
1669
1670#[cfg(test)]
1671mod tests {
1672    use super::*;
1673
1674    #[test]
1675    fn test_seccomp_manager_initial_state() {
1676        let mgr = SeccompManager::new();
1677        assert!(!mgr.is_applied());
1678    }
1679
1680    #[test]
1681    fn test_apply_idempotent() {
1682        let mgr = SeccompManager::new();
1683        // Note: We can't actually test application in unit tests
1684        // as it would affect the test process itself
1685        // This is tested in integration tests instead
1686        assert!(!mgr.is_applied());
1687    }
1688
1689    #[test]
1690    fn test_clone_denied_flags_include_newcgroup() {
1691        assert_ne!(
1692            DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWCGROUP as u64,
1693            0
1694        );
1695    }
1696
1697    #[test]
1698    fn test_clone_denied_flags_include_newtime() {
1699        assert_ne!(
1700            DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWTIME as u64,
1701            0,
1702            "CLONE_NEWTIME must be in denied clone namespace flags"
1703        );
1704    }
1705
1706    #[test]
1707    fn test_network_none_socket_domains_are_unix_only() {
1708        let domains = SeccompManager::allowed_socket_domains(false);
1709        assert_eq!(domains, vec![libc::AF_UNIX]);
1710    }
1711
1712    #[test]
1713    fn test_network_enabled_socket_domains_exclude_netlink() {
1714        let domains = SeccompManager::allowed_socket_domains(true);
1715        assert!(domains.contains(&libc::AF_UNIX));
1716        assert!(domains.contains(&libc::AF_INET));
1717        assert!(domains.contains(&libc::AF_INET6));
1718        assert!(!domains.contains(&libc::AF_NETLINK));
1719    }
1720
1721    #[test]
1722    fn test_network_mode_syscalls_only_enabled_when_network_allowed() {
1723        let none = SeccompManager::network_mode_syscalls(false);
1724        assert!(none.is_empty());
1725
1726        let enabled = SeccompManager::network_mode_syscalls(true);
1727        assert!(enabled.contains(&libc::SYS_connect));
1728        assert!(enabled.contains(&libc::SYS_bind));
1729        assert!(enabled.contains(&libc::SYS_listen));
1730        assert!(enabled.contains(&libc::SYS_accept));
1731        assert!(enabled.contains(&libc::SYS_setsockopt));
1732    }
1733
1734    #[test]
1735    fn test_landlock_bootstrap_syscalls_present_in_base_allowlist() {
1736        let base = SeccompManager::base_allowed_syscalls();
1737        assert!(base.contains(&libc::SYS_landlock_create_ruleset));
1738        assert!(base.contains(&libc::SYS_landlock_add_rule));
1739        assert!(base.contains(&libc::SYS_landlock_restrict_self));
1740    }
1741
1742    #[cfg(any(
1743        target_arch = "x86_64",
1744        target_arch = "aarch64",
1745        target_arch = "riscv64"
1746    ))]
1747    #[test]
1748    fn test_generic_file_syscalls_present_in_base_allowlist() {
1749        let base = SeccompManager::base_allowed_syscalls();
1750        assert!(
1751            base.contains(&SYS_FADVISE64),
1752            "fadvise64 must be allowed on architectures with a generic syscall table"
1753        );
1754        assert!(
1755            base.contains(&SYS_SENDFILE),
1756            "sendfile must be allowed on architectures with a generic syscall table"
1757        );
1758    }
1759
1760    #[cfg(any(
1761        target_arch = "x86_64",
1762        target_arch = "aarch64",
1763        target_arch = "riscv64"
1764    ))]
1765    #[test]
1766    fn test_generic_file_syscall_names_resolve_for_profiles() {
1767        assert_eq!(syscall_name_to_number("fadvise64"), Some(SYS_FADVISE64));
1768        assert_eq!(syscall_name_to_number("sendfile"), Some(SYS_SENDFILE));
1769    }
1770
1771    #[test]
1772    fn test_x32_legacy_range_not_allowlisted() {
1773        let base = SeccompManager::base_allowed_syscalls();
1774        let net = SeccompManager::network_mode_syscalls(true);
1775        for nr in 512_i64..=547_i64 {
1776            assert!(
1777                !base.contains(&nr) && !net.contains(&nr),
1778                "x32 syscall number {} unexpectedly allowlisted",
1779                nr
1780            );
1781        }
1782    }
1783
1784    #[test]
1785    fn test_i386_compat_socketcall_range_not_allowlisted() {
1786        let base = SeccompManager::base_allowed_syscalls();
1787        let net = SeccompManager::network_mode_syscalls(true);
1788        // i386 compat per syscall_32.tbl: socket..shutdown live at 359..373.
1789        // On x86_64 these numbers are outside our native allowlist surface.
1790        for nr in 359_i64..=373_i64 {
1791            assert!(
1792                !base.contains(&nr) && !net.contains(&nr),
1793                "i386 compat syscall number {} unexpectedly allowlisted",
1794                nr
1795            );
1796        }
1797    }
1798
1799    #[test]
1800    fn test_minimal_filter_allowlist_counts_are_stable() {
1801        let base = SeccompManager::base_allowed_syscalls();
1802        let net = SeccompManager::network_mode_syscalls(true);
1803
1804        // Snapshot counts to catch unintended policy drift.
1805        // +9 accounts for conditional rules inserted in minimal_filter():
1806        // socket/ioctl/prctl/prlimit64/mprotect/preadv2/pwritev2/clone/execveat.
1807        // fork removed (forces through filtered clone path).
1808        // execveat removed from base (arg-filtered separately).
1809        // sysinfo removed (L8: leaks host info).
1810        // prlimit64 moved to arg-filtered (M3).
1811        assert_eq!(base.len(), 171);
1812        assert_eq!(net.len(), 11);
1813        assert_eq!(base.len() + 9, 180);
1814        assert_eq!(base.len() + net.len() + 9, 191);
1815    }
1816
1817    #[test]
1818    fn test_arg_filtered_syscalls_list_includes_critical_syscalls() {
1819        // These syscalls must be in the arg-filtered list so custom profiles
1820        // get warnings when they allow them without filters.
1821        for name in &[
1822            "clone", "execveat", "ioctl", "preadv2", "prctl", "pwritev2", "socket",
1823        ] {
1824            assert!(
1825                SeccompManager::ARG_FILTERED_SYSCALLS.contains(name),
1826                "'{}' must be in ARG_FILTERED_SYSCALLS",
1827                name
1828            );
1829        }
1830    }
1831
1832    #[test]
1833    fn test_clone3_not_allowlisted_in_minimal_filter() {
1834        // clone3 carries flags through struct clone_args, which seccomp BPF
1835        // cannot inspect. It must not be unconditionally allowed; the compiler
1836        // adds an exact ENOSYS deny so libc falls back to filtered clone.
1837        let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1838        assert!(
1839            !rules.contains_key(&libc::SYS_clone3),
1840            "clone3 must not be in the seccomp allowlist"
1841        );
1842        assert!(
1843            SeccompManager::errno_denied_syscalls()
1844                .iter()
1845                .any(|(nr, errno)| *nr == libc::SYS_clone3 && *errno == libc::ENOSYS as u32),
1846            "clone3 must be denied with ENOSYS to trigger libc fallback"
1847        );
1848    }
1849
1850    #[test]
1851    fn test_clone_is_allowed_with_arg_filter() {
1852        // clone (not clone3) should still be in the rules with arg filtering
1853        let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1854        assert!(
1855            rules.contains_key(&libc::SYS_clone),
1856            "clone must be in the seccomp allowlist with arg filters"
1857        );
1858    }
1859
1860    #[test]
1861    fn test_high_risk_syscalls_removed_from_base_allowlist() {
1862        let base = SeccompManager::base_allowed_syscalls();
1863        // chown/fchown/lchown/fchownat: allowed – safe after CAP_CHOWN/CAP_FOWNER drop
1864        // mlock/munlock: allowed – needed by databases, bounded by RLIMIT_MEMLOCK
1865        let removed = [
1866            libc::SYS_sync,
1867            libc::SYS_syncfs,
1868            libc::SYS_mincore,
1869            libc::SYS_vfork,
1870            libc::SYS_tkill,
1871            // io_uring: large attack surface, many CVEs – require custom profile
1872            libc::SYS_io_uring_setup,
1873            libc::SYS_io_uring_enter,
1874            libc::SYS_io_uring_register,
1875        ];
1876
1877        for syscall in removed {
1878            assert!(
1879                !base.contains(&syscall),
1880                "syscall {} unexpectedly present in base allowlist",
1881                syscall
1882            );
1883        }
1884    }
1885
1886    #[test]
1887    fn test_custom_profile_preserves_clone_arg_filters() {
1888        // SEC-01: Custom seccomp profiles that allow "clone" must still get
1889        // argument-level filtering to block namespace-creating flags.
1890        // Verify by inspecting the built-in filter rules that serve as the
1891        // merge source for apply_profile_from_file.
1892        let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1893
1894        // Every ARG_FILTERED_SYSCALLS entry must have non-empty argument-level
1895        // rules in the built-in filter so apply_profile_from_file can merge them.
1896        for name in SeccompManager::ARG_FILTERED_SYSCALLS {
1897            if let Some(nr) = syscall_name_to_number(name) {
1898                let entry = rules.get(&nr);
1899                assert!(
1900                    entry.is_some() && !entry.unwrap().is_empty(),
1901                    "built-in filter must have argument-level rules for '{}' \
1902                     so apply_profile_from_file can merge them into custom profiles",
1903                    name
1904                );
1905            }
1906        }
1907    }
1908
1909    #[test]
1910    fn test_memfd_create_not_in_default_allowlist() {
1911        // SEC-02: memfd_create enables fileless code execution when combined with execveat.
1912        let base = SeccompManager::base_allowed_syscalls();
1913        assert!(
1914            !base.contains(&libc::SYS_memfd_create),
1915            "memfd_create must not be in the default seccomp allowlist (fileless exec risk)"
1916        );
1917        // Also verify it's not sneaked into the compiled filter rules
1918        let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1919        assert!(
1920            !rules.contains_key(&libc::SYS_memfd_create),
1921            "memfd_create must not be in the compiled seccomp filter rules"
1922        );
1923    }
1924
1925    #[test]
1926    fn test_mprotect_has_arg_filtering() {
1927        // SEC-03: mprotect must have argument-level filtering to prevent W^X
1928        // (PROT_WRITE|PROT_EXEC) violations. Verify via runtime data structures.
1929
1930        // mprotect must NOT be in the unconditional base allowlist
1931        let base = SeccompManager::base_allowed_syscalls();
1932        assert!(
1933            !base.contains(&libc::SYS_mprotect),
1934            "SYS_mprotect must not be unconditionally allowed - needs arg filtering"
1935        );
1936
1937        // mprotect must be present in the compiled filter with non-empty
1938        // argument conditions (the conditions enforce W^X)
1939        let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1940        let mprotect_rules = rules.get(&libc::SYS_mprotect);
1941        assert!(
1942            mprotect_rules.is_some(),
1943            "mprotect must be present in the seccomp filter rules"
1944        );
1945        assert!(
1946            !mprotect_rules.unwrap().is_empty(),
1947            "mprotect must have argument-level conditions to prevent W^X violations"
1948        );
1949    }
1950
1951    #[test]
1952    fn test_preadv2_pwritev2_have_flags_arg_filtering() {
1953        // v2 vectored I/O has an extra flags argument. The built-in profile
1954        // permits the v2 entrypoints only when flags == 0, leaving preadv/pwritev
1955        // available for ordinary positioned vectored I/O.
1956        let base = SeccompManager::base_allowed_syscalls();
1957        let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1958
1959        for (name, syscall) in [
1960            ("preadv2", libc::SYS_preadv2),
1961            ("pwritev2", libc::SYS_pwritev2),
1962        ] {
1963            assert!(
1964                !base.contains(&syscall),
1965                "{} must not be unconditionally allowed",
1966                name
1967            );
1968            assert!(
1969                rules.get(&syscall).is_some_and(|chain| !chain.is_empty()),
1970                "{} must have argument-level conditions",
1971                name
1972            );
1973            assert!(
1974                SeccompManager::ARG_FILTERED_SYSCALLS.contains(&name),
1975                "{} must be listed as argument-filtered for custom profiles",
1976                name
1977            );
1978        }
1979    }
1980
1981    #[test]
1982    fn test_unsafe_blocks_have_safety_comments() {
1983        // SEC-08: All unsafe blocks must have // SAFETY: documentation
1984        let source = include_str!("seccomp.rs");
1985        let mut pos = 0;
1986        while let Some(idx) = source[pos..].find("unsafe {") {
1987            let abs_idx = pos + idx;
1988            // Check that there's a SAFETY comment within 200 chars before the unsafe block
1989            let start = abs_idx.saturating_sub(200);
1990            let context = &source[start..abs_idx];
1991            assert!(
1992                context.contains("SAFETY:"),
1993                "unsafe block at byte {} must have a // SAFETY: comment. Context: ...{}...",
1994                abs_idx,
1995                &source[abs_idx.saturating_sub(80)..abs_idx + 10]
1996            );
1997            pos = abs_idx + 1;
1998        }
1999    }
2000
2001    // --- H-1: mprotect MaskedEq logic verification ---
2002    //
2003    // The mprotect filter uses MaskedEq((PROT_WRITE | PROT_EXEC), value) to
2004    // allow only combinations where the W|X bits match one of {0, W, X}.
2005    // These tests prove the logic is correct without installing a real
2006    // seccomp filter (which would affect the test process).
2007
2008    /// Helper: simulates the MaskedEq check that the seccomp BPF would perform.
2009    /// Returns true if the prot value would be ALLOWED by one of the rules.
2010    fn mprotect_would_allow(prot: u64) -> bool {
2011        let mask = (libc::PROT_WRITE | libc::PROT_EXEC) as u64;
2012        let allowed_values: &[u64] = &[0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64];
2013        let masked = prot & mask;
2014        allowed_values.contains(&masked)
2015    }
2016
2017    #[test]
2018    fn test_mprotect_allows_prot_none() {
2019        assert!(mprotect_would_allow(0), "PROT_NONE must be allowed");
2020    }
2021
2022    #[test]
2023    fn test_mprotect_allows_prot_read_only() {
2024        assert!(
2025            mprotect_would_allow(libc::PROT_READ as u64),
2026            "PROT_READ must be allowed (W|X bits are 0)"
2027        );
2028    }
2029
2030    #[test]
2031    fn test_mprotect_allows_prot_read_write() {
2032        assert!(
2033            mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE) as u64),
2034            "PROT_READ|PROT_WRITE must be allowed"
2035        );
2036    }
2037
2038    #[test]
2039    fn test_mprotect_allows_prot_read_exec() {
2040        assert!(
2041            mprotect_would_allow((libc::PROT_READ | libc::PROT_EXEC) as u64),
2042            "PROT_READ|PROT_EXEC must be allowed"
2043        );
2044    }
2045
2046    #[test]
2047    fn test_mprotect_rejects_prot_write_exec() {
2048        assert!(
2049            !mprotect_would_allow((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
2050            "PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
2051        );
2052    }
2053
2054    #[test]
2055    fn test_mprotect_rejects_prot_read_write_exec() {
2056        assert!(
2057            !mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE | libc::PROT_EXEC) as u64),
2058            "PROT_READ|PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
2059        );
2060    }
2061
2062    #[test]
2063    fn test_mprotect_allows_prot_write_alone() {
2064        assert!(
2065            mprotect_would_allow(libc::PROT_WRITE as u64),
2066            "PROT_WRITE alone must be allowed"
2067        );
2068    }
2069
2070    #[test]
2071    fn test_mprotect_allows_prot_exec_alone() {
2072        assert!(
2073            mprotect_would_allow(libc::PROT_EXEC as u64),
2074            "PROT_EXEC alone must be allowed"
2075        );
2076    }
2077
2078    // --- Extra syscall allowlist tests ---
2079
2080    #[test]
2081    fn test_extra_syscalls_are_merged_into_filter() {
2082        let extra = vec!["io_uring_setup".to_string(), "sysinfo".to_string()];
2083        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
2084        assert!(
2085            rules.contains_key(&libc::SYS_io_uring_setup),
2086            "io_uring_setup must be in filter when requested via extra_syscalls"
2087        );
2088        assert!(
2089            rules.contains_key(&libc::SYS_sysinfo),
2090            "sysinfo must be in filter when requested via extra_syscalls"
2091        );
2092    }
2093
2094    #[test]
2095    fn test_extra_syscalls_do_not_override_arg_filtered() {
2096        // If a user requests an arg-filtered syscall via extra_syscalls, the
2097        // version from the built-in filter should still be present (not replaced
2098        // with an unconditional allow).
2099        let extra = vec![
2100            "clone".to_string(),
2101            "preadv2".to_string(),
2102            "pwritev2".to_string(),
2103        ];
2104        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
2105        for (name, syscall) in [
2106            ("clone", libc::SYS_clone),
2107            ("preadv2", libc::SYS_preadv2),
2108            ("pwritev2", libc::SYS_pwritev2),
2109        ] {
2110            assert!(
2111                rules.get(&syscall).is_some_and(|chain| !chain.is_empty()),
2112                "{} must retain argument-level filtering even when in extra_syscalls",
2113                name
2114            );
2115        }
2116    }
2117
2118    #[test]
2119    fn test_extra_syscalls_unknown_name_is_warned_and_skipped() {
2120        // Unknown syscall names emit a WARN and are skipped (not fatal)
2121        let extra = vec!["not_a_real_syscall".to_string()];
2122        let result = SeccompManager::minimal_filter(true, &extra);
2123        assert!(
2124            result.is_ok(),
2125            "Unknown syscall name should warn and skip, not error"
2126        );
2127    }
2128
2129    #[test]
2130    fn test_extra_syscalls_empty_is_noop() {
2131        let rules_without = SeccompManager::minimal_filter(true, &[]).unwrap();
2132        let rules_with = SeccompManager::minimal_filter(true, &[]).unwrap();
2133        assert_eq!(rules_without.len(), rules_with.len());
2134    }
2135
2136    #[test]
2137    fn test_extra_syscalls_duplicate_of_default_is_harmless() {
2138        // Requesting a syscall that's already in the default allowlist should work fine
2139        let extra = vec!["read".to_string()];
2140        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
2141        assert!(rules.contains_key(&libc::SYS_read));
2142    }
2143
2144    #[test]
2145    fn test_extra_syscalls_blocked_known_syscall_not_added() {
2146        // A known syscall that is NOT in OPT_IN_SYSCALLS must be blocked
2147        // (not added to the filter rules). E.g. kexec_load, bpf, ptrace.
2148        let extra = vec!["kexec_load".to_string()];
2149        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
2150        assert!(
2151            !rules.contains_key(&libc::SYS_kexec_load),
2152            "kexec_load must be blocked even when requested via --seccomp-allow"
2153        );
2154    }
2155
2156    #[test]
2157    fn test_extra_syscalls_unshare_remains_blocked() {
2158        let extra = vec!["unshare".to_string()];
2159        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
2160        assert!(
2161            !rules.contains_key(&libc::SYS_unshare),
2162            "unshare must stay blocked even when requested via --seccomp-allow"
2163        );
2164    }
2165
2166    #[test]
2167    fn test_extra_syscalls_keyring_remain_blocked() {
2168        let extra = vec![
2169            "add_key".to_string(),
2170            "request_key".to_string(),
2171            "keyctl".to_string(),
2172        ];
2173        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
2174
2175        for (name, syscall) in [
2176            ("add_key", libc::SYS_add_key),
2177            ("request_key", libc::SYS_request_key),
2178            ("keyctl", libc::SYS_keyctl),
2179        ] {
2180            assert!(
2181                !rules.contains_key(&syscall),
2182                "{} must stay blocked even when requested via --seccomp-allow",
2183                name
2184            );
2185            assert!(
2186                !SeccompManager::OPT_IN_SYSCALLS.contains(&name),
2187                "{} must not be in the seccomp opt-in allowlist",
2188                name
2189            );
2190        }
2191    }
2192
2193    #[test]
2194    fn test_security_critical_syscalls_remain_absent_from_filter() {
2195        let extra = SeccompManager::SECURITY_CRITICAL_DENIED_SYSCALLS
2196            .iter()
2197            .map(|name| (*name).to_string())
2198            .collect::<Vec<_>>();
2199        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
2200
2201        for name in SeccompManager::SECURITY_CRITICAL_DENIED_SYSCALLS {
2202            let syscall = syscall_name_to_number(name).unwrap();
2203            assert!(
2204                !rules.contains_key(&syscall),
2205                "{} must not appear in the built-in filter even when requested via --seccomp-allow",
2206                name
2207            );
2208            assert!(
2209                !SeccompManager::OPT_IN_SYSCALLS.contains(name),
2210                "{} must not be in the seccomp opt-in allowlist",
2211                name
2212            );
2213        }
2214    }
2215
2216    #[test]
2217    fn test_extra_syscalls_clone3_remains_blocked() {
2218        let extra = vec!["clone3".to_string()];
2219        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
2220        assert!(
2221            !rules.contains_key(&libc::SYS_clone3),
2222            "clone3 must stay out of the allowlist even when requested via --seccomp-allow"
2223        );
2224        assert!(
2225            SeccompManager::errno_denied_syscalls()
2226                .iter()
2227                .any(|(nr, _)| *nr == libc::SYS_clone3),
2228            "clone3 must remain covered by the exact ENOSYS deny"
2229        );
2230    }
2231
2232    #[test]
2233    fn test_extra_syscalls_opt_in_syscall_is_added() {
2234        // Syscalls in OPT_IN_SYSCALLS must be added when requested
2235        let extra = vec!["io_uring_setup".to_string()];
2236        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
2237        assert!(
2238            rules.contains_key(&libc::SYS_io_uring_setup),
2239            "io_uring_setup is in OPT_IN_SYSCALLS and must be added"
2240        );
2241    }
2242
2243    #[test]
2244    fn test_production_validation_rejects_security_critical_extra_syscalls() {
2245        for name in [
2246            "clone3",
2247            "unshare",
2248            "setns",
2249            "add_key",
2250            "request_key",
2251            "keyctl",
2252        ] {
2253            let extra = vec![name.to_string()];
2254            let err =
2255                SeccompManager::validate_extra_syscalls_for_production(true, &extra).unwrap_err();
2256            assert!(err.to_string().contains("security-critical"));
2257            assert!(err.to_string().contains(name));
2258        }
2259    }
2260
2261    #[test]
2262    fn test_production_validation_rejects_unsupported_extra_syscalls() {
2263        let extra = vec!["kexec_load".to_string()];
2264        let err = SeccompManager::validate_extra_syscalls_for_production(true, &extra).unwrap_err();
2265        assert!(err.to_string().contains("unsupported"));
2266        assert!(err.to_string().contains("kexec_load"));
2267    }
2268
2269    #[test]
2270    fn test_production_validation_allows_supported_extra_syscalls() {
2271        let extra = vec![
2272            "read".to_string(),
2273            "clone".to_string(),
2274            "connect".to_string(),
2275            "io_uring_setup".to_string(),
2276        ];
2277        SeccompManager::validate_extra_syscalls_for_production(true, &extra).unwrap();
2278    }
2279}