Skip to main content

nucleus/security/
seccomp.rs

1use crate::error::{NucleusError, Result};
2use crate::security::policy::sha256_hex;
3use seccompiler::{BpfProgram, SeccompAction, SeccompCondition, SeccompFilter, SeccompRule};
4use std::collections::BTreeMap;
5use std::path::Path;
6use tracing::{debug, info, warn};
7
8/// Seccomp filter manager
9///
10/// Implements syscall whitelisting for the security state machine
11/// (NucleusSecurity_Seccomp_SeccompEnforcement.tla)
12pub struct SeccompManager {
13    applied: bool,
14}
15
16const DENIED_CLONE_NAMESPACE_FLAGS: u64 = (libc::CLONE_NEWUSER
17    | libc::CLONE_NEWNS
18    | libc::CLONE_NEWNET
19    | libc::CLONE_NEWIPC
20    | libc::CLONE_NEWUTS
21    | libc::CLONE_NEWPID
22    | libc::CLONE_NEWCGROUP
23    | libc::CLONE_NEWTIME) as u64;
24
25impl SeccompManager {
26    pub fn new() -> Self {
27        Self { applied: false }
28    }
29
30    fn base_allowed_syscalls() -> Vec<i64> {
31        let mut syscalls = vec![
32            // File I/O
33            libc::SYS_read,
34            libc::SYS_write,
35            libc::SYS_openat,
36            libc::SYS_close,
37            libc::SYS_fstat,
38            libc::SYS_lseek,
39            libc::SYS_fcntl,
40            libc::SYS_readv,
41            libc::SYS_writev,
42            libc::SYS_preadv,
43            libc::SYS_pwritev,
44            libc::SYS_preadv2,
45            libc::SYS_pwritev2,
46            libc::SYS_pread64,
47            libc::SYS_pwrite64,
48            libc::SYS_readlinkat,
49            libc::SYS_newfstatat,
50            libc::SYS_statx,
51            libc::SYS_faccessat,
52            libc::SYS_faccessat2,
53            libc::SYS_dup,
54            libc::SYS_dup3,
55            libc::SYS_pipe2,
56            libc::SYS_unlinkat,
57            libc::SYS_renameat,
58            libc::SYS_renameat2,
59            libc::SYS_linkat,
60            libc::SYS_symlinkat,
61            libc::SYS_fchmod,
62            libc::SYS_fchmodat,
63            libc::SYS_truncate,
64            libc::SYS_ftruncate,
65            libc::SYS_fallocate,
66            #[cfg(target_arch = "x86_64")]
67            libc::SYS_fadvise64,
68            libc::SYS_fsync,
69            libc::SYS_fdatasync,
70            libc::SYS_sync_file_range,
71            libc::SYS_flock,
72            libc::SYS_fstatfs,
73            libc::SYS_statfs,
74            #[cfg(target_arch = "x86_64")]
75            libc::SYS_sendfile,
76            libc::SYS_copy_file_range,
77            libc::SYS_splice,
78            libc::SYS_tee,
79            // Memory management
80            libc::SYS_mmap,
81            libc::SYS_munmap,
82            libc::SYS_brk,
83            libc::SYS_mremap,
84            libc::SYS_madvise,
85            libc::SYS_msync,
86            libc::SYS_mlock,
87            libc::SYS_munlock,
88            libc::SYS_mlock2,
89            // SysV shared memory – used by PostgreSQL, Redis, and many databases
90            // for shared buffer pools. Safe in PID/IPC namespaces (isolated keyspace).
91            libc::SYS_shmget,
92            libc::SYS_shmat,
93            libc::SYS_shmdt,
94            libc::SYS_shmctl,
95            // POSIX semaphores (used by PostgreSQL for lightweight locking)
96            libc::SYS_semget,
97            libc::SYS_semop,
98            libc::SYS_semctl,
99            libc::SYS_semtimedop,
100            // Process management
101            // fork intentionally excluded – modern glibc/musl use clone(), which
102            // has namespace-flag filtering. Removing SYS_fork forces all forks
103            // through the filtered clone path (defense-in-depth against fork bombs
104            // and unfiltered namespace creation).
105            libc::SYS_execve,
106            // execveat is conditionally allowed below (AT_EMPTY_PATH blocked)
107            libc::SYS_wait4,
108            libc::SYS_waitid,
109            libc::SYS_exit,
110            libc::SYS_exit_group,
111            libc::SYS_getpid,
112            libc::SYS_gettid,
113            libc::SYS_getuid,
114            libc::SYS_getgid,
115            libc::SYS_geteuid,
116            libc::SYS_getegid,
117            libc::SYS_getppid,
118            libc::SYS_setsid,
119            libc::SYS_getgroups,
120            // Signals
121            libc::SYS_rt_sigaction,
122            libc::SYS_rt_sigprocmask,
123            libc::SYS_rt_sigreturn,
124            libc::SYS_rt_sigsuspend,
125            libc::SYS_rt_sigtimedwait,
126            libc::SYS_rt_sigpending,
127            libc::SYS_rt_sigqueueinfo,
128            libc::SYS_sigaltstack,
129            libc::SYS_restart_syscall,
130            // L7: kill/tgkill are safe when PID namespace is active (container
131            // can only signal its own processes). If PID namespace creation fails,
132            // the runtime aborts, so this is safe.
133            libc::SYS_kill,
134            libc::SYS_tgkill,
135            // Time and timers
136            libc::SYS_clock_gettime,
137            libc::SYS_clock_getres,
138            libc::SYS_clock_nanosleep,
139            libc::SYS_gettimeofday,
140            libc::SYS_nanosleep,
141            libc::SYS_setitimer,
142            libc::SYS_getitimer,
143            // Directories
144            libc::SYS_getcwd,
145            libc::SYS_chdir,
146            libc::SYS_fchdir,
147            libc::SYS_mkdirat,
148            libc::SYS_getdents64,
149            // Misc
150            libc::SYS_uname,
151            libc::SYS_getrandom,
152            libc::SYS_futex,
153            libc::SYS_set_tid_address,
154            libc::SYS_set_robust_list,
155            libc::SYS_get_robust_list,
156            // L8: sysinfo removed – leaks host RAM, uptime, and process count.
157            // Applications needing this info should use /proc/meminfo instead.
158            libc::SYS_umask,
159            // prlimit64 moved to arg-filtered section (M3)
160            libc::SYS_getrusage,
161            libc::SYS_times,
162            libc::SYS_sched_yield,
163            libc::SYS_sched_getaffinity,
164            libc::SYS_sched_setaffinity,
165            libc::SYS_sched_getparam,
166            libc::SYS_sched_getscheduler,
167            libc::SYS_getcpu,
168            // Extended attributes – read-only queries, safe
169            libc::SYS_getxattr,
170            libc::SYS_lgetxattr,
171            libc::SYS_fgetxattr,
172            libc::SYS_listxattr,
173            libc::SYS_llistxattr,
174            libc::SYS_flistxattr,
175            libc::SYS_rseq,
176            libc::SYS_close_range,
177            // Ownership – safe after capability drop (CAP_CHOWN/CAP_FOWNER gone;
178            // operations on files not owned by the container UID will EPERM).
179            libc::SYS_fchown,
180            libc::SYS_fchownat,
181            // Legacy AIO – used by databases and storage engines. Operations are
182            // bounded by the process's existing fd permissions.
183            libc::SYS_io_setup,
184            libc::SYS_io_destroy,
185            libc::SYS_io_submit,
186            libc::SYS_io_getevents,
187            // NOTE: io_uring intentionally excluded from defaults – large kernel
188            // attack surface with a history of CVEs. Applications needing io_uring
189            // (e.g. PostgreSQL 18+ io_method=io_uring) should use a custom seccomp
190            // profile that adds io_uring_setup/io_uring_enter/io_uring_register.
191            // Process groups – safe in PID namespace (can only affect own pgrp).
192            libc::SYS_setpgid,
193            libc::SYS_getpgid,
194            // NOTE: memfd_create intentionally excluded – combined with execveat
195            // it enables fileless code execution bypassing all FS controls (SEC-02).
196            // Landlock bootstrap (runtime applies seccomp before Landlock)
197            libc::SYS_landlock_create_ruleset,
198            libc::SYS_landlock_add_rule,
199            libc::SYS_landlock_restrict_self,
200            // Socket/Network (safe introspection + local socketpair)
201            libc::SYS_getsockname,
202            libc::SYS_getpeername,
203            libc::SYS_socketpair,
204            libc::SYS_getsockopt,
205            // Poll/Select
206            libc::SYS_ppoll,
207            libc::SYS_pselect6,
208            libc::SYS_epoll_create1,
209            libc::SYS_epoll_ctl,
210            libc::SYS_epoll_pwait,
211            libc::SYS_eventfd2,
212            libc::SYS_signalfd4,
213            libc::SYS_timerfd_create,
214            libc::SYS_timerfd_settime,
215            libc::SYS_timerfd_gettime,
216        ];
217
218        // Legacy syscalls only available on x86_64 (aarch64 only has the *at variants)
219        #[cfg(target_arch = "x86_64")]
220        syscalls.extend_from_slice(&[
221            libc::SYS_open,
222            libc::SYS_stat,
223            libc::SYS_lstat,
224            libc::SYS_access,
225            libc::SYS_readlink,
226            libc::SYS_dup2,
227            libc::SYS_pipe,
228            libc::SYS_unlink,
229            libc::SYS_rename,
230            libc::SYS_link,
231            libc::SYS_symlink,
232            libc::SYS_chmod,
233            libc::SYS_mkdir,
234            libc::SYS_rmdir,
235            libc::SYS_getdents,
236            libc::SYS_getpgrp,
237            libc::SYS_chown,
238            libc::SYS_fchown,
239            libc::SYS_lchown,
240            libc::SYS_arch_prctl,
241            libc::SYS_getrlimit,
242            libc::SYS_poll,
243            libc::SYS_select,
244            libc::SYS_epoll_create,
245            libc::SYS_epoll_wait,
246            libc::SYS_eventfd,
247            libc::SYS_signalfd,
248        ]);
249
250        syscalls
251    }
252
253    fn allowed_socket_domains(allow_network: bool) -> Vec<i32> {
254        if allow_network {
255            vec![libc::AF_UNIX, libc::AF_INET, libc::AF_INET6]
256        } else {
257            vec![libc::AF_UNIX]
258        }
259    }
260
261    fn network_mode_syscalls(allow_network: bool) -> Vec<i64> {
262        if allow_network {
263            vec![
264                libc::SYS_connect,
265                libc::SYS_sendto,
266                libc::SYS_recvfrom,
267                libc::SYS_sendmsg,
268                libc::SYS_recvmsg,
269                libc::SYS_shutdown,
270                libc::SYS_bind,
271                libc::SYS_listen,
272                libc::SYS_accept,
273                libc::SYS_accept4,
274                libc::SYS_setsockopt,
275            ]
276        } else {
277            Vec::new()
278        }
279    }
280
281    /// Get minimal syscall whitelist for basic container operation
282    ///
283    /// This is a restrictive whitelist that blocks dangerous syscalls:
284    /// - ptrace (process tracing)
285    /// - kexec_load (kernel loading)
286    /// - add_key, request_key, keyctl (kernel keyring)
287    /// - bpf (eBPF programs)
288    /// - perf_event_open (performance monitoring)
289    /// - userfaultfd (user fault handling)
290    fn minimal_filter(
291        allow_network: bool,
292        extra_syscalls: &[String],
293    ) -> Result<BTreeMap<i64, Vec<SeccompRule>>> {
294        let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
295
296        // Essential syscalls for basic operation
297        let allowed_syscalls = Self::base_allowed_syscalls();
298
299        // Allow all these syscalls unconditionally
300        for syscall in allowed_syscalls {
301            rules.insert(syscall, Vec::new());
302        }
303
304        // Add network-mode-specific syscalls
305        for syscall in Self::network_mode_syscalls(allow_network) {
306            rules.insert(syscall, Vec::new());
307        }
308
309        // Add user-requested extra syscalls (--seccomp-allow).
310        // - Already in default/arg-filtered: silently accepted (no-op).
311        // - In OPT_IN_SYSCALLS: added to allowlist.
312        // - Known but not opt-in: WARN and blocked (defense-in-depth).
313        // - Unknown name: WARN and blocked.
314        for name in extra_syscalls {
315            if let Some(nr) = syscall_name_to_number(name) {
316                if let std::collections::btree_map::Entry::Vacant(entry) = rules.entry(nr) {
317                    if Self::OPT_IN_SYSCALLS.contains(&name.as_str()) {
318                        entry.insert(Vec::new());
319                    } else {
320                        warn!(
321                            "--seccomp-allow: syscall '{}' is not in the opt-in allowlist – blocked",
322                            name
323                        );
324                    }
325                }
326            } else {
327                warn!("--seccomp-allow: unknown syscall '{}' – blocked", name);
328            }
329        }
330
331        // Restrict socket() domains by network mode.
332        // none: AF_UNIX only; network-enabled: AF_UNIX/AF_INET/AF_INET6.
333        let mut socket_rules = Vec::new();
334        for domain in Self::allowed_socket_domains(allow_network) {
335            let condition = SeccompCondition::new(
336                0, // arg0 is socket(domain, type, protocol)
337                seccompiler::SeccompCmpArgLen::Dword,
338                seccompiler::SeccompCmpOp::Eq,
339                domain as u64,
340            )
341            .map_err(|e| {
342                NucleusError::SeccompError(format!(
343                    "Failed to create socket domain condition: {}",
344                    e
345                ))
346            })?;
347            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
348                NucleusError::SeccompError(format!("Failed to create socket rule: {}", e))
349            })?;
350            socket_rules.push(rule);
351        }
352        rules.insert(libc::SYS_socket, socket_rules);
353
354        // ioctl: allow only safe terminal operations (arg0 = request code)
355        let ioctl_allowed: &[u64] = &[
356            0x5401, // TCGETS
357            0x5402, // TCSETS
358            0x5403, // TCSETSW
359            0x5404, // TCSETSF
360            0x540B, // TCFLSH
361            0x540F, // TIOCGPGRP
362            0x5410, // TIOCSPGRP
363            0x5413, // TIOCGWINSZ
364            0x5429, // TIOCGSID
365            0x541B, // FIONREAD
366            0x5421, // M12: FIONBIO – allowed because fcntl(F_SETFL, O_NONBLOCK)
367            // achieves the same result and is already permitted. Blocking
368            // FIONBIO only breaks tokio/mio for no security gain.
369            0x5451, // FIOCLEX
370            0x5450, // FIONCLEX
371        ];
372        let mut ioctl_rules = Vec::new();
373        for &request in ioctl_allowed {
374            let condition = SeccompCondition::new(
375                1, // arg1 is the request code for ioctl(fd, request, ...)
376                seccompiler::SeccompCmpArgLen::Dword,
377                seccompiler::SeccompCmpOp::Eq,
378                request,
379            )
380            .map_err(|e| {
381                NucleusError::SeccompError(format!("Failed to create ioctl condition: {}", e))
382            })?;
383            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
384                NucleusError::SeccompError(format!("Failed to create ioctl rule: {}", e))
385            })?;
386            ioctl_rules.push(rule);
387        }
388        rules.insert(libc::SYS_ioctl, ioctl_rules);
389
390        // prctl: allow only safe operations (arg0 = option).
391        // Notably absent (hit default deny):
392        //   PR_CAPBSET_DROP (24) – could weaken the capability bounding set
393        //   PR_SET_SECUREBITS (28) – could disable secure-exec restrictions
394        let prctl_allowed: &[u64] = &[
395            1,  // PR_SET_PDEATHSIG
396            2,  // PR_GET_PDEATHSIG
397            15, // PR_SET_NAME
398            16, // PR_GET_NAME
399            23, // PR_CAPBSET_READ – glibc probes this at startup to discover
400            // cap_last_cap when /proc/sys is masked. Read-only, harmless
401            // after capabilities have been dropped.
402            27, // PR_GET_SECUREBITS – read-only query of securebits flags
403            36, // PR_SET_CHILD_SUBREAPER – safe, only affects own descendants
404            37, // PR_GET_CHILD_SUBREAPER
405            38, // PR_SET_NO_NEW_PRIVS
406            40, // PR_GET_TID_ADDRESS – read-only, returns thread ID address
407            47, // PR_CAP_AMBIENT – glibc probes ambient caps at startup (read-only
408            // IS_SET queries). Safe after caps are dropped.
409            39, // PR_GET_NO_NEW_PRIVS
410        ];
411        let mut prctl_rules = Vec::new();
412        for &option in prctl_allowed {
413            let condition = SeccompCondition::new(
414                0, // arg0 is the option for prctl(option, ...)
415                seccompiler::SeccompCmpArgLen::Dword,
416                seccompiler::SeccompCmpOp::Eq,
417                option,
418            )
419            .map_err(|e| {
420                NucleusError::SeccompError(format!("Failed to create prctl condition: {}", e))
421            })?;
422            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
423                NucleusError::SeccompError(format!("Failed to create prctl rule: {}", e))
424            })?;
425            prctl_rules.push(rule);
426        }
427        rules.insert(libc::SYS_prctl, prctl_rules);
428
429        // M3: prlimit64 – only allow GET (new_limit == NULL, i.e. arg2 == 0).
430        // SET operations could raise RLIMIT_NPROC to bypass fork-bomb protection.
431        let prlimit_condition = SeccompCondition::new(
432            2, // arg2 = new_limit pointer for prlimit64(pid, resource, new_limit, old_limit)
433            seccompiler::SeccompCmpArgLen::Qword,
434            seccompiler::SeccompCmpOp::Eq,
435            0u64, // new_limit == NULL means GET-only
436        )
437        .map_err(|e| {
438            NucleusError::SeccompError(format!("Failed to create prlimit64 condition: {}", e))
439        })?;
440        let prlimit_rule = SeccompRule::new(vec![prlimit_condition]).map_err(|e| {
441            NucleusError::SeccompError(format!("Failed to create prlimit64 rule: {}", e))
442        })?;
443        rules.insert(libc::SYS_prlimit64, vec![prlimit_rule]);
444
445        // mprotect: permit RW or RX transitions, but reject PROT_WRITE|PROT_EXEC.
446        let mut mprotect_rules = Vec::new();
447        for allowed in [0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64] {
448            let condition = SeccompCondition::new(
449                2, // arg2 is prot for mprotect(addr, len, prot)
450                seccompiler::SeccompCmpArgLen::Dword,
451                seccompiler::SeccompCmpOp::MaskedEq((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
452                allowed,
453            )
454            .map_err(|e| {
455                NucleusError::SeccompError(format!("Failed to create mprotect condition: {}", e))
456            })?;
457            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
458                NucleusError::SeccompError(format!("Failed to create mprotect rule: {}", e))
459            })?;
460            mprotect_rules.push(rule);
461        }
462        rules.insert(libc::SYS_mprotect, mprotect_rules);
463
464        // clone3: ALLOWED unconditionally. clone3 passes flags inside a struct
465        // pointer that seccomp BPF cannot dereference, so namespace-flag filtering
466        // is impossible at the BPF level. However, glibc 2.34+ and newer musl use
467        // clone3 internally for posix_spawn/fork – blocking it breaks
468        // std::process::Command and any child-process spawning on modern systems.
469        //
470        // SECURITY INVARIANT: Namespace creation via clone3 is prevented solely by
471        // dropping CAP_SYS_ADMIN (and other namespace caps) *before* this seccomp
472        // filter is installed. If capability dropping is bypassed, clone3 becomes
473        // an unfiltered path to namespace creation. This is a known single point
474        // of failure – see CapabilityManager::drop_all() which must run first.
475        //
476        // Verify the invariant: CAP_SYS_ADMIN must not be in the effective set.
477        // CAP_SYS_ADMIN = capability bit 21
478        if Self::has_effective_cap(21) {
479            return Err(NucleusError::SeccompError(
480                "SECURITY: CAP_SYS_ADMIN is still in the effective capability set. \
481                 Capabilities must be dropped before installing seccomp filters \
482                 (clone3 is allowed unconditionally)."
483                    .to_string(),
484            ));
485        }
486        rules.insert(libc::SYS_clone3, Vec::new());
487
488        // clone: allow but deny namespace-creating flags to prevent nested namespace creation
489        let clone_condition = SeccompCondition::new(
490            0, // arg0 = flags
491            seccompiler::SeccompCmpArgLen::Qword,
492            seccompiler::SeccompCmpOp::MaskedEq(DENIED_CLONE_NAMESPACE_FLAGS),
493            0, // (flags & ns_flags) == 0: none of the namespace flags set
494        )
495        .map_err(|e| {
496            NucleusError::SeccompError(format!("Failed to create clone condition: {}", e))
497        })?;
498        let clone_rule = SeccompRule::new(vec![clone_condition]).map_err(|e| {
499            NucleusError::SeccompError(format!("Failed to create clone rule: {}", e))
500        })?;
501        rules.insert(libc::SYS_clone, vec![clone_rule]);
502
503        // execveat: allow but block AT_EMPTY_PATH (0x1000) to prevent fileless
504        // execution. With AT_EMPTY_PATH, execveat can execute code from any open
505        // fd (e.g., open + unlink, or even a socket fd), bypassing filesystem
506        // controls – not just memfd_create. Blocking memfd_create alone is
507        // insufficient. Normal execveat with dirfd+pathname (no AT_EMPTY_PATH)
508        // remains allowed.
509        let execveat_condition = SeccompCondition::new(
510            4, // arg4 = flags for execveat(dirfd, pathname, argv, envp, flags)
511            seccompiler::SeccompCmpArgLen::Dword,
512            seccompiler::SeccompCmpOp::MaskedEq(libc::AT_EMPTY_PATH as u64),
513            0, // (flags & AT_EMPTY_PATH) == 0: AT_EMPTY_PATH not set
514        )
515        .map_err(|e| {
516            NucleusError::SeccompError(format!("Failed to create execveat condition: {}", e))
517        })?;
518        let execveat_rule = SeccompRule::new(vec![execveat_condition]).map_err(|e| {
519            NucleusError::SeccompError(format!("Failed to create execveat rule: {}", e))
520        })?;
521        rules.insert(libc::SYS_execveat, vec![execveat_rule]);
522
523        Ok(rules)
524    }
525
526    /// Compile the minimal BPF filter without applying it
527    ///
528    /// This is useful for benchmarking filter compilation overhead
529    /// without the irreversible side effect of applying the filter.
530    ///
531    /// Uses bitmap-based BPF compilation for O(1) syscall dispatch.
532    pub fn compile_minimal_filter() -> Result<BpfProgram> {
533        let rules = Self::minimal_filter(true, &[])?;
534        let target_arch = std::env::consts::ARCH.try_into().map_err(|e| {
535            NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
536        })?;
537        super::seccomp_bpf::compile_bitmap_bpf(
538            rules,
539            SeccompAction::KillProcess,
540            SeccompAction::Allow,
541            target_arch,
542        )
543    }
544
545    /// Expose minimal_filter for tests in sibling modules.
546    #[cfg(test)]
547    pub(crate) fn minimal_filter_for_test(
548        allow_network: bool,
549        extra_syscalls: &[String],
550    ) -> BTreeMap<i64, Vec<SeccompRule>> {
551        Self::minimal_filter(allow_network, extra_syscalls).unwrap()
552    }
553
554    /// Apply seccomp filter
555    ///
556    /// This implements the transition: no_filter -> whitelist_active
557    /// in the seccomp state machine (NucleusSecurity_Seccomp_SeccompEnforcement.tla)
558    ///
559    /// Once applied, the filter cannot be removed (irreversible property)
560    /// In rootless mode or if seccomp setup fails, this will warn and continue
561    pub fn apply_minimal_filter(&mut self) -> Result<bool> {
562        self.apply_minimal_filter_with_mode(false, false)
563    }
564
565    /// Apply seccomp filter with configurable failure behavior
566    ///
567    /// When `best_effort` is true, failures are logged and execution continues.
568    /// When false, seccomp setup is fail-closed.
569    pub fn apply_minimal_filter_with_mode(
570        &mut self,
571        best_effort: bool,
572        log_denied: bool,
573    ) -> Result<bool> {
574        self.apply_filter_for_network_mode(true, best_effort, log_denied, &[])
575    }
576
577    /// Apply seccomp filter with network-mode-aware socket restrictions
578    ///
579    /// When `allow_network` is false, `SYS_socket` is restricted to AF_UNIX only,
580    /// preventing creation of network sockets (AF_INET, AF_INET6, etc.).
581    /// When `allow_network` is true, all socket domains are permitted.
582    ///
583    /// When `best_effort` is true, failures are logged and execution continues.
584    /// When false, seccomp setup is fail-closed.
585    pub fn apply_filter_for_network_mode(
586        &mut self,
587        allow_network: bool,
588        best_effort: bool,
589        log_denied: bool,
590        extra_syscalls: &[String],
591    ) -> Result<bool> {
592        if self.applied {
593            debug!("Seccomp filter already applied, skipping");
594            return Ok(true);
595        }
596
597        info!(allow_network, "Applying seccomp filter");
598
599        let rules = match Self::minimal_filter(allow_network, extra_syscalls) {
600            Ok(r) => r,
601            Err(e) => {
602                if best_effort {
603                    warn!(
604                        "Failed to create seccomp rules: {} (continuing without seccomp)",
605                        e
606                    );
607                    return Ok(false);
608                }
609                return Err(e);
610            }
611        };
612
613        let target_arch = match std::env::consts::ARCH.try_into() {
614            Ok(a) => a,
615            Err(e) => {
616                let msg = format!("Unsupported architecture: {:?}", e);
617                if best_effort {
618                    warn!("{} (continuing without seccomp)", msg);
619                    return Ok(false);
620                }
621                return Err(NucleusError::SeccompError(msg));
622            }
623        };
624
625        let bpf_prog: BpfProgram = match super::seccomp_bpf::compile_bitmap_bpf(
626            rules,
627            SeccompAction::KillProcess,
628            SeccompAction::Allow,
629            target_arch,
630        ) {
631            Ok(p) => p,
632            Err(e) => {
633                if best_effort {
634                    warn!(
635                        "Failed to compile BPF program: {} (continuing without seccomp)",
636                        e
637                    );
638                    return Ok(false);
639                }
640                return Err(e);
641            }
642        };
643
644        // Apply the filter
645        match Self::apply_bpf_program(&bpf_prog, log_denied) {
646            Ok(_) => {
647                self.applied = true;
648                info!("Successfully applied seccomp filter");
649                Ok(true)
650            }
651            Err(e) => {
652                if best_effort {
653                    warn!(
654                        "Failed to apply seccomp filter: {} (continuing without seccomp)",
655                        e
656                    );
657                    Ok(false)
658                } else {
659                    Err(NucleusError::SeccompError(format!(
660                        "Failed to apply seccomp filter: {}",
661                        e
662                    )))
663                }
664            }
665        }
666    }
667
668    /// Apply a seccomp profile loaded from a JSON file.
669    ///
670    /// The profile format is a JSON object with:
671    /// ```json
672    /// {
673    ///   "defaultAction": "SCMP_ACT_ERRNO",
674    ///   "syscalls": [
675    ///     { "names": ["read", "write", "open", ...], "action": "SCMP_ACT_ALLOW" }
676    ///   ]
677    /// }
678    /// ```
679    ///
680    /// This is a subset of the OCI seccomp profile format. Only the syscall name
681    /// allowlist is used; argument-level filtering from the built-in profile is
682    /// not applied when using a custom profile.
683    ///
684    /// If `expected_sha256` is provided, the file's SHA-256 hash is verified
685    /// against it before loading. This prevents silent profile tampering.
686    pub fn apply_profile_from_file(
687        &mut self,
688        profile_path: &Path,
689        expected_sha256: Option<&str>,
690        audit_mode: bool,
691    ) -> Result<bool> {
692        if self.applied {
693            debug!("Seccomp filter already applied, skipping");
694            return Ok(true);
695        }
696
697        info!("Loading seccomp profile from {:?}", profile_path);
698
699        // Read profile file
700        let content = std::fs::read(profile_path).map_err(|e| {
701            NucleusError::SeccompError(format!(
702                "Failed to read seccomp profile {:?}: {}",
703                profile_path, e
704            ))
705        })?;
706
707        // Verify SHA-256 hash if expected
708        if let Some(expected) = expected_sha256 {
709            let actual = sha256_hex(&content);
710            if actual != expected {
711                return Err(NucleusError::SeccompError(format!(
712                    "Seccomp profile hash mismatch: expected {}, got {}",
713                    expected, actual
714                )));
715            }
716            info!("Seccomp profile hash verified: {}", actual);
717        }
718
719        // Parse profile
720        let profile: SeccompProfile = serde_json::from_slice(&content).map_err(|e| {
721            NucleusError::SeccompError(format!("Failed to parse seccomp profile: {}", e))
722        })?;
723
724        // Warn when custom profile allows security-critical syscalls without
725        // argument-level filtering. The built-in filter restricts clone, ioctl,
726        // prctl, and socket at the argument level; a custom profile that allows
727        // them by name only silently removes all of that hardening.
728        Self::warn_missing_arg_filters(&profile);
729
730        // Build filter from profile
731        let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
732
733        for syscall_group in &profile.syscalls {
734            if syscall_group.action == "SCMP_ACT_ALLOW" {
735                for name in &syscall_group.names {
736                    if let Some(nr) = syscall_name_to_number(name) {
737                        rules.insert(nr, Vec::new());
738                    } else {
739                        warn!("Unknown syscall in profile: {} (skipping)", name);
740                    }
741                }
742            }
743        }
744
745        // SEC-01: Merge built-in argument filters for security-critical syscalls.
746        // Custom profiles that allow clone/ioctl/prctl/socket/mprotect by name
747        // without argument-level filters would silently remove all hardening.
748        // Overwrite their empty rules with the built-in argument-filtered rules.
749        let builtin_rules = Self::minimal_filter(true, &[])?;
750        for syscall_name in Self::ARG_FILTERED_SYSCALLS {
751            if let Some(nr) = syscall_name_to_number(syscall_name) {
752                if let std::collections::btree_map::Entry::Occupied(mut entry) = rules.entry(nr) {
753                    if let Some(builtin) = builtin_rules.get(&nr) {
754                        if !builtin.is_empty() {
755                            info!(
756                                "Merging built-in argument filters for '{}' into custom profile",
757                                syscall_name
758                            );
759                            entry.insert(builtin.clone());
760                        }
761                    }
762                }
763            }
764        }
765        // H2: clone3 is allowed in the built-in filter (needed for glibc 2.34+).
766        // Apply the same policy to custom profiles for consistency. The security
767        // invariant against namespace creation via clone3 is enforced by dropping
768        // CAP_SYS_ADMIN *before* seccomp is installed (see verify_no_namespace_caps).
769        // If the custom profile doesn't include clone3, add it.
770        rules.entry(libc::SYS_clone3).or_default();
771
772        let target_arch = std::env::consts::ARCH.try_into().map_err(|e| {
773            NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
774        })?;
775
776        let bpf_prog: BpfProgram = super::seccomp_bpf::compile_bitmap_bpf(
777            rules,
778            SeccompAction::KillProcess,
779            SeccompAction::Allow,
780            target_arch,
781        )?;
782
783        match Self::apply_bpf_program(&bpf_prog, audit_mode) {
784            Ok(_) => {
785                self.applied = true;
786                info!(
787                    "Seccomp profile applied from {:?} (log_denied={})",
788                    profile_path, audit_mode
789                );
790                Ok(true)
791            }
792            Err(e) => Err(e),
793        }
794    }
795
796    /// Install an allow-all seccomp filter with SECCOMP_FILTER_FLAG_LOG.
797    ///
798    /// Used in trace mode: all syscalls are allowed but logged to the kernel
799    /// audit subsystem. A separate reader collects the logged syscalls.
800    pub fn apply_trace_filter(&mut self) -> Result<bool> {
801        if self.applied {
802            debug!("Seccomp filter already applied, skipping trace filter");
803            return Ok(true);
804        }
805
806        info!("Applying seccomp trace filter (allow-all + LOG)");
807
808        // Create an empty rule set – with SeccompAction::Allow as default,
809        // every syscall is permitted. The LOG flag causes the kernel to
810        // audit each syscall decision.
811        let filter = SeccompFilter::new(
812            BTreeMap::new(),
813            SeccompAction::Allow, // default: allow everything
814            SeccompAction::Allow, // match action (unused – no rules)
815            std::env::consts::ARCH.try_into().map_err(|e| {
816                NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
817            })?,
818        )
819        .map_err(|e| NucleusError::SeccompError(format!("Failed to create trace filter: {}", e)))?;
820
821        let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
822            NucleusError::SeccompError(format!("Failed to compile trace BPF: {}", e))
823        })?;
824
825        // Apply with LOG flag so kernel audits every syscall
826        Self::apply_bpf_program(&bpf_prog, true)?;
827        self.applied = true;
828        info!("Seccomp trace filter applied (all syscalls allowed + logged)");
829        Ok(true)
830    }
831
832    /// Syscalls that the built-in filter restricts at the argument level.
833    /// Custom profiles allowing these without argument filters weaken security.
834    const ARG_FILTERED_SYSCALLS: &'static [&'static str] = &[
835        "clone", "clone3", "execveat", "ioctl", "mprotect", "prctl", "socket",
836    ];
837
838    /// Non-default syscalls that may be opted into via `--seccomp-allow`.
839    ///
840    /// Every syscall known to `syscall_name_to_number` but absent from both
841    /// `base_allowed_syscalls` and `ARG_FILTERED_SYSCALLS` must appear here
842    /// to be enableable. Requesting a known syscall that is NOT in this list
843    /// emits a WARN and is silently dropped (defense-in-depth).
844    const OPT_IN_SYSCALLS: &'static [&'static str] = &[
845        // io_uring – large attack surface but needed by modern databases
846        "io_uring_setup",
847        "io_uring_enter",
848        "io_uring_register",
849        // SysV message queues
850        "msgget",
851        "msgsnd",
852        "msgrcv",
853        "msgctl",
854        // POSIX message queues
855        "mq_open",
856        "mq_unlink",
857        "mq_timedsend",
858        "mq_timedreceive",
859        "mq_notify",
860        "mq_getsetattr",
861        // POSIX timers
862        "timer_create",
863        "timer_settime",
864        "timer_gettime",
865        "timer_getoverrun",
866        "timer_delete",
867        // Inotify / fanotify
868        "inotify_init",
869        "inotify_init1",
870        "inotify_add_watch",
871        "inotify_rm_watch",
872        "fanotify_init",
873        "fanotify_mark",
874        // Memory (non-default)
875        "mincore",
876        "mlockall",
877        "munlockall",
878        "membarrier",
879        "process_madvise",
880        "mbind",
881        "set_mempolicy",
882        "get_mempolicy",
883        "set_mempolicy_home_node",
884        "pkey_mprotect",
885        "pkey_alloc",
886        "pkey_free",
887        "cachestat",
888        "remap_file_pages",
889        // File I/O (non-default)
890        "sync",
891        "syncfs",
892        "sync_file_range",
893        "readahead",
894        "vmsplice",
895        "openat2",
896        "name_to_handle_at",
897        "open_by_handle_at",
898        "io_cancel",
899        "io_pgetevents",
900        "creat",
901        "fchmodat2",
902        "statmount",
903        "listmount",
904        "utimensat",
905        "utimes",
906        "utime",
907        "futimesat",
908        // Extended attributes (write)
909        "setxattr",
910        "lsetxattr",
911        "fsetxattr",
912        "removexattr",
913        "lremovexattr",
914        "fremovexattr",
915        "setxattrat",
916        "getxattrat",
917        "listxattrat",
918        "removexattrat",
919        // Network (non-default)
920        "recvmmsg",
921        "sendmmsg",
922        // Scheduling (non-default)
923        "sched_setparam",
924        "sched_setscheduler",
925        "sched_get_priority_max",
926        "sched_get_priority_min",
927        "sched_rr_get_interval",
928        "sched_setattr",
929        "sched_getattr",
930        // Resource limits / priority
931        "setrlimit",
932        "getpriority",
933        "setpriority",
934        "ioprio_set",
935        "ioprio_get",
936        // Process (non-default, low risk)
937        "vfork",
938        "pause",
939        "alarm",
940        "tkill",
941        "sysinfo",
942        "personality",
943        "vhangup",
944        "time",
945        "pidfd_open",
946        "pidfd_send_signal",
947        "pidfd_getfd",
948        // UID/GID
949        "setuid",
950        "setgid",
951        "setreuid",
952        "setregid",
953        "setresuid",
954        "getresuid",
955        "setresgid",
956        "getresgid",
957        "setfsuid",
958        "setfsgid",
959        "setgroups",
960        "getsid",
961        // Capabilities (read-only query)
962        "capget",
963        // Signals (non-default)
964        "rt_tgsigqueueinfo",
965        // Misc
966        "mknod",
967        "mknodat",
968        "syslog",
969        "clock_settime",
970        "clock_adjtime",
971        "adjtimex",
972        "kcmp",
973        "epoll_pwait2",
974        // Futex (non-default)
975        "futex_waitv",
976        "futex_wake",
977        "futex_wait",
978        "futex_requeue",
979        // Landlock (already in default but listed for completeness)
980        "seccomp",
981        // Keyring
982        "add_key",
983        "request_key",
984        "keyctl",
985    ];
986
987    /// Warn when a custom seccomp profile allows security-critical syscalls
988    /// without argument-level filtering.
989    fn warn_missing_arg_filters(profile: &SeccompProfile) {
990        for group in &profile.syscalls {
991            if group.action != "SCMP_ACT_ALLOW" {
992                continue;
993            }
994            for name in &group.names {
995                if Self::ARG_FILTERED_SYSCALLS.contains(&name.as_str()) && group.args.is_empty() {
996                    warn!(
997                        "Custom seccomp profile allows '{}' without argument filters. \
998                         The built-in filter restricts this syscall at the argument level. \
999                         This profile weakens security compared to the default.",
1000                        name
1001                    );
1002                }
1003            }
1004        }
1005    }
1006
1007    /// Check whether a capability is in the current thread's effective set
1008    /// by reading /proc/self/status (CapEff line).
1009    fn has_effective_cap(cap: i32) -> bool {
1010        let Ok(status) = std::fs::read_to_string("/proc/self/status") else {
1011            // If we can't read, assume worst case for safety.
1012            return true;
1013        };
1014        for line in status.lines() {
1015            if let Some(hex) = line.strip_prefix("CapEff:\t") {
1016                if let Ok(eff) = u64::from_str_radix(hex.trim(), 16) {
1017                    return eff & (1u64 << cap) != 0;
1018                }
1019            }
1020        }
1021        true // assume worst case
1022    }
1023
1024    /// Check if seccomp filter has been applied
1025    pub fn is_applied(&self) -> bool {
1026        self.applied
1027    }
1028
1029    fn apply_bpf_program(bpf_prog: &BpfProgram, log_denied: bool) -> Result<()> {
1030        let mut flags: libc::c_ulong = 0;
1031        if log_denied {
1032            flags |= libc::SECCOMP_FILTER_FLAG_LOG as libc::c_ulong;
1033        }
1034
1035        match Self::apply_bpf_program_with_flags(bpf_prog, flags) {
1036            Ok(()) => Ok(()),
1037            Err(err)
1038                if log_denied
1039                    && err.raw_os_error() == Some(libc::EINVAL)
1040                    && libc::SECCOMP_FILTER_FLAG_LOG != 0 =>
1041            {
1042                warn!(
1043                    "Kernel rejected SECCOMP_FILTER_FLAG_LOG; continuing with seccomp \
1044                     enforcement without deny logging"
1045                );
1046                Self::apply_bpf_program_with_flags(bpf_prog, 0)?;
1047                Ok(())
1048            }
1049            Err(err) => Err(NucleusError::SeccompError(format!(
1050                "Failed to apply seccomp filter: {}",
1051                err
1052            ))),
1053        }
1054    }
1055
1056    fn apply_bpf_program_with_flags(
1057        bpf_prog: &BpfProgram,
1058        flags: libc::c_ulong,
1059    ) -> std::io::Result<()> {
1060        // SAFETY: `prctl(PR_SET_NO_NEW_PRIVS, ...)` has no pointer arguments here
1061        // and only affects the current thread/process as required before seccomp.
1062        let rc = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
1063        if rc != 0 {
1064            return Err(std::io::Error::last_os_error());
1065        }
1066
1067        let prog = libc::sock_fprog {
1068            len: bpf_prog.len() as u16,
1069            filter: bpf_prog.as_ptr() as *mut libc::sock_filter,
1070        };
1071
1072        // SAFETY: `prog` points to a live BPF program buffer for the duration of
1073        // the syscall and the kernel copies the pointed-to filter immediately.
1074        let rc = unsafe {
1075            libc::syscall(
1076                libc::SYS_seccomp,
1077                libc::SECCOMP_SET_MODE_FILTER,
1078                flags,
1079                &prog as *const libc::sock_fprog,
1080            )
1081        };
1082
1083        if rc < 0 {
1084            return Err(std::io::Error::last_os_error());
1085        }
1086
1087        Ok(())
1088    }
1089}
1090
1091// SeccompProfile and SeccompSyscallGroup are defined in seccomp_generate.rs
1092use crate::security::seccomp_generate::SeccompProfile;
1093
1094/// Map a syscall name (e.g. "read", "write") to its Linux syscall number.
1095///
1096/// Covers the most common syscalls. Unknown names return None.
1097fn syscall_name_to_number(name: &str) -> Option<i64> {
1098    match name {
1099        // File I/O
1100        "read" => Some(libc::SYS_read),
1101        "write" => Some(libc::SYS_write),
1102        #[cfg(target_arch = "x86_64")]
1103        "open" => Some(libc::SYS_open),
1104        "openat" => Some(libc::SYS_openat),
1105        "close" => Some(libc::SYS_close),
1106        #[cfg(target_arch = "x86_64")]
1107        "stat" => Some(libc::SYS_stat),
1108        "fstat" => Some(libc::SYS_fstat),
1109        #[cfg(target_arch = "x86_64")]
1110        "lstat" => Some(libc::SYS_lstat),
1111        "lseek" => Some(libc::SYS_lseek),
1112        #[cfg(target_arch = "x86_64")]
1113        "access" => Some(libc::SYS_access),
1114        "fcntl" => Some(libc::SYS_fcntl),
1115        "readv" => Some(libc::SYS_readv),
1116        "writev" => Some(libc::SYS_writev),
1117        "pread64" => Some(libc::SYS_pread64),
1118        "pwrite64" => Some(libc::SYS_pwrite64),
1119        #[cfg(target_arch = "x86_64")]
1120        "readlink" => Some(libc::SYS_readlink),
1121        "readlinkat" => Some(libc::SYS_readlinkat),
1122        "newfstatat" => Some(libc::SYS_newfstatat),
1123        "statx" => Some(libc::SYS_statx),
1124        "faccessat" => Some(libc::SYS_faccessat),
1125        "faccessat2" => Some(libc::SYS_faccessat2),
1126        "dup" => Some(libc::SYS_dup),
1127        #[cfg(target_arch = "x86_64")]
1128        "dup2" => Some(libc::SYS_dup2),
1129        "dup3" => Some(libc::SYS_dup3),
1130        #[cfg(target_arch = "x86_64")]
1131        "pipe" => Some(libc::SYS_pipe),
1132        "pipe2" => Some(libc::SYS_pipe2),
1133        #[cfg(target_arch = "x86_64")]
1134        "unlink" => Some(libc::SYS_unlink),
1135        "unlinkat" => Some(libc::SYS_unlinkat),
1136        #[cfg(target_arch = "x86_64")]
1137        "rename" => Some(libc::SYS_rename),
1138        "renameat" => Some(libc::SYS_renameat),
1139        "renameat2" => Some(libc::SYS_renameat2),
1140        #[cfg(target_arch = "x86_64")]
1141        "link" => Some(libc::SYS_link),
1142        "linkat" => Some(libc::SYS_linkat),
1143        #[cfg(target_arch = "x86_64")]
1144        "symlink" => Some(libc::SYS_symlink),
1145        "symlinkat" => Some(libc::SYS_symlinkat),
1146        #[cfg(target_arch = "x86_64")]
1147        "chmod" => Some(libc::SYS_chmod),
1148        "fchmod" => Some(libc::SYS_fchmod),
1149        "fchmodat" => Some(libc::SYS_fchmodat),
1150        "truncate" => Some(libc::SYS_truncate),
1151        "ftruncate" => Some(libc::SYS_ftruncate),
1152        "fallocate" => Some(libc::SYS_fallocate),
1153        #[cfg(target_arch = "x86_64")]
1154        "fadvise64" => Some(libc::SYS_fadvise64),
1155        "fsync" => Some(libc::SYS_fsync),
1156        "fdatasync" => Some(libc::SYS_fdatasync),
1157        "flock" => Some(libc::SYS_flock),
1158        #[cfg(target_arch = "x86_64")]
1159        "sendfile" => Some(libc::SYS_sendfile),
1160        "copy_file_range" => Some(libc::SYS_copy_file_range),
1161        "splice" => Some(libc::SYS_splice),
1162        "tee" => Some(libc::SYS_tee),
1163        // Memory
1164        "mmap" => Some(libc::SYS_mmap),
1165        "munmap" => Some(libc::SYS_munmap),
1166        "mprotect" => Some(libc::SYS_mprotect),
1167        "brk" => Some(libc::SYS_brk),
1168        "mremap" => Some(libc::SYS_mremap),
1169        "madvise" => Some(libc::SYS_madvise),
1170        "msync" => Some(libc::SYS_msync),
1171        "mlock" => Some(libc::SYS_mlock),
1172        "mlock2" => Some(libc::SYS_mlock2),
1173        "munlock" => Some(libc::SYS_munlock),
1174        // SysV IPC
1175        "shmget" => Some(libc::SYS_shmget),
1176        "shmat" => Some(libc::SYS_shmat),
1177        "shmdt" => Some(libc::SYS_shmdt),
1178        "shmctl" => Some(libc::SYS_shmctl),
1179        "semget" => Some(libc::SYS_semget),
1180        "semop" => Some(libc::SYS_semop),
1181        "semctl" => Some(libc::SYS_semctl),
1182        "semtimedop" => Some(libc::SYS_semtimedop),
1183        // Process
1184        #[cfg(target_arch = "x86_64")]
1185        "fork" => Some(libc::SYS_fork),
1186        "clone" => Some(libc::SYS_clone),
1187        "clone3" => Some(libc::SYS_clone3),
1188        "execve" => Some(libc::SYS_execve),
1189        "execveat" => Some(libc::SYS_execveat),
1190        "wait4" => Some(libc::SYS_wait4),
1191        "waitid" => Some(libc::SYS_waitid),
1192        "exit" => Some(libc::SYS_exit),
1193        "exit_group" => Some(libc::SYS_exit_group),
1194        "getpid" => Some(libc::SYS_getpid),
1195        "gettid" => Some(libc::SYS_gettid),
1196        "getuid" => Some(libc::SYS_getuid),
1197        "getgid" => Some(libc::SYS_getgid),
1198        "geteuid" => Some(libc::SYS_geteuid),
1199        "getegid" => Some(libc::SYS_getegid),
1200        "getppid" => Some(libc::SYS_getppid),
1201        #[cfg(target_arch = "x86_64")]
1202        "getpgrp" => Some(libc::SYS_getpgrp),
1203        "setsid" => Some(libc::SYS_setsid),
1204        "getgroups" => Some(libc::SYS_getgroups),
1205        // Signals
1206        "rt_sigaction" => Some(libc::SYS_rt_sigaction),
1207        "rt_sigprocmask" => Some(libc::SYS_rt_sigprocmask),
1208        "rt_sigreturn" => Some(libc::SYS_rt_sigreturn),
1209        "rt_sigsuspend" => Some(libc::SYS_rt_sigsuspend),
1210        "rt_sigtimedwait" => Some(libc::SYS_rt_sigtimedwait),
1211        "rt_sigpending" => Some(libc::SYS_rt_sigpending),
1212        "rt_sigqueueinfo" => Some(libc::SYS_rt_sigqueueinfo),
1213        "sigaltstack" => Some(libc::SYS_sigaltstack),
1214        "restart_syscall" => Some(libc::SYS_restart_syscall),
1215        "kill" => Some(libc::SYS_kill),
1216        "tgkill" => Some(libc::SYS_tgkill),
1217        // Time
1218        "clock_gettime" => Some(libc::SYS_clock_gettime),
1219        "clock_getres" => Some(libc::SYS_clock_getres),
1220        "clock_nanosleep" => Some(libc::SYS_clock_nanosleep),
1221        "gettimeofday" => Some(libc::SYS_gettimeofday),
1222        "nanosleep" => Some(libc::SYS_nanosleep),
1223        // Directories
1224        "getcwd" => Some(libc::SYS_getcwd),
1225        "chdir" => Some(libc::SYS_chdir),
1226        "fchdir" => Some(libc::SYS_fchdir),
1227        #[cfg(target_arch = "x86_64")]
1228        "mkdir" => Some(libc::SYS_mkdir),
1229        "mkdirat" => Some(libc::SYS_mkdirat),
1230        #[cfg(target_arch = "x86_64")]
1231        "rmdir" => Some(libc::SYS_rmdir),
1232        #[cfg(target_arch = "x86_64")]
1233        "getdents" => Some(libc::SYS_getdents),
1234        "getdents64" => Some(libc::SYS_getdents64),
1235        // Network
1236        "socket" => Some(libc::SYS_socket),
1237        "connect" => Some(libc::SYS_connect),
1238        "sendto" => Some(libc::SYS_sendto),
1239        "recvfrom" => Some(libc::SYS_recvfrom),
1240        "sendmsg" => Some(libc::SYS_sendmsg),
1241        "recvmsg" => Some(libc::SYS_recvmsg),
1242        "shutdown" => Some(libc::SYS_shutdown),
1243        "bind" => Some(libc::SYS_bind),
1244        "listen" => Some(libc::SYS_listen),
1245        "accept" => Some(libc::SYS_accept),
1246        "accept4" => Some(libc::SYS_accept4),
1247        "setsockopt" => Some(libc::SYS_setsockopt),
1248        "getsockopt" => Some(libc::SYS_getsockopt),
1249        "getsockname" => Some(libc::SYS_getsockname),
1250        "getpeername" => Some(libc::SYS_getpeername),
1251        "socketpair" => Some(libc::SYS_socketpair),
1252        // Poll/Select
1253        #[cfg(target_arch = "x86_64")]
1254        "poll" => Some(libc::SYS_poll),
1255        "ppoll" => Some(libc::SYS_ppoll),
1256        #[cfg(target_arch = "x86_64")]
1257        "select" => Some(libc::SYS_select),
1258        "pselect6" => Some(libc::SYS_pselect6),
1259        #[cfg(target_arch = "x86_64")]
1260        "epoll_create" => Some(libc::SYS_epoll_create),
1261        "epoll_create1" => Some(libc::SYS_epoll_create1),
1262        "epoll_ctl" => Some(libc::SYS_epoll_ctl),
1263        #[cfg(target_arch = "x86_64")]
1264        "epoll_wait" => Some(libc::SYS_epoll_wait),
1265        "epoll_pwait" => Some(libc::SYS_epoll_pwait),
1266        #[cfg(target_arch = "x86_64")]
1267        "eventfd" => Some(libc::SYS_eventfd),
1268        "eventfd2" => Some(libc::SYS_eventfd2),
1269        #[cfg(target_arch = "x86_64")]
1270        "signalfd" => Some(libc::SYS_signalfd),
1271        "signalfd4" => Some(libc::SYS_signalfd4),
1272        "timerfd_create" => Some(libc::SYS_timerfd_create),
1273        "timerfd_settime" => Some(libc::SYS_timerfd_settime),
1274        "timerfd_gettime" => Some(libc::SYS_timerfd_gettime),
1275        // Misc
1276        "uname" => Some(libc::SYS_uname),
1277        "getrandom" => Some(libc::SYS_getrandom),
1278        "futex" => Some(libc::SYS_futex),
1279        "set_tid_address" => Some(libc::SYS_set_tid_address),
1280        "set_robust_list" => Some(libc::SYS_set_robust_list),
1281        "get_robust_list" => Some(libc::SYS_get_robust_list),
1282        #[cfg(target_arch = "x86_64")]
1283        "arch_prctl" => Some(libc::SYS_arch_prctl),
1284        "sysinfo" => Some(libc::SYS_sysinfo),
1285        "umask" => Some(libc::SYS_umask),
1286        #[cfg(target_arch = "x86_64")]
1287        "getrlimit" => Some(libc::SYS_getrlimit),
1288        "prlimit64" => Some(libc::SYS_prlimit64),
1289        "getrusage" => Some(libc::SYS_getrusage),
1290        "times" => Some(libc::SYS_times),
1291        "sched_yield" => Some(libc::SYS_sched_yield),
1292        "sched_getaffinity" => Some(libc::SYS_sched_getaffinity),
1293        "getcpu" => Some(libc::SYS_getcpu),
1294        "rseq" => Some(libc::SYS_rseq),
1295        "close_range" => Some(libc::SYS_close_range),
1296        // Ownership
1297        "fchown" => Some(libc::SYS_fchown),
1298        "fchownat" => Some(libc::SYS_fchownat),
1299        #[cfg(target_arch = "x86_64")]
1300        "chown" => Some(libc::SYS_chown),
1301        #[cfg(target_arch = "x86_64")]
1302        "lchown" => Some(libc::SYS_lchown),
1303        // io_uring
1304        "io_uring_setup" => Some(libc::SYS_io_uring_setup),
1305        "io_uring_enter" => Some(libc::SYS_io_uring_enter),
1306        "io_uring_register" => Some(libc::SYS_io_uring_register),
1307        // Legacy AIO
1308        "io_setup" => Some(libc::SYS_io_setup),
1309        "io_destroy" => Some(libc::SYS_io_destroy),
1310        "io_submit" => Some(libc::SYS_io_submit),
1311        "io_getevents" => Some(libc::SYS_io_getevents),
1312        // Timers
1313        "setitimer" => Some(libc::SYS_setitimer),
1314        "getitimer" => Some(libc::SYS_getitimer),
1315        // Process groups
1316        "setpgid" => Some(libc::SYS_setpgid),
1317        "getpgid" => Some(libc::SYS_getpgid),
1318        "memfd_create" => Some(libc::SYS_memfd_create),
1319        "ioctl" => Some(libc::SYS_ioctl),
1320        "prctl" => Some(libc::SYS_prctl),
1321        // Landlock
1322        "landlock_create_ruleset" => Some(libc::SYS_landlock_create_ruleset),
1323        "landlock_add_rule" => Some(libc::SYS_landlock_add_rule),
1324        "landlock_restrict_self" => Some(libc::SYS_landlock_restrict_self),
1325        // --- Additional syscalls (not in default allowlist, available via --seccomp-allow) ---
1326        // Memory
1327        "mincore" => Some(libc::SYS_mincore),
1328        "mlockall" => Some(libc::SYS_mlockall),
1329        "munlockall" => Some(libc::SYS_munlockall),
1330        "mbind" => Some(libc::SYS_mbind),
1331        "set_mempolicy" => Some(libc::SYS_set_mempolicy),
1332        "get_mempolicy" => Some(libc::SYS_get_mempolicy),
1333        "memfd_secret" => Some(libc::SYS_memfd_secret),
1334        "membarrier" => Some(libc::SYS_membarrier),
1335        "process_madvise" => Some(libc::SYS_process_madvise),
1336        "pkey_mprotect" => Some(libc::SYS_pkey_mprotect),
1337        "pkey_alloc" => Some(libc::SYS_pkey_alloc),
1338        "pkey_free" => Some(libc::SYS_pkey_free),
1339        "mseal" => Some(libc::SYS_mseal),
1340        "map_shadow_stack" => Some(453),
1341        "remap_file_pages" => Some(libc::SYS_remap_file_pages),
1342        "set_mempolicy_home_node" => Some(libc::SYS_set_mempolicy_home_node),
1343        "cachestat" => Some(451),
1344        // Process
1345        #[cfg(target_arch = "x86_64")]
1346        "vfork" => Some(libc::SYS_vfork),
1347        #[cfg(target_arch = "x86_64")]
1348        "pause" => Some(libc::SYS_pause),
1349        #[cfg(target_arch = "x86_64")]
1350        "alarm" => Some(libc::SYS_alarm),
1351        "tkill" => Some(libc::SYS_tkill),
1352        "ptrace" => Some(libc::SYS_ptrace),
1353        "process_vm_readv" => Some(libc::SYS_process_vm_readv),
1354        "process_vm_writev" => Some(libc::SYS_process_vm_writev),
1355        "process_mrelease" => Some(libc::SYS_process_mrelease),
1356        "kcmp" => Some(libc::SYS_kcmp),
1357        "unshare" => Some(libc::SYS_unshare),
1358        "setns" => Some(libc::SYS_setns),
1359        "pidfd_open" => Some(libc::SYS_pidfd_open),
1360        "pidfd_send_signal" => Some(libc::SYS_pidfd_send_signal),
1361        "pidfd_getfd" => Some(libc::SYS_pidfd_getfd),
1362        // UID/GID
1363        "setuid" => Some(libc::SYS_setuid),
1364        "setgid" => Some(libc::SYS_setgid),
1365        "setreuid" => Some(libc::SYS_setreuid),
1366        "setregid" => Some(libc::SYS_setregid),
1367        "setresuid" => Some(libc::SYS_setresuid),
1368        "getresuid" => Some(libc::SYS_getresuid),
1369        "setresgid" => Some(libc::SYS_setresgid),
1370        "getresgid" => Some(libc::SYS_getresgid),
1371        "setfsuid" => Some(libc::SYS_setfsuid),
1372        "setfsgid" => Some(libc::SYS_setfsgid),
1373        "setgroups" => Some(libc::SYS_setgroups),
1374        "getsid" => Some(libc::SYS_getsid),
1375        // Capabilities
1376        "capget" => Some(libc::SYS_capget),
1377        "capset" => Some(libc::SYS_capset),
1378        // Signals
1379        "rt_tgsigqueueinfo" => Some(libc::SYS_rt_tgsigqueueinfo),
1380        // SysV message queues
1381        "msgget" => Some(libc::SYS_msgget),
1382        "msgsnd" => Some(libc::SYS_msgsnd),
1383        "msgrcv" => Some(libc::SYS_msgrcv),
1384        "msgctl" => Some(libc::SYS_msgctl),
1385        // Timers
1386        "timer_create" => Some(libc::SYS_timer_create),
1387        "timer_settime" => Some(libc::SYS_timer_settime),
1388        "timer_gettime" => Some(libc::SYS_timer_gettime),
1389        "timer_getoverrun" => Some(libc::SYS_timer_getoverrun),
1390        "timer_delete" => Some(libc::SYS_timer_delete),
1391        "clock_settime" => Some(libc::SYS_clock_settime),
1392        "clock_adjtime" => Some(libc::SYS_clock_adjtime),
1393        #[cfg(target_arch = "x86_64")]
1394        "time" => Some(libc::SYS_time),
1395        // File I/O (non-default)
1396        #[cfg(target_arch = "x86_64")]
1397        "creat" => Some(libc::SYS_creat),
1398        "readahead" => Some(libc::SYS_readahead),
1399        "sync" => Some(libc::SYS_sync),
1400        "syncfs" => Some(libc::SYS_syncfs),
1401        "vmsplice" => Some(libc::SYS_vmsplice),
1402        "utimensat" => Some(libc::SYS_utimensat),
1403        #[cfg(target_arch = "x86_64")]
1404        "utimes" => Some(libc::SYS_utimes),
1405        #[cfg(target_arch = "x86_64")]
1406        "utime" => Some(libc::SYS_utime),
1407        #[cfg(target_arch = "x86_64")]
1408        "futimesat" => Some(libc::SYS_futimesat),
1409        "openat2" => Some(libc::SYS_openat2),
1410        "name_to_handle_at" => Some(libc::SYS_name_to_handle_at),
1411        "open_by_handle_at" => Some(libc::SYS_open_by_handle_at),
1412        "fchmodat2" => Some(libc::SYS_fchmodat2),
1413        "statmount" => Some(457),
1414        "listmount" => Some(458),
1415        // Extended attributes (write)
1416        "setxattr" => Some(libc::SYS_setxattr),
1417        "lsetxattr" => Some(libc::SYS_lsetxattr),
1418        "fsetxattr" => Some(libc::SYS_fsetxattr),
1419        "removexattr" => Some(libc::SYS_removexattr),
1420        "lremovexattr" => Some(libc::SYS_lremovexattr),
1421        "fremovexattr" => Some(libc::SYS_fremovexattr),
1422        "setxattrat" => Some(463),
1423        "getxattrat" => Some(464),
1424        "listxattrat" => Some(465),
1425        "removexattrat" => Some(466),
1426        // Network (non-default)
1427        "recvmmsg" => Some(libc::SYS_recvmmsg),
1428        "sendmmsg" => Some(libc::SYS_sendmmsg),
1429        // Inotify
1430        #[cfg(target_arch = "x86_64")]
1431        "inotify_init" => Some(libc::SYS_inotify_init),
1432        "inotify_init1" => Some(libc::SYS_inotify_init1),
1433        "inotify_add_watch" => Some(libc::SYS_inotify_add_watch),
1434        "inotify_rm_watch" => Some(libc::SYS_inotify_rm_watch),
1435        // Fanotify
1436        "fanotify_init" => Some(libc::SYS_fanotify_init),
1437        "fanotify_mark" => Some(libc::SYS_fanotify_mark),
1438        // Epoll (non-default)
1439        "epoll_pwait2" => Some(libc::SYS_epoll_pwait2),
1440        // Scheduling (non-default)
1441        "sched_setparam" => Some(libc::SYS_sched_setparam),
1442        "sched_setscheduler" => Some(libc::SYS_sched_setscheduler),
1443        "sched_get_priority_max" => Some(libc::SYS_sched_get_priority_max),
1444        "sched_get_priority_min" => Some(libc::SYS_sched_get_priority_min),
1445        "sched_rr_get_interval" => Some(libc::SYS_sched_rr_get_interval),
1446        "sched_setattr" => Some(libc::SYS_sched_setattr),
1447        "sched_getattr" => Some(libc::SYS_sched_getattr),
1448        "sched_setaffinity" => Some(libc::SYS_sched_setaffinity),
1449        // Resource limits
1450        #[cfg(target_arch = "x86_64")]
1451        "setrlimit" => Some(libc::SYS_setrlimit),
1452        "getpriority" => Some(libc::SYS_getpriority),
1453        "setpriority" => Some(libc::SYS_setpriority),
1454        "ioprio_set" => Some(libc::SYS_ioprio_set),
1455        "ioprio_get" => Some(libc::SYS_ioprio_get),
1456        // Futex (non-default)
1457        "futex_waitv" => Some(libc::SYS_futex_waitv),
1458        "futex_wake" => Some(454),
1459        "futex_wait" => Some(455),
1460        "futex_requeue" => Some(456),
1461        // Kernel modules
1462        "init_module" => Some(libc::SYS_init_module),
1463        "finit_module" => Some(libc::SYS_finit_module),
1464        "delete_module" => Some(libc::SYS_delete_module),
1465        // eBPF and performance
1466        "bpf" => Some(libc::SYS_bpf),
1467        "perf_event_open" => Some(libc::SYS_perf_event_open),
1468        // Seccomp
1469        "seccomp" => Some(libc::SYS_seccomp),
1470        // Userfaultfd
1471        "userfaultfd" => Some(libc::SYS_userfaultfd),
1472        // Mount (non-default)
1473        "mount" => Some(libc::SYS_mount),
1474        "umount2" => Some(libc::SYS_umount2),
1475        "pivot_root" => Some(libc::SYS_pivot_root),
1476        "mount_setattr" => Some(libc::SYS_mount_setattr),
1477        "open_tree" => Some(libc::SYS_open_tree),
1478        "open_tree_attr" => Some(467),
1479        "move_mount" => Some(libc::SYS_move_mount),
1480        "fsopen" => Some(libc::SYS_fsopen),
1481        "fsconfig" => Some(libc::SYS_fsconfig),
1482        "fsmount" => Some(libc::SYS_fsmount),
1483        "fspick" => Some(libc::SYS_fspick),
1484        // Misc (non-default)
1485        "syslog" => Some(libc::SYS_syslog),
1486        "reboot" => Some(libc::SYS_reboot),
1487        "swapon" => Some(libc::SYS_swapon),
1488        "swapoff" => Some(libc::SYS_swapoff),
1489        "chroot" => Some(libc::SYS_chroot),
1490        "acct" => Some(libc::SYS_acct),
1491        "settimeofday" => Some(libc::SYS_settimeofday),
1492        "sethostname" => Some(libc::SYS_sethostname),
1493        "setdomainname" => Some(libc::SYS_setdomainname),
1494        "adjtimex" => Some(libc::SYS_adjtimex),
1495        #[cfg(target_arch = "x86_64")]
1496        "modify_ldt" => Some(libc::SYS_modify_ldt),
1497        #[cfg(target_arch = "x86_64")]
1498        "iopl" => Some(libc::SYS_iopl),
1499        #[cfg(target_arch = "x86_64")]
1500        "ioperm" => Some(libc::SYS_ioperm),
1501        "quotactl" => Some(libc::SYS_quotactl),
1502        "quotactl_fd" => Some(libc::SYS_quotactl_fd),
1503        "personality" => Some(libc::SYS_personality),
1504        "vhangup" => Some(libc::SYS_vhangup),
1505        #[cfg(target_arch = "x86_64")]
1506        "ustat" => Some(libc::SYS_ustat),
1507        #[cfg(target_arch = "x86_64")]
1508        "sysfs" => Some(libc::SYS_sysfs),
1509        "mknod" => Some(libc::SYS_mknod),
1510        "mknodat" => Some(libc::SYS_mknodat),
1511        "migrate_pages" => Some(libc::SYS_migrate_pages),
1512        "move_pages" => Some(libc::SYS_move_pages),
1513        #[cfg(target_arch = "x86_64")]
1514        "kexec_load" => Some(libc::SYS_kexec_load),
1515        "kexec_file_load" => Some(libc::SYS_kexec_file_load),
1516        // POSIX message queues
1517        "mq_open" => Some(libc::SYS_mq_open),
1518        "mq_unlink" => Some(libc::SYS_mq_unlink),
1519        "mq_timedsend" => Some(libc::SYS_mq_timedsend),
1520        "mq_timedreceive" => Some(libc::SYS_mq_timedreceive),
1521        "mq_notify" => Some(libc::SYS_mq_notify),
1522        "mq_getsetattr" => Some(libc::SYS_mq_getsetattr),
1523        // Keyring
1524        "add_key" => Some(libc::SYS_add_key),
1525        "request_key" => Some(libc::SYS_request_key),
1526        "keyctl" => Some(libc::SYS_keyctl),
1527        // IO pgetevents
1528        "io_pgetevents" => Some(333),
1529        // LSM
1530        "lsm_get_self_attr" => Some(459),
1531        "lsm_set_self_attr" => Some(460),
1532        "lsm_list_modules" => Some(461),
1533        #[cfg(target_arch = "x86_64")]
1534        "lookup_dcookie" => Some(libc::SYS_lookup_dcookie),
1535        "uretprobe" => Some(335),
1536        _ => None,
1537    }
1538}
1539
1540impl Default for SeccompManager {
1541    fn default() -> Self {
1542        Self::new()
1543    }
1544}
1545
1546#[cfg(test)]
1547mod tests {
1548    use super::*;
1549
1550    #[test]
1551    fn test_seccomp_manager_initial_state() {
1552        let mgr = SeccompManager::new();
1553        assert!(!mgr.is_applied());
1554    }
1555
1556    #[test]
1557    fn test_apply_idempotent() {
1558        let mgr = SeccompManager::new();
1559        // Note: We can't actually test application in unit tests
1560        // as it would affect the test process itself
1561        // This is tested in integration tests instead
1562        assert!(!mgr.is_applied());
1563    }
1564
1565    #[test]
1566    fn test_clone_denied_flags_include_newcgroup() {
1567        assert_ne!(
1568            DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWCGROUP as u64,
1569            0
1570        );
1571    }
1572
1573    #[test]
1574    fn test_clone_denied_flags_include_newtime() {
1575        assert_ne!(
1576            DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWTIME as u64,
1577            0,
1578            "CLONE_NEWTIME must be in denied clone namespace flags"
1579        );
1580    }
1581
1582    #[test]
1583    fn test_network_none_socket_domains_are_unix_only() {
1584        let domains = SeccompManager::allowed_socket_domains(false);
1585        assert_eq!(domains, vec![libc::AF_UNIX]);
1586    }
1587
1588    #[test]
1589    fn test_network_enabled_socket_domains_exclude_netlink() {
1590        let domains = SeccompManager::allowed_socket_domains(true);
1591        assert!(domains.contains(&libc::AF_UNIX));
1592        assert!(domains.contains(&libc::AF_INET));
1593        assert!(domains.contains(&libc::AF_INET6));
1594        assert!(!domains.contains(&libc::AF_NETLINK));
1595    }
1596
1597    #[test]
1598    fn test_network_mode_syscalls_only_enabled_when_network_allowed() {
1599        let none = SeccompManager::network_mode_syscalls(false);
1600        assert!(none.is_empty());
1601
1602        let enabled = SeccompManager::network_mode_syscalls(true);
1603        assert!(enabled.contains(&libc::SYS_connect));
1604        assert!(enabled.contains(&libc::SYS_bind));
1605        assert!(enabled.contains(&libc::SYS_listen));
1606        assert!(enabled.contains(&libc::SYS_accept));
1607        assert!(enabled.contains(&libc::SYS_setsockopt));
1608    }
1609
1610    #[test]
1611    fn test_landlock_bootstrap_syscalls_present_in_base_allowlist() {
1612        let base = SeccompManager::base_allowed_syscalls();
1613        assert!(base.contains(&libc::SYS_landlock_create_ruleset));
1614        assert!(base.contains(&libc::SYS_landlock_add_rule));
1615        assert!(base.contains(&libc::SYS_landlock_restrict_self));
1616    }
1617
1618    #[test]
1619    fn test_x32_legacy_range_not_allowlisted() {
1620        let base = SeccompManager::base_allowed_syscalls();
1621        let net = SeccompManager::network_mode_syscalls(true);
1622        for nr in 512_i64..=547_i64 {
1623            assert!(
1624                !base.contains(&nr) && !net.contains(&nr),
1625                "x32 syscall number {} unexpectedly allowlisted",
1626                nr
1627            );
1628        }
1629    }
1630
1631    #[test]
1632    fn test_i386_compat_socketcall_range_not_allowlisted() {
1633        let base = SeccompManager::base_allowed_syscalls();
1634        let net = SeccompManager::network_mode_syscalls(true);
1635        // i386 compat per syscall_32.tbl: socket..shutdown live at 359..373.
1636        // On x86_64 these numbers are outside our native allowlist surface.
1637        for nr in 359_i64..=373_i64 {
1638            assert!(
1639                !base.contains(&nr) && !net.contains(&nr),
1640                "i386 compat syscall number {} unexpectedly allowlisted",
1641                nr
1642            );
1643        }
1644    }
1645
1646    #[test]
1647    fn test_minimal_filter_allowlist_counts_are_stable() {
1648        let base = SeccompManager::base_allowed_syscalls();
1649        let net = SeccompManager::network_mode_syscalls(true);
1650
1651        // Snapshot counts to catch unintended policy drift.
1652        // +8 accounts for conditional rules inserted in minimal_filter():
1653        // socket/ioctl/prctl/prlimit64/mprotect/clone/clone3/execveat.
1654        // fork removed (forces through filtered clone path).
1655        // execveat removed from base (arg-filtered separately).
1656        // sysinfo removed (L8: leaks host info).
1657        // prlimit64 moved to arg-filtered (M3).
1658        assert_eq!(base.len(), 173);
1659        assert_eq!(net.len(), 11);
1660        assert_eq!(base.len() + 8, 181);
1661        assert_eq!(base.len() + net.len() + 8, 192);
1662    }
1663
1664    #[test]
1665    fn test_arg_filtered_syscalls_list_includes_critical_syscalls() {
1666        // These syscalls must be in the arg-filtered list so custom profiles
1667        // get warnings when they allow them without filters.
1668        for name in &["clone", "clone3", "execveat", "ioctl", "prctl", "socket"] {
1669            assert!(
1670                SeccompManager::ARG_FILTERED_SYSCALLS.contains(name),
1671                "'{}' must be in ARG_FILTERED_SYSCALLS",
1672                name
1673            );
1674        }
1675    }
1676
1677    #[test]
1678    fn test_clone3_allowed_in_minimal_filter() {
1679        // clone3 MUST be in the BPF rules map – glibc 2.34+ and newer musl
1680        // use clone3 internally for posix_spawn/fork. Blocking it breaks
1681        // std::process::Command on modern systems. Namespace creation is
1682        // prevented by dropped capabilities (CAP_SYS_ADMIN etc.), not seccomp.
1683        let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1684        assert!(
1685            rules.contains_key(&libc::SYS_clone3),
1686            "clone3 must be in the seccomp allowlist (glibc 2.34+ requires it)"
1687        );
1688    }
1689
1690    #[test]
1691    fn test_clone_is_allowed_with_arg_filter() {
1692        // clone (not clone3) should still be in the rules with arg filtering
1693        let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1694        assert!(
1695            rules.contains_key(&libc::SYS_clone),
1696            "clone must be in the seccomp allowlist with arg filters"
1697        );
1698    }
1699
1700    #[test]
1701    fn test_high_risk_syscalls_removed_from_base_allowlist() {
1702        let base = SeccompManager::base_allowed_syscalls();
1703        // chown/fchown/lchown/fchownat: allowed – safe after CAP_CHOWN/CAP_FOWNER drop
1704        // mlock/munlock: allowed – needed by databases, bounded by RLIMIT_MEMLOCK
1705        let removed = [
1706            libc::SYS_sync,
1707            libc::SYS_syncfs,
1708            libc::SYS_mincore,
1709            libc::SYS_vfork,
1710            libc::SYS_tkill,
1711            // io_uring: large attack surface, many CVEs – require custom profile
1712            libc::SYS_io_uring_setup,
1713            libc::SYS_io_uring_enter,
1714            libc::SYS_io_uring_register,
1715        ];
1716
1717        for syscall in removed {
1718            assert!(
1719                !base.contains(&syscall),
1720                "syscall {} unexpectedly present in base allowlist",
1721                syscall
1722            );
1723        }
1724    }
1725
1726    #[test]
1727    fn test_custom_profile_preserves_clone_arg_filters() {
1728        // SEC-01: Custom seccomp profiles that allow "clone" must still get
1729        // argument-level filtering to block namespace-creating flags.
1730        // Verify by inspecting the built-in filter rules that serve as the
1731        // merge source for apply_profile_from_file.
1732        let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1733
1734        // Every ARG_FILTERED_SYSCALLS entry (except clone3, which is allowed
1735        // unconditionally since BPF can't inspect its struct-based flags) must
1736        // have non-empty argument-level rules in the built-in filter so that
1737        // apply_profile_from_file can merge them.
1738        for name in SeccompManager::ARG_FILTERED_SYSCALLS {
1739            if *name == "clone3" {
1740                // clone3 is allowed unconditionally – BPF cannot dereference
1741                // the clone_args struct, so arg filtering is impossible.
1742                // Namespace defense relies on dropped capabilities.
1743                continue;
1744            }
1745            if let Some(nr) = syscall_name_to_number(name) {
1746                let entry = rules.get(&nr);
1747                assert!(
1748                    entry.is_some() && !entry.unwrap().is_empty(),
1749                    "built-in filter must have argument-level rules for '{}' \
1750                     so apply_profile_from_file can merge them into custom profiles",
1751                    name
1752                );
1753            }
1754        }
1755    }
1756
1757    #[test]
1758    fn test_memfd_create_not_in_default_allowlist() {
1759        // SEC-02: memfd_create enables fileless code execution when combined with execveat.
1760        let base = SeccompManager::base_allowed_syscalls();
1761        assert!(
1762            !base.contains(&libc::SYS_memfd_create),
1763            "memfd_create must not be in the default seccomp allowlist (fileless exec risk)"
1764        );
1765        // Also verify it's not sneaked into the compiled filter rules
1766        let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1767        assert!(
1768            !rules.contains_key(&libc::SYS_memfd_create),
1769            "memfd_create must not be in the compiled seccomp filter rules"
1770        );
1771    }
1772
1773    #[test]
1774    fn test_mprotect_has_arg_filtering() {
1775        // SEC-03: mprotect must have argument-level filtering to prevent W^X
1776        // (PROT_WRITE|PROT_EXEC) violations. Verify via runtime data structures.
1777
1778        // mprotect must NOT be in the unconditional base allowlist
1779        let base = SeccompManager::base_allowed_syscalls();
1780        assert!(
1781            !base.contains(&libc::SYS_mprotect),
1782            "SYS_mprotect must not be unconditionally allowed - needs arg filtering"
1783        );
1784
1785        // mprotect must be present in the compiled filter with non-empty
1786        // argument conditions (the conditions enforce W^X)
1787        let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1788        let mprotect_rules = rules.get(&libc::SYS_mprotect);
1789        assert!(
1790            mprotect_rules.is_some(),
1791            "mprotect must be present in the seccomp filter rules"
1792        );
1793        assert!(
1794            !mprotect_rules.unwrap().is_empty(),
1795            "mprotect must have argument-level conditions to prevent W^X violations"
1796        );
1797    }
1798
1799    #[test]
1800    fn test_unsafe_blocks_have_safety_comments() {
1801        // SEC-08: All unsafe blocks must have // SAFETY: documentation
1802        let source = include_str!("seccomp.rs");
1803        let mut pos = 0;
1804        while let Some(idx) = source[pos..].find("unsafe {") {
1805            let abs_idx = pos + idx;
1806            // Check that there's a SAFETY comment within 200 chars before the unsafe block
1807            let start = abs_idx.saturating_sub(200);
1808            let context = &source[start..abs_idx];
1809            assert!(
1810                context.contains("SAFETY:"),
1811                "unsafe block at byte {} must have a // SAFETY: comment. Context: ...{}...",
1812                abs_idx,
1813                &source[abs_idx.saturating_sub(80)..abs_idx + 10]
1814            );
1815            pos = abs_idx + 1;
1816        }
1817    }
1818
1819    // --- H-1: mprotect MaskedEq logic verification ---
1820    //
1821    // The mprotect filter uses MaskedEq((PROT_WRITE | PROT_EXEC), value) to
1822    // allow only combinations where the W|X bits match one of {0, W, X}.
1823    // These tests prove the logic is correct without installing a real
1824    // seccomp filter (which would affect the test process).
1825
1826    /// Helper: simulates the MaskedEq check that the seccomp BPF would perform.
1827    /// Returns true if the prot value would be ALLOWED by one of the rules.
1828    fn mprotect_would_allow(prot: u64) -> bool {
1829        let mask = (libc::PROT_WRITE | libc::PROT_EXEC) as u64;
1830        let allowed_values: &[u64] = &[0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64];
1831        let masked = prot & mask;
1832        allowed_values.contains(&masked)
1833    }
1834
1835    #[test]
1836    fn test_mprotect_allows_prot_none() {
1837        assert!(mprotect_would_allow(0), "PROT_NONE must be allowed");
1838    }
1839
1840    #[test]
1841    fn test_mprotect_allows_prot_read_only() {
1842        assert!(
1843            mprotect_would_allow(libc::PROT_READ as u64),
1844            "PROT_READ must be allowed (W|X bits are 0)"
1845        );
1846    }
1847
1848    #[test]
1849    fn test_mprotect_allows_prot_read_write() {
1850        assert!(
1851            mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE) as u64),
1852            "PROT_READ|PROT_WRITE must be allowed"
1853        );
1854    }
1855
1856    #[test]
1857    fn test_mprotect_allows_prot_read_exec() {
1858        assert!(
1859            mprotect_would_allow((libc::PROT_READ | libc::PROT_EXEC) as u64),
1860            "PROT_READ|PROT_EXEC must be allowed"
1861        );
1862    }
1863
1864    #[test]
1865    fn test_mprotect_rejects_prot_write_exec() {
1866        assert!(
1867            !mprotect_would_allow((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
1868            "PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
1869        );
1870    }
1871
1872    #[test]
1873    fn test_mprotect_rejects_prot_read_write_exec() {
1874        assert!(
1875            !mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE | libc::PROT_EXEC) as u64),
1876            "PROT_READ|PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
1877        );
1878    }
1879
1880    #[test]
1881    fn test_mprotect_allows_prot_write_alone() {
1882        assert!(
1883            mprotect_would_allow(libc::PROT_WRITE as u64),
1884            "PROT_WRITE alone must be allowed"
1885        );
1886    }
1887
1888    #[test]
1889    fn test_mprotect_allows_prot_exec_alone() {
1890        assert!(
1891            mprotect_would_allow(libc::PROT_EXEC as u64),
1892            "PROT_EXEC alone must be allowed"
1893        );
1894    }
1895
1896    // --- Extra syscall allowlist tests ---
1897
1898    #[test]
1899    fn test_extra_syscalls_are_merged_into_filter() {
1900        let extra = vec!["io_uring_setup".to_string(), "sysinfo".to_string()];
1901        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1902        assert!(
1903            rules.contains_key(&libc::SYS_io_uring_setup),
1904            "io_uring_setup must be in filter when requested via extra_syscalls"
1905        );
1906        assert!(
1907            rules.contains_key(&libc::SYS_sysinfo),
1908            "sysinfo must be in filter when requested via extra_syscalls"
1909        );
1910    }
1911
1912    #[test]
1913    fn test_extra_syscalls_do_not_override_arg_filtered() {
1914        // If a user requests "clone" via extra_syscalls, the arg-filtered
1915        // version from the built-in filter should still be present (not
1916        // replaced with an unconditional allow).
1917        let extra = vec!["clone".to_string()];
1918        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1919        let clone_rules = rules.get(&libc::SYS_clone);
1920        assert!(
1921            clone_rules.is_some() && !clone_rules.unwrap().is_empty(),
1922            "clone must retain argument-level filtering even when in extra_syscalls"
1923        );
1924    }
1925
1926    #[test]
1927    fn test_extra_syscalls_unknown_name_is_warned_and_skipped() {
1928        // Unknown syscall names emit a WARN and are skipped (not fatal)
1929        let extra = vec!["not_a_real_syscall".to_string()];
1930        let result = SeccompManager::minimal_filter(true, &extra);
1931        assert!(
1932            result.is_ok(),
1933            "Unknown syscall name should warn and skip, not error"
1934        );
1935    }
1936
1937    #[test]
1938    fn test_extra_syscalls_empty_is_noop() {
1939        let rules_without = SeccompManager::minimal_filter(true, &[]).unwrap();
1940        let rules_with = SeccompManager::minimal_filter(true, &[]).unwrap();
1941        assert_eq!(rules_without.len(), rules_with.len());
1942    }
1943
1944    #[test]
1945    fn test_extra_syscalls_duplicate_of_default_is_harmless() {
1946        // Requesting a syscall that's already in the default allowlist should work fine
1947        let extra = vec!["read".to_string()];
1948        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1949        assert!(rules.contains_key(&libc::SYS_read));
1950    }
1951
1952    #[test]
1953    fn test_extra_syscalls_blocked_known_syscall_not_added() {
1954        // A known syscall that is NOT in OPT_IN_SYSCALLS must be blocked
1955        // (not added to the filter rules). E.g. kexec_load, bpf, ptrace.
1956        let extra = vec!["kexec_load".to_string()];
1957        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1958        assert!(
1959            !rules.contains_key(&libc::SYS_kexec_load),
1960            "kexec_load must be blocked even when requested via --seccomp-allow"
1961        );
1962    }
1963
1964    #[test]
1965    fn test_extra_syscalls_unshare_remains_blocked() {
1966        let extra = vec!["unshare".to_string()];
1967        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1968        assert!(
1969            !rules.contains_key(&libc::SYS_unshare),
1970            "unshare must stay blocked even when requested via --seccomp-allow"
1971        );
1972    }
1973
1974    #[test]
1975    fn test_extra_syscalls_opt_in_syscall_is_added() {
1976        // Syscalls in OPT_IN_SYSCALLS must be added when requested
1977        let extra = vec!["io_uring_setup".to_string()];
1978        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1979        assert!(
1980            rules.contains_key(&libc::SYS_io_uring_setup),
1981            "io_uring_setup is in OPT_IN_SYSCALLS and must be added"
1982        );
1983    }
1984}