Skip to main content

nucleus/security/
seccomp.rs

1use crate::error::{NucleusError, Result};
2use crate::security::policy::sha256_hex;
3use seccompiler::{BpfProgram, SeccompAction, SeccompCondition, SeccompFilter, SeccompRule};
4use std::collections::BTreeMap;
5use std::path::Path;
6use tracing::{debug, info, warn};
7
8/// Seccomp filter manager
9///
10/// Implements syscall whitelisting for the security state machine
11/// (NucleusSecurity_Seccomp_SeccompEnforcement.tla)
12pub struct SeccompManager {
13    applied: bool,
14}
15
16const DENIED_CLONE_NAMESPACE_FLAGS: u64 = (libc::CLONE_NEWUSER
17    | libc::CLONE_NEWNS
18    | libc::CLONE_NEWNET
19    | libc::CLONE_NEWIPC
20    | libc::CLONE_NEWUTS
21    | libc::CLONE_NEWPID
22    | libc::CLONE_NEWCGROUP
23    | libc::CLONE_NEWTIME) as u64;
24
25impl SeccompManager {
26    pub fn new() -> Self {
27        Self { applied: false }
28    }
29
30    fn base_allowed_syscalls() -> Vec<i64> {
31        let mut syscalls = vec![
32            // File I/O
33            libc::SYS_read,
34            libc::SYS_write,
35            libc::SYS_openat,
36            libc::SYS_close,
37            libc::SYS_fstat,
38            libc::SYS_lseek,
39            libc::SYS_fcntl,
40            libc::SYS_readv,
41            libc::SYS_writev,
42            libc::SYS_preadv,
43            libc::SYS_pwritev,
44            libc::SYS_preadv2,
45            libc::SYS_pwritev2,
46            libc::SYS_pread64,
47            libc::SYS_pwrite64,
48            libc::SYS_readlinkat,
49            libc::SYS_newfstatat,
50            libc::SYS_statx,
51            libc::SYS_faccessat,
52            libc::SYS_faccessat2,
53            libc::SYS_dup,
54            libc::SYS_dup3,
55            libc::SYS_pipe2,
56            libc::SYS_unlinkat,
57            libc::SYS_renameat,
58            libc::SYS_renameat2,
59            libc::SYS_linkat,
60            libc::SYS_symlinkat,
61            libc::SYS_fchmod,
62            libc::SYS_fchmodat,
63            libc::SYS_truncate,
64            libc::SYS_ftruncate,
65            libc::SYS_fallocate,
66            #[cfg(target_arch = "x86_64")]
67            libc::SYS_fadvise64,
68            libc::SYS_fsync,
69            libc::SYS_fdatasync,
70            libc::SYS_sync_file_range,
71            libc::SYS_flock,
72            libc::SYS_fstatfs,
73            libc::SYS_statfs,
74            #[cfg(target_arch = "x86_64")]
75            libc::SYS_sendfile,
76            libc::SYS_copy_file_range,
77            libc::SYS_splice,
78            libc::SYS_tee,
79            // Memory management
80            libc::SYS_mmap,
81            libc::SYS_munmap,
82            libc::SYS_brk,
83            libc::SYS_mremap,
84            libc::SYS_madvise,
85            libc::SYS_msync,
86            libc::SYS_mlock,
87            libc::SYS_munlock,
88            libc::SYS_mlock2,
89            // SysV shared memory – used by PostgreSQL, Redis, and many databases
90            // for shared buffer pools. Safe in PID/IPC namespaces (isolated keyspace).
91            libc::SYS_shmget,
92            libc::SYS_shmat,
93            libc::SYS_shmdt,
94            libc::SYS_shmctl,
95            // POSIX semaphores (used by PostgreSQL for lightweight locking)
96            libc::SYS_semget,
97            libc::SYS_semop,
98            libc::SYS_semctl,
99            libc::SYS_semtimedop,
100            // Process management
101            // fork intentionally excluded – modern glibc/musl use clone(), which
102            // has namespace-flag filtering. Removing SYS_fork forces all forks
103            // through the filtered clone path (defense-in-depth against fork bombs
104            // and unfiltered namespace creation).
105            libc::SYS_execve,
106            // execveat is conditionally allowed below (AT_EMPTY_PATH blocked)
107            libc::SYS_wait4,
108            libc::SYS_waitid,
109            libc::SYS_exit,
110            libc::SYS_exit_group,
111            libc::SYS_getpid,
112            libc::SYS_gettid,
113            libc::SYS_getuid,
114            libc::SYS_getgid,
115            libc::SYS_geteuid,
116            libc::SYS_getegid,
117            libc::SYS_getppid,
118            libc::SYS_setsid,
119            libc::SYS_getgroups,
120            // Signals
121            libc::SYS_rt_sigaction,
122            libc::SYS_rt_sigprocmask,
123            libc::SYS_rt_sigreturn,
124            libc::SYS_rt_sigsuspend,
125            libc::SYS_rt_sigtimedwait,
126            libc::SYS_rt_sigpending,
127            libc::SYS_rt_sigqueueinfo,
128            libc::SYS_sigaltstack,
129            libc::SYS_restart_syscall,
130            // L7: kill/tgkill are safe when PID namespace is active (container
131            // can only signal its own processes). If PID namespace creation fails,
132            // the runtime aborts, so this is safe.
133            libc::SYS_kill,
134            libc::SYS_tgkill,
135            // Time and timers
136            libc::SYS_clock_gettime,
137            libc::SYS_clock_getres,
138            libc::SYS_clock_nanosleep,
139            libc::SYS_gettimeofday,
140            libc::SYS_nanosleep,
141            libc::SYS_setitimer,
142            libc::SYS_getitimer,
143            // Directories
144            libc::SYS_getcwd,
145            libc::SYS_chdir,
146            libc::SYS_fchdir,
147            libc::SYS_mkdirat,
148            libc::SYS_getdents64,
149            // Misc
150            libc::SYS_uname,
151            libc::SYS_getrandom,
152            libc::SYS_futex,
153            libc::SYS_set_tid_address,
154            libc::SYS_set_robust_list,
155            libc::SYS_get_robust_list,
156            // L8: sysinfo removed – leaks host RAM, uptime, and process count.
157            // Applications needing this info should use /proc/meminfo instead.
158            libc::SYS_umask,
159            // prlimit64 moved to arg-filtered section (M3)
160            libc::SYS_getrusage,
161            libc::SYS_times,
162            libc::SYS_sched_yield,
163            libc::SYS_sched_getaffinity,
164            libc::SYS_sched_setaffinity,
165            libc::SYS_sched_getparam,
166            libc::SYS_sched_getscheduler,
167            libc::SYS_getcpu,
168            // Extended attributes – read-only queries, safe
169            libc::SYS_getxattr,
170            libc::SYS_lgetxattr,
171            libc::SYS_fgetxattr,
172            libc::SYS_listxattr,
173            libc::SYS_llistxattr,
174            libc::SYS_flistxattr,
175            libc::SYS_rseq,
176            libc::SYS_close_range,
177            // Ownership – safe after capability drop (CAP_CHOWN/CAP_FOWNER gone;
178            // operations on files not owned by the container UID will EPERM).
179            libc::SYS_fchown,
180            libc::SYS_fchownat,
181            // Legacy AIO – used by databases and storage engines. Operations are
182            // bounded by the process's existing fd permissions.
183            libc::SYS_io_setup,
184            libc::SYS_io_destroy,
185            libc::SYS_io_submit,
186            libc::SYS_io_getevents,
187            // NOTE: io_uring intentionally excluded from defaults – large kernel
188            // attack surface with a history of CVEs. Applications needing io_uring
189            // (e.g. PostgreSQL 18+ io_method=io_uring) should use a custom seccomp
190            // profile that adds io_uring_setup/io_uring_enter/io_uring_register.
191            // Process groups – safe in PID namespace (can only affect own pgrp).
192            libc::SYS_setpgid,
193            libc::SYS_getpgid,
194            // NOTE: memfd_create intentionally excluded – combined with execveat
195            // it enables fileless code execution bypassing all FS controls (SEC-02).
196            // Landlock bootstrap (runtime applies seccomp before Landlock)
197            libc::SYS_landlock_create_ruleset,
198            libc::SYS_landlock_add_rule,
199            libc::SYS_landlock_restrict_self,
200            // Socket/Network (safe introspection + local socketpair)
201            libc::SYS_getsockname,
202            libc::SYS_getpeername,
203            libc::SYS_socketpair,
204            libc::SYS_getsockopt,
205            // Poll/Select
206            libc::SYS_ppoll,
207            libc::SYS_pselect6,
208            libc::SYS_epoll_create1,
209            libc::SYS_epoll_ctl,
210            libc::SYS_epoll_pwait,
211            libc::SYS_eventfd2,
212            libc::SYS_signalfd4,
213            libc::SYS_timerfd_create,
214            libc::SYS_timerfd_settime,
215            libc::SYS_timerfd_gettime,
216        ];
217
218        // Legacy syscalls only available on x86_64 (aarch64 only has the *at variants)
219        #[cfg(target_arch = "x86_64")]
220        syscalls.extend_from_slice(&[
221            libc::SYS_open,
222            libc::SYS_stat,
223            libc::SYS_lstat,
224            libc::SYS_access,
225            libc::SYS_readlink,
226            libc::SYS_dup2,
227            libc::SYS_pipe,
228            libc::SYS_unlink,
229            libc::SYS_rename,
230            libc::SYS_link,
231            libc::SYS_symlink,
232            libc::SYS_chmod,
233            libc::SYS_mkdir,
234            libc::SYS_rmdir,
235            libc::SYS_getdents,
236            libc::SYS_getpgrp,
237            libc::SYS_chown,
238            libc::SYS_fchown,
239            libc::SYS_lchown,
240            libc::SYS_arch_prctl,
241            libc::SYS_getrlimit,
242            libc::SYS_poll,
243            libc::SYS_select,
244            libc::SYS_epoll_create,
245            libc::SYS_epoll_wait,
246            libc::SYS_eventfd,
247            libc::SYS_signalfd,
248        ]);
249
250        syscalls
251    }
252
253    fn allowed_socket_domains(allow_network: bool) -> Vec<i32> {
254        if allow_network {
255            vec![libc::AF_UNIX, libc::AF_INET, libc::AF_INET6]
256        } else {
257            vec![libc::AF_UNIX]
258        }
259    }
260
261    fn network_mode_syscalls(allow_network: bool) -> Vec<i64> {
262        if allow_network {
263            vec![
264                libc::SYS_connect,
265                libc::SYS_sendto,
266                libc::SYS_recvfrom,
267                libc::SYS_sendmsg,
268                libc::SYS_recvmsg,
269                libc::SYS_shutdown,
270                libc::SYS_bind,
271                libc::SYS_listen,
272                libc::SYS_accept,
273                libc::SYS_accept4,
274                libc::SYS_setsockopt,
275            ]
276        } else {
277            Vec::new()
278        }
279    }
280
281    /// Get minimal syscall whitelist for basic container operation
282    ///
283    /// This is a restrictive whitelist that blocks dangerous syscalls:
284    /// - ptrace (process tracing)
285    /// - kexec_load (kernel loading)
286    /// - add_key, request_key, keyctl (kernel keyring)
287    /// - bpf (eBPF programs)
288    /// - perf_event_open (performance monitoring)
289    /// - userfaultfd (user fault handling)
290    fn minimal_filter(
291        allow_network: bool,
292        extra_syscalls: &[String],
293    ) -> Result<BTreeMap<i64, Vec<SeccompRule>>> {
294        let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
295
296        // Essential syscalls for basic operation
297        let allowed_syscalls = Self::base_allowed_syscalls();
298
299        // Allow all these syscalls unconditionally
300        for syscall in allowed_syscalls {
301            rules.insert(syscall, Vec::new());
302        }
303
304        // Add network-mode-specific syscalls
305        for syscall in Self::network_mode_syscalls(allow_network) {
306            rules.insert(syscall, Vec::new());
307        }
308
309        // Add user-requested extra syscalls (--seccomp-allow).
310        // - Already in default/arg-filtered: silently accepted (no-op).
311        // - In OPT_IN_SYSCALLS: added to allowlist.
312        // - Known but not opt-in: WARN and blocked (defense-in-depth).
313        // - Unknown name: WARN and blocked.
314        for name in extra_syscalls {
315            if let Some(nr) = syscall_name_to_number(name) {
316                if rules.contains_key(&nr) {
317                    // Already allowed by default or arg-filtered – no-op.
318                } else if Self::OPT_IN_SYSCALLS.contains(&name.as_str()) {
319                    rules.insert(nr, Vec::new());
320                } else {
321                    warn!(
322                        "--seccomp-allow: syscall '{}' is not in the opt-in allowlist – blocked",
323                        name
324                    );
325                }
326            } else {
327                warn!("--seccomp-allow: unknown syscall '{}' – blocked", name);
328            }
329        }
330
331        // Restrict socket() domains by network mode.
332        // none: AF_UNIX only; network-enabled: AF_UNIX/AF_INET/AF_INET6.
333        let mut socket_rules = Vec::new();
334        for domain in Self::allowed_socket_domains(allow_network) {
335            let condition = SeccompCondition::new(
336                0, // arg0 is socket(domain, type, protocol)
337                seccompiler::SeccompCmpArgLen::Dword,
338                seccompiler::SeccompCmpOp::Eq,
339                domain as u64,
340            )
341            .map_err(|e| {
342                NucleusError::SeccompError(format!(
343                    "Failed to create socket domain condition: {}",
344                    e
345                ))
346            })?;
347            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
348                NucleusError::SeccompError(format!("Failed to create socket rule: {}", e))
349            })?;
350            socket_rules.push(rule);
351        }
352        rules.insert(libc::SYS_socket, socket_rules);
353
354        // ioctl: allow only safe terminal operations (arg0 = request code)
355        let ioctl_allowed: &[u64] = &[
356            0x5401, // TCGETS
357            0x5402, // TCSETS
358            0x5403, // TCSETSW
359            0x5404, // TCSETSF
360            0x540B, // TCFLSH
361            0x540F, // TIOCGPGRP
362            0x5410, // TIOCSPGRP
363            0x5413, // TIOCGWINSZ
364            0x5429, // TIOCGSID
365            0x541B, // FIONREAD
366            0x5421, // M12: FIONBIO – allowed because fcntl(F_SETFL, O_NONBLOCK)
367            // achieves the same result and is already permitted. Blocking
368            // FIONBIO only breaks tokio/mio for no security gain.
369            0x5451, // FIOCLEX
370            0x5450, // FIONCLEX
371        ];
372        let mut ioctl_rules = Vec::new();
373        for &request in ioctl_allowed {
374            let condition = SeccompCondition::new(
375                1, // arg1 is the request code for ioctl(fd, request, ...)
376                seccompiler::SeccompCmpArgLen::Dword,
377                seccompiler::SeccompCmpOp::Eq,
378                request,
379            )
380            .map_err(|e| {
381                NucleusError::SeccompError(format!("Failed to create ioctl condition: {}", e))
382            })?;
383            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
384                NucleusError::SeccompError(format!("Failed to create ioctl rule: {}", e))
385            })?;
386            ioctl_rules.push(rule);
387        }
388        rules.insert(libc::SYS_ioctl, ioctl_rules);
389
390        // prctl: allow only safe operations (arg0 = option).
391        // Notably absent (hit default deny):
392        //   PR_CAPBSET_DROP (24) – could weaken the capability bounding set
393        //   PR_SET_SECUREBITS (28) – could disable secure-exec restrictions
394        let prctl_allowed: &[u64] = &[
395            1,  // PR_SET_PDEATHSIG
396            2,  // PR_GET_PDEATHSIG
397            15, // PR_SET_NAME
398            16, // PR_GET_NAME
399            23, // PR_CAPBSET_READ – glibc probes this at startup to discover
400            // cap_last_cap when /proc/sys is masked. Read-only, harmless
401            // after capabilities have been dropped.
402            27, // PR_GET_SECUREBITS – read-only query of securebits flags
403            36, // PR_SET_CHILD_SUBREAPER – safe, only affects own descendants
404            37, // PR_GET_CHILD_SUBREAPER
405            38, // PR_SET_NO_NEW_PRIVS
406            40, // PR_GET_TID_ADDRESS – read-only, returns thread ID address
407            47, // PR_CAP_AMBIENT – glibc probes ambient caps at startup (read-only
408            // IS_SET queries). Safe after caps are dropped.
409            39, // PR_GET_NO_NEW_PRIVS
410        ];
411        let mut prctl_rules = Vec::new();
412        for &option in prctl_allowed {
413            let condition = SeccompCondition::new(
414                0, // arg0 is the option for prctl(option, ...)
415                seccompiler::SeccompCmpArgLen::Dword,
416                seccompiler::SeccompCmpOp::Eq,
417                option,
418            )
419            .map_err(|e| {
420                NucleusError::SeccompError(format!("Failed to create prctl condition: {}", e))
421            })?;
422            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
423                NucleusError::SeccompError(format!("Failed to create prctl rule: {}", e))
424            })?;
425            prctl_rules.push(rule);
426        }
427        rules.insert(libc::SYS_prctl, prctl_rules);
428
429        // M3: prlimit64 – only allow GET (new_limit == NULL, i.e. arg2 == 0).
430        // SET operations could raise RLIMIT_NPROC to bypass fork-bomb protection.
431        let prlimit_condition = SeccompCondition::new(
432            2, // arg2 = new_limit pointer for prlimit64(pid, resource, new_limit, old_limit)
433            seccompiler::SeccompCmpArgLen::Qword,
434            seccompiler::SeccompCmpOp::Eq,
435            0u64, // new_limit == NULL means GET-only
436        )
437        .map_err(|e| {
438            NucleusError::SeccompError(format!("Failed to create prlimit64 condition: {}", e))
439        })?;
440        let prlimit_rule = SeccompRule::new(vec![prlimit_condition]).map_err(|e| {
441            NucleusError::SeccompError(format!("Failed to create prlimit64 rule: {}", e))
442        })?;
443        rules.insert(libc::SYS_prlimit64, vec![prlimit_rule]);
444
445        // mprotect: permit RW or RX transitions, but reject PROT_WRITE|PROT_EXEC.
446        let mut mprotect_rules = Vec::new();
447        for allowed in [0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64] {
448            let condition = SeccompCondition::new(
449                2, // arg2 is prot for mprotect(addr, len, prot)
450                seccompiler::SeccompCmpArgLen::Dword,
451                seccompiler::SeccompCmpOp::MaskedEq((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
452                allowed,
453            )
454            .map_err(|e| {
455                NucleusError::SeccompError(format!("Failed to create mprotect condition: {}", e))
456            })?;
457            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
458                NucleusError::SeccompError(format!("Failed to create mprotect rule: {}", e))
459            })?;
460            mprotect_rules.push(rule);
461        }
462        rules.insert(libc::SYS_mprotect, mprotect_rules);
463
464        // clone3: ALLOWED unconditionally. clone3 passes flags inside a struct
465        // pointer that seccomp BPF cannot dereference, so namespace-flag filtering
466        // is impossible at the BPF level. However, glibc 2.34+ and newer musl use
467        // clone3 internally for posix_spawn/fork – blocking it breaks
468        // std::process::Command and any child-process spawning on modern systems.
469        //
470        // SECURITY INVARIANT: Namespace creation via clone3 is prevented solely by
471        // dropping CAP_SYS_ADMIN (and other namespace caps) *before* this seccomp
472        // filter is installed. If capability dropping is bypassed, clone3 becomes
473        // an unfiltered path to namespace creation. This is a known single point
474        // of failure – see CapabilityManager::drop_all() which must run first.
475        //
476        // Verify the invariant: CAP_SYS_ADMIN must not be in the effective set.
477        // CAP_SYS_ADMIN = capability bit 21
478        if Self::has_effective_cap(21) {
479            return Err(NucleusError::SeccompError(
480                "SECURITY: CAP_SYS_ADMIN is still in the effective capability set. \
481                 Capabilities must be dropped before installing seccomp filters \
482                 (clone3 is allowed unconditionally)."
483                    .to_string(),
484            ));
485        }
486        rules.insert(libc::SYS_clone3, Vec::new());
487
488        // clone: allow but deny namespace-creating flags to prevent nested namespace creation
489        let clone_condition = SeccompCondition::new(
490            0, // arg0 = flags
491            seccompiler::SeccompCmpArgLen::Qword,
492            seccompiler::SeccompCmpOp::MaskedEq(DENIED_CLONE_NAMESPACE_FLAGS),
493            0, // (flags & ns_flags) == 0: none of the namespace flags set
494        )
495        .map_err(|e| {
496            NucleusError::SeccompError(format!("Failed to create clone condition: {}", e))
497        })?;
498        let clone_rule = SeccompRule::new(vec![clone_condition]).map_err(|e| {
499            NucleusError::SeccompError(format!("Failed to create clone rule: {}", e))
500        })?;
501        rules.insert(libc::SYS_clone, vec![clone_rule]);
502
503        // execveat: allow but block AT_EMPTY_PATH (0x1000) to prevent fileless
504        // execution. With AT_EMPTY_PATH, execveat can execute code from any open
505        // fd (e.g., open + unlink, or even a socket fd), bypassing filesystem
506        // controls – not just memfd_create. Blocking memfd_create alone is
507        // insufficient. Normal execveat with dirfd+pathname (no AT_EMPTY_PATH)
508        // remains allowed.
509        let execveat_condition = SeccompCondition::new(
510            4, // arg4 = flags for execveat(dirfd, pathname, argv, envp, flags)
511            seccompiler::SeccompCmpArgLen::Dword,
512            seccompiler::SeccompCmpOp::MaskedEq(libc::AT_EMPTY_PATH as u64),
513            0, // (flags & AT_EMPTY_PATH) == 0: AT_EMPTY_PATH not set
514        )
515        .map_err(|e| {
516            NucleusError::SeccompError(format!("Failed to create execveat condition: {}", e))
517        })?;
518        let execveat_rule = SeccompRule::new(vec![execveat_condition]).map_err(|e| {
519            NucleusError::SeccompError(format!("Failed to create execveat rule: {}", e))
520        })?;
521        rules.insert(libc::SYS_execveat, vec![execveat_rule]);
522
523        Ok(rules)
524    }
525
526    /// Compile the minimal BPF filter without applying it
527    ///
528    /// This is useful for benchmarking filter compilation overhead
529    /// without the irreversible side effect of applying the filter.
530    ///
531    /// Uses bitmap-based BPF compilation for O(1) syscall dispatch.
532    pub fn compile_minimal_filter() -> Result<BpfProgram> {
533        let rules = Self::minimal_filter(true, &[])?;
534        let target_arch = std::env::consts::ARCH.try_into().map_err(|e| {
535            NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
536        })?;
537        super::seccomp_bpf::compile_bitmap_bpf(
538            rules,
539            SeccompAction::KillProcess,
540            SeccompAction::Allow,
541            target_arch,
542        )
543    }
544
545    /// Expose minimal_filter for tests in sibling modules.
546    #[cfg(test)]
547    pub(crate) fn minimal_filter_for_test(
548        allow_network: bool,
549        extra_syscalls: &[String],
550    ) -> BTreeMap<i64, Vec<SeccompRule>> {
551        Self::minimal_filter(allow_network, extra_syscalls).unwrap()
552    }
553
554    /// Apply seccomp filter
555    ///
556    /// This implements the transition: no_filter -> whitelist_active
557    /// in the seccomp state machine (NucleusSecurity_Seccomp_SeccompEnforcement.tla)
558    ///
559    /// Once applied, the filter cannot be removed (irreversible property)
560    /// In rootless mode or if seccomp setup fails, this will warn and continue
561    pub fn apply_minimal_filter(&mut self) -> Result<bool> {
562        self.apply_minimal_filter_with_mode(false, false)
563    }
564
565    /// Apply seccomp filter with configurable failure behavior
566    ///
567    /// When `best_effort` is true, failures are logged and execution continues.
568    /// When false, seccomp setup is fail-closed.
569    pub fn apply_minimal_filter_with_mode(
570        &mut self,
571        best_effort: bool,
572        log_denied: bool,
573    ) -> Result<bool> {
574        self.apply_filter_for_network_mode(true, best_effort, log_denied, &[])
575    }
576
577    /// Apply seccomp filter with network-mode-aware socket restrictions
578    ///
579    /// When `allow_network` is false, `SYS_socket` is restricted to AF_UNIX only,
580    /// preventing creation of network sockets (AF_INET, AF_INET6, etc.).
581    /// When `allow_network` is true, all socket domains are permitted.
582    ///
583    /// When `best_effort` is true, failures are logged and execution continues.
584    /// When false, seccomp setup is fail-closed.
585    pub fn apply_filter_for_network_mode(
586        &mut self,
587        allow_network: bool,
588        best_effort: bool,
589        log_denied: bool,
590        extra_syscalls: &[String],
591    ) -> Result<bool> {
592        if self.applied {
593            debug!("Seccomp filter already applied, skipping");
594            return Ok(true);
595        }
596
597        info!(allow_network, "Applying seccomp filter");
598
599        let rules = match Self::minimal_filter(allow_network, extra_syscalls) {
600            Ok(r) => r,
601            Err(e) => {
602                if best_effort {
603                    warn!(
604                        "Failed to create seccomp rules: {} (continuing without seccomp)",
605                        e
606                    );
607                    return Ok(false);
608                }
609                return Err(e);
610            }
611        };
612
613        let target_arch = match std::env::consts::ARCH.try_into() {
614            Ok(a) => a,
615            Err(e) => {
616                let msg = format!("Unsupported architecture: {:?}", e);
617                if best_effort {
618                    warn!("{} (continuing without seccomp)", msg);
619                    return Ok(false);
620                }
621                return Err(NucleusError::SeccompError(msg));
622            }
623        };
624
625        let bpf_prog: BpfProgram = match super::seccomp_bpf::compile_bitmap_bpf(
626            rules,
627            SeccompAction::KillProcess,
628            SeccompAction::Allow,
629            target_arch,
630        ) {
631            Ok(p) => p,
632            Err(e) => {
633                if best_effort {
634                    warn!(
635                        "Failed to compile BPF program: {} (continuing without seccomp)",
636                        e
637                    );
638                    return Ok(false);
639                }
640                return Err(e);
641            }
642        };
643
644        // Apply the filter
645        match Self::apply_bpf_program(&bpf_prog, log_denied) {
646            Ok(_) => {
647                self.applied = true;
648                info!("Successfully applied seccomp filter");
649                Ok(true)
650            }
651            Err(e) => {
652                if best_effort {
653                    warn!(
654                        "Failed to apply seccomp filter: {} (continuing without seccomp)",
655                        e
656                    );
657                    Ok(false)
658                } else {
659                    Err(NucleusError::SeccompError(format!(
660                        "Failed to apply seccomp filter: {}",
661                        e
662                    )))
663                }
664            }
665        }
666    }
667
668    /// Apply a seccomp profile loaded from a JSON file.
669    ///
670    /// The profile format is a JSON object with:
671    /// ```json
672    /// {
673    ///   "defaultAction": "SCMP_ACT_ERRNO",
674    ///   "syscalls": [
675    ///     { "names": ["read", "write", "open", ...], "action": "SCMP_ACT_ALLOW" }
676    ///   ]
677    /// }
678    /// ```
679    ///
680    /// This is a subset of the OCI seccomp profile format. Only the syscall name
681    /// allowlist is used; argument-level filtering from the built-in profile is
682    /// not applied when using a custom profile.
683    ///
684    /// If `expected_sha256` is provided, the file's SHA-256 hash is verified
685    /// against it before loading. This prevents silent profile tampering.
686    pub fn apply_profile_from_file(
687        &mut self,
688        profile_path: &Path,
689        expected_sha256: Option<&str>,
690        audit_mode: bool,
691    ) -> Result<bool> {
692        if self.applied {
693            debug!("Seccomp filter already applied, skipping");
694            return Ok(true);
695        }
696
697        info!("Loading seccomp profile from {:?}", profile_path);
698
699        // Read profile file
700        let content = std::fs::read(profile_path).map_err(|e| {
701            NucleusError::SeccompError(format!(
702                "Failed to read seccomp profile {:?}: {}",
703                profile_path, e
704            ))
705        })?;
706
707        // Verify SHA-256 hash if expected
708        if let Some(expected) = expected_sha256 {
709            let actual = sha256_hex(&content);
710            if actual != expected {
711                return Err(NucleusError::SeccompError(format!(
712                    "Seccomp profile hash mismatch: expected {}, got {}",
713                    expected, actual
714                )));
715            }
716            info!("Seccomp profile hash verified: {}", actual);
717        }
718
719        // Parse profile
720        let profile: SeccompProfile = serde_json::from_slice(&content).map_err(|e| {
721            NucleusError::SeccompError(format!("Failed to parse seccomp profile: {}", e))
722        })?;
723
724        // Warn when custom profile allows security-critical syscalls without
725        // argument-level filtering. The built-in filter restricts clone, ioctl,
726        // prctl, and socket at the argument level; a custom profile that allows
727        // them by name only silently removes all of that hardening.
728        Self::warn_missing_arg_filters(&profile);
729
730        // Build filter from profile
731        let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
732
733        for syscall_group in &profile.syscalls {
734            if syscall_group.action == "SCMP_ACT_ALLOW" {
735                for name in &syscall_group.names {
736                    if let Some(nr) = syscall_name_to_number(name) {
737                        rules.insert(nr, Vec::new());
738                    } else {
739                        warn!("Unknown syscall in profile: {} (skipping)", name);
740                    }
741                }
742            }
743        }
744
745        // SEC-01: Merge built-in argument filters for security-critical syscalls.
746        // Custom profiles that allow clone/ioctl/prctl/socket/mprotect by name
747        // without argument-level filters would silently remove all hardening.
748        // Overwrite their empty rules with the built-in argument-filtered rules.
749        let builtin_rules = Self::minimal_filter(true, &[])?;
750        for syscall_name in Self::ARG_FILTERED_SYSCALLS {
751            if let Some(nr) = syscall_name_to_number(syscall_name) {
752                if let std::collections::btree_map::Entry::Occupied(mut entry) = rules.entry(nr) {
753                    if let Some(builtin) = builtin_rules.get(&nr) {
754                        if !builtin.is_empty() {
755                            info!(
756                                "Merging built-in argument filters for '{}' into custom profile",
757                                syscall_name
758                            );
759                            entry.insert(builtin.clone());
760                        }
761                    }
762                }
763            }
764        }
765        // H2: clone3 is allowed in the built-in filter (needed for glibc 2.34+).
766        // Apply the same policy to custom profiles for consistency. The security
767        // invariant against namespace creation via clone3 is enforced by dropping
768        // CAP_SYS_ADMIN *before* seccomp is installed (see verify_no_namespace_caps).
769        // If the custom profile doesn't include clone3, add it.
770        if !rules.contains_key(&libc::SYS_clone3) {
771            rules.insert(libc::SYS_clone3, Vec::new());
772        }
773
774        let target_arch = std::env::consts::ARCH.try_into().map_err(|e| {
775            NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
776        })?;
777
778        let bpf_prog: BpfProgram = super::seccomp_bpf::compile_bitmap_bpf(
779            rules,
780            SeccompAction::KillProcess,
781            SeccompAction::Allow,
782            target_arch,
783        )?;
784
785        match Self::apply_bpf_program(&bpf_prog, audit_mode) {
786            Ok(_) => {
787                self.applied = true;
788                info!(
789                    "Seccomp profile applied from {:?} (log_denied={})",
790                    profile_path, audit_mode
791                );
792                Ok(true)
793            }
794            Err(e) => Err(e),
795        }
796    }
797
798    /// Install an allow-all seccomp filter with SECCOMP_FILTER_FLAG_LOG.
799    ///
800    /// Used in trace mode: all syscalls are allowed but logged to the kernel
801    /// audit subsystem. A separate reader collects the logged syscalls.
802    pub fn apply_trace_filter(&mut self) -> Result<bool> {
803        if self.applied {
804            debug!("Seccomp filter already applied, skipping trace filter");
805            return Ok(true);
806        }
807
808        info!("Applying seccomp trace filter (allow-all + LOG)");
809
810        // Create an empty rule set – with SeccompAction::Allow as default,
811        // every syscall is permitted. The LOG flag causes the kernel to
812        // audit each syscall decision.
813        let filter = SeccompFilter::new(
814            BTreeMap::new(),
815            SeccompAction::Allow, // default: allow everything
816            SeccompAction::Allow, // match action (unused – no rules)
817            std::env::consts::ARCH.try_into().map_err(|e| {
818                NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
819            })?,
820        )
821        .map_err(|e| NucleusError::SeccompError(format!("Failed to create trace filter: {}", e)))?;
822
823        let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
824            NucleusError::SeccompError(format!("Failed to compile trace BPF: {}", e))
825        })?;
826
827        // Apply with LOG flag so kernel audits every syscall
828        Self::apply_bpf_program(&bpf_prog, true)?;
829        self.applied = true;
830        info!("Seccomp trace filter applied (all syscalls allowed + logged)");
831        Ok(true)
832    }
833
834    /// Syscalls that the built-in filter restricts at the argument level.
835    /// Custom profiles allowing these without argument filters weaken security.
836    const ARG_FILTERED_SYSCALLS: &'static [&'static str] = &[
837        "clone", "clone3", "execveat", "ioctl", "mprotect", "prctl", "socket",
838    ];
839
840    /// Non-default syscalls that may be opted into via `--seccomp-allow`.
841    ///
842    /// Every syscall known to `syscall_name_to_number` but absent from both
843    /// `base_allowed_syscalls` and `ARG_FILTERED_SYSCALLS` must appear here
844    /// to be enableable. Requesting a known syscall that is NOT in this list
845    /// emits a WARN and is silently dropped (defense-in-depth).
846    const OPT_IN_SYSCALLS: &'static [&'static str] = &[
847        // io_uring – large attack surface but needed by modern databases
848        "io_uring_setup",
849        "io_uring_enter",
850        "io_uring_register",
851        // SysV message queues
852        "msgget",
853        "msgsnd",
854        "msgrcv",
855        "msgctl",
856        // POSIX message queues
857        "mq_open",
858        "mq_unlink",
859        "mq_timedsend",
860        "mq_timedreceive",
861        "mq_notify",
862        "mq_getsetattr",
863        // POSIX timers
864        "timer_create",
865        "timer_settime",
866        "timer_gettime",
867        "timer_getoverrun",
868        "timer_delete",
869        // Inotify / fanotify
870        "inotify_init",
871        "inotify_init1",
872        "inotify_add_watch",
873        "inotify_rm_watch",
874        "fanotify_init",
875        "fanotify_mark",
876        // Memory (non-default)
877        "mincore",
878        "mlockall",
879        "munlockall",
880        "membarrier",
881        "process_madvise",
882        "mbind",
883        "set_mempolicy",
884        "get_mempolicy",
885        "set_mempolicy_home_node",
886        "pkey_mprotect",
887        "pkey_alloc",
888        "pkey_free",
889        "cachestat",
890        "remap_file_pages",
891        // File I/O (non-default)
892        "sync",
893        "syncfs",
894        "sync_file_range",
895        "readahead",
896        "vmsplice",
897        "openat2",
898        "name_to_handle_at",
899        "open_by_handle_at",
900        "io_cancel",
901        "io_pgetevents",
902        "creat",
903        "fchmodat2",
904        "statmount",
905        "listmount",
906        "utimensat",
907        "utimes",
908        "utime",
909        "futimesat",
910        // Extended attributes (write)
911        "setxattr",
912        "lsetxattr",
913        "fsetxattr",
914        "removexattr",
915        "lremovexattr",
916        "fremovexattr",
917        "setxattrat",
918        "getxattrat",
919        "listxattrat",
920        "removexattrat",
921        // Network (non-default)
922        "recvmmsg",
923        "sendmmsg",
924        // Scheduling (non-default)
925        "sched_setparam",
926        "sched_setscheduler",
927        "sched_get_priority_max",
928        "sched_get_priority_min",
929        "sched_rr_get_interval",
930        "sched_setattr",
931        "sched_getattr",
932        // Resource limits / priority
933        "setrlimit",
934        "getpriority",
935        "setpriority",
936        "ioprio_set",
937        "ioprio_get",
938        // Process (non-default, low risk)
939        "vfork",
940        "pause",
941        "alarm",
942        "tkill",
943        "sysinfo",
944        "personality",
945        "vhangup",
946        "time",
947        "pidfd_open",
948        "pidfd_send_signal",
949        "pidfd_getfd",
950        // UID/GID
951        "setuid",
952        "setgid",
953        "setreuid",
954        "setregid",
955        "setresuid",
956        "getresuid",
957        "setresgid",
958        "getresgid",
959        "setfsuid",
960        "setfsgid",
961        "setgroups",
962        "getsid",
963        // Capabilities (read-only query)
964        "capget",
965        // Signals (non-default)
966        "rt_tgsigqueueinfo",
967        // Misc
968        "mknod",
969        "mknodat",
970        "syslog",
971        "clock_settime",
972        "clock_adjtime",
973        "adjtimex",
974        "unshare",
975        "kcmp",
976        "epoll_pwait2",
977        // Futex (non-default)
978        "futex_waitv",
979        "futex_wake",
980        "futex_wait",
981        "futex_requeue",
982        // Landlock (already in default but listed for completeness)
983        "seccomp",
984        // Keyring
985        "add_key",
986        "request_key",
987        "keyctl",
988    ];
989
990    /// Warn when a custom seccomp profile allows security-critical syscalls
991    /// without argument-level filtering.
992    fn warn_missing_arg_filters(profile: &SeccompProfile) {
993        for group in &profile.syscalls {
994            if group.action != "SCMP_ACT_ALLOW" {
995                continue;
996            }
997            for name in &group.names {
998                if Self::ARG_FILTERED_SYSCALLS.contains(&name.as_str()) && group.args.is_empty() {
999                    warn!(
1000                        "Custom seccomp profile allows '{}' without argument filters. \
1001                         The built-in filter restricts this syscall at the argument level. \
1002                         This profile weakens security compared to the default.",
1003                        name
1004                    );
1005                }
1006            }
1007        }
1008    }
1009
1010    /// Check whether a capability is in the current thread's effective set
1011    /// by reading /proc/self/status (CapEff line).
1012    fn has_effective_cap(cap: i32) -> bool {
1013        let Ok(status) = std::fs::read_to_string("/proc/self/status") else {
1014            // If we can't read, assume worst case for safety.
1015            return true;
1016        };
1017        for line in status.lines() {
1018            if let Some(hex) = line.strip_prefix("CapEff:\t") {
1019                if let Ok(eff) = u64::from_str_radix(hex.trim(), 16) {
1020                    return eff & (1u64 << cap) != 0;
1021                }
1022            }
1023        }
1024        true // assume worst case
1025    }
1026
1027    /// Check if seccomp filter has been applied
1028    pub fn is_applied(&self) -> bool {
1029        self.applied
1030    }
1031
1032    fn apply_bpf_program(bpf_prog: &BpfProgram, log_denied: bool) -> Result<()> {
1033        let mut flags: libc::c_ulong = 0;
1034        if log_denied {
1035            flags |= libc::SECCOMP_FILTER_FLAG_LOG as libc::c_ulong;
1036        }
1037
1038        match Self::apply_bpf_program_with_flags(bpf_prog, flags) {
1039            Ok(()) => Ok(()),
1040            Err(err)
1041                if log_denied
1042                    && err.raw_os_error() == Some(libc::EINVAL)
1043                    && libc::SECCOMP_FILTER_FLAG_LOG != 0 =>
1044            {
1045                warn!(
1046                    "Kernel rejected SECCOMP_FILTER_FLAG_LOG; continuing with seccomp \
1047                     enforcement without deny logging"
1048                );
1049                Self::apply_bpf_program_with_flags(bpf_prog, 0)?;
1050                Ok(())
1051            }
1052            Err(err) => Err(NucleusError::SeccompError(format!(
1053                "Failed to apply seccomp filter: {}",
1054                err
1055            ))),
1056        }
1057    }
1058
1059    fn apply_bpf_program_with_flags(
1060        bpf_prog: &BpfProgram,
1061        flags: libc::c_ulong,
1062    ) -> std::io::Result<()> {
1063        // SAFETY: `prctl(PR_SET_NO_NEW_PRIVS, ...)` has no pointer arguments here
1064        // and only affects the current thread/process as required before seccomp.
1065        let rc = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
1066        if rc != 0 {
1067            return Err(std::io::Error::last_os_error());
1068        }
1069
1070        let prog = libc::sock_fprog {
1071            len: bpf_prog.len() as u16,
1072            filter: bpf_prog.as_ptr() as *mut libc::sock_filter,
1073        };
1074
1075        // SAFETY: `prog` points to a live BPF program buffer for the duration of
1076        // the syscall and the kernel copies the pointed-to filter immediately.
1077        let rc = unsafe {
1078            libc::syscall(
1079                libc::SYS_seccomp,
1080                libc::SECCOMP_SET_MODE_FILTER,
1081                flags,
1082                &prog as *const libc::sock_fprog,
1083            )
1084        };
1085
1086        if rc < 0 {
1087            return Err(std::io::Error::last_os_error());
1088        }
1089
1090        Ok(())
1091    }
1092}
1093
1094// SeccompProfile and SeccompSyscallGroup are defined in seccomp_generate.rs
1095use crate::security::seccomp_generate::SeccompProfile;
1096
1097/// Map a syscall name (e.g. "read", "write") to its Linux syscall number.
1098///
1099/// Covers the most common syscalls. Unknown names return None.
1100fn syscall_name_to_number(name: &str) -> Option<i64> {
1101    match name {
1102        // File I/O
1103        "read" => Some(libc::SYS_read),
1104        "write" => Some(libc::SYS_write),
1105        #[cfg(target_arch = "x86_64")]
1106        "open" => Some(libc::SYS_open),
1107        "openat" => Some(libc::SYS_openat),
1108        "close" => Some(libc::SYS_close),
1109        #[cfg(target_arch = "x86_64")]
1110        "stat" => Some(libc::SYS_stat),
1111        "fstat" => Some(libc::SYS_fstat),
1112        #[cfg(target_arch = "x86_64")]
1113        "lstat" => Some(libc::SYS_lstat),
1114        "lseek" => Some(libc::SYS_lseek),
1115        #[cfg(target_arch = "x86_64")]
1116        "access" => Some(libc::SYS_access),
1117        "fcntl" => Some(libc::SYS_fcntl),
1118        "readv" => Some(libc::SYS_readv),
1119        "writev" => Some(libc::SYS_writev),
1120        "pread64" => Some(libc::SYS_pread64),
1121        "pwrite64" => Some(libc::SYS_pwrite64),
1122        #[cfg(target_arch = "x86_64")]
1123        "readlink" => Some(libc::SYS_readlink),
1124        "readlinkat" => Some(libc::SYS_readlinkat),
1125        "newfstatat" => Some(libc::SYS_newfstatat),
1126        "statx" => Some(libc::SYS_statx),
1127        "faccessat" => Some(libc::SYS_faccessat),
1128        "faccessat2" => Some(libc::SYS_faccessat2),
1129        "dup" => Some(libc::SYS_dup),
1130        #[cfg(target_arch = "x86_64")]
1131        "dup2" => Some(libc::SYS_dup2),
1132        "dup3" => Some(libc::SYS_dup3),
1133        #[cfg(target_arch = "x86_64")]
1134        "pipe" => Some(libc::SYS_pipe),
1135        "pipe2" => Some(libc::SYS_pipe2),
1136        #[cfg(target_arch = "x86_64")]
1137        "unlink" => Some(libc::SYS_unlink),
1138        "unlinkat" => Some(libc::SYS_unlinkat),
1139        #[cfg(target_arch = "x86_64")]
1140        "rename" => Some(libc::SYS_rename),
1141        "renameat" => Some(libc::SYS_renameat),
1142        "renameat2" => Some(libc::SYS_renameat2),
1143        #[cfg(target_arch = "x86_64")]
1144        "link" => Some(libc::SYS_link),
1145        "linkat" => Some(libc::SYS_linkat),
1146        #[cfg(target_arch = "x86_64")]
1147        "symlink" => Some(libc::SYS_symlink),
1148        "symlinkat" => Some(libc::SYS_symlinkat),
1149        #[cfg(target_arch = "x86_64")]
1150        "chmod" => Some(libc::SYS_chmod),
1151        "fchmod" => Some(libc::SYS_fchmod),
1152        "fchmodat" => Some(libc::SYS_fchmodat),
1153        "truncate" => Some(libc::SYS_truncate),
1154        "ftruncate" => Some(libc::SYS_ftruncate),
1155        "fallocate" => Some(libc::SYS_fallocate),
1156        #[cfg(target_arch = "x86_64")]
1157        "fadvise64" => Some(libc::SYS_fadvise64),
1158        "fsync" => Some(libc::SYS_fsync),
1159        "fdatasync" => Some(libc::SYS_fdatasync),
1160        "flock" => Some(libc::SYS_flock),
1161        #[cfg(target_arch = "x86_64")]
1162        "sendfile" => Some(libc::SYS_sendfile),
1163        "copy_file_range" => Some(libc::SYS_copy_file_range),
1164        "splice" => Some(libc::SYS_splice),
1165        "tee" => Some(libc::SYS_tee),
1166        // Memory
1167        "mmap" => Some(libc::SYS_mmap),
1168        "munmap" => Some(libc::SYS_munmap),
1169        "mprotect" => Some(libc::SYS_mprotect),
1170        "brk" => Some(libc::SYS_brk),
1171        "mremap" => Some(libc::SYS_mremap),
1172        "madvise" => Some(libc::SYS_madvise),
1173        "msync" => Some(libc::SYS_msync),
1174        "mlock" => Some(libc::SYS_mlock),
1175        "mlock2" => Some(libc::SYS_mlock2),
1176        "munlock" => Some(libc::SYS_munlock),
1177        // SysV IPC
1178        "shmget" => Some(libc::SYS_shmget),
1179        "shmat" => Some(libc::SYS_shmat),
1180        "shmdt" => Some(libc::SYS_shmdt),
1181        "shmctl" => Some(libc::SYS_shmctl),
1182        "semget" => Some(libc::SYS_semget),
1183        "semop" => Some(libc::SYS_semop),
1184        "semctl" => Some(libc::SYS_semctl),
1185        "semtimedop" => Some(libc::SYS_semtimedop),
1186        // Process
1187        #[cfg(target_arch = "x86_64")]
1188        "fork" => Some(libc::SYS_fork),
1189        "clone" => Some(libc::SYS_clone),
1190        "clone3" => Some(libc::SYS_clone3),
1191        "execve" => Some(libc::SYS_execve),
1192        "execveat" => Some(libc::SYS_execveat),
1193        "wait4" => Some(libc::SYS_wait4),
1194        "waitid" => Some(libc::SYS_waitid),
1195        "exit" => Some(libc::SYS_exit),
1196        "exit_group" => Some(libc::SYS_exit_group),
1197        "getpid" => Some(libc::SYS_getpid),
1198        "gettid" => Some(libc::SYS_gettid),
1199        "getuid" => Some(libc::SYS_getuid),
1200        "getgid" => Some(libc::SYS_getgid),
1201        "geteuid" => Some(libc::SYS_geteuid),
1202        "getegid" => Some(libc::SYS_getegid),
1203        "getppid" => Some(libc::SYS_getppid),
1204        #[cfg(target_arch = "x86_64")]
1205        "getpgrp" => Some(libc::SYS_getpgrp),
1206        "setsid" => Some(libc::SYS_setsid),
1207        "getgroups" => Some(libc::SYS_getgroups),
1208        // Signals
1209        "rt_sigaction" => Some(libc::SYS_rt_sigaction),
1210        "rt_sigprocmask" => Some(libc::SYS_rt_sigprocmask),
1211        "rt_sigreturn" => Some(libc::SYS_rt_sigreturn),
1212        "rt_sigsuspend" => Some(libc::SYS_rt_sigsuspend),
1213        "rt_sigtimedwait" => Some(libc::SYS_rt_sigtimedwait),
1214        "rt_sigpending" => Some(libc::SYS_rt_sigpending),
1215        "rt_sigqueueinfo" => Some(libc::SYS_rt_sigqueueinfo),
1216        "sigaltstack" => Some(libc::SYS_sigaltstack),
1217        "restart_syscall" => Some(libc::SYS_restart_syscall),
1218        "kill" => Some(libc::SYS_kill),
1219        "tgkill" => Some(libc::SYS_tgkill),
1220        // Time
1221        "clock_gettime" => Some(libc::SYS_clock_gettime),
1222        "clock_getres" => Some(libc::SYS_clock_getres),
1223        "clock_nanosleep" => Some(libc::SYS_clock_nanosleep),
1224        "gettimeofday" => Some(libc::SYS_gettimeofday),
1225        "nanosleep" => Some(libc::SYS_nanosleep),
1226        // Directories
1227        "getcwd" => Some(libc::SYS_getcwd),
1228        "chdir" => Some(libc::SYS_chdir),
1229        "fchdir" => Some(libc::SYS_fchdir),
1230        #[cfg(target_arch = "x86_64")]
1231        "mkdir" => Some(libc::SYS_mkdir),
1232        "mkdirat" => Some(libc::SYS_mkdirat),
1233        #[cfg(target_arch = "x86_64")]
1234        "rmdir" => Some(libc::SYS_rmdir),
1235        #[cfg(target_arch = "x86_64")]
1236        "getdents" => Some(libc::SYS_getdents),
1237        "getdents64" => Some(libc::SYS_getdents64),
1238        // Network
1239        "socket" => Some(libc::SYS_socket),
1240        "connect" => Some(libc::SYS_connect),
1241        "sendto" => Some(libc::SYS_sendto),
1242        "recvfrom" => Some(libc::SYS_recvfrom),
1243        "sendmsg" => Some(libc::SYS_sendmsg),
1244        "recvmsg" => Some(libc::SYS_recvmsg),
1245        "shutdown" => Some(libc::SYS_shutdown),
1246        "bind" => Some(libc::SYS_bind),
1247        "listen" => Some(libc::SYS_listen),
1248        "accept" => Some(libc::SYS_accept),
1249        "accept4" => Some(libc::SYS_accept4),
1250        "setsockopt" => Some(libc::SYS_setsockopt),
1251        "getsockopt" => Some(libc::SYS_getsockopt),
1252        "getsockname" => Some(libc::SYS_getsockname),
1253        "getpeername" => Some(libc::SYS_getpeername),
1254        "socketpair" => Some(libc::SYS_socketpair),
1255        // Poll/Select
1256        #[cfg(target_arch = "x86_64")]
1257        "poll" => Some(libc::SYS_poll),
1258        "ppoll" => Some(libc::SYS_ppoll),
1259        #[cfg(target_arch = "x86_64")]
1260        "select" => Some(libc::SYS_select),
1261        "pselect6" => Some(libc::SYS_pselect6),
1262        #[cfg(target_arch = "x86_64")]
1263        "epoll_create" => Some(libc::SYS_epoll_create),
1264        "epoll_create1" => Some(libc::SYS_epoll_create1),
1265        "epoll_ctl" => Some(libc::SYS_epoll_ctl),
1266        #[cfg(target_arch = "x86_64")]
1267        "epoll_wait" => Some(libc::SYS_epoll_wait),
1268        "epoll_pwait" => Some(libc::SYS_epoll_pwait),
1269        #[cfg(target_arch = "x86_64")]
1270        "eventfd" => Some(libc::SYS_eventfd),
1271        "eventfd2" => Some(libc::SYS_eventfd2),
1272        #[cfg(target_arch = "x86_64")]
1273        "signalfd" => Some(libc::SYS_signalfd),
1274        "signalfd4" => Some(libc::SYS_signalfd4),
1275        "timerfd_create" => Some(libc::SYS_timerfd_create),
1276        "timerfd_settime" => Some(libc::SYS_timerfd_settime),
1277        "timerfd_gettime" => Some(libc::SYS_timerfd_gettime),
1278        // Misc
1279        "uname" => Some(libc::SYS_uname),
1280        "getrandom" => Some(libc::SYS_getrandom),
1281        "futex" => Some(libc::SYS_futex),
1282        "set_tid_address" => Some(libc::SYS_set_tid_address),
1283        "set_robust_list" => Some(libc::SYS_set_robust_list),
1284        "get_robust_list" => Some(libc::SYS_get_robust_list),
1285        #[cfg(target_arch = "x86_64")]
1286        "arch_prctl" => Some(libc::SYS_arch_prctl),
1287        "sysinfo" => Some(libc::SYS_sysinfo),
1288        "umask" => Some(libc::SYS_umask),
1289        #[cfg(target_arch = "x86_64")]
1290        "getrlimit" => Some(libc::SYS_getrlimit),
1291        "prlimit64" => Some(libc::SYS_prlimit64),
1292        "getrusage" => Some(libc::SYS_getrusage),
1293        "times" => Some(libc::SYS_times),
1294        "sched_yield" => Some(libc::SYS_sched_yield),
1295        "sched_getaffinity" => Some(libc::SYS_sched_getaffinity),
1296        "getcpu" => Some(libc::SYS_getcpu),
1297        "rseq" => Some(libc::SYS_rseq),
1298        "close_range" => Some(libc::SYS_close_range),
1299        // Ownership
1300        "fchown" => Some(libc::SYS_fchown),
1301        "fchownat" => Some(libc::SYS_fchownat),
1302        #[cfg(target_arch = "x86_64")]
1303        "chown" => Some(libc::SYS_chown),
1304        #[cfg(target_arch = "x86_64")]
1305        "lchown" => Some(libc::SYS_lchown),
1306        // io_uring
1307        "io_uring_setup" => Some(libc::SYS_io_uring_setup),
1308        "io_uring_enter" => Some(libc::SYS_io_uring_enter),
1309        "io_uring_register" => Some(libc::SYS_io_uring_register),
1310        // Legacy AIO
1311        "io_setup" => Some(libc::SYS_io_setup),
1312        "io_destroy" => Some(libc::SYS_io_destroy),
1313        "io_submit" => Some(libc::SYS_io_submit),
1314        "io_getevents" => Some(libc::SYS_io_getevents),
1315        // Timers
1316        "setitimer" => Some(libc::SYS_setitimer),
1317        "getitimer" => Some(libc::SYS_getitimer),
1318        // Process groups
1319        "setpgid" => Some(libc::SYS_setpgid),
1320        "getpgid" => Some(libc::SYS_getpgid),
1321        "memfd_create" => Some(libc::SYS_memfd_create),
1322        "ioctl" => Some(libc::SYS_ioctl),
1323        "prctl" => Some(libc::SYS_prctl),
1324        // Landlock
1325        "landlock_create_ruleset" => Some(libc::SYS_landlock_create_ruleset),
1326        "landlock_add_rule" => Some(libc::SYS_landlock_add_rule),
1327        "landlock_restrict_self" => Some(libc::SYS_landlock_restrict_self),
1328        // --- Additional syscalls (not in default allowlist, available via --seccomp-allow) ---
1329        // Memory
1330        "mincore" => Some(libc::SYS_mincore),
1331        "mlockall" => Some(libc::SYS_mlockall),
1332        "munlockall" => Some(libc::SYS_munlockall),
1333        "mbind" => Some(libc::SYS_mbind),
1334        "set_mempolicy" => Some(libc::SYS_set_mempolicy),
1335        "get_mempolicy" => Some(libc::SYS_get_mempolicy),
1336        "memfd_secret" => Some(libc::SYS_memfd_secret),
1337        "membarrier" => Some(libc::SYS_membarrier),
1338        "process_madvise" => Some(libc::SYS_process_madvise),
1339        "pkey_mprotect" => Some(libc::SYS_pkey_mprotect),
1340        "pkey_alloc" => Some(libc::SYS_pkey_alloc),
1341        "pkey_free" => Some(libc::SYS_pkey_free),
1342        "mseal" => Some(libc::SYS_mseal),
1343        "map_shadow_stack" => Some(453),
1344        "remap_file_pages" => Some(libc::SYS_remap_file_pages),
1345        "set_mempolicy_home_node" => Some(libc::SYS_set_mempolicy_home_node),
1346        "cachestat" => Some(451),
1347        // Process
1348        #[cfg(target_arch = "x86_64")]
1349        "vfork" => Some(libc::SYS_vfork),
1350        #[cfg(target_arch = "x86_64")]
1351        "pause" => Some(libc::SYS_pause),
1352        #[cfg(target_arch = "x86_64")]
1353        "alarm" => Some(libc::SYS_alarm),
1354        "tkill" => Some(libc::SYS_tkill),
1355        "ptrace" => Some(libc::SYS_ptrace),
1356        "process_vm_readv" => Some(libc::SYS_process_vm_readv),
1357        "process_vm_writev" => Some(libc::SYS_process_vm_writev),
1358        "process_mrelease" => Some(libc::SYS_process_mrelease),
1359        "kcmp" => Some(libc::SYS_kcmp),
1360        "unshare" => Some(libc::SYS_unshare),
1361        "setns" => Some(libc::SYS_setns),
1362        "pidfd_open" => Some(libc::SYS_pidfd_open),
1363        "pidfd_send_signal" => Some(libc::SYS_pidfd_send_signal),
1364        "pidfd_getfd" => Some(libc::SYS_pidfd_getfd),
1365        // UID/GID
1366        "setuid" => Some(libc::SYS_setuid),
1367        "setgid" => Some(libc::SYS_setgid),
1368        "setreuid" => Some(libc::SYS_setreuid),
1369        "setregid" => Some(libc::SYS_setregid),
1370        "setresuid" => Some(libc::SYS_setresuid),
1371        "getresuid" => Some(libc::SYS_getresuid),
1372        "setresgid" => Some(libc::SYS_setresgid),
1373        "getresgid" => Some(libc::SYS_getresgid),
1374        "setfsuid" => Some(libc::SYS_setfsuid),
1375        "setfsgid" => Some(libc::SYS_setfsgid),
1376        "setgroups" => Some(libc::SYS_setgroups),
1377        "getsid" => Some(libc::SYS_getsid),
1378        // Capabilities
1379        "capget" => Some(libc::SYS_capget),
1380        "capset" => Some(libc::SYS_capset),
1381        // Signals
1382        "rt_tgsigqueueinfo" => Some(libc::SYS_rt_tgsigqueueinfo),
1383        // SysV message queues
1384        "msgget" => Some(libc::SYS_msgget),
1385        "msgsnd" => Some(libc::SYS_msgsnd),
1386        "msgrcv" => Some(libc::SYS_msgrcv),
1387        "msgctl" => Some(libc::SYS_msgctl),
1388        // Timers
1389        "timer_create" => Some(libc::SYS_timer_create),
1390        "timer_settime" => Some(libc::SYS_timer_settime),
1391        "timer_gettime" => Some(libc::SYS_timer_gettime),
1392        "timer_getoverrun" => Some(libc::SYS_timer_getoverrun),
1393        "timer_delete" => Some(libc::SYS_timer_delete),
1394        "clock_settime" => Some(libc::SYS_clock_settime),
1395        "clock_adjtime" => Some(libc::SYS_clock_adjtime),
1396        #[cfg(target_arch = "x86_64")]
1397        "time" => Some(libc::SYS_time),
1398        // File I/O (non-default)
1399        #[cfg(target_arch = "x86_64")]
1400        "creat" => Some(libc::SYS_creat),
1401        "readahead" => Some(libc::SYS_readahead),
1402        "sync" => Some(libc::SYS_sync),
1403        "syncfs" => Some(libc::SYS_syncfs),
1404        "vmsplice" => Some(libc::SYS_vmsplice),
1405        "utimensat" => Some(libc::SYS_utimensat),
1406        #[cfg(target_arch = "x86_64")]
1407        "utimes" => Some(libc::SYS_utimes),
1408        #[cfg(target_arch = "x86_64")]
1409        "utime" => Some(libc::SYS_utime),
1410        #[cfg(target_arch = "x86_64")]
1411        "futimesat" => Some(libc::SYS_futimesat),
1412        "openat2" => Some(libc::SYS_openat2),
1413        "name_to_handle_at" => Some(libc::SYS_name_to_handle_at),
1414        "open_by_handle_at" => Some(libc::SYS_open_by_handle_at),
1415        "fchmodat2" => Some(libc::SYS_fchmodat2),
1416        "statmount" => Some(457),
1417        "listmount" => Some(458),
1418        // Extended attributes (write)
1419        "setxattr" => Some(libc::SYS_setxattr),
1420        "lsetxattr" => Some(libc::SYS_lsetxattr),
1421        "fsetxattr" => Some(libc::SYS_fsetxattr),
1422        "removexattr" => Some(libc::SYS_removexattr),
1423        "lremovexattr" => Some(libc::SYS_lremovexattr),
1424        "fremovexattr" => Some(libc::SYS_fremovexattr),
1425        "setxattrat" => Some(463),
1426        "getxattrat" => Some(464),
1427        "listxattrat" => Some(465),
1428        "removexattrat" => Some(466),
1429        // Network (non-default)
1430        "recvmmsg" => Some(libc::SYS_recvmmsg),
1431        "sendmmsg" => Some(libc::SYS_sendmmsg),
1432        // Inotify
1433        #[cfg(target_arch = "x86_64")]
1434        "inotify_init" => Some(libc::SYS_inotify_init),
1435        "inotify_init1" => Some(libc::SYS_inotify_init1),
1436        "inotify_add_watch" => Some(libc::SYS_inotify_add_watch),
1437        "inotify_rm_watch" => Some(libc::SYS_inotify_rm_watch),
1438        // Fanotify
1439        "fanotify_init" => Some(libc::SYS_fanotify_init),
1440        "fanotify_mark" => Some(libc::SYS_fanotify_mark),
1441        // Epoll (non-default)
1442        "epoll_pwait2" => Some(libc::SYS_epoll_pwait2),
1443        // Scheduling (non-default)
1444        "sched_setparam" => Some(libc::SYS_sched_setparam),
1445        "sched_setscheduler" => Some(libc::SYS_sched_setscheduler),
1446        "sched_get_priority_max" => Some(libc::SYS_sched_get_priority_max),
1447        "sched_get_priority_min" => Some(libc::SYS_sched_get_priority_min),
1448        "sched_rr_get_interval" => Some(libc::SYS_sched_rr_get_interval),
1449        "sched_setattr" => Some(libc::SYS_sched_setattr),
1450        "sched_getattr" => Some(libc::SYS_sched_getattr),
1451        "sched_setaffinity" => Some(libc::SYS_sched_setaffinity),
1452        // Resource limits
1453        #[cfg(target_arch = "x86_64")]
1454        "setrlimit" => Some(libc::SYS_setrlimit),
1455        "getpriority" => Some(libc::SYS_getpriority),
1456        "setpriority" => Some(libc::SYS_setpriority),
1457        "ioprio_set" => Some(libc::SYS_ioprio_set),
1458        "ioprio_get" => Some(libc::SYS_ioprio_get),
1459        // Futex (non-default)
1460        "futex_waitv" => Some(libc::SYS_futex_waitv),
1461        "futex_wake" => Some(454),
1462        "futex_wait" => Some(455),
1463        "futex_requeue" => Some(456),
1464        // Kernel modules
1465        "init_module" => Some(libc::SYS_init_module),
1466        "finit_module" => Some(libc::SYS_finit_module),
1467        "delete_module" => Some(libc::SYS_delete_module),
1468        // eBPF and performance
1469        "bpf" => Some(libc::SYS_bpf),
1470        "perf_event_open" => Some(libc::SYS_perf_event_open),
1471        // Seccomp
1472        "seccomp" => Some(libc::SYS_seccomp),
1473        // Userfaultfd
1474        "userfaultfd" => Some(libc::SYS_userfaultfd),
1475        // Mount (non-default)
1476        "mount" => Some(libc::SYS_mount),
1477        "umount2" => Some(libc::SYS_umount2),
1478        "pivot_root" => Some(libc::SYS_pivot_root),
1479        "mount_setattr" => Some(libc::SYS_mount_setattr),
1480        "open_tree" => Some(libc::SYS_open_tree),
1481        "open_tree_attr" => Some(467),
1482        "move_mount" => Some(libc::SYS_move_mount),
1483        "fsopen" => Some(libc::SYS_fsopen),
1484        "fsconfig" => Some(libc::SYS_fsconfig),
1485        "fsmount" => Some(libc::SYS_fsmount),
1486        "fspick" => Some(libc::SYS_fspick),
1487        // Misc (non-default)
1488        "syslog" => Some(libc::SYS_syslog),
1489        "reboot" => Some(libc::SYS_reboot),
1490        "swapon" => Some(libc::SYS_swapon),
1491        "swapoff" => Some(libc::SYS_swapoff),
1492        "chroot" => Some(libc::SYS_chroot),
1493        "acct" => Some(libc::SYS_acct),
1494        "settimeofday" => Some(libc::SYS_settimeofday),
1495        "sethostname" => Some(libc::SYS_sethostname),
1496        "setdomainname" => Some(libc::SYS_setdomainname),
1497        "adjtimex" => Some(libc::SYS_adjtimex),
1498        #[cfg(target_arch = "x86_64")]
1499        "modify_ldt" => Some(libc::SYS_modify_ldt),
1500        #[cfg(target_arch = "x86_64")]
1501        "iopl" => Some(libc::SYS_iopl),
1502        #[cfg(target_arch = "x86_64")]
1503        "ioperm" => Some(libc::SYS_ioperm),
1504        "quotactl" => Some(libc::SYS_quotactl),
1505        "quotactl_fd" => Some(libc::SYS_quotactl_fd),
1506        "personality" => Some(libc::SYS_personality),
1507        "vhangup" => Some(libc::SYS_vhangup),
1508        #[cfg(target_arch = "x86_64")]
1509        "ustat" => Some(libc::SYS_ustat),
1510        #[cfg(target_arch = "x86_64")]
1511        "sysfs" => Some(libc::SYS_sysfs),
1512        "mknod" => Some(libc::SYS_mknod),
1513        "mknodat" => Some(libc::SYS_mknodat),
1514        "migrate_pages" => Some(libc::SYS_migrate_pages),
1515        "move_pages" => Some(libc::SYS_move_pages),
1516        #[cfg(target_arch = "x86_64")]
1517        "kexec_load" => Some(libc::SYS_kexec_load),
1518        "kexec_file_load" => Some(libc::SYS_kexec_file_load),
1519        // POSIX message queues
1520        "mq_open" => Some(libc::SYS_mq_open),
1521        "mq_unlink" => Some(libc::SYS_mq_unlink),
1522        "mq_timedsend" => Some(libc::SYS_mq_timedsend),
1523        "mq_timedreceive" => Some(libc::SYS_mq_timedreceive),
1524        "mq_notify" => Some(libc::SYS_mq_notify),
1525        "mq_getsetattr" => Some(libc::SYS_mq_getsetattr),
1526        // Keyring
1527        "add_key" => Some(libc::SYS_add_key),
1528        "request_key" => Some(libc::SYS_request_key),
1529        "keyctl" => Some(libc::SYS_keyctl),
1530        // IO pgetevents
1531        "io_pgetevents" => Some(333),
1532        // LSM
1533        "lsm_get_self_attr" => Some(459),
1534        "lsm_set_self_attr" => Some(460),
1535        "lsm_list_modules" => Some(461),
1536        #[cfg(target_arch = "x86_64")]
1537        "lookup_dcookie" => Some(libc::SYS_lookup_dcookie),
1538        "uretprobe" => Some(335),
1539        _ => None,
1540    }
1541}
1542
1543impl Default for SeccompManager {
1544    fn default() -> Self {
1545        Self::new()
1546    }
1547}
1548
1549#[cfg(test)]
1550mod tests {
1551    use super::*;
1552
1553    #[test]
1554    fn test_seccomp_manager_initial_state() {
1555        let mgr = SeccompManager::new();
1556        assert!(!mgr.is_applied());
1557    }
1558
1559    #[test]
1560    fn test_apply_idempotent() {
1561        let mgr = SeccompManager::new();
1562        // Note: We can't actually test application in unit tests
1563        // as it would affect the test process itself
1564        // This is tested in integration tests instead
1565        assert!(!mgr.is_applied());
1566    }
1567
1568    #[test]
1569    fn test_clone_denied_flags_include_newcgroup() {
1570        assert_ne!(
1571            DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWCGROUP as u64,
1572            0
1573        );
1574    }
1575
1576    #[test]
1577    fn test_clone_denied_flags_include_newtime() {
1578        assert_ne!(
1579            DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWTIME as u64,
1580            0,
1581            "CLONE_NEWTIME must be in denied clone namespace flags"
1582        );
1583    }
1584
1585    #[test]
1586    fn test_network_none_socket_domains_are_unix_only() {
1587        let domains = SeccompManager::allowed_socket_domains(false);
1588        assert_eq!(domains, vec![libc::AF_UNIX]);
1589    }
1590
1591    #[test]
1592    fn test_network_enabled_socket_domains_exclude_netlink() {
1593        let domains = SeccompManager::allowed_socket_domains(true);
1594        assert!(domains.contains(&libc::AF_UNIX));
1595        assert!(domains.contains(&libc::AF_INET));
1596        assert!(domains.contains(&libc::AF_INET6));
1597        assert!(!domains.contains(&libc::AF_NETLINK));
1598    }
1599
1600    #[test]
1601    fn test_network_mode_syscalls_only_enabled_when_network_allowed() {
1602        let none = SeccompManager::network_mode_syscalls(false);
1603        assert!(none.is_empty());
1604
1605        let enabled = SeccompManager::network_mode_syscalls(true);
1606        assert!(enabled.contains(&libc::SYS_connect));
1607        assert!(enabled.contains(&libc::SYS_bind));
1608        assert!(enabled.contains(&libc::SYS_listen));
1609        assert!(enabled.contains(&libc::SYS_accept));
1610        assert!(enabled.contains(&libc::SYS_setsockopt));
1611    }
1612
1613    #[test]
1614    fn test_landlock_bootstrap_syscalls_present_in_base_allowlist() {
1615        let base = SeccompManager::base_allowed_syscalls();
1616        assert!(base.contains(&libc::SYS_landlock_create_ruleset));
1617        assert!(base.contains(&libc::SYS_landlock_add_rule));
1618        assert!(base.contains(&libc::SYS_landlock_restrict_self));
1619    }
1620
1621    #[test]
1622    fn test_x32_legacy_range_not_allowlisted() {
1623        let base = SeccompManager::base_allowed_syscalls();
1624        let net = SeccompManager::network_mode_syscalls(true);
1625        for nr in 512_i64..=547_i64 {
1626            assert!(
1627                !base.contains(&nr) && !net.contains(&nr),
1628                "x32 syscall number {} unexpectedly allowlisted",
1629                nr
1630            );
1631        }
1632    }
1633
1634    #[test]
1635    fn test_i386_compat_socketcall_range_not_allowlisted() {
1636        let base = SeccompManager::base_allowed_syscalls();
1637        let net = SeccompManager::network_mode_syscalls(true);
1638        // i386 compat per syscall_32.tbl: socket..shutdown live at 359..373.
1639        // On x86_64 these numbers are outside our native allowlist surface.
1640        for nr in 359_i64..=373_i64 {
1641            assert!(
1642                !base.contains(&nr) && !net.contains(&nr),
1643                "i386 compat syscall number {} unexpectedly allowlisted",
1644                nr
1645            );
1646        }
1647    }
1648
1649    #[test]
1650    fn test_minimal_filter_allowlist_counts_are_stable() {
1651        let base = SeccompManager::base_allowed_syscalls();
1652        let net = SeccompManager::network_mode_syscalls(true);
1653
1654        // Snapshot counts to catch unintended policy drift.
1655        // +8 accounts for conditional rules inserted in minimal_filter():
1656        // socket/ioctl/prctl/prlimit64/mprotect/clone/clone3/execveat.
1657        // fork removed (forces through filtered clone path).
1658        // execveat removed from base (arg-filtered separately).
1659        // sysinfo removed (L8: leaks host info).
1660        // prlimit64 moved to arg-filtered (M3).
1661        assert_eq!(base.len(), 173);
1662        assert_eq!(net.len(), 11);
1663        assert_eq!(base.len() + 8, 181);
1664        assert_eq!(base.len() + net.len() + 8, 192);
1665    }
1666
1667    #[test]
1668    fn test_arg_filtered_syscalls_list_includes_critical_syscalls() {
1669        // These syscalls must be in the arg-filtered list so custom profiles
1670        // get warnings when they allow them without filters.
1671        for name in &["clone", "clone3", "execveat", "ioctl", "prctl", "socket"] {
1672            assert!(
1673                SeccompManager::ARG_FILTERED_SYSCALLS.contains(name),
1674                "'{}' must be in ARG_FILTERED_SYSCALLS",
1675                name
1676            );
1677        }
1678    }
1679
1680    #[test]
1681    fn test_clone3_allowed_in_minimal_filter() {
1682        // clone3 MUST be in the BPF rules map – glibc 2.34+ and newer musl
1683        // use clone3 internally for posix_spawn/fork. Blocking it breaks
1684        // std::process::Command on modern systems. Namespace creation is
1685        // prevented by dropped capabilities (CAP_SYS_ADMIN etc.), not seccomp.
1686        let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1687        assert!(
1688            rules.contains_key(&libc::SYS_clone3),
1689            "clone3 must be in the seccomp allowlist (glibc 2.34+ requires it)"
1690        );
1691    }
1692
1693    #[test]
1694    fn test_clone_is_allowed_with_arg_filter() {
1695        // clone (not clone3) should still be in the rules with arg filtering
1696        let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1697        assert!(
1698            rules.contains_key(&libc::SYS_clone),
1699            "clone must be in the seccomp allowlist with arg filters"
1700        );
1701    }
1702
1703    #[test]
1704    fn test_high_risk_syscalls_removed_from_base_allowlist() {
1705        let base = SeccompManager::base_allowed_syscalls();
1706        // chown/fchown/lchown/fchownat: allowed – safe after CAP_CHOWN/CAP_FOWNER drop
1707        // mlock/munlock: allowed – needed by databases, bounded by RLIMIT_MEMLOCK
1708        let removed = [
1709            libc::SYS_sync,
1710            libc::SYS_syncfs,
1711            libc::SYS_mincore,
1712            libc::SYS_vfork,
1713            libc::SYS_tkill,
1714            // io_uring: large attack surface, many CVEs – require custom profile
1715            libc::SYS_io_uring_setup,
1716            libc::SYS_io_uring_enter,
1717            libc::SYS_io_uring_register,
1718        ];
1719
1720        for syscall in removed {
1721            assert!(
1722                !base.contains(&syscall),
1723                "syscall {} unexpectedly present in base allowlist",
1724                syscall
1725            );
1726        }
1727    }
1728
1729    #[test]
1730    fn test_custom_profile_preserves_clone_arg_filters() {
1731        // SEC-01: Custom seccomp profiles that allow "clone" must still get
1732        // argument-level filtering to block namespace-creating flags.
1733        // Verify by inspecting the built-in filter rules that serve as the
1734        // merge source for apply_profile_from_file.
1735        let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1736
1737        // Every ARG_FILTERED_SYSCALLS entry (except clone3, which is allowed
1738        // unconditionally since BPF can't inspect its struct-based flags) must
1739        // have non-empty argument-level rules in the built-in filter so that
1740        // apply_profile_from_file can merge them.
1741        for name in SeccompManager::ARG_FILTERED_SYSCALLS {
1742            if *name == "clone3" {
1743                // clone3 is allowed unconditionally – BPF cannot dereference
1744                // the clone_args struct, so arg filtering is impossible.
1745                // Namespace defense relies on dropped capabilities.
1746                continue;
1747            }
1748            if let Some(nr) = syscall_name_to_number(name) {
1749                let entry = rules.get(&nr);
1750                assert!(
1751                    entry.is_some() && !entry.unwrap().is_empty(),
1752                    "built-in filter must have argument-level rules for '{}' \
1753                     so apply_profile_from_file can merge them into custom profiles",
1754                    name
1755                );
1756            }
1757        }
1758    }
1759
1760    #[test]
1761    fn test_memfd_create_not_in_default_allowlist() {
1762        // SEC-02: memfd_create enables fileless code execution when combined with execveat.
1763        let base = SeccompManager::base_allowed_syscalls();
1764        assert!(
1765            !base.contains(&libc::SYS_memfd_create),
1766            "memfd_create must not be in the default seccomp allowlist (fileless exec risk)"
1767        );
1768        // Also verify it's not sneaked into the compiled filter rules
1769        let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1770        assert!(
1771            !rules.contains_key(&libc::SYS_memfd_create),
1772            "memfd_create must not be in the compiled seccomp filter rules"
1773        );
1774    }
1775
1776    #[test]
1777    fn test_mprotect_has_arg_filtering() {
1778        // SEC-03: mprotect must have argument-level filtering to prevent W^X
1779        // (PROT_WRITE|PROT_EXEC) violations. Verify via runtime data structures.
1780
1781        // mprotect must NOT be in the unconditional base allowlist
1782        let base = SeccompManager::base_allowed_syscalls();
1783        assert!(
1784            !base.contains(&libc::SYS_mprotect),
1785            "SYS_mprotect must not be unconditionally allowed - needs arg filtering"
1786        );
1787
1788        // mprotect must be present in the compiled filter with non-empty
1789        // argument conditions (the conditions enforce W^X)
1790        let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1791        let mprotect_rules = rules.get(&libc::SYS_mprotect);
1792        assert!(
1793            mprotect_rules.is_some(),
1794            "mprotect must be present in the seccomp filter rules"
1795        );
1796        assert!(
1797            !mprotect_rules.unwrap().is_empty(),
1798            "mprotect must have argument-level conditions to prevent W^X violations"
1799        );
1800    }
1801
1802    #[test]
1803    fn test_unsafe_blocks_have_safety_comments() {
1804        // SEC-08: All unsafe blocks must have // SAFETY: documentation
1805        let source = include_str!("seccomp.rs");
1806        let mut pos = 0;
1807        while let Some(idx) = source[pos..].find("unsafe {") {
1808            let abs_idx = pos + idx;
1809            // Check that there's a SAFETY comment within 200 chars before the unsafe block
1810            let start = abs_idx.saturating_sub(200);
1811            let context = &source[start..abs_idx];
1812            assert!(
1813                context.contains("SAFETY:"),
1814                "unsafe block at byte {} must have a // SAFETY: comment. Context: ...{}...",
1815                abs_idx,
1816                &source[abs_idx.saturating_sub(80)..abs_idx + 10]
1817            );
1818            pos = abs_idx + 1;
1819        }
1820    }
1821
1822    // --- H-1: mprotect MaskedEq logic verification ---
1823    //
1824    // The mprotect filter uses MaskedEq((PROT_WRITE | PROT_EXEC), value) to
1825    // allow only combinations where the W|X bits match one of {0, W, X}.
1826    // These tests prove the logic is correct without installing a real
1827    // seccomp filter (which would affect the test process).
1828
1829    /// Helper: simulates the MaskedEq check that the seccomp BPF would perform.
1830    /// Returns true if the prot value would be ALLOWED by one of the rules.
1831    fn mprotect_would_allow(prot: u64) -> bool {
1832        let mask = (libc::PROT_WRITE | libc::PROT_EXEC) as u64;
1833        let allowed_values: &[u64] = &[0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64];
1834        let masked = prot & mask;
1835        allowed_values.contains(&masked)
1836    }
1837
1838    #[test]
1839    fn test_mprotect_allows_prot_none() {
1840        assert!(mprotect_would_allow(0), "PROT_NONE must be allowed");
1841    }
1842
1843    #[test]
1844    fn test_mprotect_allows_prot_read_only() {
1845        assert!(
1846            mprotect_would_allow(libc::PROT_READ as u64),
1847            "PROT_READ must be allowed (W|X bits are 0)"
1848        );
1849    }
1850
1851    #[test]
1852    fn test_mprotect_allows_prot_read_write() {
1853        assert!(
1854            mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE) as u64),
1855            "PROT_READ|PROT_WRITE must be allowed"
1856        );
1857    }
1858
1859    #[test]
1860    fn test_mprotect_allows_prot_read_exec() {
1861        assert!(
1862            mprotect_would_allow((libc::PROT_READ | libc::PROT_EXEC) as u64),
1863            "PROT_READ|PROT_EXEC must be allowed"
1864        );
1865    }
1866
1867    #[test]
1868    fn test_mprotect_rejects_prot_write_exec() {
1869        assert!(
1870            !mprotect_would_allow((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
1871            "PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
1872        );
1873    }
1874
1875    #[test]
1876    fn test_mprotect_rejects_prot_read_write_exec() {
1877        assert!(
1878            !mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE | libc::PROT_EXEC) as u64),
1879            "PROT_READ|PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
1880        );
1881    }
1882
1883    #[test]
1884    fn test_mprotect_allows_prot_write_alone() {
1885        assert!(
1886            mprotect_would_allow(libc::PROT_WRITE as u64),
1887            "PROT_WRITE alone must be allowed"
1888        );
1889    }
1890
1891    #[test]
1892    fn test_mprotect_allows_prot_exec_alone() {
1893        assert!(
1894            mprotect_would_allow(libc::PROT_EXEC as u64),
1895            "PROT_EXEC alone must be allowed"
1896        );
1897    }
1898
1899    // --- Extra syscall allowlist tests ---
1900
1901    #[test]
1902    fn test_extra_syscalls_are_merged_into_filter() {
1903        let extra = vec!["io_uring_setup".to_string(), "sysinfo".to_string()];
1904        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1905        assert!(
1906            rules.contains_key(&libc::SYS_io_uring_setup),
1907            "io_uring_setup must be in filter when requested via extra_syscalls"
1908        );
1909        assert!(
1910            rules.contains_key(&libc::SYS_sysinfo),
1911            "sysinfo must be in filter when requested via extra_syscalls"
1912        );
1913    }
1914
1915    #[test]
1916    fn test_extra_syscalls_do_not_override_arg_filtered() {
1917        // If a user requests "clone" via extra_syscalls, the arg-filtered
1918        // version from the built-in filter should still be present (not
1919        // replaced with an unconditional allow).
1920        let extra = vec!["clone".to_string()];
1921        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1922        let clone_rules = rules.get(&libc::SYS_clone);
1923        assert!(
1924            clone_rules.is_some() && !clone_rules.unwrap().is_empty(),
1925            "clone must retain argument-level filtering even when in extra_syscalls"
1926        );
1927    }
1928
1929    #[test]
1930    fn test_extra_syscalls_unknown_name_is_warned_and_skipped() {
1931        // Unknown syscall names emit a WARN and are skipped (not fatal)
1932        let extra = vec!["not_a_real_syscall".to_string()];
1933        let result = SeccompManager::minimal_filter(true, &extra);
1934        assert!(
1935            result.is_ok(),
1936            "Unknown syscall name should warn and skip, not error"
1937        );
1938    }
1939
1940    #[test]
1941    fn test_extra_syscalls_empty_is_noop() {
1942        let rules_without = SeccompManager::minimal_filter(true, &[]).unwrap();
1943        let rules_with = SeccompManager::minimal_filter(true, &[]).unwrap();
1944        assert_eq!(rules_without.len(), rules_with.len());
1945    }
1946
1947    #[test]
1948    fn test_extra_syscalls_duplicate_of_default_is_harmless() {
1949        // Requesting a syscall that's already in the default allowlist should work fine
1950        let extra = vec!["read".to_string()];
1951        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1952        assert!(rules.contains_key(&libc::SYS_read));
1953    }
1954
1955    #[test]
1956    fn test_extra_syscalls_blocked_known_syscall_not_added() {
1957        // A known syscall that is NOT in OPT_IN_SYSCALLS must be blocked
1958        // (not added to the filter rules). E.g. kexec_load, bpf, ptrace.
1959        let extra = vec!["kexec_load".to_string()];
1960        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1961        assert!(
1962            !rules.contains_key(&libc::SYS_kexec_load),
1963            "kexec_load must be blocked even when requested via --seccomp-allow"
1964        );
1965    }
1966
1967    #[test]
1968    fn test_extra_syscalls_opt_in_syscall_is_added() {
1969        // Syscalls in OPT_IN_SYSCALLS must be added when requested
1970        let extra = vec!["io_uring_setup".to_string()];
1971        let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1972        assert!(
1973            rules.contains_key(&libc::SYS_io_uring_setup),
1974            "io_uring_setup is in OPT_IN_SYSCALLS and must be added"
1975        );
1976    }
1977}