Skip to main content

nucleus/security/
seccomp.rs

1use crate::error::{NucleusError, Result};
2use crate::security::policy::sha256_hex;
3use seccompiler::{BpfProgram, SeccompAction, SeccompCondition, SeccompFilter, SeccompRule};
4use std::collections::BTreeMap;
5use std::path::Path;
6use tracing::{debug, info, warn};
7
8/// Seccomp filter manager
9///
10/// Implements syscall whitelisting for the security state machine
11/// (NucleusSecurity_Seccomp_SeccompEnforcement.tla)
12pub struct SeccompManager {
13    applied: bool,
14}
15
16const DENIED_CLONE_NAMESPACE_FLAGS: u64 = (libc::CLONE_NEWUSER
17    | libc::CLONE_NEWNS
18    | libc::CLONE_NEWNET
19    | libc::CLONE_NEWIPC
20    | libc::CLONE_NEWUTS
21    | libc::CLONE_NEWPID
22    | libc::CLONE_NEWCGROUP
23    | libc::CLONE_NEWTIME) as u64;
24
25impl SeccompManager {
26    pub fn new() -> Self {
27        Self { applied: false }
28    }
29
30    fn base_allowed_syscalls() -> Vec<i64> {
31        let mut syscalls = vec![
32            // File I/O
33            libc::SYS_read,
34            libc::SYS_write,
35            libc::SYS_openat,
36            libc::SYS_close,
37            libc::SYS_fstat,
38            libc::SYS_lseek,
39            libc::SYS_fcntl,
40            libc::SYS_readv,
41            libc::SYS_writev,
42            libc::SYS_pread64,
43            libc::SYS_pwrite64,
44            libc::SYS_readlinkat,
45            libc::SYS_newfstatat,
46            libc::SYS_statx,
47            libc::SYS_faccessat,
48            libc::SYS_faccessat2,
49            libc::SYS_dup,
50            libc::SYS_dup3,
51            libc::SYS_pipe2,
52            libc::SYS_unlinkat,
53            libc::SYS_renameat,
54            libc::SYS_renameat2,
55            libc::SYS_linkat,
56            libc::SYS_symlinkat,
57            libc::SYS_fchmod,
58            libc::SYS_fchmodat,
59            libc::SYS_truncate,
60            libc::SYS_ftruncate,
61            libc::SYS_fallocate,
62            #[cfg(target_arch = "x86_64")]
63            libc::SYS_fadvise64,
64            libc::SYS_fsync,
65            libc::SYS_fdatasync,
66            libc::SYS_flock,
67            #[cfg(target_arch = "x86_64")]
68            libc::SYS_sendfile,
69            libc::SYS_copy_file_range,
70            libc::SYS_splice,
71            libc::SYS_tee,
72            // Memory management
73            libc::SYS_mmap,
74            libc::SYS_munmap,
75            libc::SYS_brk,
76            libc::SYS_mremap,
77            libc::SYS_madvise,
78            libc::SYS_msync,
79            // Process management
80            // fork intentionally excluded — modern glibc/musl use clone(), which
81            // has namespace-flag filtering. Removing SYS_fork forces all forks
82            // through the filtered clone path (defense-in-depth against fork bombs
83            // and unfiltered namespace creation).
84            libc::SYS_execve,
85            // execveat is conditionally allowed below (AT_EMPTY_PATH blocked)
86            libc::SYS_wait4,
87            libc::SYS_waitid,
88            libc::SYS_exit,
89            libc::SYS_exit_group,
90            libc::SYS_getpid,
91            libc::SYS_gettid,
92            libc::SYS_getuid,
93            libc::SYS_getgid,
94            libc::SYS_geteuid,
95            libc::SYS_getegid,
96            libc::SYS_getppid,
97            libc::SYS_setsid,
98            libc::SYS_getgroups,
99            // Signals
100            libc::SYS_rt_sigaction,
101            libc::SYS_rt_sigprocmask,
102            libc::SYS_rt_sigreturn,
103            libc::SYS_rt_sigsuspend,
104            libc::SYS_sigaltstack,
105            libc::SYS_kill,
106            libc::SYS_tgkill,
107            // Time
108            libc::SYS_clock_gettime,
109            libc::SYS_clock_getres,
110            libc::SYS_clock_nanosleep,
111            libc::SYS_gettimeofday,
112            libc::SYS_nanosleep,
113            // Directories
114            libc::SYS_getcwd,
115            libc::SYS_chdir,
116            libc::SYS_fchdir,
117            libc::SYS_mkdirat,
118            libc::SYS_getdents64,
119            // Misc
120            libc::SYS_uname,
121            libc::SYS_getrandom,
122            libc::SYS_futex,
123            libc::SYS_set_tid_address,
124            libc::SYS_set_robust_list,
125            libc::SYS_get_robust_list,
126            libc::SYS_sysinfo,
127            libc::SYS_umask,
128            libc::SYS_prlimit64,
129            libc::SYS_getrusage,
130            libc::SYS_times,
131            libc::SYS_sched_yield,
132            libc::SYS_sched_getaffinity,
133            libc::SYS_getcpu,
134            libc::SYS_rseq,
135            libc::SYS_close_range,
136            // NOTE: memfd_create intentionally excluded — combined with execveat
137            // it enables fileless code execution bypassing all FS controls (SEC-02).
138            // Landlock bootstrap (runtime applies seccomp before Landlock)
139            libc::SYS_landlock_create_ruleset,
140            libc::SYS_landlock_add_rule,
141            libc::SYS_landlock_restrict_self,
142            // Socket/Network (safe introspection + local socketpair)
143            libc::SYS_getsockname,
144            libc::SYS_getpeername,
145            libc::SYS_socketpair,
146            libc::SYS_getsockopt,
147            // Poll/Select
148            libc::SYS_ppoll,
149            libc::SYS_pselect6,
150            libc::SYS_epoll_create1,
151            libc::SYS_epoll_ctl,
152            libc::SYS_epoll_pwait,
153            libc::SYS_eventfd2,
154            libc::SYS_signalfd4,
155            libc::SYS_timerfd_create,
156            libc::SYS_timerfd_settime,
157            libc::SYS_timerfd_gettime,
158        ];
159
160        // Legacy syscalls only available on x86_64 (aarch64 only has the *at variants)
161        #[cfg(target_arch = "x86_64")]
162        syscalls.extend_from_slice(&[
163            libc::SYS_open,
164            libc::SYS_stat,
165            libc::SYS_lstat,
166            libc::SYS_access,
167            libc::SYS_readlink,
168            libc::SYS_dup2,
169            libc::SYS_pipe,
170            libc::SYS_unlink,
171            libc::SYS_rename,
172            libc::SYS_link,
173            libc::SYS_symlink,
174            libc::SYS_chmod,
175            libc::SYS_mkdir,
176            libc::SYS_rmdir,
177            libc::SYS_getdents,
178            libc::SYS_getpgrp,
179            libc::SYS_arch_prctl,
180            libc::SYS_getrlimit,
181            libc::SYS_poll,
182            libc::SYS_select,
183            libc::SYS_epoll_create,
184            libc::SYS_epoll_wait,
185            libc::SYS_eventfd,
186            libc::SYS_signalfd,
187        ]);
188
189        syscalls
190    }
191
192    fn allowed_socket_domains(allow_network: bool) -> Vec<i32> {
193        if allow_network {
194            vec![libc::AF_UNIX, libc::AF_INET, libc::AF_INET6]
195        } else {
196            vec![libc::AF_UNIX]
197        }
198    }
199
200    fn network_mode_syscalls(allow_network: bool) -> Vec<i64> {
201        if allow_network {
202            vec![
203                libc::SYS_connect,
204                libc::SYS_sendto,
205                libc::SYS_recvfrom,
206                libc::SYS_sendmsg,
207                libc::SYS_recvmsg,
208                libc::SYS_shutdown,
209                libc::SYS_bind,
210                libc::SYS_listen,
211                libc::SYS_accept,
212                libc::SYS_accept4,
213                libc::SYS_setsockopt,
214            ]
215        } else {
216            Vec::new()
217        }
218    }
219
220    /// Get minimal syscall whitelist for basic container operation
221    ///
222    /// This is a restrictive whitelist that blocks dangerous syscalls:
223    /// - ptrace (process tracing)
224    /// - kexec_load (kernel loading)
225    /// - add_key, request_key, keyctl (kernel keyring)
226    /// - bpf (eBPF programs)
227    /// - perf_event_open (performance monitoring)
228    /// - userfaultfd (user fault handling)
229    fn minimal_filter(allow_network: bool) -> Result<BTreeMap<i64, Vec<SeccompRule>>> {
230        let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
231
232        // Essential syscalls for basic operation
233        let allowed_syscalls = Self::base_allowed_syscalls();
234
235        // Allow all these syscalls unconditionally
236        for syscall in allowed_syscalls {
237            rules.insert(syscall, Vec::new());
238        }
239
240        // Add network-mode-specific syscalls
241        for syscall in Self::network_mode_syscalls(allow_network) {
242            rules.insert(syscall, Vec::new());
243        }
244
245        // Restrict socket() domains by network mode.
246        // none: AF_UNIX only; network-enabled: AF_UNIX/AF_INET/AF_INET6.
247        let mut socket_rules = Vec::new();
248        for domain in Self::allowed_socket_domains(allow_network) {
249            let condition = SeccompCondition::new(
250                0, // arg0 is socket(domain, type, protocol)
251                seccompiler::SeccompCmpArgLen::Dword,
252                seccompiler::SeccompCmpOp::Eq,
253                domain as u64,
254            )
255            .map_err(|e| {
256                NucleusError::SeccompError(format!(
257                    "Failed to create socket domain condition: {}",
258                    e
259                ))
260            })?;
261            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
262                NucleusError::SeccompError(format!("Failed to create socket rule: {}", e))
263            })?;
264            socket_rules.push(rule);
265        }
266        rules.insert(libc::SYS_socket, socket_rules);
267
268        // ioctl: allow only safe terminal operations (arg0 = request code)
269        let ioctl_allowed: &[u64] = &[
270            0x5401, // TCGETS
271            0x5402, // TCSETS
272            0x5403, // TCSETSW
273            0x5404, // TCSETSF
274            0x540B, // TCFLSH
275            0x540F, // TIOCGPGRP
276            0x5410, // TIOCSPGRP
277            0x5413, // TIOCGWINSZ
278            0x5429, // TIOCGSID
279            0x541B, // FIONREAD
280            // FIONBIO (0x5421) intentionally excluded — sets non-blocking mode
281            // on network sockets, enabling sophisticated network exploitation.
282            0x5451, // FIOCLEX
283            0x5450, // FIONCLEX
284        ];
285        let mut ioctl_rules = Vec::new();
286        for &request in ioctl_allowed {
287            let condition = SeccompCondition::new(
288                1, // arg1 is the request code for ioctl(fd, request, ...)
289                seccompiler::SeccompCmpArgLen::Dword,
290                seccompiler::SeccompCmpOp::Eq,
291                request,
292            )
293            .map_err(|e| {
294                NucleusError::SeccompError(format!("Failed to create ioctl condition: {}", e))
295            })?;
296            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
297                NucleusError::SeccompError(format!("Failed to create ioctl rule: {}", e))
298            })?;
299            ioctl_rules.push(rule);
300        }
301        rules.insert(libc::SYS_ioctl, ioctl_rules);
302
303        // prctl: allow only safe operations (arg0 = option).
304        // Notably absent (hit default deny):
305        //   PR_CAPBSET_READ (23) — leaks capability bounding set info
306        //   PR_CAPBSET_DROP (24) — could weaken the capability bounding set
307        //   PR_SET_SECUREBITS (28) — could disable secure-exec restrictions
308        let prctl_allowed: &[u64] = &[
309            1,  // PR_SET_PDEATHSIG
310            2,  // PR_GET_PDEATHSIG
311            15, // PR_SET_NAME
312            16, // PR_GET_NAME
313            38, // PR_SET_NO_NEW_PRIVS
314            39, // PR_GET_NO_NEW_PRIVS
315        ];
316        let mut prctl_rules = Vec::new();
317        for &option in prctl_allowed {
318            let condition = SeccompCondition::new(
319                0, // arg0 is the option for prctl(option, ...)
320                seccompiler::SeccompCmpArgLen::Dword,
321                seccompiler::SeccompCmpOp::Eq,
322                option,
323            )
324            .map_err(|e| {
325                NucleusError::SeccompError(format!("Failed to create prctl condition: {}", e))
326            })?;
327            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
328                NucleusError::SeccompError(format!("Failed to create prctl rule: {}", e))
329            })?;
330            prctl_rules.push(rule);
331        }
332        rules.insert(libc::SYS_prctl, prctl_rules);
333
334        // mprotect: permit RW or RX transitions, but reject PROT_WRITE|PROT_EXEC.
335        let mut mprotect_rules = Vec::new();
336        for allowed in [0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64] {
337            let condition = SeccompCondition::new(
338                2, // arg2 is prot for mprotect(addr, len, prot)
339                seccompiler::SeccompCmpArgLen::Dword,
340                seccompiler::SeccompCmpOp::MaskedEq((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
341                allowed,
342            )
343            .map_err(|e| {
344                NucleusError::SeccompError(format!("Failed to create mprotect condition: {}", e))
345            })?;
346            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
347                NucleusError::SeccompError(format!("Failed to create mprotect rule: {}", e))
348            })?;
349            mprotect_rules.push(rule);
350        }
351        rules.insert(libc::SYS_mprotect, mprotect_rules);
352
353        // clone3: ALLOWED unconditionally. clone3 passes flags inside a struct
354        // pointer that seccomp BPF cannot dereference, so namespace-flag filtering
355        // is impossible at the BPF level. However, glibc 2.34+ and newer musl use
356        // clone3 internally for posix_spawn/fork — blocking it breaks
357        // std::process::Command and any child-process spawning on modern systems.
358        //
359        // SECURITY INVARIANT: Namespace creation via clone3 is prevented solely by
360        // dropping CAP_SYS_ADMIN (and other namespace caps) *before* this seccomp
361        // filter is installed. If capability dropping is bypassed, clone3 becomes
362        // an unfiltered path to namespace creation. This is a known single point
363        // of failure — see CapabilityManager::drop_all() which must run first.
364        rules.insert(libc::SYS_clone3, Vec::new());
365
366        // clone: allow but deny namespace-creating flags to prevent nested namespace creation
367        let clone_condition = SeccompCondition::new(
368            0, // arg0 = flags
369            seccompiler::SeccompCmpArgLen::Qword,
370            seccompiler::SeccompCmpOp::MaskedEq(DENIED_CLONE_NAMESPACE_FLAGS),
371            0, // (flags & ns_flags) == 0: none of the namespace flags set
372        )
373        .map_err(|e| {
374            NucleusError::SeccompError(format!("Failed to create clone condition: {}", e))
375        })?;
376        let clone_rule = SeccompRule::new(vec![clone_condition]).map_err(|e| {
377            NucleusError::SeccompError(format!("Failed to create clone rule: {}", e))
378        })?;
379        rules.insert(libc::SYS_clone, vec![clone_rule]);
380
381        // execveat: allow but block AT_EMPTY_PATH (0x1000) to prevent fileless
382        // execution. With AT_EMPTY_PATH, execveat can execute code from any open
383        // fd (e.g., open + unlink, or even a socket fd), bypassing filesystem
384        // controls — not just memfd_create. Blocking memfd_create alone is
385        // insufficient. Normal execveat with dirfd+pathname (no AT_EMPTY_PATH)
386        // remains allowed.
387        let execveat_condition = SeccompCondition::new(
388            4, // arg4 = flags for execveat(dirfd, pathname, argv, envp, flags)
389            seccompiler::SeccompCmpArgLen::Dword,
390            seccompiler::SeccompCmpOp::MaskedEq(libc::AT_EMPTY_PATH as u64),
391            0, // (flags & AT_EMPTY_PATH) == 0: AT_EMPTY_PATH not set
392        )
393        .map_err(|e| {
394            NucleusError::SeccompError(format!("Failed to create execveat condition: {}", e))
395        })?;
396        let execveat_rule = SeccompRule::new(vec![execveat_condition]).map_err(|e| {
397            NucleusError::SeccompError(format!("Failed to create execveat rule: {}", e))
398        })?;
399        rules.insert(libc::SYS_execveat, vec![execveat_rule]);
400
401        Ok(rules)
402    }
403
404    /// Compile the minimal BPF filter without applying it
405    ///
406    /// This is useful for benchmarking filter compilation overhead
407    /// without the irreversible side effect of applying the filter.
408    pub fn compile_minimal_filter() -> Result<BpfProgram> {
409        let rules = Self::minimal_filter(true)?;
410        let filter = SeccompFilter::new(
411            rules,
412            SeccompAction::Errno(libc::EPERM as u32),
413            SeccompAction::Allow,
414            std::env::consts::ARCH.try_into().map_err(|e| {
415                NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
416            })?,
417        )
418        .map_err(|e| {
419            NucleusError::SeccompError(format!("Failed to create seccomp filter: {}", e))
420        })?;
421
422        let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
423            NucleusError::SeccompError(format!("Failed to compile BPF program: {}", e))
424        })?;
425
426        Ok(bpf_prog)
427    }
428
429    /// Apply seccomp filter
430    ///
431    /// This implements the transition: no_filter -> whitelist_active
432    /// in the seccomp state machine (NucleusSecurity_Seccomp_SeccompEnforcement.tla)
433    ///
434    /// Once applied, the filter cannot be removed (irreversible property)
435    /// In rootless mode or if seccomp setup fails, this will warn and continue
436    pub fn apply_minimal_filter(&mut self) -> Result<bool> {
437        self.apply_minimal_filter_with_mode(false, false)
438    }
439
440    /// Apply seccomp filter with configurable failure behavior
441    ///
442    /// When `best_effort` is true, failures are logged and execution continues.
443    /// When false, seccomp setup is fail-closed.
444    pub fn apply_minimal_filter_with_mode(
445        &mut self,
446        best_effort: bool,
447        log_denied: bool,
448    ) -> Result<bool> {
449        self.apply_filter_for_network_mode(true, best_effort, log_denied)
450    }
451
452    /// Apply seccomp filter with network-mode-aware socket restrictions
453    ///
454    /// When `allow_network` is false, `SYS_socket` is restricted to AF_UNIX only,
455    /// preventing creation of network sockets (AF_INET, AF_INET6, etc.).
456    /// When `allow_network` is true, all socket domains are permitted.
457    ///
458    /// When `best_effort` is true, failures are logged and execution continues.
459    /// When false, seccomp setup is fail-closed.
460    pub fn apply_filter_for_network_mode(
461        &mut self,
462        allow_network: bool,
463        best_effort: bool,
464        log_denied: bool,
465    ) -> Result<bool> {
466        if self.applied {
467            debug!("Seccomp filter already applied, skipping");
468            return Ok(true);
469        }
470
471        info!(allow_network, "Applying seccomp filter");
472
473        let rules = match Self::minimal_filter(allow_network) {
474            Ok(r) => r,
475            Err(e) => {
476                if best_effort {
477                    warn!(
478                        "Failed to create seccomp rules: {} (continuing without seccomp)",
479                        e
480                    );
481                    return Ok(false);
482                }
483                return Err(e);
484            }
485        };
486
487        let filter = match SeccompFilter::new(
488            rules,
489            SeccompAction::Errno(libc::EPERM as u32), // Default: deny with EPERM
490            SeccompAction::Allow,                     // Match action: allow
491            std::env::consts::ARCH.try_into().map_err(|e| {
492                NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
493            })?,
494        ) {
495            Ok(f) => f,
496            Err(e) => {
497                if best_effort {
498                    warn!(
499                        "Failed to create seccomp filter: {} (continuing without seccomp)",
500                        e
501                    );
502                    return Ok(false);
503                }
504                return Err(NucleusError::SeccompError(format!(
505                    "Failed to create seccomp filter: {}",
506                    e
507                )));
508            }
509        };
510
511        let bpf_prog: BpfProgram = match filter.try_into() {
512            Ok(p) => p,
513            Err(e) => {
514                if best_effort {
515                    warn!(
516                        "Failed to compile BPF program: {} (continuing without seccomp)",
517                        e
518                    );
519                    return Ok(false);
520                }
521                return Err(NucleusError::SeccompError(format!(
522                    "Failed to compile BPF program: {}",
523                    e
524                )));
525            }
526        };
527
528        // Apply the filter
529        match Self::apply_bpf_program(&bpf_prog, log_denied) {
530            Ok(_) => {
531                self.applied = true;
532                info!("Successfully applied seccomp filter");
533                Ok(true)
534            }
535            Err(e) => {
536                if best_effort {
537                    warn!(
538                        "Failed to apply seccomp filter: {} (continuing without seccomp)",
539                        e
540                    );
541                    Ok(false)
542                } else {
543                    Err(NucleusError::SeccompError(format!(
544                        "Failed to apply seccomp filter: {}",
545                        e
546                    )))
547                }
548            }
549        }
550    }
551
552    /// Apply a seccomp profile loaded from a JSON file.
553    ///
554    /// The profile format is a JSON object with:
555    /// ```json
556    /// {
557    ///   "defaultAction": "SCMP_ACT_ERRNO",
558    ///   "syscalls": [
559    ///     { "names": ["read", "write", "open", ...], "action": "SCMP_ACT_ALLOW" }
560    ///   ]
561    /// }
562    /// ```
563    ///
564    /// This is a subset of the OCI seccomp profile format. Only the syscall name
565    /// allowlist is used; argument-level filtering from the built-in profile is
566    /// not applied when using a custom profile.
567    ///
568    /// If `expected_sha256` is provided, the file's SHA-256 hash is verified
569    /// against it before loading. This prevents silent profile tampering.
570    pub fn apply_profile_from_file(
571        &mut self,
572        profile_path: &Path,
573        expected_sha256: Option<&str>,
574        audit_mode: bool,
575    ) -> Result<bool> {
576        if self.applied {
577            debug!("Seccomp filter already applied, skipping");
578            return Ok(true);
579        }
580
581        info!("Loading seccomp profile from {:?}", profile_path);
582
583        // Read profile file
584        let content = std::fs::read(profile_path).map_err(|e| {
585            NucleusError::SeccompError(format!(
586                "Failed to read seccomp profile {:?}: {}",
587                profile_path, e
588            ))
589        })?;
590
591        // Verify SHA-256 hash if expected
592        if let Some(expected) = expected_sha256 {
593            let actual = sha256_hex(&content);
594            if actual != expected {
595                return Err(NucleusError::SeccompError(format!(
596                    "Seccomp profile hash mismatch: expected {}, got {}",
597                    expected, actual
598                )));
599            }
600            info!("Seccomp profile hash verified: {}", actual);
601        }
602
603        // Parse profile
604        let profile: SeccompProfile = serde_json::from_slice(&content).map_err(|e| {
605            NucleusError::SeccompError(format!("Failed to parse seccomp profile: {}", e))
606        })?;
607
608        // Warn when custom profile allows security-critical syscalls without
609        // argument-level filtering. The built-in filter restricts clone, ioctl,
610        // prctl, and socket at the argument level; a custom profile that allows
611        // them by name only silently removes all of that hardening.
612        Self::warn_missing_arg_filters(&profile);
613
614        // Build filter from profile
615        let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
616
617        for syscall_group in &profile.syscalls {
618            if syscall_group.action == "SCMP_ACT_ALLOW" {
619                for name in &syscall_group.names {
620                    if let Some(nr) = syscall_name_to_number(name) {
621                        rules.insert(nr, Vec::new());
622                    } else {
623                        warn!("Unknown syscall in profile: {} (skipping)", name);
624                    }
625                }
626            }
627        }
628
629        // SEC-01: Merge built-in argument filters for security-critical syscalls.
630        // Custom profiles that allow clone/ioctl/prctl/socket/mprotect by name
631        // without argument-level filters would silently remove all hardening.
632        // Overwrite their empty rules with the built-in argument-filtered rules.
633        let builtin_rules = Self::minimal_filter(true)?;
634        for syscall_name in Self::ARG_FILTERED_SYSCALLS {
635            if let Some(nr) = syscall_name_to_number(syscall_name) {
636                if let std::collections::btree_map::Entry::Occupied(mut entry) = rules.entry(nr) {
637                    if let Some(builtin) = builtin_rules.get(&nr) {
638                        if !builtin.is_empty() {
639                            info!(
640                                "Merging built-in argument filters for '{}' into custom profile",
641                                syscall_name
642                            );
643                            entry.insert(builtin.clone());
644                        }
645                    }
646                }
647            }
648        }
649        // Also enforce clone3 denial — it cannot be argument-filtered
650        rules.remove(&libc::SYS_clone3);
651
652        let filter = SeccompFilter::new(
653            rules,
654            SeccompAction::Errno(libc::EPERM as u32),
655            SeccompAction::Allow,
656            std::env::consts::ARCH.try_into().map_err(|e| {
657                NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
658            })?,
659        )
660        .map_err(|e| {
661            NucleusError::SeccompError(format!(
662                "Failed to create seccomp filter from profile: {}",
663                e
664            ))
665        })?;
666
667        let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
668            NucleusError::SeccompError(format!("Failed to compile BPF program from profile: {}", e))
669        })?;
670
671        match Self::apply_bpf_program(&bpf_prog, audit_mode) {
672            Ok(_) => {
673                self.applied = true;
674                info!(
675                    "Seccomp profile applied from {:?} (log_denied={})",
676                    profile_path, audit_mode
677                );
678                Ok(true)
679            }
680            Err(e) => Err(e),
681        }
682    }
683
684    /// Install an allow-all seccomp filter with SECCOMP_FILTER_FLAG_LOG.
685    ///
686    /// Used in trace mode: all syscalls are allowed but logged to the kernel
687    /// audit subsystem. A separate reader collects the logged syscalls.
688    pub fn apply_trace_filter(&mut self) -> Result<bool> {
689        if self.applied {
690            debug!("Seccomp filter already applied, skipping trace filter");
691            return Ok(true);
692        }
693
694        info!("Applying seccomp trace filter (allow-all + LOG)");
695
696        // Create an empty rule set — with SeccompAction::Allow as default,
697        // every syscall is permitted. The LOG flag causes the kernel to
698        // audit each syscall decision.
699        let filter = SeccompFilter::new(
700            BTreeMap::new(),
701            SeccompAction::Allow, // default: allow everything
702            SeccompAction::Allow, // match action (unused — no rules)
703            std::env::consts::ARCH.try_into().map_err(|e| {
704                NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
705            })?,
706        )
707        .map_err(|e| NucleusError::SeccompError(format!("Failed to create trace filter: {}", e)))?;
708
709        let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
710            NucleusError::SeccompError(format!("Failed to compile trace BPF: {}", e))
711        })?;
712
713        // Apply with LOG flag so kernel audits every syscall
714        Self::apply_bpf_program(&bpf_prog, true)?;
715        self.applied = true;
716        info!("Seccomp trace filter applied (all syscalls allowed + logged)");
717        Ok(true)
718    }
719
720    /// Syscalls that the built-in filter restricts at the argument level.
721    /// Custom profiles allowing these without argument filters weaken security.
722    const ARG_FILTERED_SYSCALLS: &'static [&'static str] = &[
723        "clone", "clone3", "execveat", "ioctl", "mprotect", "prctl", "socket",
724    ];
725
726    /// Warn when a custom seccomp profile allows security-critical syscalls
727    /// without argument-level filtering.
728    fn warn_missing_arg_filters(profile: &SeccompProfile) {
729        for group in &profile.syscalls {
730            if group.action != "SCMP_ACT_ALLOW" {
731                continue;
732            }
733            for name in &group.names {
734                if Self::ARG_FILTERED_SYSCALLS.contains(&name.as_str()) && group.args.is_empty() {
735                    warn!(
736                        "Custom seccomp profile allows '{}' without argument filters. \
737                         The built-in filter restricts this syscall at the argument level. \
738                         This profile weakens security compared to the default.",
739                        name
740                    );
741                }
742            }
743        }
744    }
745
746    /// Check if seccomp filter has been applied
747    pub fn is_applied(&self) -> bool {
748        self.applied
749    }
750
751    fn apply_bpf_program(bpf_prog: &BpfProgram, log_denied: bool) -> Result<()> {
752        let mut flags: libc::c_ulong = 0;
753        if log_denied {
754            flags |= libc::SECCOMP_FILTER_FLAG_LOG as libc::c_ulong;
755        }
756
757        match Self::apply_bpf_program_with_flags(bpf_prog, flags) {
758            Ok(()) => Ok(()),
759            Err(err)
760                if log_denied
761                    && err.raw_os_error() == Some(libc::EINVAL)
762                    && libc::SECCOMP_FILTER_FLAG_LOG != 0 =>
763            {
764                warn!(
765                    "Kernel rejected SECCOMP_FILTER_FLAG_LOG; continuing with seccomp \
766                     enforcement without deny logging"
767                );
768                Self::apply_bpf_program_with_flags(bpf_prog, 0)?;
769                Ok(())
770            }
771            Err(err) => Err(NucleusError::SeccompError(format!(
772                "Failed to apply seccomp filter: {}",
773                err
774            ))),
775        }
776    }
777
778    fn apply_bpf_program_with_flags(
779        bpf_prog: &BpfProgram,
780        flags: libc::c_ulong,
781    ) -> std::io::Result<()> {
782        // SAFETY: `prctl(PR_SET_NO_NEW_PRIVS, ...)` has no pointer arguments here
783        // and only affects the current thread/process as required before seccomp.
784        let rc = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
785        if rc != 0 {
786            return Err(std::io::Error::last_os_error());
787        }
788
789        let prog = libc::sock_fprog {
790            len: bpf_prog.len() as u16,
791            filter: bpf_prog.as_ptr() as *mut libc::sock_filter,
792        };
793
794        // SAFETY: `prog` points to a live BPF program buffer for the duration of
795        // the syscall and the kernel copies the pointed-to filter immediately.
796        let rc = unsafe {
797            libc::syscall(
798                libc::SYS_seccomp,
799                libc::SECCOMP_SET_MODE_FILTER,
800                flags,
801                &prog as *const libc::sock_fprog,
802            )
803        };
804
805        if rc < 0 {
806            return Err(std::io::Error::last_os_error());
807        }
808
809        Ok(())
810    }
811}
812
813// SeccompProfile and SeccompSyscallGroup are defined in seccomp_generate.rs
814use crate::security::seccomp_generate::SeccompProfile;
815
816/// Map a syscall name (e.g. "read", "write") to its Linux syscall number.
817///
818/// Covers the most common syscalls. Unknown names return None.
819fn syscall_name_to_number(name: &str) -> Option<i64> {
820    match name {
821        // File I/O
822        "read" => Some(libc::SYS_read),
823        "write" => Some(libc::SYS_write),
824        #[cfg(target_arch = "x86_64")]
825        "open" => Some(libc::SYS_open),
826        "openat" => Some(libc::SYS_openat),
827        "close" => Some(libc::SYS_close),
828        #[cfg(target_arch = "x86_64")]
829        "stat" => Some(libc::SYS_stat),
830        "fstat" => Some(libc::SYS_fstat),
831        #[cfg(target_arch = "x86_64")]
832        "lstat" => Some(libc::SYS_lstat),
833        "lseek" => Some(libc::SYS_lseek),
834        #[cfg(target_arch = "x86_64")]
835        "access" => Some(libc::SYS_access),
836        "fcntl" => Some(libc::SYS_fcntl),
837        "readv" => Some(libc::SYS_readv),
838        "writev" => Some(libc::SYS_writev),
839        "pread64" => Some(libc::SYS_pread64),
840        "pwrite64" => Some(libc::SYS_pwrite64),
841        #[cfg(target_arch = "x86_64")]
842        "readlink" => Some(libc::SYS_readlink),
843        "readlinkat" => Some(libc::SYS_readlinkat),
844        "newfstatat" => Some(libc::SYS_newfstatat),
845        "statx" => Some(libc::SYS_statx),
846        "faccessat" => Some(libc::SYS_faccessat),
847        "faccessat2" => Some(libc::SYS_faccessat2),
848        "dup" => Some(libc::SYS_dup),
849        #[cfg(target_arch = "x86_64")]
850        "dup2" => Some(libc::SYS_dup2),
851        "dup3" => Some(libc::SYS_dup3),
852        #[cfg(target_arch = "x86_64")]
853        "pipe" => Some(libc::SYS_pipe),
854        "pipe2" => Some(libc::SYS_pipe2),
855        #[cfg(target_arch = "x86_64")]
856        "unlink" => Some(libc::SYS_unlink),
857        "unlinkat" => Some(libc::SYS_unlinkat),
858        #[cfg(target_arch = "x86_64")]
859        "rename" => Some(libc::SYS_rename),
860        "renameat" => Some(libc::SYS_renameat),
861        "renameat2" => Some(libc::SYS_renameat2),
862        #[cfg(target_arch = "x86_64")]
863        "link" => Some(libc::SYS_link),
864        "linkat" => Some(libc::SYS_linkat),
865        #[cfg(target_arch = "x86_64")]
866        "symlink" => Some(libc::SYS_symlink),
867        "symlinkat" => Some(libc::SYS_symlinkat),
868        #[cfg(target_arch = "x86_64")]
869        "chmod" => Some(libc::SYS_chmod),
870        "fchmod" => Some(libc::SYS_fchmod),
871        "fchmodat" => Some(libc::SYS_fchmodat),
872        "truncate" => Some(libc::SYS_truncate),
873        "ftruncate" => Some(libc::SYS_ftruncate),
874        "fallocate" => Some(libc::SYS_fallocate),
875        #[cfg(target_arch = "x86_64")]
876        "fadvise64" => Some(libc::SYS_fadvise64),
877        "fsync" => Some(libc::SYS_fsync),
878        "fdatasync" => Some(libc::SYS_fdatasync),
879        "flock" => Some(libc::SYS_flock),
880        #[cfg(target_arch = "x86_64")]
881        "sendfile" => Some(libc::SYS_sendfile),
882        "copy_file_range" => Some(libc::SYS_copy_file_range),
883        "splice" => Some(libc::SYS_splice),
884        "tee" => Some(libc::SYS_tee),
885        // Memory
886        "mmap" => Some(libc::SYS_mmap),
887        "munmap" => Some(libc::SYS_munmap),
888        "mprotect" => Some(libc::SYS_mprotect),
889        "brk" => Some(libc::SYS_brk),
890        "mremap" => Some(libc::SYS_mremap),
891        "madvise" => Some(libc::SYS_madvise),
892        "msync" => Some(libc::SYS_msync),
893        "mlock" => Some(libc::SYS_mlock),
894        "munlock" => Some(libc::SYS_munlock),
895        // Process
896        #[cfg(target_arch = "x86_64")]
897        "fork" => Some(libc::SYS_fork),
898        "clone" => Some(libc::SYS_clone),
899        "clone3" => Some(libc::SYS_clone3),
900        "execve" => Some(libc::SYS_execve),
901        "execveat" => Some(libc::SYS_execveat),
902        "wait4" => Some(libc::SYS_wait4),
903        "waitid" => Some(libc::SYS_waitid),
904        "exit" => Some(libc::SYS_exit),
905        "exit_group" => Some(libc::SYS_exit_group),
906        "getpid" => Some(libc::SYS_getpid),
907        "gettid" => Some(libc::SYS_gettid),
908        "getuid" => Some(libc::SYS_getuid),
909        "getgid" => Some(libc::SYS_getgid),
910        "geteuid" => Some(libc::SYS_geteuid),
911        "getegid" => Some(libc::SYS_getegid),
912        "getppid" => Some(libc::SYS_getppid),
913        #[cfg(target_arch = "x86_64")]
914        "getpgrp" => Some(libc::SYS_getpgrp),
915        "setsid" => Some(libc::SYS_setsid),
916        "getgroups" => Some(libc::SYS_getgroups),
917        // Signals
918        "rt_sigaction" => Some(libc::SYS_rt_sigaction),
919        "rt_sigprocmask" => Some(libc::SYS_rt_sigprocmask),
920        "rt_sigreturn" => Some(libc::SYS_rt_sigreturn),
921        "rt_sigsuspend" => Some(libc::SYS_rt_sigsuspend),
922        "sigaltstack" => Some(libc::SYS_sigaltstack),
923        "kill" => Some(libc::SYS_kill),
924        "tgkill" => Some(libc::SYS_tgkill),
925        // Time
926        "clock_gettime" => Some(libc::SYS_clock_gettime),
927        "clock_getres" => Some(libc::SYS_clock_getres),
928        "clock_nanosleep" => Some(libc::SYS_clock_nanosleep),
929        "gettimeofday" => Some(libc::SYS_gettimeofday),
930        "nanosleep" => Some(libc::SYS_nanosleep),
931        // Directories
932        "getcwd" => Some(libc::SYS_getcwd),
933        "chdir" => Some(libc::SYS_chdir),
934        "fchdir" => Some(libc::SYS_fchdir),
935        #[cfg(target_arch = "x86_64")]
936        "mkdir" => Some(libc::SYS_mkdir),
937        "mkdirat" => Some(libc::SYS_mkdirat),
938        #[cfg(target_arch = "x86_64")]
939        "rmdir" => Some(libc::SYS_rmdir),
940        #[cfg(target_arch = "x86_64")]
941        "getdents" => Some(libc::SYS_getdents),
942        "getdents64" => Some(libc::SYS_getdents64),
943        // Network
944        "socket" => Some(libc::SYS_socket),
945        "connect" => Some(libc::SYS_connect),
946        "sendto" => Some(libc::SYS_sendto),
947        "recvfrom" => Some(libc::SYS_recvfrom),
948        "sendmsg" => Some(libc::SYS_sendmsg),
949        "recvmsg" => Some(libc::SYS_recvmsg),
950        "shutdown" => Some(libc::SYS_shutdown),
951        "bind" => Some(libc::SYS_bind),
952        "listen" => Some(libc::SYS_listen),
953        "accept" => Some(libc::SYS_accept),
954        "accept4" => Some(libc::SYS_accept4),
955        "setsockopt" => Some(libc::SYS_setsockopt),
956        "getsockopt" => Some(libc::SYS_getsockopt),
957        "getsockname" => Some(libc::SYS_getsockname),
958        "getpeername" => Some(libc::SYS_getpeername),
959        "socketpair" => Some(libc::SYS_socketpair),
960        // Poll/Select
961        #[cfg(target_arch = "x86_64")]
962        "poll" => Some(libc::SYS_poll),
963        "ppoll" => Some(libc::SYS_ppoll),
964        #[cfg(target_arch = "x86_64")]
965        "select" => Some(libc::SYS_select),
966        "pselect6" => Some(libc::SYS_pselect6),
967        #[cfg(target_arch = "x86_64")]
968        "epoll_create" => Some(libc::SYS_epoll_create),
969        "epoll_create1" => Some(libc::SYS_epoll_create1),
970        "epoll_ctl" => Some(libc::SYS_epoll_ctl),
971        #[cfg(target_arch = "x86_64")]
972        "epoll_wait" => Some(libc::SYS_epoll_wait),
973        "epoll_pwait" => Some(libc::SYS_epoll_pwait),
974        #[cfg(target_arch = "x86_64")]
975        "eventfd" => Some(libc::SYS_eventfd),
976        "eventfd2" => Some(libc::SYS_eventfd2),
977        #[cfg(target_arch = "x86_64")]
978        "signalfd" => Some(libc::SYS_signalfd),
979        "signalfd4" => Some(libc::SYS_signalfd4),
980        "timerfd_create" => Some(libc::SYS_timerfd_create),
981        "timerfd_settime" => Some(libc::SYS_timerfd_settime),
982        "timerfd_gettime" => Some(libc::SYS_timerfd_gettime),
983        // Misc
984        "uname" => Some(libc::SYS_uname),
985        "getrandom" => Some(libc::SYS_getrandom),
986        "futex" => Some(libc::SYS_futex),
987        "set_tid_address" => Some(libc::SYS_set_tid_address),
988        "set_robust_list" => Some(libc::SYS_set_robust_list),
989        "get_robust_list" => Some(libc::SYS_get_robust_list),
990        #[cfg(target_arch = "x86_64")]
991        "arch_prctl" => Some(libc::SYS_arch_prctl),
992        "sysinfo" => Some(libc::SYS_sysinfo),
993        "umask" => Some(libc::SYS_umask),
994        #[cfg(target_arch = "x86_64")]
995        "getrlimit" => Some(libc::SYS_getrlimit),
996        "prlimit64" => Some(libc::SYS_prlimit64),
997        "getrusage" => Some(libc::SYS_getrusage),
998        "times" => Some(libc::SYS_times),
999        "sched_yield" => Some(libc::SYS_sched_yield),
1000        "sched_getaffinity" => Some(libc::SYS_sched_getaffinity),
1001        "getcpu" => Some(libc::SYS_getcpu),
1002        "rseq" => Some(libc::SYS_rseq),
1003        "close_range" => Some(libc::SYS_close_range),
1004        "memfd_create" => Some(libc::SYS_memfd_create),
1005        "ioctl" => Some(libc::SYS_ioctl),
1006        "prctl" => Some(libc::SYS_prctl),
1007        // Landlock
1008        "landlock_create_ruleset" => Some(libc::SYS_landlock_create_ruleset),
1009        "landlock_add_rule" => Some(libc::SYS_landlock_add_rule),
1010        "landlock_restrict_self" => Some(libc::SYS_landlock_restrict_self),
1011        _ => None,
1012    }
1013}
1014
1015impl Default for SeccompManager {
1016    fn default() -> Self {
1017        Self::new()
1018    }
1019}
1020
1021#[cfg(test)]
1022mod tests {
1023    use super::*;
1024
1025    #[test]
1026    fn test_seccomp_manager_initial_state() {
1027        let mgr = SeccompManager::new();
1028        assert!(!mgr.is_applied());
1029    }
1030
1031    #[test]
1032    fn test_apply_idempotent() {
1033        let mgr = SeccompManager::new();
1034        // Note: We can't actually test application in unit tests
1035        // as it would affect the test process itself
1036        // This is tested in integration tests instead
1037        assert!(!mgr.is_applied());
1038    }
1039
1040    #[test]
1041    fn test_clone_denied_flags_include_newcgroup() {
1042        assert_ne!(
1043            DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWCGROUP as u64,
1044            0
1045        );
1046    }
1047
1048    #[test]
1049    fn test_clone_denied_flags_include_newtime() {
1050        assert_ne!(
1051            DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWTIME as u64,
1052            0,
1053            "CLONE_NEWTIME must be in denied clone namespace flags"
1054        );
1055    }
1056
1057    #[test]
1058    fn test_network_none_socket_domains_are_unix_only() {
1059        let domains = SeccompManager::allowed_socket_domains(false);
1060        assert_eq!(domains, vec![libc::AF_UNIX]);
1061    }
1062
1063    #[test]
1064    fn test_network_enabled_socket_domains_exclude_netlink() {
1065        let domains = SeccompManager::allowed_socket_domains(true);
1066        assert!(domains.contains(&libc::AF_UNIX));
1067        assert!(domains.contains(&libc::AF_INET));
1068        assert!(domains.contains(&libc::AF_INET6));
1069        assert!(!domains.contains(&libc::AF_NETLINK));
1070    }
1071
1072    #[test]
1073    fn test_network_mode_syscalls_only_enabled_when_network_allowed() {
1074        let none = SeccompManager::network_mode_syscalls(false);
1075        assert!(none.is_empty());
1076
1077        let enabled = SeccompManager::network_mode_syscalls(true);
1078        assert!(enabled.contains(&libc::SYS_connect));
1079        assert!(enabled.contains(&libc::SYS_bind));
1080        assert!(enabled.contains(&libc::SYS_listen));
1081        assert!(enabled.contains(&libc::SYS_accept));
1082        assert!(enabled.contains(&libc::SYS_setsockopt));
1083    }
1084
1085    #[test]
1086    fn test_landlock_bootstrap_syscalls_present_in_base_allowlist() {
1087        let base = SeccompManager::base_allowed_syscalls();
1088        assert!(base.contains(&libc::SYS_landlock_create_ruleset));
1089        assert!(base.contains(&libc::SYS_landlock_add_rule));
1090        assert!(base.contains(&libc::SYS_landlock_restrict_self));
1091    }
1092
1093    #[test]
1094    fn test_x32_legacy_range_not_allowlisted() {
1095        let base = SeccompManager::base_allowed_syscalls();
1096        let net = SeccompManager::network_mode_syscalls(true);
1097        for nr in 512_i64..=547_i64 {
1098            assert!(
1099                !base.contains(&nr) && !net.contains(&nr),
1100                "x32 syscall number {} unexpectedly allowlisted",
1101                nr
1102            );
1103        }
1104    }
1105
1106    #[test]
1107    fn test_i386_compat_socketcall_range_not_allowlisted() {
1108        let base = SeccompManager::base_allowed_syscalls();
1109        let net = SeccompManager::network_mode_syscalls(true);
1110        // i386 compat per syscall_32.tbl: socket..shutdown live at 359..373.
1111        // On x86_64 these numbers are outside our native allowlist surface.
1112        for nr in 359_i64..=373_i64 {
1113            assert!(
1114                !base.contains(&nr) && !net.contains(&nr),
1115                "i386 compat syscall number {} unexpectedly allowlisted",
1116                nr
1117            );
1118        }
1119    }
1120
1121    #[test]
1122    fn test_minimal_filter_allowlist_counts_are_stable() {
1123        let base = SeccompManager::base_allowed_syscalls();
1124        let net = SeccompManager::network_mode_syscalls(true);
1125
1126        // Snapshot counts to catch unintended policy drift.
1127        // +7 accounts for conditional rules inserted in minimal_filter():
1128        // socket/ioctl/prctl/mprotect/clone/clone3/execveat.
1129        // fork removed (forces through filtered clone path).
1130        // execveat removed from base (arg-filtered separately).
1131        assert_eq!(base.len(), 131);
1132        assert_eq!(net.len(), 11);
1133        assert_eq!(base.len() + 7, 138);
1134        assert_eq!(base.len() + net.len() + 7, 149);
1135    }
1136
1137    #[test]
1138    fn test_arg_filtered_syscalls_list_includes_critical_syscalls() {
1139        // These syscalls must be in the arg-filtered list so custom profiles
1140        // get warnings when they allow them without filters.
1141        for name in &["clone", "clone3", "execveat", "ioctl", "prctl", "socket"] {
1142            assert!(
1143                SeccompManager::ARG_FILTERED_SYSCALLS.contains(name),
1144                "'{}' must be in ARG_FILTERED_SYSCALLS",
1145                name
1146            );
1147        }
1148    }
1149
1150    #[test]
1151    fn test_clone3_allowed_in_minimal_filter() {
1152        // clone3 MUST be in the BPF rules map — glibc 2.34+ and newer musl
1153        // use clone3 internally for posix_spawn/fork. Blocking it breaks
1154        // std::process::Command on modern systems. Namespace creation is
1155        // prevented by dropped capabilities (CAP_SYS_ADMIN etc.), not seccomp.
1156        let rules = SeccompManager::minimal_filter(true).unwrap();
1157        assert!(
1158            rules.contains_key(&libc::SYS_clone3),
1159            "clone3 must be in the seccomp allowlist (glibc 2.34+ requires it)"
1160        );
1161    }
1162
1163    #[test]
1164    fn test_clone_is_allowed_with_arg_filter() {
1165        // clone (not clone3) should still be in the rules with arg filtering
1166        let rules = SeccompManager::minimal_filter(true).unwrap();
1167        assert!(
1168            rules.contains_key(&libc::SYS_clone),
1169            "clone must be in the seccomp allowlist with arg filters"
1170        );
1171    }
1172
1173    #[test]
1174    fn test_high_risk_syscalls_removed_from_base_allowlist() {
1175        let base = SeccompManager::base_allowed_syscalls();
1176        let removed = [
1177            libc::SYS_chown,
1178            libc::SYS_fchown,
1179            libc::SYS_lchown,
1180            libc::SYS_fchownat,
1181            libc::SYS_sync,
1182            libc::SYS_syncfs,
1183            libc::SYS_mlock,
1184            libc::SYS_munlock,
1185            libc::SYS_mincore,
1186            libc::SYS_vfork,
1187            libc::SYS_tkill,
1188        ];
1189
1190        for syscall in removed {
1191            assert!(
1192                !base.contains(&syscall),
1193                "syscall {} unexpectedly present in base allowlist",
1194                syscall
1195            );
1196        }
1197    }
1198
1199    #[test]
1200    fn test_custom_profile_preserves_clone_arg_filters() {
1201        // SEC-01: Custom seccomp profiles that allow "clone" must still get
1202        // argument-level filtering to block namespace-creating flags.
1203        // Verify by inspecting the built-in filter rules that serve as the
1204        // merge source for apply_profile_from_file.
1205        let rules = SeccompManager::minimal_filter(true).unwrap();
1206
1207        // Every ARG_FILTERED_SYSCALLS entry (except clone3, which is allowed
1208        // unconditionally since BPF can't inspect its struct-based flags) must
1209        // have non-empty argument-level rules in the built-in filter so that
1210        // apply_profile_from_file can merge them.
1211        for name in SeccompManager::ARG_FILTERED_SYSCALLS {
1212            if *name == "clone3" {
1213                // clone3 is allowed unconditionally — BPF cannot dereference
1214                // the clone_args struct, so arg filtering is impossible.
1215                // Namespace defense relies on dropped capabilities.
1216                continue;
1217            }
1218            if let Some(nr) = syscall_name_to_number(name) {
1219                let entry = rules.get(&nr);
1220                assert!(
1221                    entry.is_some() && !entry.unwrap().is_empty(),
1222                    "built-in filter must have argument-level rules for '{}' \
1223                     so apply_profile_from_file can merge them into custom profiles",
1224                    name
1225                );
1226            }
1227        }
1228    }
1229
1230    #[test]
1231    fn test_memfd_create_not_in_default_allowlist() {
1232        // SEC-02: memfd_create enables fileless code execution when combined with execveat.
1233        let base = SeccompManager::base_allowed_syscalls();
1234        assert!(
1235            !base.contains(&libc::SYS_memfd_create),
1236            "memfd_create must not be in the default seccomp allowlist (fileless exec risk)"
1237        );
1238        // Also verify it's not sneaked into the compiled filter rules
1239        let rules = SeccompManager::minimal_filter(true).unwrap();
1240        assert!(
1241            !rules.contains_key(&libc::SYS_memfd_create),
1242            "memfd_create must not be in the compiled seccomp filter rules"
1243        );
1244    }
1245
1246    #[test]
1247    fn test_mprotect_has_arg_filtering() {
1248        // SEC-03: mprotect must have argument-level filtering to prevent W^X
1249        // (PROT_WRITE|PROT_EXEC) violations. Verify via runtime data structures.
1250
1251        // mprotect must NOT be in the unconditional base allowlist
1252        let base = SeccompManager::base_allowed_syscalls();
1253        assert!(
1254            !base.contains(&libc::SYS_mprotect),
1255            "SYS_mprotect must not be unconditionally allowed - needs arg filtering"
1256        );
1257
1258        // mprotect must be present in the compiled filter with non-empty
1259        // argument conditions (the conditions enforce W^X)
1260        let rules = SeccompManager::minimal_filter(true).unwrap();
1261        let mprotect_rules = rules.get(&libc::SYS_mprotect);
1262        assert!(
1263            mprotect_rules.is_some(),
1264            "mprotect must be present in the seccomp filter rules"
1265        );
1266        assert!(
1267            !mprotect_rules.unwrap().is_empty(),
1268            "mprotect must have argument-level conditions to prevent W^X violations"
1269        );
1270    }
1271
1272    #[test]
1273    fn test_unsafe_blocks_have_safety_comments() {
1274        // SEC-08: All unsafe blocks must have // SAFETY: documentation
1275        let source = include_str!("seccomp.rs");
1276        let mut pos = 0;
1277        while let Some(idx) = source[pos..].find("unsafe {") {
1278            let abs_idx = pos + idx;
1279            // Check that there's a SAFETY comment within 200 chars before the unsafe block
1280            let start = abs_idx.saturating_sub(200);
1281            let context = &source[start..abs_idx];
1282            assert!(
1283                context.contains("SAFETY:"),
1284                "unsafe block at byte {} must have a // SAFETY: comment. Context: ...{}...",
1285                abs_idx,
1286                &source[abs_idx.saturating_sub(80)..abs_idx + 10]
1287            );
1288            pos = abs_idx + 1;
1289        }
1290    }
1291
1292    // --- H-1: mprotect MaskedEq logic verification ---
1293    //
1294    // The mprotect filter uses MaskedEq((PROT_WRITE | PROT_EXEC), value) to
1295    // allow only combinations where the W|X bits match one of {0, W, X}.
1296    // These tests prove the logic is correct without installing a real
1297    // seccomp filter (which would affect the test process).
1298
1299    /// Helper: simulates the MaskedEq check that the seccomp BPF would perform.
1300    /// Returns true if the prot value would be ALLOWED by one of the rules.
1301    fn mprotect_would_allow(prot: u64) -> bool {
1302        let mask = (libc::PROT_WRITE | libc::PROT_EXEC) as u64;
1303        let allowed_values: &[u64] = &[0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64];
1304        let masked = prot & mask;
1305        allowed_values.contains(&masked)
1306    }
1307
1308    #[test]
1309    fn test_mprotect_allows_prot_none() {
1310        assert!(mprotect_would_allow(0), "PROT_NONE must be allowed");
1311    }
1312
1313    #[test]
1314    fn test_mprotect_allows_prot_read_only() {
1315        assert!(
1316            mprotect_would_allow(libc::PROT_READ as u64),
1317            "PROT_READ must be allowed (W|X bits are 0)"
1318        );
1319    }
1320
1321    #[test]
1322    fn test_mprotect_allows_prot_read_write() {
1323        assert!(
1324            mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE) as u64),
1325            "PROT_READ|PROT_WRITE must be allowed"
1326        );
1327    }
1328
1329    #[test]
1330    fn test_mprotect_allows_prot_read_exec() {
1331        assert!(
1332            mprotect_would_allow((libc::PROT_READ | libc::PROT_EXEC) as u64),
1333            "PROT_READ|PROT_EXEC must be allowed"
1334        );
1335    }
1336
1337    #[test]
1338    fn test_mprotect_rejects_prot_write_exec() {
1339        assert!(
1340            !mprotect_would_allow((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
1341            "PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
1342        );
1343    }
1344
1345    #[test]
1346    fn test_mprotect_rejects_prot_read_write_exec() {
1347        assert!(
1348            !mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE | libc::PROT_EXEC) as u64),
1349            "PROT_READ|PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
1350        );
1351    }
1352
1353    #[test]
1354    fn test_mprotect_allows_prot_write_alone() {
1355        assert!(
1356            mprotect_would_allow(libc::PROT_WRITE as u64),
1357            "PROT_WRITE alone must be allowed"
1358        );
1359    }
1360
1361    #[test]
1362    fn test_mprotect_allows_prot_exec_alone() {
1363        assert!(
1364            mprotect_would_allow(libc::PROT_EXEC as u64),
1365            "PROT_EXEC alone must be allowed"
1366        );
1367    }
1368}