Skip to main content

nucleus/security/
seccomp.rs

1use crate::error::{NucleusError, Result};
2use crate::security::policy::sha256_hex;
3use seccompiler::{BpfProgram, SeccompAction, SeccompCondition, SeccompFilter, SeccompRule};
4use std::collections::BTreeMap;
5use std::path::Path;
6use tracing::{debug, info, warn};
7
8/// Seccomp filter manager
9///
10/// Implements syscall whitelisting for the security state machine
11/// (NucleusSecurity_Seccomp_SeccompEnforcement.tla)
12pub struct SeccompManager {
13    applied: bool,
14}
15
16const DENIED_CLONE_NAMESPACE_FLAGS: u64 = (libc::CLONE_NEWUSER
17    | libc::CLONE_NEWNS
18    | libc::CLONE_NEWNET
19    | libc::CLONE_NEWIPC
20    | libc::CLONE_NEWUTS
21    | libc::CLONE_NEWPID
22    | libc::CLONE_NEWCGROUP
23    | libc::CLONE_NEWTIME) as u64;
24
25impl SeccompManager {
26    pub fn new() -> Self {
27        Self { applied: false }
28    }
29
30    fn base_allowed_syscalls() -> Vec<i64> {
31        let mut syscalls = vec![
32            // File I/O
33            libc::SYS_read,
34            libc::SYS_write,
35            libc::SYS_openat,
36            libc::SYS_close,
37            libc::SYS_fstat,
38            libc::SYS_lseek,
39            libc::SYS_fcntl,
40            libc::SYS_readv,
41            libc::SYS_writev,
42            libc::SYS_pread64,
43            libc::SYS_pwrite64,
44            libc::SYS_readlinkat,
45            libc::SYS_newfstatat,
46            libc::SYS_statx,
47            libc::SYS_faccessat,
48            libc::SYS_faccessat2,
49            libc::SYS_dup,
50            libc::SYS_dup3,
51            libc::SYS_pipe2,
52            libc::SYS_unlinkat,
53            libc::SYS_renameat,
54            libc::SYS_renameat2,
55            libc::SYS_linkat,
56            libc::SYS_symlinkat,
57            libc::SYS_fchmod,
58            libc::SYS_fchmodat,
59            libc::SYS_truncate,
60            libc::SYS_ftruncate,
61            libc::SYS_fallocate,
62            #[cfg(target_arch = "x86_64")]
63            libc::SYS_fadvise64,
64            libc::SYS_fsync,
65            libc::SYS_fdatasync,
66            libc::SYS_flock,
67            #[cfg(target_arch = "x86_64")]
68            libc::SYS_sendfile,
69            libc::SYS_copy_file_range,
70            libc::SYS_splice,
71            libc::SYS_tee,
72            // Memory management
73            libc::SYS_mmap,
74            libc::SYS_munmap,
75            libc::SYS_brk,
76            libc::SYS_mremap,
77            libc::SYS_madvise,
78            libc::SYS_msync,
79            // Process management
80            // fork intentionally excluded — modern glibc/musl use clone(), which
81            // has namespace-flag filtering. Removing SYS_fork forces all forks
82            // through the filtered clone path (defense-in-depth against fork bombs
83            // and unfiltered namespace creation).
84            libc::SYS_execve,
85            // execveat is conditionally allowed below (AT_EMPTY_PATH blocked)
86            libc::SYS_wait4,
87            libc::SYS_waitid,
88            libc::SYS_exit,
89            libc::SYS_exit_group,
90            libc::SYS_getpid,
91            libc::SYS_gettid,
92            libc::SYS_getuid,
93            libc::SYS_getgid,
94            libc::SYS_geteuid,
95            libc::SYS_getegid,
96            libc::SYS_getppid,
97            libc::SYS_setsid,
98            libc::SYS_getgroups,
99            // Signals
100            libc::SYS_rt_sigaction,
101            libc::SYS_rt_sigprocmask,
102            libc::SYS_rt_sigreturn,
103            libc::SYS_rt_sigsuspend,
104            libc::SYS_sigaltstack,
105            libc::SYS_kill,
106            libc::SYS_tgkill,
107            // Time
108            libc::SYS_clock_gettime,
109            libc::SYS_clock_getres,
110            libc::SYS_clock_nanosleep,
111            libc::SYS_gettimeofday,
112            libc::SYS_nanosleep,
113            // Directories
114            libc::SYS_getcwd,
115            libc::SYS_chdir,
116            libc::SYS_fchdir,
117            libc::SYS_mkdirat,
118            libc::SYS_getdents64,
119            // Misc
120            libc::SYS_uname,
121            libc::SYS_getrandom,
122            libc::SYS_futex,
123            libc::SYS_set_tid_address,
124            libc::SYS_set_robust_list,
125            libc::SYS_get_robust_list,
126            libc::SYS_sysinfo,
127            libc::SYS_umask,
128            libc::SYS_prlimit64,
129            libc::SYS_getrusage,
130            libc::SYS_times,
131            libc::SYS_sched_yield,
132            libc::SYS_sched_getaffinity,
133            libc::SYS_getcpu,
134            libc::SYS_rseq,
135            libc::SYS_close_range,
136            // NOTE: memfd_create intentionally excluded — combined with execveat
137            // it enables fileless code execution bypassing all FS controls (SEC-02).
138            // Landlock bootstrap (runtime applies seccomp before Landlock)
139            libc::SYS_landlock_create_ruleset,
140            libc::SYS_landlock_add_rule,
141            libc::SYS_landlock_restrict_self,
142            // Socket/Network (safe introspection + local socketpair)
143            libc::SYS_getsockname,
144            libc::SYS_getpeername,
145            libc::SYS_socketpair,
146            libc::SYS_getsockopt,
147            // Poll/Select
148            libc::SYS_ppoll,
149            libc::SYS_pselect6,
150            libc::SYS_epoll_create1,
151            libc::SYS_epoll_ctl,
152            libc::SYS_epoll_pwait,
153            libc::SYS_eventfd2,
154            libc::SYS_signalfd4,
155            libc::SYS_timerfd_create,
156            libc::SYS_timerfd_settime,
157            libc::SYS_timerfd_gettime,
158        ];
159
160        // Legacy syscalls only available on x86_64 (aarch64 only has the *at variants)
161        #[cfg(target_arch = "x86_64")]
162        syscalls.extend_from_slice(&[
163            libc::SYS_open,
164            libc::SYS_stat,
165            libc::SYS_lstat,
166            libc::SYS_access,
167            libc::SYS_readlink,
168            libc::SYS_dup2,
169            libc::SYS_pipe,
170            libc::SYS_unlink,
171            libc::SYS_rename,
172            libc::SYS_link,
173            libc::SYS_symlink,
174            libc::SYS_chmod,
175            libc::SYS_mkdir,
176            libc::SYS_rmdir,
177            libc::SYS_getdents,
178            libc::SYS_getpgrp,
179            libc::SYS_arch_prctl,
180            libc::SYS_getrlimit,
181            libc::SYS_poll,
182            libc::SYS_select,
183            libc::SYS_epoll_create,
184            libc::SYS_epoll_wait,
185            libc::SYS_eventfd,
186            libc::SYS_signalfd,
187        ]);
188
189        syscalls
190    }
191
192    fn allowed_socket_domains(allow_network: bool) -> Vec<i32> {
193        if allow_network {
194            vec![libc::AF_UNIX, libc::AF_INET, libc::AF_INET6]
195        } else {
196            vec![libc::AF_UNIX]
197        }
198    }
199
200    fn network_mode_syscalls(allow_network: bool) -> Vec<i64> {
201        if allow_network {
202            vec![
203                libc::SYS_connect,
204                libc::SYS_sendto,
205                libc::SYS_recvfrom,
206                libc::SYS_sendmsg,
207                libc::SYS_recvmsg,
208                libc::SYS_shutdown,
209                libc::SYS_bind,
210                libc::SYS_listen,
211                libc::SYS_accept,
212                libc::SYS_accept4,
213                libc::SYS_setsockopt,
214            ]
215        } else {
216            Vec::new()
217        }
218    }
219
220    /// Get minimal syscall whitelist for basic container operation
221    ///
222    /// This is a restrictive whitelist that blocks dangerous syscalls:
223    /// - ptrace (process tracing)
224    /// - kexec_load (kernel loading)
225    /// - add_key, request_key, keyctl (kernel keyring)
226    /// - bpf (eBPF programs)
227    /// - perf_event_open (performance monitoring)
228    /// - userfaultfd (user fault handling)
229    fn minimal_filter(allow_network: bool) -> Result<BTreeMap<i64, Vec<SeccompRule>>> {
230        let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
231
232        // Essential syscalls for basic operation
233        let allowed_syscalls = Self::base_allowed_syscalls();
234
235        // Allow all these syscalls unconditionally
236        for syscall in allowed_syscalls {
237            rules.insert(syscall, Vec::new());
238        }
239
240        // Add network-mode-specific syscalls
241        for syscall in Self::network_mode_syscalls(allow_network) {
242            rules.insert(syscall, Vec::new());
243        }
244
245        // Restrict socket() domains by network mode.
246        // none: AF_UNIX only; network-enabled: AF_UNIX/AF_INET/AF_INET6.
247        let mut socket_rules = Vec::new();
248        for domain in Self::allowed_socket_domains(allow_network) {
249            let condition = SeccompCondition::new(
250                0, // arg0 is socket(domain, type, protocol)
251                seccompiler::SeccompCmpArgLen::Dword,
252                seccompiler::SeccompCmpOp::Eq,
253                domain as u64,
254            )
255            .map_err(|e| {
256                NucleusError::SeccompError(format!(
257                    "Failed to create socket domain condition: {}",
258                    e
259                ))
260            })?;
261            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
262                NucleusError::SeccompError(format!("Failed to create socket rule: {}", e))
263            })?;
264            socket_rules.push(rule);
265        }
266        rules.insert(libc::SYS_socket, socket_rules);
267
268        // ioctl: allow only safe terminal operations (arg0 = request code)
269        let ioctl_allowed: &[u64] = &[
270            0x5401, // TCGETS
271            0x5402, // TCSETS
272            0x5403, // TCSETSW
273            0x5404, // TCSETSF
274            0x540B, // TCFLSH
275            0x540F, // TIOCGPGRP
276            0x5410, // TIOCSPGRP
277            0x5413, // TIOCGWINSZ
278            0x5429, // TIOCGSID
279            0x541B, // FIONREAD
280            // FIONBIO (0x5421) intentionally excluded — sets non-blocking mode
281            // on network sockets, enabling sophisticated network exploitation.
282            0x5451, // FIOCLEX
283            0x5450, // FIONCLEX
284        ];
285        let mut ioctl_rules = Vec::new();
286        for &request in ioctl_allowed {
287            let condition = SeccompCondition::new(
288                1, // arg1 is the request code for ioctl(fd, request, ...)
289                seccompiler::SeccompCmpArgLen::Dword,
290                seccompiler::SeccompCmpOp::Eq,
291                request,
292            )
293            .map_err(|e| {
294                NucleusError::SeccompError(format!("Failed to create ioctl condition: {}", e))
295            })?;
296            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
297                NucleusError::SeccompError(format!("Failed to create ioctl rule: {}", e))
298            })?;
299            ioctl_rules.push(rule);
300        }
301        rules.insert(libc::SYS_ioctl, ioctl_rules);
302
303        // prctl: allow only safe operations (arg0 = option).
304        // Notably absent (hit default deny):
305        //   PR_CAPBSET_READ (23) — leaks capability bounding set info
306        //   PR_CAPBSET_DROP (24) — could weaken the capability bounding set
307        //   PR_SET_SECUREBITS (28) — could disable secure-exec restrictions
308        let prctl_allowed: &[u64] = &[
309            1,  // PR_SET_PDEATHSIG
310            2,  // PR_GET_PDEATHSIG
311            15, // PR_SET_NAME
312            16, // PR_GET_NAME
313            38, // PR_SET_NO_NEW_PRIVS
314            39, // PR_GET_NO_NEW_PRIVS
315        ];
316        let mut prctl_rules = Vec::new();
317        for &option in prctl_allowed {
318            let condition = SeccompCondition::new(
319                0, // arg0 is the option for prctl(option, ...)
320                seccompiler::SeccompCmpArgLen::Dword,
321                seccompiler::SeccompCmpOp::Eq,
322                option,
323            )
324            .map_err(|e| {
325                NucleusError::SeccompError(format!("Failed to create prctl condition: {}", e))
326            })?;
327            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
328                NucleusError::SeccompError(format!("Failed to create prctl rule: {}", e))
329            })?;
330            prctl_rules.push(rule);
331        }
332        rules.insert(libc::SYS_prctl, prctl_rules);
333
334        // mprotect: permit RW or RX transitions, but reject PROT_WRITE|PROT_EXEC.
335        let mut mprotect_rules = Vec::new();
336        for allowed in [0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64] {
337            let condition = SeccompCondition::new(
338                2, // arg2 is prot for mprotect(addr, len, prot)
339                seccompiler::SeccompCmpArgLen::Dword,
340                seccompiler::SeccompCmpOp::MaskedEq((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
341                allowed,
342            )
343            .map_err(|e| {
344                NucleusError::SeccompError(format!("Failed to create mprotect condition: {}", e))
345            })?;
346            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
347                NucleusError::SeccompError(format!("Failed to create mprotect rule: {}", e))
348            })?;
349            mprotect_rules.push(rule);
350        }
351        rules.insert(libc::SYS_mprotect, mprotect_rules);
352
353        // clone3: ALLOWED unconditionally. clone3 passes flags inside a struct
354        // pointer that seccomp BPF cannot dereference, so namespace-flag filtering
355        // is impossible at the BPF level. However, glibc 2.34+ and newer musl use
356        // clone3 internally for posix_spawn/fork — blocking it breaks
357        // std::process::Command and any child-process spawning on modern systems.
358        //
359        // SECURITY INVARIANT: Namespace creation via clone3 is prevented solely by
360        // dropping CAP_SYS_ADMIN (and other namespace caps) *before* this seccomp
361        // filter is installed. If capability dropping is bypassed, clone3 becomes
362        // an unfiltered path to namespace creation. This is a known single point
363        // of failure — see CapabilityManager::drop_all() which must run first.
364        rules.insert(libc::SYS_clone3, Vec::new());
365
366        // clone: allow but deny namespace-creating flags to prevent nested namespace creation
367        let clone_condition = SeccompCondition::new(
368            0, // arg0 = flags
369            seccompiler::SeccompCmpArgLen::Qword,
370            seccompiler::SeccompCmpOp::MaskedEq(DENIED_CLONE_NAMESPACE_FLAGS),
371            0, // (flags & ns_flags) == 0: none of the namespace flags set
372        )
373        .map_err(|e| {
374            NucleusError::SeccompError(format!("Failed to create clone condition: {}", e))
375        })?;
376        let clone_rule = SeccompRule::new(vec![clone_condition]).map_err(|e| {
377            NucleusError::SeccompError(format!("Failed to create clone rule: {}", e))
378        })?;
379        rules.insert(libc::SYS_clone, vec![clone_rule]);
380
381        // execveat: allow but block AT_EMPTY_PATH (0x1000) to prevent fileless
382        // execution. With AT_EMPTY_PATH, execveat can execute code from any open
383        // fd (e.g., open + unlink, or even a socket fd), bypassing filesystem
384        // controls — not just memfd_create. Blocking memfd_create alone is
385        // insufficient. Normal execveat with dirfd+pathname (no AT_EMPTY_PATH)
386        // remains allowed.
387        let execveat_condition = SeccompCondition::new(
388            4, // arg4 = flags for execveat(dirfd, pathname, argv, envp, flags)
389            seccompiler::SeccompCmpArgLen::Dword,
390            seccompiler::SeccompCmpOp::MaskedEq(libc::AT_EMPTY_PATH as u64),
391            0, // (flags & AT_EMPTY_PATH) == 0: AT_EMPTY_PATH not set
392        )
393        .map_err(|e| {
394            NucleusError::SeccompError(format!("Failed to create execveat condition: {}", e))
395        })?;
396        let execveat_rule = SeccompRule::new(vec![execveat_condition]).map_err(|e| {
397            NucleusError::SeccompError(format!("Failed to create execveat rule: {}", e))
398        })?;
399        rules.insert(libc::SYS_execveat, vec![execveat_rule]);
400
401        Ok(rules)
402    }
403
404    /// Compile the minimal BPF filter without applying it
405    ///
406    /// This is useful for benchmarking filter compilation overhead
407    /// without the irreversible side effect of applying the filter.
408    pub fn compile_minimal_filter() -> Result<BpfProgram> {
409        let rules = Self::minimal_filter(true)?;
410        let filter = SeccompFilter::new(
411            rules,
412            SeccompAction::Errno(libc::EPERM as u32),
413            SeccompAction::Allow,
414            std::env::consts::ARCH.try_into().map_err(|e| {
415                NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
416            })?,
417        )
418        .map_err(|e| {
419            NucleusError::SeccompError(format!("Failed to create seccomp filter: {}", e))
420        })?;
421
422        let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
423            NucleusError::SeccompError(format!("Failed to compile BPF program: {}", e))
424        })?;
425
426        Ok(bpf_prog)
427    }
428
429    /// Apply seccomp filter
430    ///
431    /// This implements the transition: no_filter -> whitelist_active
432    /// in the seccomp state machine (NucleusSecurity_Seccomp_SeccompEnforcement.tla)
433    ///
434    /// Once applied, the filter cannot be removed (irreversible property)
435    /// In rootless mode or if seccomp setup fails, this will warn and continue
436    pub fn apply_minimal_filter(&mut self) -> Result<bool> {
437        self.apply_minimal_filter_with_mode(false, false)
438    }
439
440    /// Apply seccomp filter with configurable failure behavior
441    ///
442    /// When `best_effort` is true, failures are logged and execution continues.
443    /// When false, seccomp setup is fail-closed.
444    pub fn apply_minimal_filter_with_mode(
445        &mut self,
446        best_effort: bool,
447        log_denied: bool,
448    ) -> Result<bool> {
449        self.apply_filter_for_network_mode(true, best_effort, log_denied)
450    }
451
452    /// Apply seccomp filter with network-mode-aware socket restrictions
453    ///
454    /// When `allow_network` is false, `SYS_socket` is restricted to AF_UNIX only,
455    /// preventing creation of network sockets (AF_INET, AF_INET6, etc.).
456    /// When `allow_network` is true, all socket domains are permitted.
457    ///
458    /// When `best_effort` is true, failures are logged and execution continues.
459    /// When false, seccomp setup is fail-closed.
460    pub fn apply_filter_for_network_mode(
461        &mut self,
462        allow_network: bool,
463        best_effort: bool,
464        log_denied: bool,
465    ) -> Result<bool> {
466        if self.applied {
467            debug!("Seccomp filter already applied, skipping");
468            return Ok(true);
469        }
470
471        info!(allow_network, "Applying seccomp filter");
472
473        let rules = match Self::minimal_filter(allow_network) {
474            Ok(r) => r,
475            Err(e) => {
476                if best_effort {
477                    warn!(
478                        "Failed to create seccomp rules: {} (continuing without seccomp)",
479                        e
480                    );
481                    return Ok(false);
482                }
483                return Err(e);
484            }
485        };
486
487        let filter = match SeccompFilter::new(
488            rules,
489            SeccompAction::Errno(libc::EPERM as u32), // Default: deny with EPERM
490            SeccompAction::Allow,                     // Match action: allow
491            std::env::consts::ARCH.try_into().map_err(|e| {
492                NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
493            })?,
494        ) {
495            Ok(f) => f,
496            Err(e) => {
497                if best_effort {
498                    warn!(
499                        "Failed to create seccomp filter: {} (continuing without seccomp)",
500                        e
501                    );
502                    return Ok(false);
503                }
504                return Err(NucleusError::SeccompError(format!(
505                    "Failed to create seccomp filter: {}",
506                    e
507                )));
508            }
509        };
510
511        let bpf_prog: BpfProgram = match filter.try_into() {
512            Ok(p) => p,
513            Err(e) => {
514                if best_effort {
515                    warn!(
516                        "Failed to compile BPF program: {} (continuing without seccomp)",
517                        e
518                    );
519                    return Ok(false);
520                }
521                return Err(NucleusError::SeccompError(format!(
522                    "Failed to compile BPF program: {}",
523                    e
524                )));
525            }
526        };
527
528        // Apply the filter
529        match Self::apply_bpf_program(&bpf_prog, log_denied) {
530            Ok(_) => {
531                self.applied = true;
532                info!("Successfully applied seccomp filter");
533                Ok(true)
534            }
535            Err(e) => {
536                if best_effort {
537                    warn!(
538                        "Failed to apply seccomp filter: {} (continuing without seccomp)",
539                        e
540                    );
541                    Ok(false)
542                } else {
543                    Err(NucleusError::SeccompError(format!(
544                        "Failed to apply seccomp filter: {}",
545                        e
546                    )))
547                }
548            }
549        }
550    }
551
552    /// Apply a seccomp profile loaded from a JSON file.
553    ///
554    /// The profile format is a JSON object with:
555    /// ```json
556    /// {
557    ///   "defaultAction": "SCMP_ACT_ERRNO",
558    ///   "syscalls": [
559    ///     { "names": ["read", "write", "open", ...], "action": "SCMP_ACT_ALLOW" }
560    ///   ]
561    /// }
562    /// ```
563    ///
564    /// This is a subset of the OCI seccomp profile format. Only the syscall name
565    /// allowlist is used; argument-level filtering from the built-in profile is
566    /// not applied when using a custom profile.
567    ///
568    /// If `expected_sha256` is provided, the file's SHA-256 hash is verified
569    /// against it before loading. This prevents silent profile tampering.
570    pub fn apply_profile_from_file(
571        &mut self,
572        profile_path: &Path,
573        expected_sha256: Option<&str>,
574        audit_mode: bool,
575    ) -> Result<bool> {
576        if self.applied {
577            debug!("Seccomp filter already applied, skipping");
578            return Ok(true);
579        }
580
581        info!("Loading seccomp profile from {:?}", profile_path);
582
583        // Read profile file
584        let content = std::fs::read(profile_path).map_err(|e| {
585            NucleusError::SeccompError(format!(
586                "Failed to read seccomp profile {:?}: {}",
587                profile_path, e
588            ))
589        })?;
590
591        // Verify SHA-256 hash if expected
592        if let Some(expected) = expected_sha256 {
593            let actual = sha256_hex(&content);
594            if actual != expected {
595                return Err(NucleusError::SeccompError(format!(
596                    "Seccomp profile hash mismatch: expected {}, got {}",
597                    expected, actual
598                )));
599            }
600            info!("Seccomp profile hash verified: {}", actual);
601        }
602
603        // Parse profile
604        let profile: SeccompProfile = serde_json::from_slice(&content).map_err(|e| {
605            NucleusError::SeccompError(format!("Failed to parse seccomp profile: {}", e))
606        })?;
607
608        // Warn when custom profile allows security-critical syscalls without
609        // argument-level filtering. The built-in filter restricts clone, ioctl,
610        // prctl, and socket at the argument level; a custom profile that allows
611        // them by name only silently removes all of that hardening.
612        Self::warn_missing_arg_filters(&profile);
613
614        // Build filter from profile
615        let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
616
617        for syscall_group in &profile.syscalls {
618            if syscall_group.action == "SCMP_ACT_ALLOW" {
619                for name in &syscall_group.names {
620                    if let Some(nr) = syscall_name_to_number(name) {
621                        rules.insert(nr, Vec::new());
622                    } else {
623                        warn!("Unknown syscall in profile: {} (skipping)", name);
624                    }
625                }
626            }
627        }
628
629        // SEC-01: Merge built-in argument filters for security-critical syscalls.
630        // Custom profiles that allow clone/ioctl/prctl/socket/mprotect by name
631        // without argument-level filters would silently remove all hardening.
632        // Overwrite their empty rules with the built-in argument-filtered rules.
633        let builtin_rules = Self::minimal_filter(true)?;
634        for syscall_name in Self::ARG_FILTERED_SYSCALLS {
635            if let Some(nr) = syscall_name_to_number(syscall_name) {
636                if let std::collections::btree_map::Entry::Occupied(mut entry) = rules.entry(nr) {
637                    if let Some(builtin) = builtin_rules.get(&nr) {
638                        if !builtin.is_empty() {
639                            info!(
640                                "Merging built-in argument filters for '{}' into custom profile",
641                                syscall_name
642                            );
643                            entry.insert(builtin.clone());
644                        }
645                    }
646                }
647            }
648        }
649        // Also enforce clone3 denial — it cannot be argument-filtered
650        rules.remove(&libc::SYS_clone3);
651
652        let filter = SeccompFilter::new(
653            rules,
654            SeccompAction::Errno(libc::EPERM as u32),
655            SeccompAction::Allow,
656            std::env::consts::ARCH.try_into().map_err(|e| {
657                NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
658            })?,
659        )
660        .map_err(|e| {
661            NucleusError::SeccompError(format!(
662                "Failed to create seccomp filter from profile: {}",
663                e
664            ))
665        })?;
666
667        let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
668            NucleusError::SeccompError(format!("Failed to compile BPF program from profile: {}", e))
669        })?;
670
671        match Self::apply_bpf_program(&bpf_prog, audit_mode) {
672            Ok(_) => {
673                self.applied = true;
674                info!(
675                    "Seccomp profile applied from {:?} (log_denied={})",
676                    profile_path, audit_mode
677                );
678                Ok(true)
679            }
680            Err(e) => Err(e),
681        }
682    }
683
684    /// Install an allow-all seccomp filter with SECCOMP_FILTER_FLAG_LOG.
685    ///
686    /// Used in trace mode: all syscalls are allowed but logged to the kernel
687    /// audit subsystem. A separate reader collects the logged syscalls.
688    pub fn apply_trace_filter(&mut self) -> Result<bool> {
689        if self.applied {
690            debug!("Seccomp filter already applied, skipping trace filter");
691            return Ok(true);
692        }
693
694        info!("Applying seccomp trace filter (allow-all + LOG)");
695
696        // Create an empty rule set — with SeccompAction::Allow as default,
697        // every syscall is permitted. The LOG flag causes the kernel to
698        // audit each syscall decision.
699        let filter = SeccompFilter::new(
700            BTreeMap::new(),
701            SeccompAction::Allow, // default: allow everything
702            SeccompAction::Allow, // match action (unused — no rules)
703            std::env::consts::ARCH.try_into().map_err(|e| {
704                NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
705            })?,
706        )
707        .map_err(|e| NucleusError::SeccompError(format!("Failed to create trace filter: {}", e)))?;
708
709        let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
710            NucleusError::SeccompError(format!("Failed to compile trace BPF: {}", e))
711        })?;
712
713        // Apply with LOG flag so kernel audits every syscall
714        Self::apply_bpf_program(&bpf_prog, true)?;
715        self.applied = true;
716        info!("Seccomp trace filter applied (all syscalls allowed + logged)");
717        Ok(true)
718    }
719
720    /// Syscalls that the built-in filter restricts at the argument level.
721    /// Custom profiles allowing these without argument filters weaken security.
722    const ARG_FILTERED_SYSCALLS: &'static [&'static str] =
723        &["clone", "clone3", "execveat", "ioctl", "mprotect", "prctl", "socket"];
724
725    /// Warn when a custom seccomp profile allows security-critical syscalls
726    /// without argument-level filtering.
727    fn warn_missing_arg_filters(profile: &SeccompProfile) {
728        for group in &profile.syscalls {
729            if group.action != "SCMP_ACT_ALLOW" {
730                continue;
731            }
732            for name in &group.names {
733                if Self::ARG_FILTERED_SYSCALLS.contains(&name.as_str()) && group.args.is_empty() {
734                    warn!(
735                        "Custom seccomp profile allows '{}' without argument filters. \
736                         The built-in filter restricts this syscall at the argument level. \
737                         This profile weakens security compared to the default.",
738                        name
739                    );
740                }
741            }
742        }
743    }
744
745    /// Check if seccomp filter has been applied
746    pub fn is_applied(&self) -> bool {
747        self.applied
748    }
749
750    fn apply_bpf_program(bpf_prog: &BpfProgram, log_denied: bool) -> Result<()> {
751        let mut flags: libc::c_ulong = 0;
752        if log_denied {
753            flags |= libc::SECCOMP_FILTER_FLAG_LOG as libc::c_ulong;
754        }
755
756        match Self::apply_bpf_program_with_flags(bpf_prog, flags) {
757            Ok(()) => Ok(()),
758            Err(err)
759                if log_denied
760                    && err.raw_os_error() == Some(libc::EINVAL)
761                    && libc::SECCOMP_FILTER_FLAG_LOG != 0 =>
762            {
763                warn!(
764                    "Kernel rejected SECCOMP_FILTER_FLAG_LOG; continuing with seccomp \
765                     enforcement without deny logging"
766                );
767                Self::apply_bpf_program_with_flags(bpf_prog, 0)?;
768                Ok(())
769            }
770            Err(err) => Err(NucleusError::SeccompError(format!(
771                "Failed to apply seccomp filter: {}",
772                err
773            ))),
774        }
775    }
776
777    fn apply_bpf_program_with_flags(
778        bpf_prog: &BpfProgram,
779        flags: libc::c_ulong,
780    ) -> std::io::Result<()> {
781        // SAFETY: `prctl(PR_SET_NO_NEW_PRIVS, ...)` has no pointer arguments here
782        // and only affects the current thread/process as required before seccomp.
783        let rc = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
784        if rc != 0 {
785            return Err(std::io::Error::last_os_error());
786        }
787
788        let prog = libc::sock_fprog {
789            len: bpf_prog.len() as u16,
790            filter: bpf_prog.as_ptr() as *mut libc::sock_filter,
791        };
792
793        // SAFETY: `prog` points to a live BPF program buffer for the duration of
794        // the syscall and the kernel copies the pointed-to filter immediately.
795        let rc = unsafe {
796            libc::syscall(
797                libc::SYS_seccomp,
798                libc::SECCOMP_SET_MODE_FILTER,
799                flags,
800                &prog as *const libc::sock_fprog,
801            )
802        };
803
804        if rc < 0 {
805            return Err(std::io::Error::last_os_error());
806        }
807
808        Ok(())
809    }
810}
811
812// SeccompProfile and SeccompSyscallGroup are defined in seccomp_generate.rs
813use crate::security::seccomp_generate::SeccompProfile;
814
815/// Map a syscall name (e.g. "read", "write") to its Linux syscall number.
816///
817/// Covers the most common syscalls. Unknown names return None.
818fn syscall_name_to_number(name: &str) -> Option<i64> {
819    match name {
820        // File I/O
821        "read" => Some(libc::SYS_read),
822        "write" => Some(libc::SYS_write),
823        #[cfg(target_arch = "x86_64")]
824        "open" => Some(libc::SYS_open),
825        "openat" => Some(libc::SYS_openat),
826        "close" => Some(libc::SYS_close),
827        #[cfg(target_arch = "x86_64")]
828        "stat" => Some(libc::SYS_stat),
829        "fstat" => Some(libc::SYS_fstat),
830        #[cfg(target_arch = "x86_64")]
831        "lstat" => Some(libc::SYS_lstat),
832        "lseek" => Some(libc::SYS_lseek),
833        #[cfg(target_arch = "x86_64")]
834        "access" => Some(libc::SYS_access),
835        "fcntl" => Some(libc::SYS_fcntl),
836        "readv" => Some(libc::SYS_readv),
837        "writev" => Some(libc::SYS_writev),
838        "pread64" => Some(libc::SYS_pread64),
839        "pwrite64" => Some(libc::SYS_pwrite64),
840        #[cfg(target_arch = "x86_64")]
841        "readlink" => Some(libc::SYS_readlink),
842        "readlinkat" => Some(libc::SYS_readlinkat),
843        "newfstatat" => Some(libc::SYS_newfstatat),
844        "statx" => Some(libc::SYS_statx),
845        "faccessat" => Some(libc::SYS_faccessat),
846        "faccessat2" => Some(libc::SYS_faccessat2),
847        "dup" => Some(libc::SYS_dup),
848        #[cfg(target_arch = "x86_64")]
849        "dup2" => Some(libc::SYS_dup2),
850        "dup3" => Some(libc::SYS_dup3),
851        #[cfg(target_arch = "x86_64")]
852        "pipe" => Some(libc::SYS_pipe),
853        "pipe2" => Some(libc::SYS_pipe2),
854        #[cfg(target_arch = "x86_64")]
855        "unlink" => Some(libc::SYS_unlink),
856        "unlinkat" => Some(libc::SYS_unlinkat),
857        #[cfg(target_arch = "x86_64")]
858        "rename" => Some(libc::SYS_rename),
859        "renameat" => Some(libc::SYS_renameat),
860        "renameat2" => Some(libc::SYS_renameat2),
861        #[cfg(target_arch = "x86_64")]
862        "link" => Some(libc::SYS_link),
863        "linkat" => Some(libc::SYS_linkat),
864        #[cfg(target_arch = "x86_64")]
865        "symlink" => Some(libc::SYS_symlink),
866        "symlinkat" => Some(libc::SYS_symlinkat),
867        #[cfg(target_arch = "x86_64")]
868        "chmod" => Some(libc::SYS_chmod),
869        "fchmod" => Some(libc::SYS_fchmod),
870        "fchmodat" => Some(libc::SYS_fchmodat),
871        "truncate" => Some(libc::SYS_truncate),
872        "ftruncate" => Some(libc::SYS_ftruncate),
873        "fallocate" => Some(libc::SYS_fallocate),
874        #[cfg(target_arch = "x86_64")]
875        "fadvise64" => Some(libc::SYS_fadvise64),
876        "fsync" => Some(libc::SYS_fsync),
877        "fdatasync" => Some(libc::SYS_fdatasync),
878        "flock" => Some(libc::SYS_flock),
879        #[cfg(target_arch = "x86_64")]
880        "sendfile" => Some(libc::SYS_sendfile),
881        "copy_file_range" => Some(libc::SYS_copy_file_range),
882        "splice" => Some(libc::SYS_splice),
883        "tee" => Some(libc::SYS_tee),
884        // Memory
885        "mmap" => Some(libc::SYS_mmap),
886        "munmap" => Some(libc::SYS_munmap),
887        "mprotect" => Some(libc::SYS_mprotect),
888        "brk" => Some(libc::SYS_brk),
889        "mremap" => Some(libc::SYS_mremap),
890        "madvise" => Some(libc::SYS_madvise),
891        "msync" => Some(libc::SYS_msync),
892        "mlock" => Some(libc::SYS_mlock),
893        "munlock" => Some(libc::SYS_munlock),
894        // Process
895        #[cfg(target_arch = "x86_64")]
896        "fork" => Some(libc::SYS_fork),
897        "clone" => Some(libc::SYS_clone),
898        "clone3" => Some(libc::SYS_clone3),
899        "execve" => Some(libc::SYS_execve),
900        "execveat" => Some(libc::SYS_execveat),
901        "wait4" => Some(libc::SYS_wait4),
902        "waitid" => Some(libc::SYS_waitid),
903        "exit" => Some(libc::SYS_exit),
904        "exit_group" => Some(libc::SYS_exit_group),
905        "getpid" => Some(libc::SYS_getpid),
906        "gettid" => Some(libc::SYS_gettid),
907        "getuid" => Some(libc::SYS_getuid),
908        "getgid" => Some(libc::SYS_getgid),
909        "geteuid" => Some(libc::SYS_geteuid),
910        "getegid" => Some(libc::SYS_getegid),
911        "getppid" => Some(libc::SYS_getppid),
912        #[cfg(target_arch = "x86_64")]
913        "getpgrp" => Some(libc::SYS_getpgrp),
914        "setsid" => Some(libc::SYS_setsid),
915        "getgroups" => Some(libc::SYS_getgroups),
916        // Signals
917        "rt_sigaction" => Some(libc::SYS_rt_sigaction),
918        "rt_sigprocmask" => Some(libc::SYS_rt_sigprocmask),
919        "rt_sigreturn" => Some(libc::SYS_rt_sigreturn),
920        "rt_sigsuspend" => Some(libc::SYS_rt_sigsuspend),
921        "sigaltstack" => Some(libc::SYS_sigaltstack),
922        "kill" => Some(libc::SYS_kill),
923        "tgkill" => Some(libc::SYS_tgkill),
924        // Time
925        "clock_gettime" => Some(libc::SYS_clock_gettime),
926        "clock_getres" => Some(libc::SYS_clock_getres),
927        "clock_nanosleep" => Some(libc::SYS_clock_nanosleep),
928        "gettimeofday" => Some(libc::SYS_gettimeofday),
929        "nanosleep" => Some(libc::SYS_nanosleep),
930        // Directories
931        "getcwd" => Some(libc::SYS_getcwd),
932        "chdir" => Some(libc::SYS_chdir),
933        "fchdir" => Some(libc::SYS_fchdir),
934        #[cfg(target_arch = "x86_64")]
935        "mkdir" => Some(libc::SYS_mkdir),
936        "mkdirat" => Some(libc::SYS_mkdirat),
937        #[cfg(target_arch = "x86_64")]
938        "rmdir" => Some(libc::SYS_rmdir),
939        #[cfg(target_arch = "x86_64")]
940        "getdents" => Some(libc::SYS_getdents),
941        "getdents64" => Some(libc::SYS_getdents64),
942        // Network
943        "socket" => Some(libc::SYS_socket),
944        "connect" => Some(libc::SYS_connect),
945        "sendto" => Some(libc::SYS_sendto),
946        "recvfrom" => Some(libc::SYS_recvfrom),
947        "sendmsg" => Some(libc::SYS_sendmsg),
948        "recvmsg" => Some(libc::SYS_recvmsg),
949        "shutdown" => Some(libc::SYS_shutdown),
950        "bind" => Some(libc::SYS_bind),
951        "listen" => Some(libc::SYS_listen),
952        "accept" => Some(libc::SYS_accept),
953        "accept4" => Some(libc::SYS_accept4),
954        "setsockopt" => Some(libc::SYS_setsockopt),
955        "getsockopt" => Some(libc::SYS_getsockopt),
956        "getsockname" => Some(libc::SYS_getsockname),
957        "getpeername" => Some(libc::SYS_getpeername),
958        "socketpair" => Some(libc::SYS_socketpair),
959        // Poll/Select
960        #[cfg(target_arch = "x86_64")]
961        "poll" => Some(libc::SYS_poll),
962        "ppoll" => Some(libc::SYS_ppoll),
963        #[cfg(target_arch = "x86_64")]
964        "select" => Some(libc::SYS_select),
965        "pselect6" => Some(libc::SYS_pselect6),
966        #[cfg(target_arch = "x86_64")]
967        "epoll_create" => Some(libc::SYS_epoll_create),
968        "epoll_create1" => Some(libc::SYS_epoll_create1),
969        "epoll_ctl" => Some(libc::SYS_epoll_ctl),
970        #[cfg(target_arch = "x86_64")]
971        "epoll_wait" => Some(libc::SYS_epoll_wait),
972        "epoll_pwait" => Some(libc::SYS_epoll_pwait),
973        #[cfg(target_arch = "x86_64")]
974        "eventfd" => Some(libc::SYS_eventfd),
975        "eventfd2" => Some(libc::SYS_eventfd2),
976        #[cfg(target_arch = "x86_64")]
977        "signalfd" => Some(libc::SYS_signalfd),
978        "signalfd4" => Some(libc::SYS_signalfd4),
979        "timerfd_create" => Some(libc::SYS_timerfd_create),
980        "timerfd_settime" => Some(libc::SYS_timerfd_settime),
981        "timerfd_gettime" => Some(libc::SYS_timerfd_gettime),
982        // Misc
983        "uname" => Some(libc::SYS_uname),
984        "getrandom" => Some(libc::SYS_getrandom),
985        "futex" => Some(libc::SYS_futex),
986        "set_tid_address" => Some(libc::SYS_set_tid_address),
987        "set_robust_list" => Some(libc::SYS_set_robust_list),
988        "get_robust_list" => Some(libc::SYS_get_robust_list),
989        #[cfg(target_arch = "x86_64")]
990        "arch_prctl" => Some(libc::SYS_arch_prctl),
991        "sysinfo" => Some(libc::SYS_sysinfo),
992        "umask" => Some(libc::SYS_umask),
993        #[cfg(target_arch = "x86_64")]
994        "getrlimit" => Some(libc::SYS_getrlimit),
995        "prlimit64" => Some(libc::SYS_prlimit64),
996        "getrusage" => Some(libc::SYS_getrusage),
997        "times" => Some(libc::SYS_times),
998        "sched_yield" => Some(libc::SYS_sched_yield),
999        "sched_getaffinity" => Some(libc::SYS_sched_getaffinity),
1000        "getcpu" => Some(libc::SYS_getcpu),
1001        "rseq" => Some(libc::SYS_rseq),
1002        "close_range" => Some(libc::SYS_close_range),
1003        "memfd_create" => Some(libc::SYS_memfd_create),
1004        "ioctl" => Some(libc::SYS_ioctl),
1005        "prctl" => Some(libc::SYS_prctl),
1006        // Landlock
1007        "landlock_create_ruleset" => Some(libc::SYS_landlock_create_ruleset),
1008        "landlock_add_rule" => Some(libc::SYS_landlock_add_rule),
1009        "landlock_restrict_self" => Some(libc::SYS_landlock_restrict_self),
1010        _ => None,
1011    }
1012}
1013
1014impl Default for SeccompManager {
1015    fn default() -> Self {
1016        Self::new()
1017    }
1018}
1019
1020#[cfg(test)]
1021mod tests {
1022    use super::*;
1023
1024    #[test]
1025    fn test_seccomp_manager_initial_state() {
1026        let mgr = SeccompManager::new();
1027        assert!(!mgr.is_applied());
1028    }
1029
1030    #[test]
1031    fn test_apply_idempotent() {
1032        let mgr = SeccompManager::new();
1033        // Note: We can't actually test application in unit tests
1034        // as it would affect the test process itself
1035        // This is tested in integration tests instead
1036        assert!(!mgr.is_applied());
1037    }
1038
1039    #[test]
1040    fn test_clone_denied_flags_include_newcgroup() {
1041        assert_ne!(
1042            DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWCGROUP as u64,
1043            0
1044        );
1045    }
1046
1047    #[test]
1048    fn test_clone_denied_flags_include_newtime() {
1049        assert_ne!(
1050            DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWTIME as u64,
1051            0,
1052            "CLONE_NEWTIME must be in denied clone namespace flags"
1053        );
1054    }
1055
1056    #[test]
1057    fn test_network_none_socket_domains_are_unix_only() {
1058        let domains = SeccompManager::allowed_socket_domains(false);
1059        assert_eq!(domains, vec![libc::AF_UNIX]);
1060    }
1061
1062    #[test]
1063    fn test_network_enabled_socket_domains_exclude_netlink() {
1064        let domains = SeccompManager::allowed_socket_domains(true);
1065        assert!(domains.contains(&libc::AF_UNIX));
1066        assert!(domains.contains(&libc::AF_INET));
1067        assert!(domains.contains(&libc::AF_INET6));
1068        assert!(!domains.contains(&libc::AF_NETLINK));
1069    }
1070
1071    #[test]
1072    fn test_network_mode_syscalls_only_enabled_when_network_allowed() {
1073        let none = SeccompManager::network_mode_syscalls(false);
1074        assert!(none.is_empty());
1075
1076        let enabled = SeccompManager::network_mode_syscalls(true);
1077        assert!(enabled.contains(&libc::SYS_connect));
1078        assert!(enabled.contains(&libc::SYS_bind));
1079        assert!(enabled.contains(&libc::SYS_listen));
1080        assert!(enabled.contains(&libc::SYS_accept));
1081        assert!(enabled.contains(&libc::SYS_setsockopt));
1082    }
1083
1084    #[test]
1085    fn test_landlock_bootstrap_syscalls_present_in_base_allowlist() {
1086        let base = SeccompManager::base_allowed_syscalls();
1087        assert!(base.contains(&libc::SYS_landlock_create_ruleset));
1088        assert!(base.contains(&libc::SYS_landlock_add_rule));
1089        assert!(base.contains(&libc::SYS_landlock_restrict_self));
1090    }
1091
1092    #[test]
1093    fn test_x32_legacy_range_not_allowlisted() {
1094        let base = SeccompManager::base_allowed_syscalls();
1095        let net = SeccompManager::network_mode_syscalls(true);
1096        for nr in 512_i64..=547_i64 {
1097            assert!(
1098                !base.contains(&nr) && !net.contains(&nr),
1099                "x32 syscall number {} unexpectedly allowlisted",
1100                nr
1101            );
1102        }
1103    }
1104
1105    #[test]
1106    fn test_i386_compat_socketcall_range_not_allowlisted() {
1107        let base = SeccompManager::base_allowed_syscalls();
1108        let net = SeccompManager::network_mode_syscalls(true);
1109        // i386 compat per syscall_32.tbl: socket..shutdown live at 359..373.
1110        // On x86_64 these numbers are outside our native allowlist surface.
1111        for nr in 359_i64..=373_i64 {
1112            assert!(
1113                !base.contains(&nr) && !net.contains(&nr),
1114                "i386 compat syscall number {} unexpectedly allowlisted",
1115                nr
1116            );
1117        }
1118    }
1119
1120    #[test]
1121    fn test_minimal_filter_allowlist_counts_are_stable() {
1122        let base = SeccompManager::base_allowed_syscalls();
1123        let net = SeccompManager::network_mode_syscalls(true);
1124
1125        // Snapshot counts to catch unintended policy drift.
1126        // +7 accounts for conditional rules inserted in minimal_filter():
1127        // socket/ioctl/prctl/mprotect/clone/clone3/execveat.
1128        // fork removed (forces through filtered clone path).
1129        // execveat removed from base (arg-filtered separately).
1130        assert_eq!(base.len(), 131);
1131        assert_eq!(net.len(), 11);
1132        assert_eq!(base.len() + 7, 138);
1133        assert_eq!(base.len() + net.len() + 7, 149);
1134    }
1135
1136    #[test]
1137    fn test_arg_filtered_syscalls_list_includes_critical_syscalls() {
1138        // These syscalls must be in the arg-filtered list so custom profiles
1139        // get warnings when they allow them without filters.
1140        for name in &["clone", "clone3", "execveat", "ioctl", "prctl", "socket"] {
1141            assert!(
1142                SeccompManager::ARG_FILTERED_SYSCALLS.contains(name),
1143                "'{}' must be in ARG_FILTERED_SYSCALLS",
1144                name
1145            );
1146        }
1147    }
1148
1149    #[test]
1150    fn test_clone3_allowed_in_minimal_filter() {
1151        // clone3 MUST be in the BPF rules map — glibc 2.34+ and newer musl
1152        // use clone3 internally for posix_spawn/fork. Blocking it breaks
1153        // std::process::Command on modern systems. Namespace creation is
1154        // prevented by dropped capabilities (CAP_SYS_ADMIN etc.), not seccomp.
1155        let rules = SeccompManager::minimal_filter(true).unwrap();
1156        assert!(
1157            rules.contains_key(&libc::SYS_clone3),
1158            "clone3 must be in the seccomp allowlist (glibc 2.34+ requires it)"
1159        );
1160    }
1161
1162    #[test]
1163    fn test_clone_is_allowed_with_arg_filter() {
1164        // clone (not clone3) should still be in the rules with arg filtering
1165        let rules = SeccompManager::minimal_filter(true).unwrap();
1166        assert!(
1167            rules.contains_key(&libc::SYS_clone),
1168            "clone must be in the seccomp allowlist with arg filters"
1169        );
1170    }
1171
1172    #[test]
1173    fn test_high_risk_syscalls_removed_from_base_allowlist() {
1174        let base = SeccompManager::base_allowed_syscalls();
1175        let removed = [
1176            libc::SYS_chown,
1177            libc::SYS_fchown,
1178            libc::SYS_lchown,
1179            libc::SYS_fchownat,
1180            libc::SYS_sync,
1181            libc::SYS_syncfs,
1182            libc::SYS_mlock,
1183            libc::SYS_munlock,
1184            libc::SYS_mincore,
1185            libc::SYS_vfork,
1186            libc::SYS_tkill,
1187        ];
1188
1189        for syscall in removed {
1190            assert!(
1191                !base.contains(&syscall),
1192                "syscall {} unexpectedly present in base allowlist",
1193                syscall
1194            );
1195        }
1196    }
1197
1198    #[test]
1199    fn test_custom_profile_preserves_clone_arg_filters() {
1200        // SEC-01: Custom seccomp profiles that allow "clone" must still get
1201        // argument-level filtering to block namespace-creating flags.
1202        // Verify by inspecting the built-in filter rules that serve as the
1203        // merge source for apply_profile_from_file.
1204        let rules = SeccompManager::minimal_filter(true).unwrap();
1205
1206        // Every ARG_FILTERED_SYSCALLS entry (except clone3, which is allowed
1207        // unconditionally since BPF can't inspect its struct-based flags) must
1208        // have non-empty argument-level rules in the built-in filter so that
1209        // apply_profile_from_file can merge them.
1210        for name in SeccompManager::ARG_FILTERED_SYSCALLS {
1211            if *name == "clone3" {
1212                // clone3 is allowed unconditionally — BPF cannot dereference
1213                // the clone_args struct, so arg filtering is impossible.
1214                // Namespace defense relies on dropped capabilities.
1215                continue;
1216            }
1217            if let Some(nr) = syscall_name_to_number(name) {
1218                let entry = rules.get(&nr);
1219                assert!(
1220                    entry.is_some() && !entry.unwrap().is_empty(),
1221                    "built-in filter must have argument-level rules for '{}' \
1222                     so apply_profile_from_file can merge them into custom profiles",
1223                    name
1224                );
1225            }
1226        }
1227    }
1228
1229    #[test]
1230    fn test_memfd_create_not_in_default_allowlist() {
1231        // SEC-02: memfd_create enables fileless code execution when combined with execveat.
1232        let base = SeccompManager::base_allowed_syscalls();
1233        assert!(
1234            !base.contains(&libc::SYS_memfd_create),
1235            "memfd_create must not be in the default seccomp allowlist (fileless exec risk)"
1236        );
1237        // Also verify it's not sneaked into the compiled filter rules
1238        let rules = SeccompManager::minimal_filter(true).unwrap();
1239        assert!(
1240            !rules.contains_key(&libc::SYS_memfd_create),
1241            "memfd_create must not be in the compiled seccomp filter rules"
1242        );
1243    }
1244
1245    #[test]
1246    fn test_mprotect_has_arg_filtering() {
1247        // SEC-03: mprotect must have argument-level filtering to prevent W^X
1248        // (PROT_WRITE|PROT_EXEC) violations. Verify via runtime data structures.
1249
1250        // mprotect must NOT be in the unconditional base allowlist
1251        let base = SeccompManager::base_allowed_syscalls();
1252        assert!(
1253            !base.contains(&libc::SYS_mprotect),
1254            "SYS_mprotect must not be unconditionally allowed - needs arg filtering"
1255        );
1256
1257        // mprotect must be present in the compiled filter with non-empty
1258        // argument conditions (the conditions enforce W^X)
1259        let rules = SeccompManager::minimal_filter(true).unwrap();
1260        let mprotect_rules = rules.get(&libc::SYS_mprotect);
1261        assert!(
1262            mprotect_rules.is_some(),
1263            "mprotect must be present in the seccomp filter rules"
1264        );
1265        assert!(
1266            !mprotect_rules.unwrap().is_empty(),
1267            "mprotect must have argument-level conditions to prevent W^X violations"
1268        );
1269    }
1270
1271    #[test]
1272    fn test_unsafe_blocks_have_safety_comments() {
1273        // SEC-08: All unsafe blocks must have // SAFETY: documentation
1274        let source = include_str!("seccomp.rs");
1275        let mut pos = 0;
1276        while let Some(idx) = source[pos..].find("unsafe {") {
1277            let abs_idx = pos + idx;
1278            // Check that there's a SAFETY comment within 200 chars before the unsafe block
1279            let start = abs_idx.saturating_sub(200);
1280            let context = &source[start..abs_idx];
1281            assert!(
1282                context.contains("SAFETY:"),
1283                "unsafe block at byte {} must have a // SAFETY: comment. Context: ...{}...",
1284                abs_idx,
1285                &source[abs_idx.saturating_sub(80)..abs_idx + 10]
1286            );
1287            pos = abs_idx + 1;
1288        }
1289    }
1290
1291    // --- H-1: mprotect MaskedEq logic verification ---
1292    //
1293    // The mprotect filter uses MaskedEq((PROT_WRITE | PROT_EXEC), value) to
1294    // allow only combinations where the W|X bits match one of {0, W, X}.
1295    // These tests prove the logic is correct without installing a real
1296    // seccomp filter (which would affect the test process).
1297
1298    /// Helper: simulates the MaskedEq check that the seccomp BPF would perform.
1299    /// Returns true if the prot value would be ALLOWED by one of the rules.
1300    fn mprotect_would_allow(prot: u64) -> bool {
1301        let mask = (libc::PROT_WRITE | libc::PROT_EXEC) as u64;
1302        let allowed_values: &[u64] = &[0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64];
1303        let masked = prot & mask;
1304        allowed_values.contains(&masked)
1305    }
1306
1307    #[test]
1308    fn test_mprotect_allows_prot_none() {
1309        assert!(mprotect_would_allow(0), "PROT_NONE must be allowed");
1310    }
1311
1312    #[test]
1313    fn test_mprotect_allows_prot_read_only() {
1314        assert!(
1315            mprotect_would_allow(libc::PROT_READ as u64),
1316            "PROT_READ must be allowed (W|X bits are 0)"
1317        );
1318    }
1319
1320    #[test]
1321    fn test_mprotect_allows_prot_read_write() {
1322        assert!(
1323            mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE) as u64),
1324            "PROT_READ|PROT_WRITE must be allowed"
1325        );
1326    }
1327
1328    #[test]
1329    fn test_mprotect_allows_prot_read_exec() {
1330        assert!(
1331            mprotect_would_allow((libc::PROT_READ | libc::PROT_EXEC) as u64),
1332            "PROT_READ|PROT_EXEC must be allowed"
1333        );
1334    }
1335
1336    #[test]
1337    fn test_mprotect_rejects_prot_write_exec() {
1338        assert!(
1339            !mprotect_would_allow((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
1340            "PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
1341        );
1342    }
1343
1344    #[test]
1345    fn test_mprotect_rejects_prot_read_write_exec() {
1346        assert!(
1347            !mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE | libc::PROT_EXEC) as u64),
1348            "PROT_READ|PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
1349        );
1350    }
1351
1352    #[test]
1353    fn test_mprotect_allows_prot_write_alone() {
1354        assert!(
1355            mprotect_would_allow(libc::PROT_WRITE as u64),
1356            "PROT_WRITE alone must be allowed"
1357        );
1358    }
1359
1360    #[test]
1361    fn test_mprotect_allows_prot_exec_alone() {
1362        assert!(
1363            mprotect_would_allow(libc::PROT_EXEC as u64),
1364            "PROT_EXEC alone must be allowed"
1365        );
1366    }
1367}