Skip to main content

nucleus/security/
seccomp.rs

1use crate::error::{NucleusError, Result};
2use crate::security::policy::sha256_hex;
3use seccompiler::{BpfProgram, SeccompAction, SeccompCondition, SeccompFilter, SeccompRule};
4use std::collections::BTreeMap;
5use std::path::Path;
6use tracing::{debug, info, warn};
7
8/// Seccomp filter manager
9///
10/// Implements syscall whitelisting for the security state machine
11/// (NucleusSecurity_Seccomp_SeccompEnforcement.tla)
12pub struct SeccompManager {
13    applied: bool,
14}
15
16const DENIED_CLONE_NAMESPACE_FLAGS: u64 = (libc::CLONE_NEWUSER
17    | libc::CLONE_NEWNS
18    | libc::CLONE_NEWNET
19    | libc::CLONE_NEWIPC
20    | libc::CLONE_NEWUTS
21    | libc::CLONE_NEWPID
22    | libc::CLONE_NEWCGROUP
23    | libc::CLONE_NEWTIME) as u64;
24
25impl SeccompManager {
26    pub fn new() -> Self {
27        Self { applied: false }
28    }
29
30    fn base_allowed_syscalls() -> Vec<i64> {
31        vec![
32            // File I/O
33            libc::SYS_read,
34            libc::SYS_write,
35            libc::SYS_open,
36            libc::SYS_openat,
37            libc::SYS_close,
38            libc::SYS_stat,
39            libc::SYS_fstat,
40            libc::SYS_lstat,
41            libc::SYS_lseek,
42            libc::SYS_access,
43            libc::SYS_fcntl,
44            libc::SYS_readv,
45            libc::SYS_writev,
46            libc::SYS_pread64,
47            libc::SYS_pwrite64,
48            libc::SYS_readlink,
49            libc::SYS_readlinkat,
50            libc::SYS_newfstatat,
51            libc::SYS_statx,
52            libc::SYS_faccessat,
53            libc::SYS_faccessat2,
54            libc::SYS_dup,
55            libc::SYS_dup2,
56            libc::SYS_dup3,
57            libc::SYS_pipe,
58            libc::SYS_pipe2,
59            libc::SYS_unlink,
60            libc::SYS_unlinkat,
61            libc::SYS_rename,
62            libc::SYS_renameat,
63            libc::SYS_renameat2,
64            libc::SYS_link,
65            libc::SYS_linkat,
66            libc::SYS_symlink,
67            libc::SYS_symlinkat,
68            libc::SYS_chmod,
69            libc::SYS_fchmod,
70            libc::SYS_fchmodat,
71            libc::SYS_truncate,
72            libc::SYS_ftruncate,
73            libc::SYS_fallocate,
74            libc::SYS_fadvise64,
75            libc::SYS_fsync,
76            libc::SYS_fdatasync,
77            libc::SYS_flock,
78            libc::SYS_sendfile,
79            libc::SYS_copy_file_range,
80            libc::SYS_splice,
81            libc::SYS_tee,
82            // Memory management
83            libc::SYS_mmap,
84            libc::SYS_munmap,
85            libc::SYS_brk,
86            libc::SYS_mremap,
87            libc::SYS_madvise,
88            libc::SYS_msync,
89            // Process management
90            // fork intentionally excluded — modern glibc/musl use clone(), which
91            // has namespace-flag filtering. Removing SYS_fork forces all forks
92            // through the filtered clone path (defense-in-depth against fork bombs
93            // and unfiltered namespace creation).
94            libc::SYS_execve,
95            // execveat is conditionally allowed below (AT_EMPTY_PATH blocked)
96            libc::SYS_wait4,
97            libc::SYS_waitid,
98            libc::SYS_exit,
99            libc::SYS_exit_group,
100            libc::SYS_getpid,
101            libc::SYS_gettid,
102            libc::SYS_getuid,
103            libc::SYS_getgid,
104            libc::SYS_geteuid,
105            libc::SYS_getegid,
106            libc::SYS_getppid,
107            libc::SYS_getpgrp,
108            libc::SYS_setsid,
109            libc::SYS_getgroups,
110            // Signals
111            libc::SYS_rt_sigaction,
112            libc::SYS_rt_sigprocmask,
113            libc::SYS_rt_sigreturn,
114            libc::SYS_rt_sigsuspend,
115            libc::SYS_sigaltstack,
116            libc::SYS_kill,
117            libc::SYS_tgkill,
118            // Time
119            libc::SYS_clock_gettime,
120            libc::SYS_clock_getres,
121            libc::SYS_clock_nanosleep,
122            libc::SYS_gettimeofday,
123            libc::SYS_nanosleep,
124            // Directories
125            libc::SYS_getcwd,
126            libc::SYS_chdir,
127            libc::SYS_fchdir,
128            libc::SYS_mkdir,
129            libc::SYS_mkdirat,
130            libc::SYS_rmdir,
131            libc::SYS_getdents,
132            libc::SYS_getdents64,
133            // Misc
134            libc::SYS_uname,
135            libc::SYS_getrandom,
136            libc::SYS_futex,
137            libc::SYS_set_tid_address,
138            libc::SYS_set_robust_list,
139            libc::SYS_get_robust_list,
140            libc::SYS_arch_prctl,
141            libc::SYS_sysinfo,
142            libc::SYS_umask,
143            libc::SYS_getrlimit,
144            libc::SYS_prlimit64,
145            libc::SYS_getrusage,
146            libc::SYS_times,
147            libc::SYS_sched_yield,
148            libc::SYS_sched_getaffinity,
149            libc::SYS_getcpu,
150            libc::SYS_rseq,
151            libc::SYS_close_range,
152            // NOTE: memfd_create intentionally excluded — combined with execveat
153            // it enables fileless code execution bypassing all FS controls (SEC-02).
154            // Landlock bootstrap (runtime applies seccomp before Landlock)
155            libc::SYS_landlock_create_ruleset,
156            libc::SYS_landlock_add_rule,
157            libc::SYS_landlock_restrict_self,
158            // Socket/Network (safe introspection + local socketpair)
159            libc::SYS_getsockname,
160            libc::SYS_getpeername,
161            libc::SYS_socketpair,
162            libc::SYS_getsockopt,
163            // Poll/Select
164            libc::SYS_poll,
165            libc::SYS_ppoll,
166            libc::SYS_select,
167            libc::SYS_pselect6,
168            libc::SYS_epoll_create,
169            libc::SYS_epoll_create1,
170            libc::SYS_epoll_ctl,
171            libc::SYS_epoll_wait,
172            libc::SYS_epoll_pwait,
173            libc::SYS_eventfd,
174            libc::SYS_eventfd2,
175            libc::SYS_signalfd,
176            libc::SYS_signalfd4,
177            libc::SYS_timerfd_create,
178            libc::SYS_timerfd_settime,
179            libc::SYS_timerfd_gettime,
180        ]
181    }
182
183    fn allowed_socket_domains(allow_network: bool) -> Vec<i32> {
184        if allow_network {
185            vec![libc::AF_UNIX, libc::AF_INET, libc::AF_INET6]
186        } else {
187            vec![libc::AF_UNIX]
188        }
189    }
190
191    fn network_mode_syscalls(allow_network: bool) -> Vec<i64> {
192        if allow_network {
193            vec![
194                libc::SYS_connect,
195                libc::SYS_sendto,
196                libc::SYS_recvfrom,
197                libc::SYS_sendmsg,
198                libc::SYS_recvmsg,
199                libc::SYS_shutdown,
200                libc::SYS_bind,
201                libc::SYS_listen,
202                libc::SYS_accept,
203                libc::SYS_accept4,
204                libc::SYS_setsockopt,
205            ]
206        } else {
207            Vec::new()
208        }
209    }
210
211    /// Get minimal syscall whitelist for basic container operation
212    ///
213    /// This is a restrictive whitelist that blocks dangerous syscalls:
214    /// - ptrace (process tracing)
215    /// - kexec_load (kernel loading)
216    /// - add_key, request_key, keyctl (kernel keyring)
217    /// - bpf (eBPF programs)
218    /// - perf_event_open (performance monitoring)
219    /// - userfaultfd (user fault handling)
220    fn minimal_filter(allow_network: bool) -> Result<BTreeMap<i64, Vec<SeccompRule>>> {
221        let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
222
223        // Essential syscalls for basic operation
224        let allowed_syscalls = Self::base_allowed_syscalls();
225
226        // Allow all these syscalls unconditionally
227        for syscall in allowed_syscalls {
228            rules.insert(syscall, Vec::new());
229        }
230
231        // Add network-mode-specific syscalls
232        for syscall in Self::network_mode_syscalls(allow_network) {
233            rules.insert(syscall, Vec::new());
234        }
235
236        // Restrict socket() domains by network mode.
237        // none: AF_UNIX only; network-enabled: AF_UNIX/AF_INET/AF_INET6.
238        let mut socket_rules = Vec::new();
239        for domain in Self::allowed_socket_domains(allow_network) {
240            let condition = SeccompCondition::new(
241                0, // arg0 is socket(domain, type, protocol)
242                seccompiler::SeccompCmpArgLen::Dword,
243                seccompiler::SeccompCmpOp::Eq,
244                domain as u64,
245            )
246            .map_err(|e| {
247                NucleusError::SeccompError(format!(
248                    "Failed to create socket domain condition: {}",
249                    e
250                ))
251            })?;
252            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
253                NucleusError::SeccompError(format!("Failed to create socket rule: {}", e))
254            })?;
255            socket_rules.push(rule);
256        }
257        rules.insert(libc::SYS_socket, socket_rules);
258
259        // ioctl: allow only safe terminal operations (arg0 = request code)
260        let ioctl_allowed: &[u64] = &[
261            0x5401, // TCGETS
262            0x5402, // TCSETS
263            0x5403, // TCSETSW
264            0x5404, // TCSETSF
265            0x540B, // TCFLSH
266            0x540F, // TIOCGPGRP
267            0x5410, // TIOCSPGRP
268            0x5413, // TIOCGWINSZ
269            0x5429, // TIOCGSID
270            0x541B, // FIONREAD
271            // FIONBIO (0x5421) intentionally excluded — sets non-blocking mode
272            // on network sockets, enabling sophisticated network exploitation.
273            0x5451, // FIOCLEX
274            0x5450, // FIONCLEX
275        ];
276        let mut ioctl_rules = Vec::new();
277        for &request in ioctl_allowed {
278            let condition = SeccompCondition::new(
279                1, // arg1 is the request code for ioctl(fd, request, ...)
280                seccompiler::SeccompCmpArgLen::Dword,
281                seccompiler::SeccompCmpOp::Eq,
282                request,
283            )
284            .map_err(|e| {
285                NucleusError::SeccompError(format!("Failed to create ioctl condition: {}", e))
286            })?;
287            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
288                NucleusError::SeccompError(format!("Failed to create ioctl rule: {}", e))
289            })?;
290            ioctl_rules.push(rule);
291        }
292        rules.insert(libc::SYS_ioctl, ioctl_rules);
293
294        // prctl: allow only safe operations (arg0 = option).
295        // Notably absent (hit default deny):
296        //   PR_CAPBSET_READ (23) — leaks capability bounding set info
297        //   PR_CAPBSET_DROP (24) — could weaken the capability bounding set
298        //   PR_SET_SECUREBITS (28) — could disable secure-exec restrictions
299        let prctl_allowed: &[u64] = &[
300            1,  // PR_SET_PDEATHSIG
301            2,  // PR_GET_PDEATHSIG
302            15, // PR_SET_NAME
303            16, // PR_GET_NAME
304            38, // PR_SET_NO_NEW_PRIVS
305            39, // PR_GET_NO_NEW_PRIVS
306        ];
307        let mut prctl_rules = Vec::new();
308        for &option in prctl_allowed {
309            let condition = SeccompCondition::new(
310                0, // arg0 is the option for prctl(option, ...)
311                seccompiler::SeccompCmpArgLen::Dword,
312                seccompiler::SeccompCmpOp::Eq,
313                option,
314            )
315            .map_err(|e| {
316                NucleusError::SeccompError(format!("Failed to create prctl condition: {}", e))
317            })?;
318            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
319                NucleusError::SeccompError(format!("Failed to create prctl rule: {}", e))
320            })?;
321            prctl_rules.push(rule);
322        }
323        rules.insert(libc::SYS_prctl, prctl_rules);
324
325        // mprotect: permit RW or RX transitions, but reject PROT_WRITE|PROT_EXEC.
326        let mut mprotect_rules = Vec::new();
327        for allowed in [0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64] {
328            let condition = SeccompCondition::new(
329                2, // arg2 is prot for mprotect(addr, len, prot)
330                seccompiler::SeccompCmpArgLen::Dword,
331                seccompiler::SeccompCmpOp::MaskedEq((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
332                allowed,
333            )
334            .map_err(|e| {
335                NucleusError::SeccompError(format!("Failed to create mprotect condition: {}", e))
336            })?;
337            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
338                NucleusError::SeccompError(format!("Failed to create mprotect rule: {}", e))
339            })?;
340            mprotect_rules.push(rule);
341        }
342        rules.insert(libc::SYS_mprotect, mprotect_rules);
343
344        // clone3: ALLOWED unconditionally. clone3 passes flags inside a struct
345        // pointer that seccomp BPF cannot dereference, so namespace-flag filtering
346        // is impossible at the BPF level. However, glibc 2.34+ and newer musl use
347        // clone3 internally for posix_spawn/fork — blocking it breaks
348        // std::process::Command and any child-process spawning on modern systems.
349        //
350        // SECURITY INVARIANT: Namespace creation via clone3 is prevented solely by
351        // dropping CAP_SYS_ADMIN (and other namespace caps) *before* this seccomp
352        // filter is installed. If capability dropping is bypassed, clone3 becomes
353        // an unfiltered path to namespace creation. This is a known single point
354        // of failure — see CapabilityManager::drop_all() which must run first.
355        rules.insert(libc::SYS_clone3, Vec::new());
356
357        // clone: allow but deny namespace-creating flags to prevent nested namespace creation
358        let clone_condition = SeccompCondition::new(
359            0, // arg0 = flags
360            seccompiler::SeccompCmpArgLen::Qword,
361            seccompiler::SeccompCmpOp::MaskedEq(DENIED_CLONE_NAMESPACE_FLAGS),
362            0, // (flags & ns_flags) == 0: none of the namespace flags set
363        )
364        .map_err(|e| {
365            NucleusError::SeccompError(format!("Failed to create clone condition: {}", e))
366        })?;
367        let clone_rule = SeccompRule::new(vec![clone_condition]).map_err(|e| {
368            NucleusError::SeccompError(format!("Failed to create clone rule: {}", e))
369        })?;
370        rules.insert(libc::SYS_clone, vec![clone_rule]);
371
372        // execveat: allow but block AT_EMPTY_PATH (0x1000) to prevent fileless
373        // execution. With AT_EMPTY_PATH, execveat can execute code from any open
374        // fd (e.g., open + unlink, or even a socket fd), bypassing filesystem
375        // controls — not just memfd_create. Blocking memfd_create alone is
376        // insufficient. Normal execveat with dirfd+pathname (no AT_EMPTY_PATH)
377        // remains allowed.
378        let execveat_condition = SeccompCondition::new(
379            4, // arg4 = flags for execveat(dirfd, pathname, argv, envp, flags)
380            seccompiler::SeccompCmpArgLen::Dword,
381            seccompiler::SeccompCmpOp::MaskedEq(libc::AT_EMPTY_PATH as u64),
382            0, // (flags & AT_EMPTY_PATH) == 0: AT_EMPTY_PATH not set
383        )
384        .map_err(|e| {
385            NucleusError::SeccompError(format!("Failed to create execveat condition: {}", e))
386        })?;
387        let execveat_rule = SeccompRule::new(vec![execveat_condition]).map_err(|e| {
388            NucleusError::SeccompError(format!("Failed to create execveat rule: {}", e))
389        })?;
390        rules.insert(libc::SYS_execveat, vec![execveat_rule]);
391
392        Ok(rules)
393    }
394
395    /// Compile the minimal BPF filter without applying it
396    ///
397    /// This is useful for benchmarking filter compilation overhead
398    /// without the irreversible side effect of applying the filter.
399    pub fn compile_minimal_filter() -> Result<BpfProgram> {
400        let rules = Self::minimal_filter(true)?;
401        let filter = SeccompFilter::new(
402            rules,
403            SeccompAction::Errno(libc::EPERM as u32),
404            SeccompAction::Allow,
405            std::env::consts::ARCH.try_into().map_err(|e| {
406                NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
407            })?,
408        )
409        .map_err(|e| {
410            NucleusError::SeccompError(format!("Failed to create seccomp filter: {}", e))
411        })?;
412
413        let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
414            NucleusError::SeccompError(format!("Failed to compile BPF program: {}", e))
415        })?;
416
417        Ok(bpf_prog)
418    }
419
420    /// Apply seccomp filter
421    ///
422    /// This implements the transition: no_filter -> whitelist_active
423    /// in the seccomp state machine (NucleusSecurity_Seccomp_SeccompEnforcement.tla)
424    ///
425    /// Once applied, the filter cannot be removed (irreversible property)
426    /// In rootless mode or if seccomp setup fails, this will warn and continue
427    pub fn apply_minimal_filter(&mut self) -> Result<bool> {
428        self.apply_minimal_filter_with_mode(false, false)
429    }
430
431    /// Apply seccomp filter with configurable failure behavior
432    ///
433    /// When `best_effort` is true, failures are logged and execution continues.
434    /// When false, seccomp setup is fail-closed.
435    pub fn apply_minimal_filter_with_mode(
436        &mut self,
437        best_effort: bool,
438        log_denied: bool,
439    ) -> Result<bool> {
440        self.apply_filter_for_network_mode(true, best_effort, log_denied)
441    }
442
443    /// Apply seccomp filter with network-mode-aware socket restrictions
444    ///
445    /// When `allow_network` is false, `SYS_socket` is restricted to AF_UNIX only,
446    /// preventing creation of network sockets (AF_INET, AF_INET6, etc.).
447    /// When `allow_network` is true, all socket domains are permitted.
448    ///
449    /// When `best_effort` is true, failures are logged and execution continues.
450    /// When false, seccomp setup is fail-closed.
451    pub fn apply_filter_for_network_mode(
452        &mut self,
453        allow_network: bool,
454        best_effort: bool,
455        log_denied: bool,
456    ) -> Result<bool> {
457        if self.applied {
458            debug!("Seccomp filter already applied, skipping");
459            return Ok(true);
460        }
461
462        info!(allow_network, "Applying seccomp filter");
463
464        let rules = match Self::minimal_filter(allow_network) {
465            Ok(r) => r,
466            Err(e) => {
467                if best_effort {
468                    warn!(
469                        "Failed to create seccomp rules: {} (continuing without seccomp)",
470                        e
471                    );
472                    return Ok(false);
473                }
474                return Err(e);
475            }
476        };
477
478        let filter = match SeccompFilter::new(
479            rules,
480            SeccompAction::Errno(libc::EPERM as u32), // Default: deny with EPERM
481            SeccompAction::Allow,                     // Match action: allow
482            std::env::consts::ARCH.try_into().map_err(|e| {
483                NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
484            })?,
485        ) {
486            Ok(f) => f,
487            Err(e) => {
488                if best_effort {
489                    warn!(
490                        "Failed to create seccomp filter: {} (continuing without seccomp)",
491                        e
492                    );
493                    return Ok(false);
494                }
495                return Err(NucleusError::SeccompError(format!(
496                    "Failed to create seccomp filter: {}",
497                    e
498                )));
499            }
500        };
501
502        let bpf_prog: BpfProgram = match filter.try_into() {
503            Ok(p) => p,
504            Err(e) => {
505                if best_effort {
506                    warn!(
507                        "Failed to compile BPF program: {} (continuing without seccomp)",
508                        e
509                    );
510                    return Ok(false);
511                }
512                return Err(NucleusError::SeccompError(format!(
513                    "Failed to compile BPF program: {}",
514                    e
515                )));
516            }
517        };
518
519        // Apply the filter
520        match Self::apply_bpf_program(&bpf_prog, log_denied) {
521            Ok(_) => {
522                self.applied = true;
523                info!("Successfully applied seccomp filter");
524                Ok(true)
525            }
526            Err(e) => {
527                if best_effort {
528                    warn!(
529                        "Failed to apply seccomp filter: {} (continuing without seccomp)",
530                        e
531                    );
532                    Ok(false)
533                } else {
534                    Err(NucleusError::SeccompError(format!(
535                        "Failed to apply seccomp filter: {}",
536                        e
537                    )))
538                }
539            }
540        }
541    }
542
543    /// Apply a seccomp profile loaded from a JSON file.
544    ///
545    /// The profile format is a JSON object with:
546    /// ```json
547    /// {
548    ///   "defaultAction": "SCMP_ACT_ERRNO",
549    ///   "syscalls": [
550    ///     { "names": ["read", "write", "open", ...], "action": "SCMP_ACT_ALLOW" }
551    ///   ]
552    /// }
553    /// ```
554    ///
555    /// This is a subset of the OCI seccomp profile format. Only the syscall name
556    /// allowlist is used; argument-level filtering from the built-in profile is
557    /// not applied when using a custom profile.
558    ///
559    /// If `expected_sha256` is provided, the file's SHA-256 hash is verified
560    /// against it before loading. This prevents silent profile tampering.
561    pub fn apply_profile_from_file(
562        &mut self,
563        profile_path: &Path,
564        expected_sha256: Option<&str>,
565        audit_mode: bool,
566    ) -> Result<bool> {
567        if self.applied {
568            debug!("Seccomp filter already applied, skipping");
569            return Ok(true);
570        }
571
572        info!("Loading seccomp profile from {:?}", profile_path);
573
574        // Read profile file
575        let content = std::fs::read(profile_path).map_err(|e| {
576            NucleusError::SeccompError(format!(
577                "Failed to read seccomp profile {:?}: {}",
578                profile_path, e
579            ))
580        })?;
581
582        // Verify SHA-256 hash if expected
583        if let Some(expected) = expected_sha256 {
584            let actual = sha256_hex(&content);
585            if actual != expected {
586                return Err(NucleusError::SeccompError(format!(
587                    "Seccomp profile hash mismatch: expected {}, got {}",
588                    expected, actual
589                )));
590            }
591            info!("Seccomp profile hash verified: {}", actual);
592        }
593
594        // Parse profile
595        let profile: SeccompProfile = serde_json::from_slice(&content).map_err(|e| {
596            NucleusError::SeccompError(format!("Failed to parse seccomp profile: {}", e))
597        })?;
598
599        // Warn when custom profile allows security-critical syscalls without
600        // argument-level filtering. The built-in filter restricts clone, ioctl,
601        // prctl, and socket at the argument level; a custom profile that allows
602        // them by name only silently removes all of that hardening.
603        Self::warn_missing_arg_filters(&profile);
604
605        // Build filter from profile
606        let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
607
608        for syscall_group in &profile.syscalls {
609            if syscall_group.action == "SCMP_ACT_ALLOW" {
610                for name in &syscall_group.names {
611                    if let Some(nr) = syscall_name_to_number(name) {
612                        rules.insert(nr, Vec::new());
613                    } else {
614                        warn!("Unknown syscall in profile: {} (skipping)", name);
615                    }
616                }
617            }
618        }
619
620        // SEC-01: Merge built-in argument filters for security-critical syscalls.
621        // Custom profiles that allow clone/ioctl/prctl/socket/mprotect by name
622        // without argument-level filters would silently remove all hardening.
623        // Overwrite their empty rules with the built-in argument-filtered rules.
624        let builtin_rules = Self::minimal_filter(true)?;
625        for syscall_name in Self::ARG_FILTERED_SYSCALLS {
626            if let Some(nr) = syscall_name_to_number(syscall_name) {
627                if let std::collections::btree_map::Entry::Occupied(mut entry) = rules.entry(nr) {
628                    if let Some(builtin) = builtin_rules.get(&nr) {
629                        if !builtin.is_empty() {
630                            info!(
631                                "Merging built-in argument filters for '{}' into custom profile",
632                                syscall_name
633                            );
634                            entry.insert(builtin.clone());
635                        }
636                    }
637                }
638            }
639        }
640        // Also enforce clone3 denial — it cannot be argument-filtered
641        rules.remove(&libc::SYS_clone3);
642
643        let filter = SeccompFilter::new(
644            rules,
645            SeccompAction::Errno(libc::EPERM as u32),
646            SeccompAction::Allow,
647            std::env::consts::ARCH.try_into().map_err(|e| {
648                NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
649            })?,
650        )
651        .map_err(|e| {
652            NucleusError::SeccompError(format!(
653                "Failed to create seccomp filter from profile: {}",
654                e
655            ))
656        })?;
657
658        let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
659            NucleusError::SeccompError(format!("Failed to compile BPF program from profile: {}", e))
660        })?;
661
662        match Self::apply_bpf_program(&bpf_prog, audit_mode) {
663            Ok(_) => {
664                self.applied = true;
665                info!(
666                    "Seccomp profile applied from {:?} (log_denied={})",
667                    profile_path, audit_mode
668                );
669                Ok(true)
670            }
671            Err(e) => Err(e),
672        }
673    }
674
675    /// Install an allow-all seccomp filter with SECCOMP_FILTER_FLAG_LOG.
676    ///
677    /// Used in trace mode: all syscalls are allowed but logged to the kernel
678    /// audit subsystem. A separate reader collects the logged syscalls.
679    pub fn apply_trace_filter(&mut self) -> Result<bool> {
680        if self.applied {
681            debug!("Seccomp filter already applied, skipping trace filter");
682            return Ok(true);
683        }
684
685        info!("Applying seccomp trace filter (allow-all + LOG)");
686
687        // Create an empty rule set — with SeccompAction::Allow as default,
688        // every syscall is permitted. The LOG flag causes the kernel to
689        // audit each syscall decision.
690        let filter = SeccompFilter::new(
691            BTreeMap::new(),
692            SeccompAction::Allow, // default: allow everything
693            SeccompAction::Allow, // match action (unused — no rules)
694            std::env::consts::ARCH.try_into().map_err(|e| {
695                NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
696            })?,
697        )
698        .map_err(|e| NucleusError::SeccompError(format!("Failed to create trace filter: {}", e)))?;
699
700        let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
701            NucleusError::SeccompError(format!("Failed to compile trace BPF: {}", e))
702        })?;
703
704        // Apply with LOG flag so kernel audits every syscall
705        Self::apply_bpf_program(&bpf_prog, true)?;
706        self.applied = true;
707        info!("Seccomp trace filter applied (all syscalls allowed + logged)");
708        Ok(true)
709    }
710
711    /// Syscalls that the built-in filter restricts at the argument level.
712    /// Custom profiles allowing these without argument filters weaken security.
713    const ARG_FILTERED_SYSCALLS: &'static [&'static str] =
714        &["clone", "clone3", "execveat", "ioctl", "mprotect", "prctl", "socket"];
715
716    /// Warn when a custom seccomp profile allows security-critical syscalls
717    /// without argument-level filtering.
718    fn warn_missing_arg_filters(profile: &SeccompProfile) {
719        for group in &profile.syscalls {
720            if group.action != "SCMP_ACT_ALLOW" {
721                continue;
722            }
723            for name in &group.names {
724                if Self::ARG_FILTERED_SYSCALLS.contains(&name.as_str()) && group.args.is_empty() {
725                    warn!(
726                        "Custom seccomp profile allows '{}' without argument filters. \
727                         The built-in filter restricts this syscall at the argument level. \
728                         This profile weakens security compared to the default.",
729                        name
730                    );
731                }
732            }
733        }
734    }
735
736    /// Check if seccomp filter has been applied
737    pub fn is_applied(&self) -> bool {
738        self.applied
739    }
740
741    fn apply_bpf_program(bpf_prog: &BpfProgram, log_denied: bool) -> Result<()> {
742        let mut flags: libc::c_ulong = 0;
743        if log_denied {
744            flags |= libc::SECCOMP_FILTER_FLAG_LOG as libc::c_ulong;
745        }
746
747        match Self::apply_bpf_program_with_flags(bpf_prog, flags) {
748            Ok(()) => Ok(()),
749            Err(err)
750                if log_denied
751                    && err.raw_os_error() == Some(libc::EINVAL)
752                    && libc::SECCOMP_FILTER_FLAG_LOG != 0 =>
753            {
754                warn!(
755                    "Kernel rejected SECCOMP_FILTER_FLAG_LOG; continuing with seccomp \
756                     enforcement without deny logging"
757                );
758                Self::apply_bpf_program_with_flags(bpf_prog, 0)?;
759                Ok(())
760            }
761            Err(err) => Err(NucleusError::SeccompError(format!(
762                "Failed to apply seccomp filter: {}",
763                err
764            ))),
765        }
766    }
767
768    fn apply_bpf_program_with_flags(
769        bpf_prog: &BpfProgram,
770        flags: libc::c_ulong,
771    ) -> std::io::Result<()> {
772        // SAFETY: `prctl(PR_SET_NO_NEW_PRIVS, ...)` has no pointer arguments here
773        // and only affects the current thread/process as required before seccomp.
774        let rc = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
775        if rc != 0 {
776            return Err(std::io::Error::last_os_error());
777        }
778
779        let prog = libc::sock_fprog {
780            len: bpf_prog.len() as u16,
781            filter: bpf_prog.as_ptr() as *mut libc::sock_filter,
782        };
783
784        // SAFETY: `prog` points to a live BPF program buffer for the duration of
785        // the syscall and the kernel copies the pointed-to filter immediately.
786        let rc = unsafe {
787            libc::syscall(
788                libc::SYS_seccomp,
789                libc::SECCOMP_SET_MODE_FILTER,
790                flags,
791                &prog as *const libc::sock_fprog,
792            )
793        };
794
795        if rc < 0 {
796            return Err(std::io::Error::last_os_error());
797        }
798
799        Ok(())
800    }
801}
802
803// SeccompProfile and SeccompSyscallGroup are defined in seccomp_generate.rs
804use crate::security::seccomp_generate::SeccompProfile;
805
806/// Map a syscall name (e.g. "read", "write") to its Linux syscall number.
807///
808/// Covers the most common syscalls. Unknown names return None.
809fn syscall_name_to_number(name: &str) -> Option<i64> {
810    match name {
811        // File I/O
812        "read" => Some(libc::SYS_read),
813        "write" => Some(libc::SYS_write),
814        "open" => Some(libc::SYS_open),
815        "openat" => Some(libc::SYS_openat),
816        "close" => Some(libc::SYS_close),
817        "stat" => Some(libc::SYS_stat),
818        "fstat" => Some(libc::SYS_fstat),
819        "lstat" => Some(libc::SYS_lstat),
820        "lseek" => Some(libc::SYS_lseek),
821        "access" => Some(libc::SYS_access),
822        "fcntl" => Some(libc::SYS_fcntl),
823        "readv" => Some(libc::SYS_readv),
824        "writev" => Some(libc::SYS_writev),
825        "pread64" => Some(libc::SYS_pread64),
826        "pwrite64" => Some(libc::SYS_pwrite64),
827        "readlink" => Some(libc::SYS_readlink),
828        "readlinkat" => Some(libc::SYS_readlinkat),
829        "newfstatat" => Some(libc::SYS_newfstatat),
830        "statx" => Some(libc::SYS_statx),
831        "faccessat" => Some(libc::SYS_faccessat),
832        "faccessat2" => Some(libc::SYS_faccessat2),
833        "dup" => Some(libc::SYS_dup),
834        "dup2" => Some(libc::SYS_dup2),
835        "dup3" => Some(libc::SYS_dup3),
836        "pipe" => Some(libc::SYS_pipe),
837        "pipe2" => Some(libc::SYS_pipe2),
838        "unlink" => Some(libc::SYS_unlink),
839        "unlinkat" => Some(libc::SYS_unlinkat),
840        "rename" => Some(libc::SYS_rename),
841        "renameat" => Some(libc::SYS_renameat),
842        "renameat2" => Some(libc::SYS_renameat2),
843        "link" => Some(libc::SYS_link),
844        "linkat" => Some(libc::SYS_linkat),
845        "symlink" => Some(libc::SYS_symlink),
846        "symlinkat" => Some(libc::SYS_symlinkat),
847        "chmod" => Some(libc::SYS_chmod),
848        "fchmod" => Some(libc::SYS_fchmod),
849        "fchmodat" => Some(libc::SYS_fchmodat),
850        "truncate" => Some(libc::SYS_truncate),
851        "ftruncate" => Some(libc::SYS_ftruncate),
852        "fallocate" => Some(libc::SYS_fallocate),
853        "fadvise64" => Some(libc::SYS_fadvise64),
854        "fsync" => Some(libc::SYS_fsync),
855        "fdatasync" => Some(libc::SYS_fdatasync),
856        "flock" => Some(libc::SYS_flock),
857        "sendfile" => Some(libc::SYS_sendfile),
858        "copy_file_range" => Some(libc::SYS_copy_file_range),
859        "splice" => Some(libc::SYS_splice),
860        "tee" => Some(libc::SYS_tee),
861        // Memory
862        "mmap" => Some(libc::SYS_mmap),
863        "munmap" => Some(libc::SYS_munmap),
864        "mprotect" => Some(libc::SYS_mprotect),
865        "brk" => Some(libc::SYS_brk),
866        "mremap" => Some(libc::SYS_mremap),
867        "madvise" => Some(libc::SYS_madvise),
868        "msync" => Some(libc::SYS_msync),
869        "mlock" => Some(libc::SYS_mlock),
870        "munlock" => Some(libc::SYS_munlock),
871        // Process
872        "fork" => Some(libc::SYS_fork),
873        "clone" => Some(libc::SYS_clone),
874        "clone3" => Some(libc::SYS_clone3),
875        "execve" => Some(libc::SYS_execve),
876        "execveat" => Some(libc::SYS_execveat),
877        "wait4" => Some(libc::SYS_wait4),
878        "waitid" => Some(libc::SYS_waitid),
879        "exit" => Some(libc::SYS_exit),
880        "exit_group" => Some(libc::SYS_exit_group),
881        "getpid" => Some(libc::SYS_getpid),
882        "gettid" => Some(libc::SYS_gettid),
883        "getuid" => Some(libc::SYS_getuid),
884        "getgid" => Some(libc::SYS_getgid),
885        "geteuid" => Some(libc::SYS_geteuid),
886        "getegid" => Some(libc::SYS_getegid),
887        "getppid" => Some(libc::SYS_getppid),
888        "getpgrp" => Some(libc::SYS_getpgrp),
889        "setsid" => Some(libc::SYS_setsid),
890        "getgroups" => Some(libc::SYS_getgroups),
891        // Signals
892        "rt_sigaction" => Some(libc::SYS_rt_sigaction),
893        "rt_sigprocmask" => Some(libc::SYS_rt_sigprocmask),
894        "rt_sigreturn" => Some(libc::SYS_rt_sigreturn),
895        "rt_sigsuspend" => Some(libc::SYS_rt_sigsuspend),
896        "sigaltstack" => Some(libc::SYS_sigaltstack),
897        "kill" => Some(libc::SYS_kill),
898        "tgkill" => Some(libc::SYS_tgkill),
899        // Time
900        "clock_gettime" => Some(libc::SYS_clock_gettime),
901        "clock_getres" => Some(libc::SYS_clock_getres),
902        "clock_nanosleep" => Some(libc::SYS_clock_nanosleep),
903        "gettimeofday" => Some(libc::SYS_gettimeofday),
904        "nanosleep" => Some(libc::SYS_nanosleep),
905        // Directories
906        "getcwd" => Some(libc::SYS_getcwd),
907        "chdir" => Some(libc::SYS_chdir),
908        "fchdir" => Some(libc::SYS_fchdir),
909        "mkdir" => Some(libc::SYS_mkdir),
910        "mkdirat" => Some(libc::SYS_mkdirat),
911        "rmdir" => Some(libc::SYS_rmdir),
912        "getdents" => Some(libc::SYS_getdents),
913        "getdents64" => Some(libc::SYS_getdents64),
914        // Network
915        "socket" => Some(libc::SYS_socket),
916        "connect" => Some(libc::SYS_connect),
917        "sendto" => Some(libc::SYS_sendto),
918        "recvfrom" => Some(libc::SYS_recvfrom),
919        "sendmsg" => Some(libc::SYS_sendmsg),
920        "recvmsg" => Some(libc::SYS_recvmsg),
921        "shutdown" => Some(libc::SYS_shutdown),
922        "bind" => Some(libc::SYS_bind),
923        "listen" => Some(libc::SYS_listen),
924        "accept" => Some(libc::SYS_accept),
925        "accept4" => Some(libc::SYS_accept4),
926        "setsockopt" => Some(libc::SYS_setsockopt),
927        "getsockopt" => Some(libc::SYS_getsockopt),
928        "getsockname" => Some(libc::SYS_getsockname),
929        "getpeername" => Some(libc::SYS_getpeername),
930        "socketpair" => Some(libc::SYS_socketpair),
931        // Poll/Select
932        "poll" => Some(libc::SYS_poll),
933        "ppoll" => Some(libc::SYS_ppoll),
934        "select" => Some(libc::SYS_select),
935        "pselect6" => Some(libc::SYS_pselect6),
936        "epoll_create" => Some(libc::SYS_epoll_create),
937        "epoll_create1" => Some(libc::SYS_epoll_create1),
938        "epoll_ctl" => Some(libc::SYS_epoll_ctl),
939        "epoll_wait" => Some(libc::SYS_epoll_wait),
940        "epoll_pwait" => Some(libc::SYS_epoll_pwait),
941        "eventfd" => Some(libc::SYS_eventfd),
942        "eventfd2" => Some(libc::SYS_eventfd2),
943        "signalfd" => Some(libc::SYS_signalfd),
944        "signalfd4" => Some(libc::SYS_signalfd4),
945        "timerfd_create" => Some(libc::SYS_timerfd_create),
946        "timerfd_settime" => Some(libc::SYS_timerfd_settime),
947        "timerfd_gettime" => Some(libc::SYS_timerfd_gettime),
948        // Misc
949        "uname" => Some(libc::SYS_uname),
950        "getrandom" => Some(libc::SYS_getrandom),
951        "futex" => Some(libc::SYS_futex),
952        "set_tid_address" => Some(libc::SYS_set_tid_address),
953        "set_robust_list" => Some(libc::SYS_set_robust_list),
954        "get_robust_list" => Some(libc::SYS_get_robust_list),
955        "arch_prctl" => Some(libc::SYS_arch_prctl),
956        "sysinfo" => Some(libc::SYS_sysinfo),
957        "umask" => Some(libc::SYS_umask),
958        "getrlimit" => Some(libc::SYS_getrlimit),
959        "prlimit64" => Some(libc::SYS_prlimit64),
960        "getrusage" => Some(libc::SYS_getrusage),
961        "times" => Some(libc::SYS_times),
962        "sched_yield" => Some(libc::SYS_sched_yield),
963        "sched_getaffinity" => Some(libc::SYS_sched_getaffinity),
964        "getcpu" => Some(libc::SYS_getcpu),
965        "rseq" => Some(libc::SYS_rseq),
966        "close_range" => Some(libc::SYS_close_range),
967        "memfd_create" => Some(libc::SYS_memfd_create),
968        "ioctl" => Some(libc::SYS_ioctl),
969        "prctl" => Some(libc::SYS_prctl),
970        // Landlock
971        "landlock_create_ruleset" => Some(libc::SYS_landlock_create_ruleset),
972        "landlock_add_rule" => Some(libc::SYS_landlock_add_rule),
973        "landlock_restrict_self" => Some(libc::SYS_landlock_restrict_self),
974        _ => None,
975    }
976}
977
978impl Default for SeccompManager {
979    fn default() -> Self {
980        Self::new()
981    }
982}
983
984#[cfg(test)]
985mod tests {
986    use super::*;
987
988    #[test]
989    fn test_seccomp_manager_initial_state() {
990        let mgr = SeccompManager::new();
991        assert!(!mgr.is_applied());
992    }
993
994    #[test]
995    fn test_apply_idempotent() {
996        let mgr = SeccompManager::new();
997        // Note: We can't actually test application in unit tests
998        // as it would affect the test process itself
999        // This is tested in integration tests instead
1000        assert!(!mgr.is_applied());
1001    }
1002
1003    #[test]
1004    fn test_clone_denied_flags_include_newcgroup() {
1005        assert_ne!(
1006            DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWCGROUP as u64,
1007            0
1008        );
1009    }
1010
1011    #[test]
1012    fn test_clone_denied_flags_include_newtime() {
1013        assert_ne!(
1014            DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWTIME as u64,
1015            0,
1016            "CLONE_NEWTIME must be in denied clone namespace flags"
1017        );
1018    }
1019
1020    #[test]
1021    fn test_network_none_socket_domains_are_unix_only() {
1022        let domains = SeccompManager::allowed_socket_domains(false);
1023        assert_eq!(domains, vec![libc::AF_UNIX]);
1024    }
1025
1026    #[test]
1027    fn test_network_enabled_socket_domains_exclude_netlink() {
1028        let domains = SeccompManager::allowed_socket_domains(true);
1029        assert!(domains.contains(&libc::AF_UNIX));
1030        assert!(domains.contains(&libc::AF_INET));
1031        assert!(domains.contains(&libc::AF_INET6));
1032        assert!(!domains.contains(&libc::AF_NETLINK));
1033    }
1034
1035    #[test]
1036    fn test_network_mode_syscalls_only_enabled_when_network_allowed() {
1037        let none = SeccompManager::network_mode_syscalls(false);
1038        assert!(none.is_empty());
1039
1040        let enabled = SeccompManager::network_mode_syscalls(true);
1041        assert!(enabled.contains(&libc::SYS_connect));
1042        assert!(enabled.contains(&libc::SYS_bind));
1043        assert!(enabled.contains(&libc::SYS_listen));
1044        assert!(enabled.contains(&libc::SYS_accept));
1045        assert!(enabled.contains(&libc::SYS_setsockopt));
1046    }
1047
1048    #[test]
1049    fn test_landlock_bootstrap_syscalls_present_in_base_allowlist() {
1050        let base = SeccompManager::base_allowed_syscalls();
1051        assert!(base.contains(&libc::SYS_landlock_create_ruleset));
1052        assert!(base.contains(&libc::SYS_landlock_add_rule));
1053        assert!(base.contains(&libc::SYS_landlock_restrict_self));
1054    }
1055
1056    #[test]
1057    fn test_x32_legacy_range_not_allowlisted() {
1058        let base = SeccompManager::base_allowed_syscalls();
1059        let net = SeccompManager::network_mode_syscalls(true);
1060        for nr in 512_i64..=547_i64 {
1061            assert!(
1062                !base.contains(&nr) && !net.contains(&nr),
1063                "x32 syscall number {} unexpectedly allowlisted",
1064                nr
1065            );
1066        }
1067    }
1068
1069    #[test]
1070    fn test_i386_compat_socketcall_range_not_allowlisted() {
1071        let base = SeccompManager::base_allowed_syscalls();
1072        let net = SeccompManager::network_mode_syscalls(true);
1073        // i386 compat per syscall_32.tbl: socket..shutdown live at 359..373.
1074        // On x86_64 these numbers are outside our native allowlist surface.
1075        for nr in 359_i64..=373_i64 {
1076            assert!(
1077                !base.contains(&nr) && !net.contains(&nr),
1078                "i386 compat syscall number {} unexpectedly allowlisted",
1079                nr
1080            );
1081        }
1082    }
1083
1084    #[test]
1085    fn test_minimal_filter_allowlist_counts_are_stable() {
1086        let base = SeccompManager::base_allowed_syscalls();
1087        let net = SeccompManager::network_mode_syscalls(true);
1088
1089        // Snapshot counts to catch unintended policy drift.
1090        // +7 accounts for conditional rules inserted in minimal_filter():
1091        // socket/ioctl/prctl/mprotect/clone/clone3/execveat.
1092        // fork removed (forces through filtered clone path).
1093        // execveat removed from base (arg-filtered separately).
1094        assert_eq!(base.len(), 131);
1095        assert_eq!(net.len(), 11);
1096        assert_eq!(base.len() + 7, 138);
1097        assert_eq!(base.len() + net.len() + 7, 149);
1098    }
1099
1100    #[test]
1101    fn test_arg_filtered_syscalls_list_includes_critical_syscalls() {
1102        // These syscalls must be in the arg-filtered list so custom profiles
1103        // get warnings when they allow them without filters.
1104        for name in &["clone", "clone3", "execveat", "ioctl", "prctl", "socket"] {
1105            assert!(
1106                SeccompManager::ARG_FILTERED_SYSCALLS.contains(name),
1107                "'{}' must be in ARG_FILTERED_SYSCALLS",
1108                name
1109            );
1110        }
1111    }
1112
1113    #[test]
1114    fn test_clone3_allowed_in_minimal_filter() {
1115        // clone3 MUST be in the BPF rules map — glibc 2.34+ and newer musl
1116        // use clone3 internally for posix_spawn/fork. Blocking it breaks
1117        // std::process::Command on modern systems. Namespace creation is
1118        // prevented by dropped capabilities (CAP_SYS_ADMIN etc.), not seccomp.
1119        let rules = SeccompManager::minimal_filter(true).unwrap();
1120        assert!(
1121            rules.contains_key(&libc::SYS_clone3),
1122            "clone3 must be in the seccomp allowlist (glibc 2.34+ requires it)"
1123        );
1124    }
1125
1126    #[test]
1127    fn test_clone_is_allowed_with_arg_filter() {
1128        // clone (not clone3) should still be in the rules with arg filtering
1129        let rules = SeccompManager::minimal_filter(true).unwrap();
1130        assert!(
1131            rules.contains_key(&libc::SYS_clone),
1132            "clone must be in the seccomp allowlist with arg filters"
1133        );
1134    }
1135
1136    #[test]
1137    fn test_high_risk_syscalls_removed_from_base_allowlist() {
1138        let base = SeccompManager::base_allowed_syscalls();
1139        let removed = [
1140            libc::SYS_chown,
1141            libc::SYS_fchown,
1142            libc::SYS_lchown,
1143            libc::SYS_fchownat,
1144            libc::SYS_sync,
1145            libc::SYS_syncfs,
1146            libc::SYS_mlock,
1147            libc::SYS_munlock,
1148            libc::SYS_mincore,
1149            libc::SYS_vfork,
1150            libc::SYS_tkill,
1151        ];
1152
1153        for syscall in removed {
1154            assert!(
1155                !base.contains(&syscall),
1156                "syscall {} unexpectedly present in base allowlist",
1157                syscall
1158            );
1159        }
1160    }
1161
1162    #[test]
1163    fn test_custom_profile_preserves_clone_arg_filters() {
1164        // SEC-01: Custom seccomp profiles that allow "clone" must still get
1165        // argument-level filtering to block namespace-creating flags.
1166        // Verify by inspecting the built-in filter rules that serve as the
1167        // merge source for apply_profile_from_file.
1168        let rules = SeccompManager::minimal_filter(true).unwrap();
1169
1170        // Every ARG_FILTERED_SYSCALLS entry (except clone3, which is allowed
1171        // unconditionally since BPF can't inspect its struct-based flags) must
1172        // have non-empty argument-level rules in the built-in filter so that
1173        // apply_profile_from_file can merge them.
1174        for name in SeccompManager::ARG_FILTERED_SYSCALLS {
1175            if *name == "clone3" {
1176                // clone3 is allowed unconditionally — BPF cannot dereference
1177                // the clone_args struct, so arg filtering is impossible.
1178                // Namespace defense relies on dropped capabilities.
1179                continue;
1180            }
1181            if let Some(nr) = syscall_name_to_number(name) {
1182                let entry = rules.get(&nr);
1183                assert!(
1184                    entry.is_some() && !entry.unwrap().is_empty(),
1185                    "built-in filter must have argument-level rules for '{}' \
1186                     so apply_profile_from_file can merge them into custom profiles",
1187                    name
1188                );
1189            }
1190        }
1191    }
1192
1193    #[test]
1194    fn test_memfd_create_not_in_default_allowlist() {
1195        // SEC-02: memfd_create enables fileless code execution when combined with execveat.
1196        let base = SeccompManager::base_allowed_syscalls();
1197        assert!(
1198            !base.contains(&libc::SYS_memfd_create),
1199            "memfd_create must not be in the default seccomp allowlist (fileless exec risk)"
1200        );
1201        // Also verify it's not sneaked into the compiled filter rules
1202        let rules = SeccompManager::minimal_filter(true).unwrap();
1203        assert!(
1204            !rules.contains_key(&libc::SYS_memfd_create),
1205            "memfd_create must not be in the compiled seccomp filter rules"
1206        );
1207    }
1208
1209    #[test]
1210    fn test_mprotect_has_arg_filtering() {
1211        // SEC-03: mprotect must have argument-level filtering to prevent W^X
1212        // (PROT_WRITE|PROT_EXEC) violations. Verify via runtime data structures.
1213
1214        // mprotect must NOT be in the unconditional base allowlist
1215        let base = SeccompManager::base_allowed_syscalls();
1216        assert!(
1217            !base.contains(&libc::SYS_mprotect),
1218            "SYS_mprotect must not be unconditionally allowed - needs arg filtering"
1219        );
1220
1221        // mprotect must be present in the compiled filter with non-empty
1222        // argument conditions (the conditions enforce W^X)
1223        let rules = SeccompManager::minimal_filter(true).unwrap();
1224        let mprotect_rules = rules.get(&libc::SYS_mprotect);
1225        assert!(
1226            mprotect_rules.is_some(),
1227            "mprotect must be present in the seccomp filter rules"
1228        );
1229        assert!(
1230            !mprotect_rules.unwrap().is_empty(),
1231            "mprotect must have argument-level conditions to prevent W^X violations"
1232        );
1233    }
1234
1235    #[test]
1236    fn test_unsafe_blocks_have_safety_comments() {
1237        // SEC-08: All unsafe blocks must have // SAFETY: documentation
1238        let source = include_str!("seccomp.rs");
1239        let mut pos = 0;
1240        while let Some(idx) = source[pos..].find("unsafe {") {
1241            let abs_idx = pos + idx;
1242            // Check that there's a SAFETY comment within 200 chars before the unsafe block
1243            let start = abs_idx.saturating_sub(200);
1244            let context = &source[start..abs_idx];
1245            assert!(
1246                context.contains("SAFETY:"),
1247                "unsafe block at byte {} must have a // SAFETY: comment. Context: ...{}...",
1248                abs_idx,
1249                &source[abs_idx.saturating_sub(80)..abs_idx + 10]
1250            );
1251            pos = abs_idx + 1;
1252        }
1253    }
1254
1255    // --- H-1: mprotect MaskedEq logic verification ---
1256    //
1257    // The mprotect filter uses MaskedEq((PROT_WRITE | PROT_EXEC), value) to
1258    // allow only combinations where the W|X bits match one of {0, W, X}.
1259    // These tests prove the logic is correct without installing a real
1260    // seccomp filter (which would affect the test process).
1261
1262    /// Helper: simulates the MaskedEq check that the seccomp BPF would perform.
1263    /// Returns true if the prot value would be ALLOWED by one of the rules.
1264    fn mprotect_would_allow(prot: u64) -> bool {
1265        let mask = (libc::PROT_WRITE | libc::PROT_EXEC) as u64;
1266        let allowed_values: &[u64] = &[0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64];
1267        let masked = prot & mask;
1268        allowed_values.contains(&masked)
1269    }
1270
1271    #[test]
1272    fn test_mprotect_allows_prot_none() {
1273        assert!(mprotect_would_allow(0), "PROT_NONE must be allowed");
1274    }
1275
1276    #[test]
1277    fn test_mprotect_allows_prot_read_only() {
1278        assert!(
1279            mprotect_would_allow(libc::PROT_READ as u64),
1280            "PROT_READ must be allowed (W|X bits are 0)"
1281        );
1282    }
1283
1284    #[test]
1285    fn test_mprotect_allows_prot_read_write() {
1286        assert!(
1287            mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE) as u64),
1288            "PROT_READ|PROT_WRITE must be allowed"
1289        );
1290    }
1291
1292    #[test]
1293    fn test_mprotect_allows_prot_read_exec() {
1294        assert!(
1295            mprotect_would_allow((libc::PROT_READ | libc::PROT_EXEC) as u64),
1296            "PROT_READ|PROT_EXEC must be allowed"
1297        );
1298    }
1299
1300    #[test]
1301    fn test_mprotect_rejects_prot_write_exec() {
1302        assert!(
1303            !mprotect_would_allow((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
1304            "PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
1305        );
1306    }
1307
1308    #[test]
1309    fn test_mprotect_rejects_prot_read_write_exec() {
1310        assert!(
1311            !mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE | libc::PROT_EXEC) as u64),
1312            "PROT_READ|PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
1313        );
1314    }
1315
1316    #[test]
1317    fn test_mprotect_allows_prot_write_alone() {
1318        assert!(
1319            mprotect_would_allow(libc::PROT_WRITE as u64),
1320            "PROT_WRITE alone must be allowed"
1321        );
1322    }
1323
1324    #[test]
1325    fn test_mprotect_allows_prot_exec_alone() {
1326        assert!(
1327            mprotect_would_allow(libc::PROT_EXEC as u64),
1328            "PROT_EXEC alone must be allowed"
1329        );
1330    }
1331}