Skip to main content

nucleus/security/
seccomp.rs

1use crate::error::{NucleusError, Result};
2use crate::security::policy::sha256_hex;
3use seccompiler::{BpfProgram, SeccompAction, SeccompCondition, SeccompFilter, SeccompRule};
4use std::collections::BTreeMap;
5use std::path::Path;
6use tracing::{debug, info, warn};
7
8/// Seccomp filter manager
9///
10/// Implements syscall whitelisting for the security state machine
11/// (NucleusSecurity_Seccomp_SeccompEnforcement.tla)
12pub struct SeccompManager {
13    applied: bool,
14}
15
16const DENIED_CLONE_NAMESPACE_FLAGS: u64 = (libc::CLONE_NEWUSER
17    | libc::CLONE_NEWNS
18    | libc::CLONE_NEWNET
19    | libc::CLONE_NEWIPC
20    | libc::CLONE_NEWUTS
21    | libc::CLONE_NEWPID
22    | libc::CLONE_NEWCGROUP
23    | libc::CLONE_NEWTIME) as u64;
24
25impl SeccompManager {
26    pub fn new() -> Self {
27        Self { applied: false }
28    }
29
30    fn base_allowed_syscalls() -> Vec<i64> {
31        let mut syscalls = vec![
32            // File I/O
33            libc::SYS_read,
34            libc::SYS_write,
35            libc::SYS_openat,
36            libc::SYS_close,
37            libc::SYS_fstat,
38            libc::SYS_lseek,
39            libc::SYS_fcntl,
40            libc::SYS_readv,
41            libc::SYS_writev,
42            libc::SYS_pread64,
43            libc::SYS_pwrite64,
44            libc::SYS_readlinkat,
45            libc::SYS_newfstatat,
46            libc::SYS_statx,
47            libc::SYS_faccessat,
48            libc::SYS_faccessat2,
49            libc::SYS_dup,
50            libc::SYS_dup3,
51            libc::SYS_pipe2,
52            libc::SYS_unlinkat,
53            libc::SYS_renameat,
54            libc::SYS_renameat2,
55            libc::SYS_linkat,
56            libc::SYS_symlinkat,
57            libc::SYS_fchmod,
58            libc::SYS_fchmodat,
59            libc::SYS_truncate,
60            libc::SYS_ftruncate,
61            libc::SYS_fallocate,
62            #[cfg(target_arch = "x86_64")]
63            libc::SYS_fadvise64,
64            libc::SYS_fsync,
65            libc::SYS_fdatasync,
66            libc::SYS_flock,
67            #[cfg(target_arch = "x86_64")]
68            libc::SYS_sendfile,
69            libc::SYS_copy_file_range,
70            libc::SYS_splice,
71            libc::SYS_tee,
72            // Memory management
73            libc::SYS_mmap,
74            libc::SYS_munmap,
75            libc::SYS_brk,
76            libc::SYS_mremap,
77            libc::SYS_madvise,
78            libc::SYS_msync,
79            // Process management
80            // fork intentionally excluded — modern glibc/musl use clone(), which
81            // has namespace-flag filtering. Removing SYS_fork forces all forks
82            // through the filtered clone path (defense-in-depth against fork bombs
83            // and unfiltered namespace creation).
84            libc::SYS_execve,
85            // execveat is conditionally allowed below (AT_EMPTY_PATH blocked)
86            libc::SYS_wait4,
87            libc::SYS_waitid,
88            libc::SYS_exit,
89            libc::SYS_exit_group,
90            libc::SYS_getpid,
91            libc::SYS_gettid,
92            libc::SYS_getuid,
93            libc::SYS_getgid,
94            libc::SYS_geteuid,
95            libc::SYS_getegid,
96            libc::SYS_getppid,
97            libc::SYS_setsid,
98            libc::SYS_getgroups,
99            // Signals
100            libc::SYS_rt_sigaction,
101            libc::SYS_rt_sigprocmask,
102            libc::SYS_rt_sigreturn,
103            libc::SYS_rt_sigsuspend,
104            libc::SYS_sigaltstack,
105            // L7: kill/tgkill are safe when PID namespace is active (container
106            // can only signal its own processes). If PID namespace creation fails,
107            // the runtime aborts, so this is safe.
108            libc::SYS_kill,
109            libc::SYS_tgkill,
110            // Time
111            libc::SYS_clock_gettime,
112            libc::SYS_clock_getres,
113            libc::SYS_clock_nanosleep,
114            libc::SYS_gettimeofday,
115            libc::SYS_nanosleep,
116            // Directories
117            libc::SYS_getcwd,
118            libc::SYS_chdir,
119            libc::SYS_fchdir,
120            libc::SYS_mkdirat,
121            libc::SYS_getdents64,
122            // Misc
123            libc::SYS_uname,
124            libc::SYS_getrandom,
125            libc::SYS_futex,
126            libc::SYS_set_tid_address,
127            libc::SYS_set_robust_list,
128            libc::SYS_get_robust_list,
129            // L8: sysinfo removed — leaks host RAM, uptime, and process count.
130            // Applications needing this info should use /proc/meminfo instead.
131            libc::SYS_umask,
132            // prlimit64 moved to arg-filtered section (M3)
133            libc::SYS_getrusage,
134            libc::SYS_times,
135            libc::SYS_sched_yield,
136            libc::SYS_sched_getaffinity,
137            libc::SYS_getcpu,
138            libc::SYS_rseq,
139            libc::SYS_close_range,
140            // NOTE: memfd_create intentionally excluded — combined with execveat
141            // it enables fileless code execution bypassing all FS controls (SEC-02).
142            // Landlock bootstrap (runtime applies seccomp before Landlock)
143            libc::SYS_landlock_create_ruleset,
144            libc::SYS_landlock_add_rule,
145            libc::SYS_landlock_restrict_self,
146            // Socket/Network (safe introspection + local socketpair)
147            libc::SYS_getsockname,
148            libc::SYS_getpeername,
149            libc::SYS_socketpair,
150            libc::SYS_getsockopt,
151            // Poll/Select
152            libc::SYS_ppoll,
153            libc::SYS_pselect6,
154            libc::SYS_epoll_create1,
155            libc::SYS_epoll_ctl,
156            libc::SYS_epoll_pwait,
157            libc::SYS_eventfd2,
158            libc::SYS_signalfd4,
159            libc::SYS_timerfd_create,
160            libc::SYS_timerfd_settime,
161            libc::SYS_timerfd_gettime,
162        ];
163
164        // Legacy syscalls only available on x86_64 (aarch64 only has the *at variants)
165        #[cfg(target_arch = "x86_64")]
166        syscalls.extend_from_slice(&[
167            libc::SYS_open,
168            libc::SYS_stat,
169            libc::SYS_lstat,
170            libc::SYS_access,
171            libc::SYS_readlink,
172            libc::SYS_dup2,
173            libc::SYS_pipe,
174            libc::SYS_unlink,
175            libc::SYS_rename,
176            libc::SYS_link,
177            libc::SYS_symlink,
178            libc::SYS_chmod,
179            libc::SYS_mkdir,
180            libc::SYS_rmdir,
181            libc::SYS_getdents,
182            libc::SYS_getpgrp,
183            libc::SYS_arch_prctl,
184            libc::SYS_getrlimit,
185            libc::SYS_poll,
186            libc::SYS_select,
187            libc::SYS_epoll_create,
188            libc::SYS_epoll_wait,
189            libc::SYS_eventfd,
190            libc::SYS_signalfd,
191        ]);
192
193        syscalls
194    }
195
196    fn allowed_socket_domains(allow_network: bool) -> Vec<i32> {
197        if allow_network {
198            vec![libc::AF_UNIX, libc::AF_INET, libc::AF_INET6]
199        } else {
200            vec![libc::AF_UNIX]
201        }
202    }
203
204    fn network_mode_syscalls(allow_network: bool) -> Vec<i64> {
205        if allow_network {
206            vec![
207                libc::SYS_connect,
208                libc::SYS_sendto,
209                libc::SYS_recvfrom,
210                libc::SYS_sendmsg,
211                libc::SYS_recvmsg,
212                libc::SYS_shutdown,
213                libc::SYS_bind,
214                libc::SYS_listen,
215                libc::SYS_accept,
216                libc::SYS_accept4,
217                libc::SYS_setsockopt,
218            ]
219        } else {
220            Vec::new()
221        }
222    }
223
224    /// Get minimal syscall whitelist for basic container operation
225    ///
226    /// This is a restrictive whitelist that blocks dangerous syscalls:
227    /// - ptrace (process tracing)
228    /// - kexec_load (kernel loading)
229    /// - add_key, request_key, keyctl (kernel keyring)
230    /// - bpf (eBPF programs)
231    /// - perf_event_open (performance monitoring)
232    /// - userfaultfd (user fault handling)
233    fn minimal_filter(allow_network: bool) -> Result<BTreeMap<i64, Vec<SeccompRule>>> {
234        let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
235
236        // Essential syscalls for basic operation
237        let allowed_syscalls = Self::base_allowed_syscalls();
238
239        // Allow all these syscalls unconditionally
240        for syscall in allowed_syscalls {
241            rules.insert(syscall, Vec::new());
242        }
243
244        // Add network-mode-specific syscalls
245        for syscall in Self::network_mode_syscalls(allow_network) {
246            rules.insert(syscall, Vec::new());
247        }
248
249        // Restrict socket() domains by network mode.
250        // none: AF_UNIX only; network-enabled: AF_UNIX/AF_INET/AF_INET6.
251        let mut socket_rules = Vec::new();
252        for domain in Self::allowed_socket_domains(allow_network) {
253            let condition = SeccompCondition::new(
254                0, // arg0 is socket(domain, type, protocol)
255                seccompiler::SeccompCmpArgLen::Dword,
256                seccompiler::SeccompCmpOp::Eq,
257                domain as u64,
258            )
259            .map_err(|e| {
260                NucleusError::SeccompError(format!(
261                    "Failed to create socket domain condition: {}",
262                    e
263                ))
264            })?;
265            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
266                NucleusError::SeccompError(format!("Failed to create socket rule: {}", e))
267            })?;
268            socket_rules.push(rule);
269        }
270        rules.insert(libc::SYS_socket, socket_rules);
271
272        // ioctl: allow only safe terminal operations (arg0 = request code)
273        let ioctl_allowed: &[u64] = &[
274            0x5401, // TCGETS
275            0x5402, // TCSETS
276            0x5403, // TCSETSW
277            0x5404, // TCSETSF
278            0x540B, // TCFLSH
279            0x540F, // TIOCGPGRP
280            0x5410, // TIOCSPGRP
281            0x5413, // TIOCGWINSZ
282            0x5429, // TIOCGSID
283            0x541B, // FIONREAD
284            0x5421, // M12: FIONBIO — allowed because fcntl(F_SETFL, O_NONBLOCK)
285            // achieves the same result and is already permitted. Blocking
286            // FIONBIO only breaks tokio/mio for no security gain.
287            0x5451, // FIOCLEX
288            0x5450, // FIONCLEX
289        ];
290        let mut ioctl_rules = Vec::new();
291        for &request in ioctl_allowed {
292            let condition = SeccompCondition::new(
293                1, // arg1 is the request code for ioctl(fd, request, ...)
294                seccompiler::SeccompCmpArgLen::Dword,
295                seccompiler::SeccompCmpOp::Eq,
296                request,
297            )
298            .map_err(|e| {
299                NucleusError::SeccompError(format!("Failed to create ioctl condition: {}", e))
300            })?;
301            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
302                NucleusError::SeccompError(format!("Failed to create ioctl rule: {}", e))
303            })?;
304            ioctl_rules.push(rule);
305        }
306        rules.insert(libc::SYS_ioctl, ioctl_rules);
307
308        // prctl: allow only safe operations (arg0 = option).
309        // Notably absent (hit default deny):
310        //   PR_CAPBSET_READ (23) — leaks capability bounding set info
311        //   PR_CAPBSET_DROP (24) — could weaken the capability bounding set
312        //   PR_SET_SECUREBITS (28) — could disable secure-exec restrictions
313        let prctl_allowed: &[u64] = &[
314            1,  // PR_SET_PDEATHSIG
315            2,  // PR_GET_PDEATHSIG
316            15, // PR_SET_NAME
317            16, // PR_GET_NAME
318            38, // PR_SET_NO_NEW_PRIVS
319            39, // PR_GET_NO_NEW_PRIVS
320        ];
321        let mut prctl_rules = Vec::new();
322        for &option in prctl_allowed {
323            let condition = SeccompCondition::new(
324                0, // arg0 is the option for prctl(option, ...)
325                seccompiler::SeccompCmpArgLen::Dword,
326                seccompiler::SeccompCmpOp::Eq,
327                option,
328            )
329            .map_err(|e| {
330                NucleusError::SeccompError(format!("Failed to create prctl condition: {}", e))
331            })?;
332            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
333                NucleusError::SeccompError(format!("Failed to create prctl rule: {}", e))
334            })?;
335            prctl_rules.push(rule);
336        }
337        rules.insert(libc::SYS_prctl, prctl_rules);
338
339        // M3: prlimit64 — only allow GET (new_limit == NULL, i.e. arg2 == 0).
340        // SET operations could raise RLIMIT_NPROC to bypass fork-bomb protection.
341        let prlimit_condition = SeccompCondition::new(
342            2, // arg2 = new_limit pointer for prlimit64(pid, resource, new_limit, old_limit)
343            seccompiler::SeccompCmpArgLen::Qword,
344            seccompiler::SeccompCmpOp::Eq,
345            0u64, // new_limit == NULL means GET-only
346        )
347        .map_err(|e| {
348            NucleusError::SeccompError(format!("Failed to create prlimit64 condition: {}", e))
349        })?;
350        let prlimit_rule = SeccompRule::new(vec![prlimit_condition]).map_err(|e| {
351            NucleusError::SeccompError(format!("Failed to create prlimit64 rule: {}", e))
352        })?;
353        rules.insert(libc::SYS_prlimit64, vec![prlimit_rule]);
354
355        // mprotect: permit RW or RX transitions, but reject PROT_WRITE|PROT_EXEC.
356        let mut mprotect_rules = Vec::new();
357        for allowed in [0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64] {
358            let condition = SeccompCondition::new(
359                2, // arg2 is prot for mprotect(addr, len, prot)
360                seccompiler::SeccompCmpArgLen::Dword,
361                seccompiler::SeccompCmpOp::MaskedEq((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
362                allowed,
363            )
364            .map_err(|e| {
365                NucleusError::SeccompError(format!("Failed to create mprotect condition: {}", e))
366            })?;
367            let rule = SeccompRule::new(vec![condition]).map_err(|e| {
368                NucleusError::SeccompError(format!("Failed to create mprotect rule: {}", e))
369            })?;
370            mprotect_rules.push(rule);
371        }
372        rules.insert(libc::SYS_mprotect, mprotect_rules);
373
374        // clone3: ALLOWED unconditionally. clone3 passes flags inside a struct
375        // pointer that seccomp BPF cannot dereference, so namespace-flag filtering
376        // is impossible at the BPF level. However, glibc 2.34+ and newer musl use
377        // clone3 internally for posix_spawn/fork — blocking it breaks
378        // std::process::Command and any child-process spawning on modern systems.
379        //
380        // SECURITY INVARIANT: Namespace creation via clone3 is prevented solely by
381        // dropping CAP_SYS_ADMIN (and other namespace caps) *before* this seccomp
382        // filter is installed. If capability dropping is bypassed, clone3 becomes
383        // an unfiltered path to namespace creation. This is a known single point
384        // of failure — see CapabilityManager::drop_all() which must run first.
385        //
386        // Verify the invariant: CAP_SYS_ADMIN must not be in the effective set.
387        // CAP_SYS_ADMIN = capability bit 21
388        if Self::has_effective_cap(21) {
389            return Err(NucleusError::SeccompError(
390                "SECURITY: CAP_SYS_ADMIN is still in the effective capability set. \
391                 Capabilities must be dropped before installing seccomp filters \
392                 (clone3 is allowed unconditionally)."
393                    .to_string(),
394            ));
395        }
396        rules.insert(libc::SYS_clone3, Vec::new());
397
398        // clone: allow but deny namespace-creating flags to prevent nested namespace creation
399        let clone_condition = SeccompCondition::new(
400            0, // arg0 = flags
401            seccompiler::SeccompCmpArgLen::Qword,
402            seccompiler::SeccompCmpOp::MaskedEq(DENIED_CLONE_NAMESPACE_FLAGS),
403            0, // (flags & ns_flags) == 0: none of the namespace flags set
404        )
405        .map_err(|e| {
406            NucleusError::SeccompError(format!("Failed to create clone condition: {}", e))
407        })?;
408        let clone_rule = SeccompRule::new(vec![clone_condition]).map_err(|e| {
409            NucleusError::SeccompError(format!("Failed to create clone rule: {}", e))
410        })?;
411        rules.insert(libc::SYS_clone, vec![clone_rule]);
412
413        // execveat: allow but block AT_EMPTY_PATH (0x1000) to prevent fileless
414        // execution. With AT_EMPTY_PATH, execveat can execute code from any open
415        // fd (e.g., open + unlink, or even a socket fd), bypassing filesystem
416        // controls — not just memfd_create. Blocking memfd_create alone is
417        // insufficient. Normal execveat with dirfd+pathname (no AT_EMPTY_PATH)
418        // remains allowed.
419        let execveat_condition = SeccompCondition::new(
420            4, // arg4 = flags for execveat(dirfd, pathname, argv, envp, flags)
421            seccompiler::SeccompCmpArgLen::Dword,
422            seccompiler::SeccompCmpOp::MaskedEq(libc::AT_EMPTY_PATH as u64),
423            0, // (flags & AT_EMPTY_PATH) == 0: AT_EMPTY_PATH not set
424        )
425        .map_err(|e| {
426            NucleusError::SeccompError(format!("Failed to create execveat condition: {}", e))
427        })?;
428        let execveat_rule = SeccompRule::new(vec![execveat_condition]).map_err(|e| {
429            NucleusError::SeccompError(format!("Failed to create execveat rule: {}", e))
430        })?;
431        rules.insert(libc::SYS_execveat, vec![execveat_rule]);
432
433        Ok(rules)
434    }
435
436    /// Compile the minimal BPF filter without applying it
437    ///
438    /// This is useful for benchmarking filter compilation overhead
439    /// without the irreversible side effect of applying the filter.
440    pub fn compile_minimal_filter() -> Result<BpfProgram> {
441        let rules = Self::minimal_filter(true)?;
442        let filter = SeccompFilter::new(
443            rules,
444            SeccompAction::KillProcess,
445            SeccompAction::Allow,
446            std::env::consts::ARCH.try_into().map_err(|e| {
447                NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
448            })?,
449        )
450        .map_err(|e| {
451            NucleusError::SeccompError(format!("Failed to create seccomp filter: {}", e))
452        })?;
453
454        let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
455            NucleusError::SeccompError(format!("Failed to compile BPF program: {}", e))
456        })?;
457
458        Ok(bpf_prog)
459    }
460
461    /// Apply seccomp filter
462    ///
463    /// This implements the transition: no_filter -> whitelist_active
464    /// in the seccomp state machine (NucleusSecurity_Seccomp_SeccompEnforcement.tla)
465    ///
466    /// Once applied, the filter cannot be removed (irreversible property)
467    /// In rootless mode or if seccomp setup fails, this will warn and continue
468    pub fn apply_minimal_filter(&mut self) -> Result<bool> {
469        self.apply_minimal_filter_with_mode(false, false)
470    }
471
472    /// Apply seccomp filter with configurable failure behavior
473    ///
474    /// When `best_effort` is true, failures are logged and execution continues.
475    /// When false, seccomp setup is fail-closed.
476    pub fn apply_minimal_filter_with_mode(
477        &mut self,
478        best_effort: bool,
479        log_denied: bool,
480    ) -> Result<bool> {
481        self.apply_filter_for_network_mode(true, best_effort, log_denied)
482    }
483
484    /// Apply seccomp filter with network-mode-aware socket restrictions
485    ///
486    /// When `allow_network` is false, `SYS_socket` is restricted to AF_UNIX only,
487    /// preventing creation of network sockets (AF_INET, AF_INET6, etc.).
488    /// When `allow_network` is true, all socket domains are permitted.
489    ///
490    /// When `best_effort` is true, failures are logged and execution continues.
491    /// When false, seccomp setup is fail-closed.
492    pub fn apply_filter_for_network_mode(
493        &mut self,
494        allow_network: bool,
495        best_effort: bool,
496        log_denied: bool,
497    ) -> Result<bool> {
498        if self.applied {
499            debug!("Seccomp filter already applied, skipping");
500            return Ok(true);
501        }
502
503        info!(allow_network, "Applying seccomp filter");
504
505        let rules = match Self::minimal_filter(allow_network) {
506            Ok(r) => r,
507            Err(e) => {
508                if best_effort {
509                    warn!(
510                        "Failed to create seccomp rules: {} (continuing without seccomp)",
511                        e
512                    );
513                    return Ok(false);
514                }
515                return Err(e);
516            }
517        };
518
519        let filter = match SeccompFilter::new(
520            rules,
521            SeccompAction::KillProcess, // Default: kill on blocked syscall
522            SeccompAction::Allow,       // Match action: allow
523            std::env::consts::ARCH.try_into().map_err(|e| {
524                NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
525            })?,
526        ) {
527            Ok(f) => f,
528            Err(e) => {
529                if best_effort {
530                    warn!(
531                        "Failed to create seccomp filter: {} (continuing without seccomp)",
532                        e
533                    );
534                    return Ok(false);
535                }
536                return Err(NucleusError::SeccompError(format!(
537                    "Failed to create seccomp filter: {}",
538                    e
539                )));
540            }
541        };
542
543        let bpf_prog: BpfProgram = match filter.try_into() {
544            Ok(p) => p,
545            Err(e) => {
546                if best_effort {
547                    warn!(
548                        "Failed to compile BPF program: {} (continuing without seccomp)",
549                        e
550                    );
551                    return Ok(false);
552                }
553                return Err(NucleusError::SeccompError(format!(
554                    "Failed to compile BPF program: {}",
555                    e
556                )));
557            }
558        };
559
560        // Apply the filter
561        match Self::apply_bpf_program(&bpf_prog, log_denied) {
562            Ok(_) => {
563                self.applied = true;
564                info!("Successfully applied seccomp filter");
565                Ok(true)
566            }
567            Err(e) => {
568                if best_effort {
569                    warn!(
570                        "Failed to apply seccomp filter: {} (continuing without seccomp)",
571                        e
572                    );
573                    Ok(false)
574                } else {
575                    Err(NucleusError::SeccompError(format!(
576                        "Failed to apply seccomp filter: {}",
577                        e
578                    )))
579                }
580            }
581        }
582    }
583
584    /// Apply a seccomp profile loaded from a JSON file.
585    ///
586    /// The profile format is a JSON object with:
587    /// ```json
588    /// {
589    ///   "defaultAction": "SCMP_ACT_ERRNO",
590    ///   "syscalls": [
591    ///     { "names": ["read", "write", "open", ...], "action": "SCMP_ACT_ALLOW" }
592    ///   ]
593    /// }
594    /// ```
595    ///
596    /// This is a subset of the OCI seccomp profile format. Only the syscall name
597    /// allowlist is used; argument-level filtering from the built-in profile is
598    /// not applied when using a custom profile.
599    ///
600    /// If `expected_sha256` is provided, the file's SHA-256 hash is verified
601    /// against it before loading. This prevents silent profile tampering.
602    pub fn apply_profile_from_file(
603        &mut self,
604        profile_path: &Path,
605        expected_sha256: Option<&str>,
606        audit_mode: bool,
607    ) -> Result<bool> {
608        if self.applied {
609            debug!("Seccomp filter already applied, skipping");
610            return Ok(true);
611        }
612
613        info!("Loading seccomp profile from {:?}", profile_path);
614
615        // Read profile file
616        let content = std::fs::read(profile_path).map_err(|e| {
617            NucleusError::SeccompError(format!(
618                "Failed to read seccomp profile {:?}: {}",
619                profile_path, e
620            ))
621        })?;
622
623        // Verify SHA-256 hash if expected
624        if let Some(expected) = expected_sha256 {
625            let actual = sha256_hex(&content);
626            if actual != expected {
627                return Err(NucleusError::SeccompError(format!(
628                    "Seccomp profile hash mismatch: expected {}, got {}",
629                    expected, actual
630                )));
631            }
632            info!("Seccomp profile hash verified: {}", actual);
633        }
634
635        // Parse profile
636        let profile: SeccompProfile = serde_json::from_slice(&content).map_err(|e| {
637            NucleusError::SeccompError(format!("Failed to parse seccomp profile: {}", e))
638        })?;
639
640        // Warn when custom profile allows security-critical syscalls without
641        // argument-level filtering. The built-in filter restricts clone, ioctl,
642        // prctl, and socket at the argument level; a custom profile that allows
643        // them by name only silently removes all of that hardening.
644        Self::warn_missing_arg_filters(&profile);
645
646        // Build filter from profile
647        let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
648
649        for syscall_group in &profile.syscalls {
650            if syscall_group.action == "SCMP_ACT_ALLOW" {
651                for name in &syscall_group.names {
652                    if let Some(nr) = syscall_name_to_number(name) {
653                        rules.insert(nr, Vec::new());
654                    } else {
655                        warn!("Unknown syscall in profile: {} (skipping)", name);
656                    }
657                }
658            }
659        }
660
661        // SEC-01: Merge built-in argument filters for security-critical syscalls.
662        // Custom profiles that allow clone/ioctl/prctl/socket/mprotect by name
663        // without argument-level filters would silently remove all hardening.
664        // Overwrite their empty rules with the built-in argument-filtered rules.
665        let builtin_rules = Self::minimal_filter(true)?;
666        for syscall_name in Self::ARG_FILTERED_SYSCALLS {
667            if let Some(nr) = syscall_name_to_number(syscall_name) {
668                if let std::collections::btree_map::Entry::Occupied(mut entry) = rules.entry(nr) {
669                    if let Some(builtin) = builtin_rules.get(&nr) {
670                        if !builtin.is_empty() {
671                            info!(
672                                "Merging built-in argument filters for '{}' into custom profile",
673                                syscall_name
674                            );
675                            entry.insert(builtin.clone());
676                        }
677                    }
678                }
679            }
680        }
681        // H2: clone3 is allowed in the built-in filter (needed for glibc 2.34+).
682        // Apply the same policy to custom profiles for consistency. The security
683        // invariant against namespace creation via clone3 is enforced by dropping
684        // CAP_SYS_ADMIN *before* seccomp is installed (see verify_no_namespace_caps).
685        // If the custom profile doesn't include clone3, add it.
686        if !rules.contains_key(&libc::SYS_clone3) {
687            rules.insert(libc::SYS_clone3, Vec::new());
688        }
689
690        let filter = SeccompFilter::new(
691            rules,
692            SeccompAction::KillProcess,
693            SeccompAction::Allow,
694            std::env::consts::ARCH.try_into().map_err(|e| {
695                NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
696            })?,
697        )
698        .map_err(|e| {
699            NucleusError::SeccompError(format!(
700                "Failed to create seccomp filter from profile: {}",
701                e
702            ))
703        })?;
704
705        let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
706            NucleusError::SeccompError(format!("Failed to compile BPF program from profile: {}", e))
707        })?;
708
709        match Self::apply_bpf_program(&bpf_prog, audit_mode) {
710            Ok(_) => {
711                self.applied = true;
712                info!(
713                    "Seccomp profile applied from {:?} (log_denied={})",
714                    profile_path, audit_mode
715                );
716                Ok(true)
717            }
718            Err(e) => Err(e),
719        }
720    }
721
722    /// Install an allow-all seccomp filter with SECCOMP_FILTER_FLAG_LOG.
723    ///
724    /// Used in trace mode: all syscalls are allowed but logged to the kernel
725    /// audit subsystem. A separate reader collects the logged syscalls.
726    pub fn apply_trace_filter(&mut self) -> Result<bool> {
727        if self.applied {
728            debug!("Seccomp filter already applied, skipping trace filter");
729            return Ok(true);
730        }
731
732        info!("Applying seccomp trace filter (allow-all + LOG)");
733
734        // Create an empty rule set — with SeccompAction::Allow as default,
735        // every syscall is permitted. The LOG flag causes the kernel to
736        // audit each syscall decision.
737        let filter = SeccompFilter::new(
738            BTreeMap::new(),
739            SeccompAction::Allow, // default: allow everything
740            SeccompAction::Allow, // match action (unused — no rules)
741            std::env::consts::ARCH.try_into().map_err(|e| {
742                NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
743            })?,
744        )
745        .map_err(|e| NucleusError::SeccompError(format!("Failed to create trace filter: {}", e)))?;
746
747        let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
748            NucleusError::SeccompError(format!("Failed to compile trace BPF: {}", e))
749        })?;
750
751        // Apply with LOG flag so kernel audits every syscall
752        Self::apply_bpf_program(&bpf_prog, true)?;
753        self.applied = true;
754        info!("Seccomp trace filter applied (all syscalls allowed + logged)");
755        Ok(true)
756    }
757
758    /// Syscalls that the built-in filter restricts at the argument level.
759    /// Custom profiles allowing these without argument filters weaken security.
760    const ARG_FILTERED_SYSCALLS: &'static [&'static str] = &[
761        "clone", "clone3", "execveat", "ioctl", "mprotect", "prctl", "socket",
762    ];
763
764    /// Warn when a custom seccomp profile allows security-critical syscalls
765    /// without argument-level filtering.
766    fn warn_missing_arg_filters(profile: &SeccompProfile) {
767        for group in &profile.syscalls {
768            if group.action != "SCMP_ACT_ALLOW" {
769                continue;
770            }
771            for name in &group.names {
772                if Self::ARG_FILTERED_SYSCALLS.contains(&name.as_str()) && group.args.is_empty() {
773                    warn!(
774                        "Custom seccomp profile allows '{}' without argument filters. \
775                         The built-in filter restricts this syscall at the argument level. \
776                         This profile weakens security compared to the default.",
777                        name
778                    );
779                }
780            }
781        }
782    }
783
784    /// Check whether a capability is in the current thread's effective set
785    /// by reading /proc/self/status (CapEff line).
786    fn has_effective_cap(cap: i32) -> bool {
787        let Ok(status) = std::fs::read_to_string("/proc/self/status") else {
788            // If we can't read, assume worst case for safety.
789            return true;
790        };
791        for line in status.lines() {
792            if let Some(hex) = line.strip_prefix("CapEff:\t") {
793                if let Ok(eff) = u64::from_str_radix(hex.trim(), 16) {
794                    return eff & (1u64 << cap) != 0;
795                }
796            }
797        }
798        true // assume worst case
799    }
800
801    /// Check if seccomp filter has been applied
802    pub fn is_applied(&self) -> bool {
803        self.applied
804    }
805
806    fn apply_bpf_program(bpf_prog: &BpfProgram, log_denied: bool) -> Result<()> {
807        let mut flags: libc::c_ulong = 0;
808        if log_denied {
809            flags |= libc::SECCOMP_FILTER_FLAG_LOG as libc::c_ulong;
810        }
811
812        match Self::apply_bpf_program_with_flags(bpf_prog, flags) {
813            Ok(()) => Ok(()),
814            Err(err)
815                if log_denied
816                    && err.raw_os_error() == Some(libc::EINVAL)
817                    && libc::SECCOMP_FILTER_FLAG_LOG != 0 =>
818            {
819                warn!(
820                    "Kernel rejected SECCOMP_FILTER_FLAG_LOG; continuing with seccomp \
821                     enforcement without deny logging"
822                );
823                Self::apply_bpf_program_with_flags(bpf_prog, 0)?;
824                Ok(())
825            }
826            Err(err) => Err(NucleusError::SeccompError(format!(
827                "Failed to apply seccomp filter: {}",
828                err
829            ))),
830        }
831    }
832
833    fn apply_bpf_program_with_flags(
834        bpf_prog: &BpfProgram,
835        flags: libc::c_ulong,
836    ) -> std::io::Result<()> {
837        // SAFETY: `prctl(PR_SET_NO_NEW_PRIVS, ...)` has no pointer arguments here
838        // and only affects the current thread/process as required before seccomp.
839        let rc = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
840        if rc != 0 {
841            return Err(std::io::Error::last_os_error());
842        }
843
844        let prog = libc::sock_fprog {
845            len: bpf_prog.len() as u16,
846            filter: bpf_prog.as_ptr() as *mut libc::sock_filter,
847        };
848
849        // SAFETY: `prog` points to a live BPF program buffer for the duration of
850        // the syscall and the kernel copies the pointed-to filter immediately.
851        let rc = unsafe {
852            libc::syscall(
853                libc::SYS_seccomp,
854                libc::SECCOMP_SET_MODE_FILTER,
855                flags,
856                &prog as *const libc::sock_fprog,
857            )
858        };
859
860        if rc < 0 {
861            return Err(std::io::Error::last_os_error());
862        }
863
864        Ok(())
865    }
866}
867
868// SeccompProfile and SeccompSyscallGroup are defined in seccomp_generate.rs
869use crate::security::seccomp_generate::SeccompProfile;
870
871/// Map a syscall name (e.g. "read", "write") to its Linux syscall number.
872///
873/// Covers the most common syscalls. Unknown names return None.
874fn syscall_name_to_number(name: &str) -> Option<i64> {
875    match name {
876        // File I/O
877        "read" => Some(libc::SYS_read),
878        "write" => Some(libc::SYS_write),
879        #[cfg(target_arch = "x86_64")]
880        "open" => Some(libc::SYS_open),
881        "openat" => Some(libc::SYS_openat),
882        "close" => Some(libc::SYS_close),
883        #[cfg(target_arch = "x86_64")]
884        "stat" => Some(libc::SYS_stat),
885        "fstat" => Some(libc::SYS_fstat),
886        #[cfg(target_arch = "x86_64")]
887        "lstat" => Some(libc::SYS_lstat),
888        "lseek" => Some(libc::SYS_lseek),
889        #[cfg(target_arch = "x86_64")]
890        "access" => Some(libc::SYS_access),
891        "fcntl" => Some(libc::SYS_fcntl),
892        "readv" => Some(libc::SYS_readv),
893        "writev" => Some(libc::SYS_writev),
894        "pread64" => Some(libc::SYS_pread64),
895        "pwrite64" => Some(libc::SYS_pwrite64),
896        #[cfg(target_arch = "x86_64")]
897        "readlink" => Some(libc::SYS_readlink),
898        "readlinkat" => Some(libc::SYS_readlinkat),
899        "newfstatat" => Some(libc::SYS_newfstatat),
900        "statx" => Some(libc::SYS_statx),
901        "faccessat" => Some(libc::SYS_faccessat),
902        "faccessat2" => Some(libc::SYS_faccessat2),
903        "dup" => Some(libc::SYS_dup),
904        #[cfg(target_arch = "x86_64")]
905        "dup2" => Some(libc::SYS_dup2),
906        "dup3" => Some(libc::SYS_dup3),
907        #[cfg(target_arch = "x86_64")]
908        "pipe" => Some(libc::SYS_pipe),
909        "pipe2" => Some(libc::SYS_pipe2),
910        #[cfg(target_arch = "x86_64")]
911        "unlink" => Some(libc::SYS_unlink),
912        "unlinkat" => Some(libc::SYS_unlinkat),
913        #[cfg(target_arch = "x86_64")]
914        "rename" => Some(libc::SYS_rename),
915        "renameat" => Some(libc::SYS_renameat),
916        "renameat2" => Some(libc::SYS_renameat2),
917        #[cfg(target_arch = "x86_64")]
918        "link" => Some(libc::SYS_link),
919        "linkat" => Some(libc::SYS_linkat),
920        #[cfg(target_arch = "x86_64")]
921        "symlink" => Some(libc::SYS_symlink),
922        "symlinkat" => Some(libc::SYS_symlinkat),
923        #[cfg(target_arch = "x86_64")]
924        "chmod" => Some(libc::SYS_chmod),
925        "fchmod" => Some(libc::SYS_fchmod),
926        "fchmodat" => Some(libc::SYS_fchmodat),
927        "truncate" => Some(libc::SYS_truncate),
928        "ftruncate" => Some(libc::SYS_ftruncate),
929        "fallocate" => Some(libc::SYS_fallocate),
930        #[cfg(target_arch = "x86_64")]
931        "fadvise64" => Some(libc::SYS_fadvise64),
932        "fsync" => Some(libc::SYS_fsync),
933        "fdatasync" => Some(libc::SYS_fdatasync),
934        "flock" => Some(libc::SYS_flock),
935        #[cfg(target_arch = "x86_64")]
936        "sendfile" => Some(libc::SYS_sendfile),
937        "copy_file_range" => Some(libc::SYS_copy_file_range),
938        "splice" => Some(libc::SYS_splice),
939        "tee" => Some(libc::SYS_tee),
940        // Memory
941        "mmap" => Some(libc::SYS_mmap),
942        "munmap" => Some(libc::SYS_munmap),
943        "mprotect" => Some(libc::SYS_mprotect),
944        "brk" => Some(libc::SYS_brk),
945        "mremap" => Some(libc::SYS_mremap),
946        "madvise" => Some(libc::SYS_madvise),
947        "msync" => Some(libc::SYS_msync),
948        "mlock" => Some(libc::SYS_mlock),
949        "munlock" => Some(libc::SYS_munlock),
950        // Process
951        #[cfg(target_arch = "x86_64")]
952        "fork" => Some(libc::SYS_fork),
953        "clone" => Some(libc::SYS_clone),
954        "clone3" => Some(libc::SYS_clone3),
955        "execve" => Some(libc::SYS_execve),
956        "execveat" => Some(libc::SYS_execveat),
957        "wait4" => Some(libc::SYS_wait4),
958        "waitid" => Some(libc::SYS_waitid),
959        "exit" => Some(libc::SYS_exit),
960        "exit_group" => Some(libc::SYS_exit_group),
961        "getpid" => Some(libc::SYS_getpid),
962        "gettid" => Some(libc::SYS_gettid),
963        "getuid" => Some(libc::SYS_getuid),
964        "getgid" => Some(libc::SYS_getgid),
965        "geteuid" => Some(libc::SYS_geteuid),
966        "getegid" => Some(libc::SYS_getegid),
967        "getppid" => Some(libc::SYS_getppid),
968        #[cfg(target_arch = "x86_64")]
969        "getpgrp" => Some(libc::SYS_getpgrp),
970        "setsid" => Some(libc::SYS_setsid),
971        "getgroups" => Some(libc::SYS_getgroups),
972        // Signals
973        "rt_sigaction" => Some(libc::SYS_rt_sigaction),
974        "rt_sigprocmask" => Some(libc::SYS_rt_sigprocmask),
975        "rt_sigreturn" => Some(libc::SYS_rt_sigreturn),
976        "rt_sigsuspend" => Some(libc::SYS_rt_sigsuspend),
977        "sigaltstack" => Some(libc::SYS_sigaltstack),
978        "kill" => Some(libc::SYS_kill),
979        "tgkill" => Some(libc::SYS_tgkill),
980        // Time
981        "clock_gettime" => Some(libc::SYS_clock_gettime),
982        "clock_getres" => Some(libc::SYS_clock_getres),
983        "clock_nanosleep" => Some(libc::SYS_clock_nanosleep),
984        "gettimeofday" => Some(libc::SYS_gettimeofday),
985        "nanosleep" => Some(libc::SYS_nanosleep),
986        // Directories
987        "getcwd" => Some(libc::SYS_getcwd),
988        "chdir" => Some(libc::SYS_chdir),
989        "fchdir" => Some(libc::SYS_fchdir),
990        #[cfg(target_arch = "x86_64")]
991        "mkdir" => Some(libc::SYS_mkdir),
992        "mkdirat" => Some(libc::SYS_mkdirat),
993        #[cfg(target_arch = "x86_64")]
994        "rmdir" => Some(libc::SYS_rmdir),
995        #[cfg(target_arch = "x86_64")]
996        "getdents" => Some(libc::SYS_getdents),
997        "getdents64" => Some(libc::SYS_getdents64),
998        // Network
999        "socket" => Some(libc::SYS_socket),
1000        "connect" => Some(libc::SYS_connect),
1001        "sendto" => Some(libc::SYS_sendto),
1002        "recvfrom" => Some(libc::SYS_recvfrom),
1003        "sendmsg" => Some(libc::SYS_sendmsg),
1004        "recvmsg" => Some(libc::SYS_recvmsg),
1005        "shutdown" => Some(libc::SYS_shutdown),
1006        "bind" => Some(libc::SYS_bind),
1007        "listen" => Some(libc::SYS_listen),
1008        "accept" => Some(libc::SYS_accept),
1009        "accept4" => Some(libc::SYS_accept4),
1010        "setsockopt" => Some(libc::SYS_setsockopt),
1011        "getsockopt" => Some(libc::SYS_getsockopt),
1012        "getsockname" => Some(libc::SYS_getsockname),
1013        "getpeername" => Some(libc::SYS_getpeername),
1014        "socketpair" => Some(libc::SYS_socketpair),
1015        // Poll/Select
1016        #[cfg(target_arch = "x86_64")]
1017        "poll" => Some(libc::SYS_poll),
1018        "ppoll" => Some(libc::SYS_ppoll),
1019        #[cfg(target_arch = "x86_64")]
1020        "select" => Some(libc::SYS_select),
1021        "pselect6" => Some(libc::SYS_pselect6),
1022        #[cfg(target_arch = "x86_64")]
1023        "epoll_create" => Some(libc::SYS_epoll_create),
1024        "epoll_create1" => Some(libc::SYS_epoll_create1),
1025        "epoll_ctl" => Some(libc::SYS_epoll_ctl),
1026        #[cfg(target_arch = "x86_64")]
1027        "epoll_wait" => Some(libc::SYS_epoll_wait),
1028        "epoll_pwait" => Some(libc::SYS_epoll_pwait),
1029        #[cfg(target_arch = "x86_64")]
1030        "eventfd" => Some(libc::SYS_eventfd),
1031        "eventfd2" => Some(libc::SYS_eventfd2),
1032        #[cfg(target_arch = "x86_64")]
1033        "signalfd" => Some(libc::SYS_signalfd),
1034        "signalfd4" => Some(libc::SYS_signalfd4),
1035        "timerfd_create" => Some(libc::SYS_timerfd_create),
1036        "timerfd_settime" => Some(libc::SYS_timerfd_settime),
1037        "timerfd_gettime" => Some(libc::SYS_timerfd_gettime),
1038        // Misc
1039        "uname" => Some(libc::SYS_uname),
1040        "getrandom" => Some(libc::SYS_getrandom),
1041        "futex" => Some(libc::SYS_futex),
1042        "set_tid_address" => Some(libc::SYS_set_tid_address),
1043        "set_robust_list" => Some(libc::SYS_set_robust_list),
1044        "get_robust_list" => Some(libc::SYS_get_robust_list),
1045        #[cfg(target_arch = "x86_64")]
1046        "arch_prctl" => Some(libc::SYS_arch_prctl),
1047        "sysinfo" => Some(libc::SYS_sysinfo),
1048        "umask" => Some(libc::SYS_umask),
1049        #[cfg(target_arch = "x86_64")]
1050        "getrlimit" => Some(libc::SYS_getrlimit),
1051        "prlimit64" => Some(libc::SYS_prlimit64),
1052        "getrusage" => Some(libc::SYS_getrusage),
1053        "times" => Some(libc::SYS_times),
1054        "sched_yield" => Some(libc::SYS_sched_yield),
1055        "sched_getaffinity" => Some(libc::SYS_sched_getaffinity),
1056        "getcpu" => Some(libc::SYS_getcpu),
1057        "rseq" => Some(libc::SYS_rseq),
1058        "close_range" => Some(libc::SYS_close_range),
1059        "memfd_create" => Some(libc::SYS_memfd_create),
1060        "ioctl" => Some(libc::SYS_ioctl),
1061        "prctl" => Some(libc::SYS_prctl),
1062        // Landlock
1063        "landlock_create_ruleset" => Some(libc::SYS_landlock_create_ruleset),
1064        "landlock_add_rule" => Some(libc::SYS_landlock_add_rule),
1065        "landlock_restrict_self" => Some(libc::SYS_landlock_restrict_self),
1066        _ => None,
1067    }
1068}
1069
1070impl Default for SeccompManager {
1071    fn default() -> Self {
1072        Self::new()
1073    }
1074}
1075
1076#[cfg(test)]
1077mod tests {
1078    use super::*;
1079
1080    #[test]
1081    fn test_seccomp_manager_initial_state() {
1082        let mgr = SeccompManager::new();
1083        assert!(!mgr.is_applied());
1084    }
1085
1086    #[test]
1087    fn test_apply_idempotent() {
1088        let mgr = SeccompManager::new();
1089        // Note: We can't actually test application in unit tests
1090        // as it would affect the test process itself
1091        // This is tested in integration tests instead
1092        assert!(!mgr.is_applied());
1093    }
1094
1095    #[test]
1096    fn test_clone_denied_flags_include_newcgroup() {
1097        assert_ne!(
1098            DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWCGROUP as u64,
1099            0
1100        );
1101    }
1102
1103    #[test]
1104    fn test_clone_denied_flags_include_newtime() {
1105        assert_ne!(
1106            DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWTIME as u64,
1107            0,
1108            "CLONE_NEWTIME must be in denied clone namespace flags"
1109        );
1110    }
1111
1112    #[test]
1113    fn test_network_none_socket_domains_are_unix_only() {
1114        let domains = SeccompManager::allowed_socket_domains(false);
1115        assert_eq!(domains, vec![libc::AF_UNIX]);
1116    }
1117
1118    #[test]
1119    fn test_network_enabled_socket_domains_exclude_netlink() {
1120        let domains = SeccompManager::allowed_socket_domains(true);
1121        assert!(domains.contains(&libc::AF_UNIX));
1122        assert!(domains.contains(&libc::AF_INET));
1123        assert!(domains.contains(&libc::AF_INET6));
1124        assert!(!domains.contains(&libc::AF_NETLINK));
1125    }
1126
1127    #[test]
1128    fn test_network_mode_syscalls_only_enabled_when_network_allowed() {
1129        let none = SeccompManager::network_mode_syscalls(false);
1130        assert!(none.is_empty());
1131
1132        let enabled = SeccompManager::network_mode_syscalls(true);
1133        assert!(enabled.contains(&libc::SYS_connect));
1134        assert!(enabled.contains(&libc::SYS_bind));
1135        assert!(enabled.contains(&libc::SYS_listen));
1136        assert!(enabled.contains(&libc::SYS_accept));
1137        assert!(enabled.contains(&libc::SYS_setsockopt));
1138    }
1139
1140    #[test]
1141    fn test_landlock_bootstrap_syscalls_present_in_base_allowlist() {
1142        let base = SeccompManager::base_allowed_syscalls();
1143        assert!(base.contains(&libc::SYS_landlock_create_ruleset));
1144        assert!(base.contains(&libc::SYS_landlock_add_rule));
1145        assert!(base.contains(&libc::SYS_landlock_restrict_self));
1146    }
1147
1148    #[test]
1149    fn test_x32_legacy_range_not_allowlisted() {
1150        let base = SeccompManager::base_allowed_syscalls();
1151        let net = SeccompManager::network_mode_syscalls(true);
1152        for nr in 512_i64..=547_i64 {
1153            assert!(
1154                !base.contains(&nr) && !net.contains(&nr),
1155                "x32 syscall number {} unexpectedly allowlisted",
1156                nr
1157            );
1158        }
1159    }
1160
1161    #[test]
1162    fn test_i386_compat_socketcall_range_not_allowlisted() {
1163        let base = SeccompManager::base_allowed_syscalls();
1164        let net = SeccompManager::network_mode_syscalls(true);
1165        // i386 compat per syscall_32.tbl: socket..shutdown live at 359..373.
1166        // On x86_64 these numbers are outside our native allowlist surface.
1167        for nr in 359_i64..=373_i64 {
1168            assert!(
1169                !base.contains(&nr) && !net.contains(&nr),
1170                "i386 compat syscall number {} unexpectedly allowlisted",
1171                nr
1172            );
1173        }
1174    }
1175
1176    #[test]
1177    fn test_minimal_filter_allowlist_counts_are_stable() {
1178        let base = SeccompManager::base_allowed_syscalls();
1179        let net = SeccompManager::network_mode_syscalls(true);
1180
1181        // Snapshot counts to catch unintended policy drift.
1182        // +8 accounts for conditional rules inserted in minimal_filter():
1183        // socket/ioctl/prctl/prlimit64/mprotect/clone/clone3/execveat.
1184        // fork removed (forces through filtered clone path).
1185        // execveat removed from base (arg-filtered separately).
1186        // sysinfo removed (L8: leaks host info).
1187        // prlimit64 moved to arg-filtered (M3).
1188        assert_eq!(base.len(), 129);
1189        assert_eq!(net.len(), 11);
1190        assert_eq!(base.len() + 8, 137);
1191        assert_eq!(base.len() + net.len() + 8, 148);
1192    }
1193
1194    #[test]
1195    fn test_arg_filtered_syscalls_list_includes_critical_syscalls() {
1196        // These syscalls must be in the arg-filtered list so custom profiles
1197        // get warnings when they allow them without filters.
1198        for name in &["clone", "clone3", "execveat", "ioctl", "prctl", "socket"] {
1199            assert!(
1200                SeccompManager::ARG_FILTERED_SYSCALLS.contains(name),
1201                "'{}' must be in ARG_FILTERED_SYSCALLS",
1202                name
1203            );
1204        }
1205    }
1206
1207    #[test]
1208    fn test_clone3_allowed_in_minimal_filter() {
1209        // clone3 MUST be in the BPF rules map — glibc 2.34+ and newer musl
1210        // use clone3 internally for posix_spawn/fork. Blocking it breaks
1211        // std::process::Command on modern systems. Namespace creation is
1212        // prevented by dropped capabilities (CAP_SYS_ADMIN etc.), not seccomp.
1213        let rules = SeccompManager::minimal_filter(true).unwrap();
1214        assert!(
1215            rules.contains_key(&libc::SYS_clone3),
1216            "clone3 must be in the seccomp allowlist (glibc 2.34+ requires it)"
1217        );
1218    }
1219
1220    #[test]
1221    fn test_clone_is_allowed_with_arg_filter() {
1222        // clone (not clone3) should still be in the rules with arg filtering
1223        let rules = SeccompManager::minimal_filter(true).unwrap();
1224        assert!(
1225            rules.contains_key(&libc::SYS_clone),
1226            "clone must be in the seccomp allowlist with arg filters"
1227        );
1228    }
1229
1230    #[test]
1231    fn test_high_risk_syscalls_removed_from_base_allowlist() {
1232        let base = SeccompManager::base_allowed_syscalls();
1233        let removed = [
1234            libc::SYS_chown,
1235            libc::SYS_fchown,
1236            libc::SYS_lchown,
1237            libc::SYS_fchownat,
1238            libc::SYS_sync,
1239            libc::SYS_syncfs,
1240            libc::SYS_mlock,
1241            libc::SYS_munlock,
1242            libc::SYS_mincore,
1243            libc::SYS_vfork,
1244            libc::SYS_tkill,
1245        ];
1246
1247        for syscall in removed {
1248            assert!(
1249                !base.contains(&syscall),
1250                "syscall {} unexpectedly present in base allowlist",
1251                syscall
1252            );
1253        }
1254    }
1255
1256    #[test]
1257    fn test_custom_profile_preserves_clone_arg_filters() {
1258        // SEC-01: Custom seccomp profiles that allow "clone" must still get
1259        // argument-level filtering to block namespace-creating flags.
1260        // Verify by inspecting the built-in filter rules that serve as the
1261        // merge source for apply_profile_from_file.
1262        let rules = SeccompManager::minimal_filter(true).unwrap();
1263
1264        // Every ARG_FILTERED_SYSCALLS entry (except clone3, which is allowed
1265        // unconditionally since BPF can't inspect its struct-based flags) must
1266        // have non-empty argument-level rules in the built-in filter so that
1267        // apply_profile_from_file can merge them.
1268        for name in SeccompManager::ARG_FILTERED_SYSCALLS {
1269            if *name == "clone3" {
1270                // clone3 is allowed unconditionally — BPF cannot dereference
1271                // the clone_args struct, so arg filtering is impossible.
1272                // Namespace defense relies on dropped capabilities.
1273                continue;
1274            }
1275            if let Some(nr) = syscall_name_to_number(name) {
1276                let entry = rules.get(&nr);
1277                assert!(
1278                    entry.is_some() && !entry.unwrap().is_empty(),
1279                    "built-in filter must have argument-level rules for '{}' \
1280                     so apply_profile_from_file can merge them into custom profiles",
1281                    name
1282                );
1283            }
1284        }
1285    }
1286
1287    #[test]
1288    fn test_memfd_create_not_in_default_allowlist() {
1289        // SEC-02: memfd_create enables fileless code execution when combined with execveat.
1290        let base = SeccompManager::base_allowed_syscalls();
1291        assert!(
1292            !base.contains(&libc::SYS_memfd_create),
1293            "memfd_create must not be in the default seccomp allowlist (fileless exec risk)"
1294        );
1295        // Also verify it's not sneaked into the compiled filter rules
1296        let rules = SeccompManager::minimal_filter(true).unwrap();
1297        assert!(
1298            !rules.contains_key(&libc::SYS_memfd_create),
1299            "memfd_create must not be in the compiled seccomp filter rules"
1300        );
1301    }
1302
1303    #[test]
1304    fn test_mprotect_has_arg_filtering() {
1305        // SEC-03: mprotect must have argument-level filtering to prevent W^X
1306        // (PROT_WRITE|PROT_EXEC) violations. Verify via runtime data structures.
1307
1308        // mprotect must NOT be in the unconditional base allowlist
1309        let base = SeccompManager::base_allowed_syscalls();
1310        assert!(
1311            !base.contains(&libc::SYS_mprotect),
1312            "SYS_mprotect must not be unconditionally allowed - needs arg filtering"
1313        );
1314
1315        // mprotect must be present in the compiled filter with non-empty
1316        // argument conditions (the conditions enforce W^X)
1317        let rules = SeccompManager::minimal_filter(true).unwrap();
1318        let mprotect_rules = rules.get(&libc::SYS_mprotect);
1319        assert!(
1320            mprotect_rules.is_some(),
1321            "mprotect must be present in the seccomp filter rules"
1322        );
1323        assert!(
1324            !mprotect_rules.unwrap().is_empty(),
1325            "mprotect must have argument-level conditions to prevent W^X violations"
1326        );
1327    }
1328
1329    #[test]
1330    fn test_unsafe_blocks_have_safety_comments() {
1331        // SEC-08: All unsafe blocks must have // SAFETY: documentation
1332        let source = include_str!("seccomp.rs");
1333        let mut pos = 0;
1334        while let Some(idx) = source[pos..].find("unsafe {") {
1335            let abs_idx = pos + idx;
1336            // Check that there's a SAFETY comment within 200 chars before the unsafe block
1337            let start = abs_idx.saturating_sub(200);
1338            let context = &source[start..abs_idx];
1339            assert!(
1340                context.contains("SAFETY:"),
1341                "unsafe block at byte {} must have a // SAFETY: comment. Context: ...{}...",
1342                abs_idx,
1343                &source[abs_idx.saturating_sub(80)..abs_idx + 10]
1344            );
1345            pos = abs_idx + 1;
1346        }
1347    }
1348
1349    // --- H-1: mprotect MaskedEq logic verification ---
1350    //
1351    // The mprotect filter uses MaskedEq((PROT_WRITE | PROT_EXEC), value) to
1352    // allow only combinations where the W|X bits match one of {0, W, X}.
1353    // These tests prove the logic is correct without installing a real
1354    // seccomp filter (which would affect the test process).
1355
1356    /// Helper: simulates the MaskedEq check that the seccomp BPF would perform.
1357    /// Returns true if the prot value would be ALLOWED by one of the rules.
1358    fn mprotect_would_allow(prot: u64) -> bool {
1359        let mask = (libc::PROT_WRITE | libc::PROT_EXEC) as u64;
1360        let allowed_values: &[u64] = &[0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64];
1361        let masked = prot & mask;
1362        allowed_values.contains(&masked)
1363    }
1364
1365    #[test]
1366    fn test_mprotect_allows_prot_none() {
1367        assert!(mprotect_would_allow(0), "PROT_NONE must be allowed");
1368    }
1369
1370    #[test]
1371    fn test_mprotect_allows_prot_read_only() {
1372        assert!(
1373            mprotect_would_allow(libc::PROT_READ as u64),
1374            "PROT_READ must be allowed (W|X bits are 0)"
1375        );
1376    }
1377
1378    #[test]
1379    fn test_mprotect_allows_prot_read_write() {
1380        assert!(
1381            mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE) as u64),
1382            "PROT_READ|PROT_WRITE must be allowed"
1383        );
1384    }
1385
1386    #[test]
1387    fn test_mprotect_allows_prot_read_exec() {
1388        assert!(
1389            mprotect_would_allow((libc::PROT_READ | libc::PROT_EXEC) as u64),
1390            "PROT_READ|PROT_EXEC must be allowed"
1391        );
1392    }
1393
1394    #[test]
1395    fn test_mprotect_rejects_prot_write_exec() {
1396        assert!(
1397            !mprotect_would_allow((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
1398            "PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
1399        );
1400    }
1401
1402    #[test]
1403    fn test_mprotect_rejects_prot_read_write_exec() {
1404        assert!(
1405            !mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE | libc::PROT_EXEC) as u64),
1406            "PROT_READ|PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
1407        );
1408    }
1409
1410    #[test]
1411    fn test_mprotect_allows_prot_write_alone() {
1412        assert!(
1413            mprotect_would_allow(libc::PROT_WRITE as u64),
1414            "PROT_WRITE alone must be allowed"
1415        );
1416    }
1417
1418    #[test]
1419    fn test_mprotect_allows_prot_exec_alone() {
1420        assert!(
1421            mprotect_would_allow(libc::PROT_EXEC as u64),
1422            "PROT_EXEC alone must be allowed"
1423        );
1424    }
1425}