1use crate::error::{NucleusError, Result};
2use crate::security::policy::sha256_hex;
3use seccompiler::{BpfProgram, SeccompAction, SeccompCondition, SeccompFilter, SeccompRule};
4use std::collections::BTreeMap;
5use std::path::Path;
6use tracing::{debug, info, warn};
7
8pub struct SeccompManager {
13 applied: bool,
14}
15
16const DENIED_CLONE_NAMESPACE_FLAGS: u64 = (libc::CLONE_NEWUSER
17 | libc::CLONE_NEWNS
18 | libc::CLONE_NEWNET
19 | libc::CLONE_NEWIPC
20 | libc::CLONE_NEWUTS
21 | libc::CLONE_NEWPID
22 | libc::CLONE_NEWCGROUP
23 | libc::CLONE_NEWTIME) as u64;
24
25impl SeccompManager {
26 pub fn new() -> Self {
27 Self { applied: false }
28 }
29
30 fn base_allowed_syscalls() -> Vec<i64> {
31 let mut syscalls = vec![
32 libc::SYS_read,
34 libc::SYS_write,
35 libc::SYS_openat,
36 libc::SYS_close,
37 libc::SYS_fstat,
38 libc::SYS_lseek,
39 libc::SYS_fcntl,
40 libc::SYS_readv,
41 libc::SYS_writev,
42 libc::SYS_pread64,
43 libc::SYS_pwrite64,
44 libc::SYS_readlinkat,
45 libc::SYS_newfstatat,
46 libc::SYS_statx,
47 libc::SYS_faccessat,
48 libc::SYS_faccessat2,
49 libc::SYS_dup,
50 libc::SYS_dup3,
51 libc::SYS_pipe2,
52 libc::SYS_unlinkat,
53 libc::SYS_renameat,
54 libc::SYS_renameat2,
55 libc::SYS_linkat,
56 libc::SYS_symlinkat,
57 libc::SYS_fchmod,
58 libc::SYS_fchmodat,
59 libc::SYS_truncate,
60 libc::SYS_ftruncate,
61 libc::SYS_fallocate,
62 #[cfg(target_arch = "x86_64")]
63 libc::SYS_fadvise64,
64 libc::SYS_fsync,
65 libc::SYS_fdatasync,
66 libc::SYS_flock,
67 #[cfg(target_arch = "x86_64")]
68 libc::SYS_sendfile,
69 libc::SYS_copy_file_range,
70 libc::SYS_splice,
71 libc::SYS_tee,
72 libc::SYS_mmap,
74 libc::SYS_munmap,
75 libc::SYS_brk,
76 libc::SYS_mremap,
77 libc::SYS_madvise,
78 libc::SYS_msync,
79 libc::SYS_execve,
85 libc::SYS_wait4,
87 libc::SYS_waitid,
88 libc::SYS_exit,
89 libc::SYS_exit_group,
90 libc::SYS_getpid,
91 libc::SYS_gettid,
92 libc::SYS_getuid,
93 libc::SYS_getgid,
94 libc::SYS_geteuid,
95 libc::SYS_getegid,
96 libc::SYS_getppid,
97 libc::SYS_setsid,
98 libc::SYS_getgroups,
99 libc::SYS_rt_sigaction,
101 libc::SYS_rt_sigprocmask,
102 libc::SYS_rt_sigreturn,
103 libc::SYS_rt_sigsuspend,
104 libc::SYS_sigaltstack,
105 libc::SYS_kill,
109 libc::SYS_tgkill,
110 libc::SYS_clock_gettime,
112 libc::SYS_clock_getres,
113 libc::SYS_clock_nanosleep,
114 libc::SYS_gettimeofday,
115 libc::SYS_nanosleep,
116 libc::SYS_getcwd,
118 libc::SYS_chdir,
119 libc::SYS_fchdir,
120 libc::SYS_mkdirat,
121 libc::SYS_getdents64,
122 libc::SYS_uname,
124 libc::SYS_getrandom,
125 libc::SYS_futex,
126 libc::SYS_set_tid_address,
127 libc::SYS_set_robust_list,
128 libc::SYS_get_robust_list,
129 libc::SYS_umask,
132 libc::SYS_getrusage,
134 libc::SYS_times,
135 libc::SYS_sched_yield,
136 libc::SYS_sched_getaffinity,
137 libc::SYS_getcpu,
138 libc::SYS_rseq,
139 libc::SYS_close_range,
140 libc::SYS_landlock_create_ruleset,
144 libc::SYS_landlock_add_rule,
145 libc::SYS_landlock_restrict_self,
146 libc::SYS_getsockname,
148 libc::SYS_getpeername,
149 libc::SYS_socketpair,
150 libc::SYS_getsockopt,
151 libc::SYS_ppoll,
153 libc::SYS_pselect6,
154 libc::SYS_epoll_create1,
155 libc::SYS_epoll_ctl,
156 libc::SYS_epoll_pwait,
157 libc::SYS_eventfd2,
158 libc::SYS_signalfd4,
159 libc::SYS_timerfd_create,
160 libc::SYS_timerfd_settime,
161 libc::SYS_timerfd_gettime,
162 ];
163
164 #[cfg(target_arch = "x86_64")]
166 syscalls.extend_from_slice(&[
167 libc::SYS_open,
168 libc::SYS_stat,
169 libc::SYS_lstat,
170 libc::SYS_access,
171 libc::SYS_readlink,
172 libc::SYS_dup2,
173 libc::SYS_pipe,
174 libc::SYS_unlink,
175 libc::SYS_rename,
176 libc::SYS_link,
177 libc::SYS_symlink,
178 libc::SYS_chmod,
179 libc::SYS_mkdir,
180 libc::SYS_rmdir,
181 libc::SYS_getdents,
182 libc::SYS_getpgrp,
183 libc::SYS_arch_prctl,
184 libc::SYS_getrlimit,
185 libc::SYS_poll,
186 libc::SYS_select,
187 libc::SYS_epoll_create,
188 libc::SYS_epoll_wait,
189 libc::SYS_eventfd,
190 libc::SYS_signalfd,
191 ]);
192
193 syscalls
194 }
195
196 fn allowed_socket_domains(allow_network: bool) -> Vec<i32> {
197 if allow_network {
198 vec![libc::AF_UNIX, libc::AF_INET, libc::AF_INET6]
199 } else {
200 vec![libc::AF_UNIX]
201 }
202 }
203
204 fn network_mode_syscalls(allow_network: bool) -> Vec<i64> {
205 if allow_network {
206 vec![
207 libc::SYS_connect,
208 libc::SYS_sendto,
209 libc::SYS_recvfrom,
210 libc::SYS_sendmsg,
211 libc::SYS_recvmsg,
212 libc::SYS_shutdown,
213 libc::SYS_bind,
214 libc::SYS_listen,
215 libc::SYS_accept,
216 libc::SYS_accept4,
217 libc::SYS_setsockopt,
218 ]
219 } else {
220 Vec::new()
221 }
222 }
223
224 fn minimal_filter(allow_network: bool) -> Result<BTreeMap<i64, Vec<SeccompRule>>> {
234 let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
235
236 let allowed_syscalls = Self::base_allowed_syscalls();
238
239 for syscall in allowed_syscalls {
241 rules.insert(syscall, Vec::new());
242 }
243
244 for syscall in Self::network_mode_syscalls(allow_network) {
246 rules.insert(syscall, Vec::new());
247 }
248
249 let mut socket_rules = Vec::new();
252 for domain in Self::allowed_socket_domains(allow_network) {
253 let condition = SeccompCondition::new(
254 0, seccompiler::SeccompCmpArgLen::Dword,
256 seccompiler::SeccompCmpOp::Eq,
257 domain as u64,
258 )
259 .map_err(|e| {
260 NucleusError::SeccompError(format!(
261 "Failed to create socket domain condition: {}",
262 e
263 ))
264 })?;
265 let rule = SeccompRule::new(vec![condition]).map_err(|e| {
266 NucleusError::SeccompError(format!("Failed to create socket rule: {}", e))
267 })?;
268 socket_rules.push(rule);
269 }
270 rules.insert(libc::SYS_socket, socket_rules);
271
272 let ioctl_allowed: &[u64] = &[
274 0x5401, 0x5402, 0x5403, 0x5404, 0x540B, 0x540F, 0x5410, 0x5413, 0x5429, 0x541B, 0x5421, 0x5451, 0x5450, ];
290 let mut ioctl_rules = Vec::new();
291 for &request in ioctl_allowed {
292 let condition = SeccompCondition::new(
293 1, seccompiler::SeccompCmpArgLen::Dword,
295 seccompiler::SeccompCmpOp::Eq,
296 request,
297 )
298 .map_err(|e| {
299 NucleusError::SeccompError(format!("Failed to create ioctl condition: {}", e))
300 })?;
301 let rule = SeccompRule::new(vec![condition]).map_err(|e| {
302 NucleusError::SeccompError(format!("Failed to create ioctl rule: {}", e))
303 })?;
304 ioctl_rules.push(rule);
305 }
306 rules.insert(libc::SYS_ioctl, ioctl_rules);
307
308 let prctl_allowed: &[u64] = &[
314 1, 2, 15, 16, 38, 39, ];
321 let mut prctl_rules = Vec::new();
322 for &option in prctl_allowed {
323 let condition = SeccompCondition::new(
324 0, seccompiler::SeccompCmpArgLen::Dword,
326 seccompiler::SeccompCmpOp::Eq,
327 option,
328 )
329 .map_err(|e| {
330 NucleusError::SeccompError(format!("Failed to create prctl condition: {}", e))
331 })?;
332 let rule = SeccompRule::new(vec![condition]).map_err(|e| {
333 NucleusError::SeccompError(format!("Failed to create prctl rule: {}", e))
334 })?;
335 prctl_rules.push(rule);
336 }
337 rules.insert(libc::SYS_prctl, prctl_rules);
338
339 let prlimit_condition = SeccompCondition::new(
342 2, seccompiler::SeccompCmpArgLen::Qword,
344 seccompiler::SeccompCmpOp::Eq,
345 0u64, )
347 .map_err(|e| {
348 NucleusError::SeccompError(format!("Failed to create prlimit64 condition: {}", e))
349 })?;
350 let prlimit_rule = SeccompRule::new(vec![prlimit_condition]).map_err(|e| {
351 NucleusError::SeccompError(format!("Failed to create prlimit64 rule: {}", e))
352 })?;
353 rules.insert(libc::SYS_prlimit64, vec![prlimit_rule]);
354
355 let mut mprotect_rules = Vec::new();
357 for allowed in [0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64] {
358 let condition = SeccompCondition::new(
359 2, seccompiler::SeccompCmpArgLen::Dword,
361 seccompiler::SeccompCmpOp::MaskedEq((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
362 allowed,
363 )
364 .map_err(|e| {
365 NucleusError::SeccompError(format!("Failed to create mprotect condition: {}", e))
366 })?;
367 let rule = SeccompRule::new(vec![condition]).map_err(|e| {
368 NucleusError::SeccompError(format!("Failed to create mprotect rule: {}", e))
369 })?;
370 mprotect_rules.push(rule);
371 }
372 rules.insert(libc::SYS_mprotect, mprotect_rules);
373
374 if Self::has_effective_cap(21) {
389 return Err(NucleusError::SeccompError(
390 "SECURITY: CAP_SYS_ADMIN is still in the effective capability set. \
391 Capabilities must be dropped before installing seccomp filters \
392 (clone3 is allowed unconditionally)."
393 .to_string(),
394 ));
395 }
396 rules.insert(libc::SYS_clone3, Vec::new());
397
398 let clone_condition = SeccompCondition::new(
400 0, seccompiler::SeccompCmpArgLen::Qword,
402 seccompiler::SeccompCmpOp::MaskedEq(DENIED_CLONE_NAMESPACE_FLAGS),
403 0, )
405 .map_err(|e| {
406 NucleusError::SeccompError(format!("Failed to create clone condition: {}", e))
407 })?;
408 let clone_rule = SeccompRule::new(vec![clone_condition]).map_err(|e| {
409 NucleusError::SeccompError(format!("Failed to create clone rule: {}", e))
410 })?;
411 rules.insert(libc::SYS_clone, vec![clone_rule]);
412
413 let execveat_condition = SeccompCondition::new(
420 4, seccompiler::SeccompCmpArgLen::Dword,
422 seccompiler::SeccompCmpOp::MaskedEq(libc::AT_EMPTY_PATH as u64),
423 0, )
425 .map_err(|e| {
426 NucleusError::SeccompError(format!("Failed to create execveat condition: {}", e))
427 })?;
428 let execveat_rule = SeccompRule::new(vec![execveat_condition]).map_err(|e| {
429 NucleusError::SeccompError(format!("Failed to create execveat rule: {}", e))
430 })?;
431 rules.insert(libc::SYS_execveat, vec![execveat_rule]);
432
433 Ok(rules)
434 }
435
436 pub fn compile_minimal_filter() -> Result<BpfProgram> {
441 let rules = Self::minimal_filter(true)?;
442 let filter = SeccompFilter::new(
443 rules,
444 SeccompAction::KillProcess,
445 SeccompAction::Allow,
446 std::env::consts::ARCH.try_into().map_err(|e| {
447 NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
448 })?,
449 )
450 .map_err(|e| {
451 NucleusError::SeccompError(format!("Failed to create seccomp filter: {}", e))
452 })?;
453
454 let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
455 NucleusError::SeccompError(format!("Failed to compile BPF program: {}", e))
456 })?;
457
458 Ok(bpf_prog)
459 }
460
461 pub fn apply_minimal_filter(&mut self) -> Result<bool> {
469 self.apply_minimal_filter_with_mode(false, false)
470 }
471
472 pub fn apply_minimal_filter_with_mode(
477 &mut self,
478 best_effort: bool,
479 log_denied: bool,
480 ) -> Result<bool> {
481 self.apply_filter_for_network_mode(true, best_effort, log_denied)
482 }
483
484 pub fn apply_filter_for_network_mode(
493 &mut self,
494 allow_network: bool,
495 best_effort: bool,
496 log_denied: bool,
497 ) -> Result<bool> {
498 if self.applied {
499 debug!("Seccomp filter already applied, skipping");
500 return Ok(true);
501 }
502
503 info!(allow_network, "Applying seccomp filter");
504
505 let rules = match Self::minimal_filter(allow_network) {
506 Ok(r) => r,
507 Err(e) => {
508 if best_effort {
509 warn!(
510 "Failed to create seccomp rules: {} (continuing without seccomp)",
511 e
512 );
513 return Ok(false);
514 }
515 return Err(e);
516 }
517 };
518
519 let filter = match SeccompFilter::new(
520 rules,
521 SeccompAction::KillProcess, SeccompAction::Allow, std::env::consts::ARCH.try_into().map_err(|e| {
524 NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
525 })?,
526 ) {
527 Ok(f) => f,
528 Err(e) => {
529 if best_effort {
530 warn!(
531 "Failed to create seccomp filter: {} (continuing without seccomp)",
532 e
533 );
534 return Ok(false);
535 }
536 return Err(NucleusError::SeccompError(format!(
537 "Failed to create seccomp filter: {}",
538 e
539 )));
540 }
541 };
542
543 let bpf_prog: BpfProgram = match filter.try_into() {
544 Ok(p) => p,
545 Err(e) => {
546 if best_effort {
547 warn!(
548 "Failed to compile BPF program: {} (continuing without seccomp)",
549 e
550 );
551 return Ok(false);
552 }
553 return Err(NucleusError::SeccompError(format!(
554 "Failed to compile BPF program: {}",
555 e
556 )));
557 }
558 };
559
560 match Self::apply_bpf_program(&bpf_prog, log_denied) {
562 Ok(_) => {
563 self.applied = true;
564 info!("Successfully applied seccomp filter");
565 Ok(true)
566 }
567 Err(e) => {
568 if best_effort {
569 warn!(
570 "Failed to apply seccomp filter: {} (continuing without seccomp)",
571 e
572 );
573 Ok(false)
574 } else {
575 Err(NucleusError::SeccompError(format!(
576 "Failed to apply seccomp filter: {}",
577 e
578 )))
579 }
580 }
581 }
582 }
583
584 pub fn apply_profile_from_file(
603 &mut self,
604 profile_path: &Path,
605 expected_sha256: Option<&str>,
606 audit_mode: bool,
607 ) -> Result<bool> {
608 if self.applied {
609 debug!("Seccomp filter already applied, skipping");
610 return Ok(true);
611 }
612
613 info!("Loading seccomp profile from {:?}", profile_path);
614
615 let content = std::fs::read(profile_path).map_err(|e| {
617 NucleusError::SeccompError(format!(
618 "Failed to read seccomp profile {:?}: {}",
619 profile_path, e
620 ))
621 })?;
622
623 if let Some(expected) = expected_sha256 {
625 let actual = sha256_hex(&content);
626 if actual != expected {
627 return Err(NucleusError::SeccompError(format!(
628 "Seccomp profile hash mismatch: expected {}, got {}",
629 expected, actual
630 )));
631 }
632 info!("Seccomp profile hash verified: {}", actual);
633 }
634
635 let profile: SeccompProfile = serde_json::from_slice(&content).map_err(|e| {
637 NucleusError::SeccompError(format!("Failed to parse seccomp profile: {}", e))
638 })?;
639
640 Self::warn_missing_arg_filters(&profile);
645
646 let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
648
649 for syscall_group in &profile.syscalls {
650 if syscall_group.action == "SCMP_ACT_ALLOW" {
651 for name in &syscall_group.names {
652 if let Some(nr) = syscall_name_to_number(name) {
653 rules.insert(nr, Vec::new());
654 } else {
655 warn!("Unknown syscall in profile: {} (skipping)", name);
656 }
657 }
658 }
659 }
660
661 let builtin_rules = Self::minimal_filter(true)?;
666 for syscall_name in Self::ARG_FILTERED_SYSCALLS {
667 if let Some(nr) = syscall_name_to_number(syscall_name) {
668 if let std::collections::btree_map::Entry::Occupied(mut entry) = rules.entry(nr) {
669 if let Some(builtin) = builtin_rules.get(&nr) {
670 if !builtin.is_empty() {
671 info!(
672 "Merging built-in argument filters for '{}' into custom profile",
673 syscall_name
674 );
675 entry.insert(builtin.clone());
676 }
677 }
678 }
679 }
680 }
681 if !rules.contains_key(&libc::SYS_clone3) {
687 rules.insert(libc::SYS_clone3, Vec::new());
688 }
689
690 let filter = SeccompFilter::new(
691 rules,
692 SeccompAction::KillProcess,
693 SeccompAction::Allow,
694 std::env::consts::ARCH.try_into().map_err(|e| {
695 NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
696 })?,
697 )
698 .map_err(|e| {
699 NucleusError::SeccompError(format!(
700 "Failed to create seccomp filter from profile: {}",
701 e
702 ))
703 })?;
704
705 let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
706 NucleusError::SeccompError(format!("Failed to compile BPF program from profile: {}", e))
707 })?;
708
709 match Self::apply_bpf_program(&bpf_prog, audit_mode) {
710 Ok(_) => {
711 self.applied = true;
712 info!(
713 "Seccomp profile applied from {:?} (log_denied={})",
714 profile_path, audit_mode
715 );
716 Ok(true)
717 }
718 Err(e) => Err(e),
719 }
720 }
721
722 pub fn apply_trace_filter(&mut self) -> Result<bool> {
727 if self.applied {
728 debug!("Seccomp filter already applied, skipping trace filter");
729 return Ok(true);
730 }
731
732 info!("Applying seccomp trace filter (allow-all + LOG)");
733
734 let filter = SeccompFilter::new(
738 BTreeMap::new(),
739 SeccompAction::Allow, SeccompAction::Allow, std::env::consts::ARCH.try_into().map_err(|e| {
742 NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
743 })?,
744 )
745 .map_err(|e| NucleusError::SeccompError(format!("Failed to create trace filter: {}", e)))?;
746
747 let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
748 NucleusError::SeccompError(format!("Failed to compile trace BPF: {}", e))
749 })?;
750
751 Self::apply_bpf_program(&bpf_prog, true)?;
753 self.applied = true;
754 info!("Seccomp trace filter applied (all syscalls allowed + logged)");
755 Ok(true)
756 }
757
758 const ARG_FILTERED_SYSCALLS: &'static [&'static str] = &[
761 "clone", "clone3", "execveat", "ioctl", "mprotect", "prctl", "socket",
762 ];
763
764 fn warn_missing_arg_filters(profile: &SeccompProfile) {
767 for group in &profile.syscalls {
768 if group.action != "SCMP_ACT_ALLOW" {
769 continue;
770 }
771 for name in &group.names {
772 if Self::ARG_FILTERED_SYSCALLS.contains(&name.as_str()) && group.args.is_empty() {
773 warn!(
774 "Custom seccomp profile allows '{}' without argument filters. \
775 The built-in filter restricts this syscall at the argument level. \
776 This profile weakens security compared to the default.",
777 name
778 );
779 }
780 }
781 }
782 }
783
784 fn has_effective_cap(cap: i32) -> bool {
787 let Ok(status) = std::fs::read_to_string("/proc/self/status") else {
788 return true;
790 };
791 for line in status.lines() {
792 if let Some(hex) = line.strip_prefix("CapEff:\t") {
793 if let Ok(eff) = u64::from_str_radix(hex.trim(), 16) {
794 return eff & (1u64 << cap) != 0;
795 }
796 }
797 }
798 true }
800
801 pub fn is_applied(&self) -> bool {
803 self.applied
804 }
805
806 fn apply_bpf_program(bpf_prog: &BpfProgram, log_denied: bool) -> Result<()> {
807 let mut flags: libc::c_ulong = 0;
808 if log_denied {
809 flags |= libc::SECCOMP_FILTER_FLAG_LOG as libc::c_ulong;
810 }
811
812 match Self::apply_bpf_program_with_flags(bpf_prog, flags) {
813 Ok(()) => Ok(()),
814 Err(err)
815 if log_denied
816 && err.raw_os_error() == Some(libc::EINVAL)
817 && libc::SECCOMP_FILTER_FLAG_LOG != 0 =>
818 {
819 warn!(
820 "Kernel rejected SECCOMP_FILTER_FLAG_LOG; continuing with seccomp \
821 enforcement without deny logging"
822 );
823 Self::apply_bpf_program_with_flags(bpf_prog, 0)?;
824 Ok(())
825 }
826 Err(err) => Err(NucleusError::SeccompError(format!(
827 "Failed to apply seccomp filter: {}",
828 err
829 ))),
830 }
831 }
832
833 fn apply_bpf_program_with_flags(
834 bpf_prog: &BpfProgram,
835 flags: libc::c_ulong,
836 ) -> std::io::Result<()> {
837 let rc = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
840 if rc != 0 {
841 return Err(std::io::Error::last_os_error());
842 }
843
844 let prog = libc::sock_fprog {
845 len: bpf_prog.len() as u16,
846 filter: bpf_prog.as_ptr() as *mut libc::sock_filter,
847 };
848
849 let rc = unsafe {
852 libc::syscall(
853 libc::SYS_seccomp,
854 libc::SECCOMP_SET_MODE_FILTER,
855 flags,
856 &prog as *const libc::sock_fprog,
857 )
858 };
859
860 if rc < 0 {
861 return Err(std::io::Error::last_os_error());
862 }
863
864 Ok(())
865 }
866}
867
868use crate::security::seccomp_generate::SeccompProfile;
870
871fn syscall_name_to_number(name: &str) -> Option<i64> {
875 match name {
876 "read" => Some(libc::SYS_read),
878 "write" => Some(libc::SYS_write),
879 #[cfg(target_arch = "x86_64")]
880 "open" => Some(libc::SYS_open),
881 "openat" => Some(libc::SYS_openat),
882 "close" => Some(libc::SYS_close),
883 #[cfg(target_arch = "x86_64")]
884 "stat" => Some(libc::SYS_stat),
885 "fstat" => Some(libc::SYS_fstat),
886 #[cfg(target_arch = "x86_64")]
887 "lstat" => Some(libc::SYS_lstat),
888 "lseek" => Some(libc::SYS_lseek),
889 #[cfg(target_arch = "x86_64")]
890 "access" => Some(libc::SYS_access),
891 "fcntl" => Some(libc::SYS_fcntl),
892 "readv" => Some(libc::SYS_readv),
893 "writev" => Some(libc::SYS_writev),
894 "pread64" => Some(libc::SYS_pread64),
895 "pwrite64" => Some(libc::SYS_pwrite64),
896 #[cfg(target_arch = "x86_64")]
897 "readlink" => Some(libc::SYS_readlink),
898 "readlinkat" => Some(libc::SYS_readlinkat),
899 "newfstatat" => Some(libc::SYS_newfstatat),
900 "statx" => Some(libc::SYS_statx),
901 "faccessat" => Some(libc::SYS_faccessat),
902 "faccessat2" => Some(libc::SYS_faccessat2),
903 "dup" => Some(libc::SYS_dup),
904 #[cfg(target_arch = "x86_64")]
905 "dup2" => Some(libc::SYS_dup2),
906 "dup3" => Some(libc::SYS_dup3),
907 #[cfg(target_arch = "x86_64")]
908 "pipe" => Some(libc::SYS_pipe),
909 "pipe2" => Some(libc::SYS_pipe2),
910 #[cfg(target_arch = "x86_64")]
911 "unlink" => Some(libc::SYS_unlink),
912 "unlinkat" => Some(libc::SYS_unlinkat),
913 #[cfg(target_arch = "x86_64")]
914 "rename" => Some(libc::SYS_rename),
915 "renameat" => Some(libc::SYS_renameat),
916 "renameat2" => Some(libc::SYS_renameat2),
917 #[cfg(target_arch = "x86_64")]
918 "link" => Some(libc::SYS_link),
919 "linkat" => Some(libc::SYS_linkat),
920 #[cfg(target_arch = "x86_64")]
921 "symlink" => Some(libc::SYS_symlink),
922 "symlinkat" => Some(libc::SYS_symlinkat),
923 #[cfg(target_arch = "x86_64")]
924 "chmod" => Some(libc::SYS_chmod),
925 "fchmod" => Some(libc::SYS_fchmod),
926 "fchmodat" => Some(libc::SYS_fchmodat),
927 "truncate" => Some(libc::SYS_truncate),
928 "ftruncate" => Some(libc::SYS_ftruncate),
929 "fallocate" => Some(libc::SYS_fallocate),
930 #[cfg(target_arch = "x86_64")]
931 "fadvise64" => Some(libc::SYS_fadvise64),
932 "fsync" => Some(libc::SYS_fsync),
933 "fdatasync" => Some(libc::SYS_fdatasync),
934 "flock" => Some(libc::SYS_flock),
935 #[cfg(target_arch = "x86_64")]
936 "sendfile" => Some(libc::SYS_sendfile),
937 "copy_file_range" => Some(libc::SYS_copy_file_range),
938 "splice" => Some(libc::SYS_splice),
939 "tee" => Some(libc::SYS_tee),
940 "mmap" => Some(libc::SYS_mmap),
942 "munmap" => Some(libc::SYS_munmap),
943 "mprotect" => Some(libc::SYS_mprotect),
944 "brk" => Some(libc::SYS_brk),
945 "mremap" => Some(libc::SYS_mremap),
946 "madvise" => Some(libc::SYS_madvise),
947 "msync" => Some(libc::SYS_msync),
948 "mlock" => Some(libc::SYS_mlock),
949 "munlock" => Some(libc::SYS_munlock),
950 #[cfg(target_arch = "x86_64")]
952 "fork" => Some(libc::SYS_fork),
953 "clone" => Some(libc::SYS_clone),
954 "clone3" => Some(libc::SYS_clone3),
955 "execve" => Some(libc::SYS_execve),
956 "execveat" => Some(libc::SYS_execveat),
957 "wait4" => Some(libc::SYS_wait4),
958 "waitid" => Some(libc::SYS_waitid),
959 "exit" => Some(libc::SYS_exit),
960 "exit_group" => Some(libc::SYS_exit_group),
961 "getpid" => Some(libc::SYS_getpid),
962 "gettid" => Some(libc::SYS_gettid),
963 "getuid" => Some(libc::SYS_getuid),
964 "getgid" => Some(libc::SYS_getgid),
965 "geteuid" => Some(libc::SYS_geteuid),
966 "getegid" => Some(libc::SYS_getegid),
967 "getppid" => Some(libc::SYS_getppid),
968 #[cfg(target_arch = "x86_64")]
969 "getpgrp" => Some(libc::SYS_getpgrp),
970 "setsid" => Some(libc::SYS_setsid),
971 "getgroups" => Some(libc::SYS_getgroups),
972 "rt_sigaction" => Some(libc::SYS_rt_sigaction),
974 "rt_sigprocmask" => Some(libc::SYS_rt_sigprocmask),
975 "rt_sigreturn" => Some(libc::SYS_rt_sigreturn),
976 "rt_sigsuspend" => Some(libc::SYS_rt_sigsuspend),
977 "sigaltstack" => Some(libc::SYS_sigaltstack),
978 "kill" => Some(libc::SYS_kill),
979 "tgkill" => Some(libc::SYS_tgkill),
980 "clock_gettime" => Some(libc::SYS_clock_gettime),
982 "clock_getres" => Some(libc::SYS_clock_getres),
983 "clock_nanosleep" => Some(libc::SYS_clock_nanosleep),
984 "gettimeofday" => Some(libc::SYS_gettimeofday),
985 "nanosleep" => Some(libc::SYS_nanosleep),
986 "getcwd" => Some(libc::SYS_getcwd),
988 "chdir" => Some(libc::SYS_chdir),
989 "fchdir" => Some(libc::SYS_fchdir),
990 #[cfg(target_arch = "x86_64")]
991 "mkdir" => Some(libc::SYS_mkdir),
992 "mkdirat" => Some(libc::SYS_mkdirat),
993 #[cfg(target_arch = "x86_64")]
994 "rmdir" => Some(libc::SYS_rmdir),
995 #[cfg(target_arch = "x86_64")]
996 "getdents" => Some(libc::SYS_getdents),
997 "getdents64" => Some(libc::SYS_getdents64),
998 "socket" => Some(libc::SYS_socket),
1000 "connect" => Some(libc::SYS_connect),
1001 "sendto" => Some(libc::SYS_sendto),
1002 "recvfrom" => Some(libc::SYS_recvfrom),
1003 "sendmsg" => Some(libc::SYS_sendmsg),
1004 "recvmsg" => Some(libc::SYS_recvmsg),
1005 "shutdown" => Some(libc::SYS_shutdown),
1006 "bind" => Some(libc::SYS_bind),
1007 "listen" => Some(libc::SYS_listen),
1008 "accept" => Some(libc::SYS_accept),
1009 "accept4" => Some(libc::SYS_accept4),
1010 "setsockopt" => Some(libc::SYS_setsockopt),
1011 "getsockopt" => Some(libc::SYS_getsockopt),
1012 "getsockname" => Some(libc::SYS_getsockname),
1013 "getpeername" => Some(libc::SYS_getpeername),
1014 "socketpair" => Some(libc::SYS_socketpair),
1015 #[cfg(target_arch = "x86_64")]
1017 "poll" => Some(libc::SYS_poll),
1018 "ppoll" => Some(libc::SYS_ppoll),
1019 #[cfg(target_arch = "x86_64")]
1020 "select" => Some(libc::SYS_select),
1021 "pselect6" => Some(libc::SYS_pselect6),
1022 #[cfg(target_arch = "x86_64")]
1023 "epoll_create" => Some(libc::SYS_epoll_create),
1024 "epoll_create1" => Some(libc::SYS_epoll_create1),
1025 "epoll_ctl" => Some(libc::SYS_epoll_ctl),
1026 #[cfg(target_arch = "x86_64")]
1027 "epoll_wait" => Some(libc::SYS_epoll_wait),
1028 "epoll_pwait" => Some(libc::SYS_epoll_pwait),
1029 #[cfg(target_arch = "x86_64")]
1030 "eventfd" => Some(libc::SYS_eventfd),
1031 "eventfd2" => Some(libc::SYS_eventfd2),
1032 #[cfg(target_arch = "x86_64")]
1033 "signalfd" => Some(libc::SYS_signalfd),
1034 "signalfd4" => Some(libc::SYS_signalfd4),
1035 "timerfd_create" => Some(libc::SYS_timerfd_create),
1036 "timerfd_settime" => Some(libc::SYS_timerfd_settime),
1037 "timerfd_gettime" => Some(libc::SYS_timerfd_gettime),
1038 "uname" => Some(libc::SYS_uname),
1040 "getrandom" => Some(libc::SYS_getrandom),
1041 "futex" => Some(libc::SYS_futex),
1042 "set_tid_address" => Some(libc::SYS_set_tid_address),
1043 "set_robust_list" => Some(libc::SYS_set_robust_list),
1044 "get_robust_list" => Some(libc::SYS_get_robust_list),
1045 #[cfg(target_arch = "x86_64")]
1046 "arch_prctl" => Some(libc::SYS_arch_prctl),
1047 "sysinfo" => Some(libc::SYS_sysinfo),
1048 "umask" => Some(libc::SYS_umask),
1049 #[cfg(target_arch = "x86_64")]
1050 "getrlimit" => Some(libc::SYS_getrlimit),
1051 "prlimit64" => Some(libc::SYS_prlimit64),
1052 "getrusage" => Some(libc::SYS_getrusage),
1053 "times" => Some(libc::SYS_times),
1054 "sched_yield" => Some(libc::SYS_sched_yield),
1055 "sched_getaffinity" => Some(libc::SYS_sched_getaffinity),
1056 "getcpu" => Some(libc::SYS_getcpu),
1057 "rseq" => Some(libc::SYS_rseq),
1058 "close_range" => Some(libc::SYS_close_range),
1059 "memfd_create" => Some(libc::SYS_memfd_create),
1060 "ioctl" => Some(libc::SYS_ioctl),
1061 "prctl" => Some(libc::SYS_prctl),
1062 "landlock_create_ruleset" => Some(libc::SYS_landlock_create_ruleset),
1064 "landlock_add_rule" => Some(libc::SYS_landlock_add_rule),
1065 "landlock_restrict_self" => Some(libc::SYS_landlock_restrict_self),
1066 _ => None,
1067 }
1068}
1069
1070impl Default for SeccompManager {
1071 fn default() -> Self {
1072 Self::new()
1073 }
1074}
1075
1076#[cfg(test)]
1077mod tests {
1078 use super::*;
1079
1080 #[test]
1081 fn test_seccomp_manager_initial_state() {
1082 let mgr = SeccompManager::new();
1083 assert!(!mgr.is_applied());
1084 }
1085
1086 #[test]
1087 fn test_apply_idempotent() {
1088 let mgr = SeccompManager::new();
1089 assert!(!mgr.is_applied());
1093 }
1094
1095 #[test]
1096 fn test_clone_denied_flags_include_newcgroup() {
1097 assert_ne!(
1098 DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWCGROUP as u64,
1099 0
1100 );
1101 }
1102
1103 #[test]
1104 fn test_clone_denied_flags_include_newtime() {
1105 assert_ne!(
1106 DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWTIME as u64,
1107 0,
1108 "CLONE_NEWTIME must be in denied clone namespace flags"
1109 );
1110 }
1111
1112 #[test]
1113 fn test_network_none_socket_domains_are_unix_only() {
1114 let domains = SeccompManager::allowed_socket_domains(false);
1115 assert_eq!(domains, vec![libc::AF_UNIX]);
1116 }
1117
1118 #[test]
1119 fn test_network_enabled_socket_domains_exclude_netlink() {
1120 let domains = SeccompManager::allowed_socket_domains(true);
1121 assert!(domains.contains(&libc::AF_UNIX));
1122 assert!(domains.contains(&libc::AF_INET));
1123 assert!(domains.contains(&libc::AF_INET6));
1124 assert!(!domains.contains(&libc::AF_NETLINK));
1125 }
1126
1127 #[test]
1128 fn test_network_mode_syscalls_only_enabled_when_network_allowed() {
1129 let none = SeccompManager::network_mode_syscalls(false);
1130 assert!(none.is_empty());
1131
1132 let enabled = SeccompManager::network_mode_syscalls(true);
1133 assert!(enabled.contains(&libc::SYS_connect));
1134 assert!(enabled.contains(&libc::SYS_bind));
1135 assert!(enabled.contains(&libc::SYS_listen));
1136 assert!(enabled.contains(&libc::SYS_accept));
1137 assert!(enabled.contains(&libc::SYS_setsockopt));
1138 }
1139
1140 #[test]
1141 fn test_landlock_bootstrap_syscalls_present_in_base_allowlist() {
1142 let base = SeccompManager::base_allowed_syscalls();
1143 assert!(base.contains(&libc::SYS_landlock_create_ruleset));
1144 assert!(base.contains(&libc::SYS_landlock_add_rule));
1145 assert!(base.contains(&libc::SYS_landlock_restrict_self));
1146 }
1147
1148 #[test]
1149 fn test_x32_legacy_range_not_allowlisted() {
1150 let base = SeccompManager::base_allowed_syscalls();
1151 let net = SeccompManager::network_mode_syscalls(true);
1152 for nr in 512_i64..=547_i64 {
1153 assert!(
1154 !base.contains(&nr) && !net.contains(&nr),
1155 "x32 syscall number {} unexpectedly allowlisted",
1156 nr
1157 );
1158 }
1159 }
1160
1161 #[test]
1162 fn test_i386_compat_socketcall_range_not_allowlisted() {
1163 let base = SeccompManager::base_allowed_syscalls();
1164 let net = SeccompManager::network_mode_syscalls(true);
1165 for nr in 359_i64..=373_i64 {
1168 assert!(
1169 !base.contains(&nr) && !net.contains(&nr),
1170 "i386 compat syscall number {} unexpectedly allowlisted",
1171 nr
1172 );
1173 }
1174 }
1175
1176 #[test]
1177 fn test_minimal_filter_allowlist_counts_are_stable() {
1178 let base = SeccompManager::base_allowed_syscalls();
1179 let net = SeccompManager::network_mode_syscalls(true);
1180
1181 assert_eq!(base.len(), 129);
1189 assert_eq!(net.len(), 11);
1190 assert_eq!(base.len() + 8, 137);
1191 assert_eq!(base.len() + net.len() + 8, 148);
1192 }
1193
1194 #[test]
1195 fn test_arg_filtered_syscalls_list_includes_critical_syscalls() {
1196 for name in &["clone", "clone3", "execveat", "ioctl", "prctl", "socket"] {
1199 assert!(
1200 SeccompManager::ARG_FILTERED_SYSCALLS.contains(name),
1201 "'{}' must be in ARG_FILTERED_SYSCALLS",
1202 name
1203 );
1204 }
1205 }
1206
1207 #[test]
1208 fn test_clone3_allowed_in_minimal_filter() {
1209 let rules = SeccompManager::minimal_filter(true).unwrap();
1214 assert!(
1215 rules.contains_key(&libc::SYS_clone3),
1216 "clone3 must be in the seccomp allowlist (glibc 2.34+ requires it)"
1217 );
1218 }
1219
1220 #[test]
1221 fn test_clone_is_allowed_with_arg_filter() {
1222 let rules = SeccompManager::minimal_filter(true).unwrap();
1224 assert!(
1225 rules.contains_key(&libc::SYS_clone),
1226 "clone must be in the seccomp allowlist with arg filters"
1227 );
1228 }
1229
1230 #[test]
1231 fn test_high_risk_syscalls_removed_from_base_allowlist() {
1232 let base = SeccompManager::base_allowed_syscalls();
1233 let removed = [
1234 libc::SYS_chown,
1235 libc::SYS_fchown,
1236 libc::SYS_lchown,
1237 libc::SYS_fchownat,
1238 libc::SYS_sync,
1239 libc::SYS_syncfs,
1240 libc::SYS_mlock,
1241 libc::SYS_munlock,
1242 libc::SYS_mincore,
1243 libc::SYS_vfork,
1244 libc::SYS_tkill,
1245 ];
1246
1247 for syscall in removed {
1248 assert!(
1249 !base.contains(&syscall),
1250 "syscall {} unexpectedly present in base allowlist",
1251 syscall
1252 );
1253 }
1254 }
1255
1256 #[test]
1257 fn test_custom_profile_preserves_clone_arg_filters() {
1258 let rules = SeccompManager::minimal_filter(true).unwrap();
1263
1264 for name in SeccompManager::ARG_FILTERED_SYSCALLS {
1269 if *name == "clone3" {
1270 continue;
1274 }
1275 if let Some(nr) = syscall_name_to_number(name) {
1276 let entry = rules.get(&nr);
1277 assert!(
1278 entry.is_some() && !entry.unwrap().is_empty(),
1279 "built-in filter must have argument-level rules for '{}' \
1280 so apply_profile_from_file can merge them into custom profiles",
1281 name
1282 );
1283 }
1284 }
1285 }
1286
1287 #[test]
1288 fn test_memfd_create_not_in_default_allowlist() {
1289 let base = SeccompManager::base_allowed_syscalls();
1291 assert!(
1292 !base.contains(&libc::SYS_memfd_create),
1293 "memfd_create must not be in the default seccomp allowlist (fileless exec risk)"
1294 );
1295 let rules = SeccompManager::minimal_filter(true).unwrap();
1297 assert!(
1298 !rules.contains_key(&libc::SYS_memfd_create),
1299 "memfd_create must not be in the compiled seccomp filter rules"
1300 );
1301 }
1302
1303 #[test]
1304 fn test_mprotect_has_arg_filtering() {
1305 let base = SeccompManager::base_allowed_syscalls();
1310 assert!(
1311 !base.contains(&libc::SYS_mprotect),
1312 "SYS_mprotect must not be unconditionally allowed - needs arg filtering"
1313 );
1314
1315 let rules = SeccompManager::minimal_filter(true).unwrap();
1318 let mprotect_rules = rules.get(&libc::SYS_mprotect);
1319 assert!(
1320 mprotect_rules.is_some(),
1321 "mprotect must be present in the seccomp filter rules"
1322 );
1323 assert!(
1324 !mprotect_rules.unwrap().is_empty(),
1325 "mprotect must have argument-level conditions to prevent W^X violations"
1326 );
1327 }
1328
1329 #[test]
1330 fn test_unsafe_blocks_have_safety_comments() {
1331 let source = include_str!("seccomp.rs");
1333 let mut pos = 0;
1334 while let Some(idx) = source[pos..].find("unsafe {") {
1335 let abs_idx = pos + idx;
1336 let start = abs_idx.saturating_sub(200);
1338 let context = &source[start..abs_idx];
1339 assert!(
1340 context.contains("SAFETY:"),
1341 "unsafe block at byte {} must have a // SAFETY: comment. Context: ...{}...",
1342 abs_idx,
1343 &source[abs_idx.saturating_sub(80)..abs_idx + 10]
1344 );
1345 pos = abs_idx + 1;
1346 }
1347 }
1348
1349 fn mprotect_would_allow(prot: u64) -> bool {
1359 let mask = (libc::PROT_WRITE | libc::PROT_EXEC) as u64;
1360 let allowed_values: &[u64] = &[0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64];
1361 let masked = prot & mask;
1362 allowed_values.contains(&masked)
1363 }
1364
1365 #[test]
1366 fn test_mprotect_allows_prot_none() {
1367 assert!(mprotect_would_allow(0), "PROT_NONE must be allowed");
1368 }
1369
1370 #[test]
1371 fn test_mprotect_allows_prot_read_only() {
1372 assert!(
1373 mprotect_would_allow(libc::PROT_READ as u64),
1374 "PROT_READ must be allowed (W|X bits are 0)"
1375 );
1376 }
1377
1378 #[test]
1379 fn test_mprotect_allows_prot_read_write() {
1380 assert!(
1381 mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE) as u64),
1382 "PROT_READ|PROT_WRITE must be allowed"
1383 );
1384 }
1385
1386 #[test]
1387 fn test_mprotect_allows_prot_read_exec() {
1388 assert!(
1389 mprotect_would_allow((libc::PROT_READ | libc::PROT_EXEC) as u64),
1390 "PROT_READ|PROT_EXEC must be allowed"
1391 );
1392 }
1393
1394 #[test]
1395 fn test_mprotect_rejects_prot_write_exec() {
1396 assert!(
1397 !mprotect_would_allow((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
1398 "PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
1399 );
1400 }
1401
1402 #[test]
1403 fn test_mprotect_rejects_prot_read_write_exec() {
1404 assert!(
1405 !mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE | libc::PROT_EXEC) as u64),
1406 "PROT_READ|PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
1407 );
1408 }
1409
1410 #[test]
1411 fn test_mprotect_allows_prot_write_alone() {
1412 assert!(
1413 mprotect_would_allow(libc::PROT_WRITE as u64),
1414 "PROT_WRITE alone must be allowed"
1415 );
1416 }
1417
1418 #[test]
1419 fn test_mprotect_allows_prot_exec_alone() {
1420 assert!(
1421 mprotect_would_allow(libc::PROT_EXEC as u64),
1422 "PROT_EXEC alone must be allowed"
1423 );
1424 }
1425}