1use crate::error::{NucleusError, Result};
2use crate::security::policy::sha256_hex;
3use seccompiler::{BpfProgram, SeccompAction, SeccompCondition, SeccompFilter, SeccompRule};
4use std::collections::BTreeMap;
5use std::path::Path;
6use tracing::{debug, info, warn};
7
8pub struct SeccompManager {
13 applied: bool,
14}
15
16const DENIED_CLONE_NAMESPACE_FLAGS: u64 = (libc::CLONE_NEWUSER
17 | libc::CLONE_NEWNS
18 | libc::CLONE_NEWNET
19 | libc::CLONE_NEWIPC
20 | libc::CLONE_NEWUTS
21 | libc::CLONE_NEWPID
22 | libc::CLONE_NEWCGROUP
23 | libc::CLONE_NEWTIME) as u64;
24
25impl SeccompManager {
26 pub fn new() -> Self {
27 Self { applied: false }
28 }
29
30 fn base_allowed_syscalls() -> Vec<i64> {
31 let mut syscalls = vec![
32 libc::SYS_read,
34 libc::SYS_write,
35 libc::SYS_openat,
36 libc::SYS_close,
37 libc::SYS_fstat,
38 libc::SYS_lseek,
39 libc::SYS_fcntl,
40 libc::SYS_readv,
41 libc::SYS_writev,
42 libc::SYS_preadv,
43 libc::SYS_pwritev,
44 libc::SYS_preadv2,
45 libc::SYS_pwritev2,
46 libc::SYS_pread64,
47 libc::SYS_pwrite64,
48 libc::SYS_readlinkat,
49 libc::SYS_newfstatat,
50 libc::SYS_statx,
51 libc::SYS_faccessat,
52 libc::SYS_faccessat2,
53 libc::SYS_dup,
54 libc::SYS_dup3,
55 libc::SYS_pipe2,
56 libc::SYS_unlinkat,
57 libc::SYS_renameat,
58 libc::SYS_renameat2,
59 libc::SYS_linkat,
60 libc::SYS_symlinkat,
61 libc::SYS_fchmod,
62 libc::SYS_fchmodat,
63 libc::SYS_truncate,
64 libc::SYS_ftruncate,
65 libc::SYS_fallocate,
66 #[cfg(target_arch = "x86_64")]
67 libc::SYS_fadvise64,
68 libc::SYS_fsync,
69 libc::SYS_fdatasync,
70 libc::SYS_sync_file_range,
71 libc::SYS_flock,
72 libc::SYS_fstatfs,
73 libc::SYS_statfs,
74 #[cfg(target_arch = "x86_64")]
75 libc::SYS_sendfile,
76 libc::SYS_copy_file_range,
77 libc::SYS_splice,
78 libc::SYS_tee,
79 libc::SYS_mmap,
81 libc::SYS_munmap,
82 libc::SYS_brk,
83 libc::SYS_mremap,
84 libc::SYS_madvise,
85 libc::SYS_msync,
86 libc::SYS_mlock,
87 libc::SYS_munlock,
88 libc::SYS_mlock2,
89 libc::SYS_shmget,
92 libc::SYS_shmat,
93 libc::SYS_shmdt,
94 libc::SYS_shmctl,
95 libc::SYS_semget,
97 libc::SYS_semop,
98 libc::SYS_semctl,
99 libc::SYS_semtimedop,
100 libc::SYS_execve,
106 libc::SYS_wait4,
108 libc::SYS_waitid,
109 libc::SYS_exit,
110 libc::SYS_exit_group,
111 libc::SYS_getpid,
112 libc::SYS_gettid,
113 libc::SYS_getuid,
114 libc::SYS_getgid,
115 libc::SYS_geteuid,
116 libc::SYS_getegid,
117 libc::SYS_getppid,
118 libc::SYS_setsid,
119 libc::SYS_getgroups,
120 libc::SYS_rt_sigaction,
122 libc::SYS_rt_sigprocmask,
123 libc::SYS_rt_sigreturn,
124 libc::SYS_rt_sigsuspend,
125 libc::SYS_rt_sigtimedwait,
126 libc::SYS_rt_sigpending,
127 libc::SYS_rt_sigqueueinfo,
128 libc::SYS_sigaltstack,
129 libc::SYS_restart_syscall,
130 libc::SYS_kill,
134 libc::SYS_tgkill,
135 libc::SYS_clock_gettime,
137 libc::SYS_clock_getres,
138 libc::SYS_clock_nanosleep,
139 libc::SYS_gettimeofday,
140 libc::SYS_nanosleep,
141 libc::SYS_setitimer,
142 libc::SYS_getitimer,
143 libc::SYS_getcwd,
145 libc::SYS_chdir,
146 libc::SYS_fchdir,
147 libc::SYS_mkdirat,
148 libc::SYS_getdents64,
149 libc::SYS_uname,
151 libc::SYS_getrandom,
152 libc::SYS_futex,
153 libc::SYS_set_tid_address,
154 libc::SYS_set_robust_list,
155 libc::SYS_get_robust_list,
156 libc::SYS_umask,
159 libc::SYS_getrusage,
161 libc::SYS_times,
162 libc::SYS_sched_yield,
163 libc::SYS_sched_getaffinity,
164 libc::SYS_sched_setaffinity,
165 libc::SYS_sched_getparam,
166 libc::SYS_sched_getscheduler,
167 libc::SYS_getcpu,
168 libc::SYS_getxattr,
170 libc::SYS_lgetxattr,
171 libc::SYS_fgetxattr,
172 libc::SYS_listxattr,
173 libc::SYS_llistxattr,
174 libc::SYS_flistxattr,
175 libc::SYS_rseq,
176 libc::SYS_close_range,
177 libc::SYS_fchown,
180 libc::SYS_fchownat,
181 libc::SYS_io_setup,
184 libc::SYS_io_destroy,
185 libc::SYS_io_submit,
186 libc::SYS_io_getevents,
187 libc::SYS_setpgid,
193 libc::SYS_getpgid,
194 libc::SYS_landlock_create_ruleset,
198 libc::SYS_landlock_add_rule,
199 libc::SYS_landlock_restrict_self,
200 libc::SYS_getsockname,
202 libc::SYS_getpeername,
203 libc::SYS_socketpair,
204 libc::SYS_getsockopt,
205 libc::SYS_ppoll,
207 libc::SYS_pselect6,
208 libc::SYS_epoll_create1,
209 libc::SYS_epoll_ctl,
210 libc::SYS_epoll_pwait,
211 libc::SYS_eventfd2,
212 libc::SYS_signalfd4,
213 libc::SYS_timerfd_create,
214 libc::SYS_timerfd_settime,
215 libc::SYS_timerfd_gettime,
216 ];
217
218 #[cfg(target_arch = "x86_64")]
220 syscalls.extend_from_slice(&[
221 libc::SYS_open,
222 libc::SYS_stat,
223 libc::SYS_lstat,
224 libc::SYS_access,
225 libc::SYS_readlink,
226 libc::SYS_dup2,
227 libc::SYS_pipe,
228 libc::SYS_unlink,
229 libc::SYS_rename,
230 libc::SYS_link,
231 libc::SYS_symlink,
232 libc::SYS_chmod,
233 libc::SYS_mkdir,
234 libc::SYS_rmdir,
235 libc::SYS_getdents,
236 libc::SYS_getpgrp,
237 libc::SYS_chown,
238 libc::SYS_fchown,
239 libc::SYS_lchown,
240 libc::SYS_arch_prctl,
241 libc::SYS_getrlimit,
242 libc::SYS_poll,
243 libc::SYS_select,
244 libc::SYS_epoll_create,
245 libc::SYS_epoll_wait,
246 libc::SYS_eventfd,
247 libc::SYS_signalfd,
248 ]);
249
250 syscalls
251 }
252
253 fn allowed_socket_domains(allow_network: bool) -> Vec<i32> {
254 if allow_network {
255 vec![libc::AF_UNIX, libc::AF_INET, libc::AF_INET6]
256 } else {
257 vec![libc::AF_UNIX]
258 }
259 }
260
261 fn network_mode_syscalls(allow_network: bool) -> Vec<i64> {
262 if allow_network {
263 vec![
264 libc::SYS_connect,
265 libc::SYS_sendto,
266 libc::SYS_recvfrom,
267 libc::SYS_sendmsg,
268 libc::SYS_recvmsg,
269 libc::SYS_shutdown,
270 libc::SYS_bind,
271 libc::SYS_listen,
272 libc::SYS_accept,
273 libc::SYS_accept4,
274 libc::SYS_setsockopt,
275 ]
276 } else {
277 Vec::new()
278 }
279 }
280
281 fn minimal_filter(
291 allow_network: bool,
292 extra_syscalls: &[String],
293 ) -> Result<BTreeMap<i64, Vec<SeccompRule>>> {
294 let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
295
296 let allowed_syscalls = Self::base_allowed_syscalls();
298
299 for syscall in allowed_syscalls {
301 rules.insert(syscall, Vec::new());
302 }
303
304 for syscall in Self::network_mode_syscalls(allow_network) {
306 rules.insert(syscall, Vec::new());
307 }
308
309 for name in extra_syscalls {
315 if let Some(nr) = syscall_name_to_number(name) {
316 if rules.contains_key(&nr) {
317 } else if Self::OPT_IN_SYSCALLS.contains(&name.as_str()) {
319 rules.insert(nr, Vec::new());
320 } else {
321 warn!(
322 "--seccomp-allow: syscall '{}' is not in the opt-in allowlist – blocked",
323 name
324 );
325 }
326 } else {
327 warn!("--seccomp-allow: unknown syscall '{}' – blocked", name);
328 }
329 }
330
331 let mut socket_rules = Vec::new();
334 for domain in Self::allowed_socket_domains(allow_network) {
335 let condition = SeccompCondition::new(
336 0, seccompiler::SeccompCmpArgLen::Dword,
338 seccompiler::SeccompCmpOp::Eq,
339 domain as u64,
340 )
341 .map_err(|e| {
342 NucleusError::SeccompError(format!(
343 "Failed to create socket domain condition: {}",
344 e
345 ))
346 })?;
347 let rule = SeccompRule::new(vec![condition]).map_err(|e| {
348 NucleusError::SeccompError(format!("Failed to create socket rule: {}", e))
349 })?;
350 socket_rules.push(rule);
351 }
352 rules.insert(libc::SYS_socket, socket_rules);
353
354 let ioctl_allowed: &[u64] = &[
356 0x5401, 0x5402, 0x5403, 0x5404, 0x540B, 0x540F, 0x5410, 0x5413, 0x5429, 0x541B, 0x5421, 0x5451, 0x5450, ];
372 let mut ioctl_rules = Vec::new();
373 for &request in ioctl_allowed {
374 let condition = SeccompCondition::new(
375 1, seccompiler::SeccompCmpArgLen::Dword,
377 seccompiler::SeccompCmpOp::Eq,
378 request,
379 )
380 .map_err(|e| {
381 NucleusError::SeccompError(format!("Failed to create ioctl condition: {}", e))
382 })?;
383 let rule = SeccompRule::new(vec![condition]).map_err(|e| {
384 NucleusError::SeccompError(format!("Failed to create ioctl rule: {}", e))
385 })?;
386 ioctl_rules.push(rule);
387 }
388 rules.insert(libc::SYS_ioctl, ioctl_rules);
389
390 let prctl_allowed: &[u64] = &[
395 1, 2, 15, 16, 23, 27, 36, 37, 38, 40, 47, 39, ];
411 let mut prctl_rules = Vec::new();
412 for &option in prctl_allowed {
413 let condition = SeccompCondition::new(
414 0, seccompiler::SeccompCmpArgLen::Dword,
416 seccompiler::SeccompCmpOp::Eq,
417 option,
418 )
419 .map_err(|e| {
420 NucleusError::SeccompError(format!("Failed to create prctl condition: {}", e))
421 })?;
422 let rule = SeccompRule::new(vec![condition]).map_err(|e| {
423 NucleusError::SeccompError(format!("Failed to create prctl rule: {}", e))
424 })?;
425 prctl_rules.push(rule);
426 }
427 rules.insert(libc::SYS_prctl, prctl_rules);
428
429 let prlimit_condition = SeccompCondition::new(
432 2, seccompiler::SeccompCmpArgLen::Qword,
434 seccompiler::SeccompCmpOp::Eq,
435 0u64, )
437 .map_err(|e| {
438 NucleusError::SeccompError(format!("Failed to create prlimit64 condition: {}", e))
439 })?;
440 let prlimit_rule = SeccompRule::new(vec![prlimit_condition]).map_err(|e| {
441 NucleusError::SeccompError(format!("Failed to create prlimit64 rule: {}", e))
442 })?;
443 rules.insert(libc::SYS_prlimit64, vec![prlimit_rule]);
444
445 let mut mprotect_rules = Vec::new();
447 for allowed in [0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64] {
448 let condition = SeccompCondition::new(
449 2, seccompiler::SeccompCmpArgLen::Dword,
451 seccompiler::SeccompCmpOp::MaskedEq((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
452 allowed,
453 )
454 .map_err(|e| {
455 NucleusError::SeccompError(format!("Failed to create mprotect condition: {}", e))
456 })?;
457 let rule = SeccompRule::new(vec![condition]).map_err(|e| {
458 NucleusError::SeccompError(format!("Failed to create mprotect rule: {}", e))
459 })?;
460 mprotect_rules.push(rule);
461 }
462 rules.insert(libc::SYS_mprotect, mprotect_rules);
463
464 if Self::has_effective_cap(21) {
479 return Err(NucleusError::SeccompError(
480 "SECURITY: CAP_SYS_ADMIN is still in the effective capability set. \
481 Capabilities must be dropped before installing seccomp filters \
482 (clone3 is allowed unconditionally)."
483 .to_string(),
484 ));
485 }
486 rules.insert(libc::SYS_clone3, Vec::new());
487
488 let clone_condition = SeccompCondition::new(
490 0, seccompiler::SeccompCmpArgLen::Qword,
492 seccompiler::SeccompCmpOp::MaskedEq(DENIED_CLONE_NAMESPACE_FLAGS),
493 0, )
495 .map_err(|e| {
496 NucleusError::SeccompError(format!("Failed to create clone condition: {}", e))
497 })?;
498 let clone_rule = SeccompRule::new(vec![clone_condition]).map_err(|e| {
499 NucleusError::SeccompError(format!("Failed to create clone rule: {}", e))
500 })?;
501 rules.insert(libc::SYS_clone, vec![clone_rule]);
502
503 let execveat_condition = SeccompCondition::new(
510 4, seccompiler::SeccompCmpArgLen::Dword,
512 seccompiler::SeccompCmpOp::MaskedEq(libc::AT_EMPTY_PATH as u64),
513 0, )
515 .map_err(|e| {
516 NucleusError::SeccompError(format!("Failed to create execveat condition: {}", e))
517 })?;
518 let execveat_rule = SeccompRule::new(vec![execveat_condition]).map_err(|e| {
519 NucleusError::SeccompError(format!("Failed to create execveat rule: {}", e))
520 })?;
521 rules.insert(libc::SYS_execveat, vec![execveat_rule]);
522
523 Ok(rules)
524 }
525
526 pub fn compile_minimal_filter() -> Result<BpfProgram> {
533 let rules = Self::minimal_filter(true, &[])?;
534 let target_arch = std::env::consts::ARCH.try_into().map_err(|e| {
535 NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
536 })?;
537 super::seccomp_bpf::compile_bitmap_bpf(
538 rules,
539 SeccompAction::KillProcess,
540 SeccompAction::Allow,
541 target_arch,
542 )
543 }
544
545 #[cfg(test)]
547 pub(crate) fn minimal_filter_for_test(
548 allow_network: bool,
549 extra_syscalls: &[String],
550 ) -> BTreeMap<i64, Vec<SeccompRule>> {
551 Self::minimal_filter(allow_network, extra_syscalls).unwrap()
552 }
553
554 pub fn apply_minimal_filter(&mut self) -> Result<bool> {
562 self.apply_minimal_filter_with_mode(false, false)
563 }
564
565 pub fn apply_minimal_filter_with_mode(
570 &mut self,
571 best_effort: bool,
572 log_denied: bool,
573 ) -> Result<bool> {
574 self.apply_filter_for_network_mode(true, best_effort, log_denied, &[])
575 }
576
577 pub fn apply_filter_for_network_mode(
586 &mut self,
587 allow_network: bool,
588 best_effort: bool,
589 log_denied: bool,
590 extra_syscalls: &[String],
591 ) -> Result<bool> {
592 if self.applied {
593 debug!("Seccomp filter already applied, skipping");
594 return Ok(true);
595 }
596
597 info!(allow_network, "Applying seccomp filter");
598
599 let rules = match Self::minimal_filter(allow_network, extra_syscalls) {
600 Ok(r) => r,
601 Err(e) => {
602 if best_effort {
603 warn!(
604 "Failed to create seccomp rules: {} (continuing without seccomp)",
605 e
606 );
607 return Ok(false);
608 }
609 return Err(e);
610 }
611 };
612
613 let target_arch = match std::env::consts::ARCH.try_into() {
614 Ok(a) => a,
615 Err(e) => {
616 let msg = format!("Unsupported architecture: {:?}", e);
617 if best_effort {
618 warn!("{} (continuing without seccomp)", msg);
619 return Ok(false);
620 }
621 return Err(NucleusError::SeccompError(msg));
622 }
623 };
624
625 let bpf_prog: BpfProgram = match super::seccomp_bpf::compile_bitmap_bpf(
626 rules,
627 SeccompAction::KillProcess,
628 SeccompAction::Allow,
629 target_arch,
630 ) {
631 Ok(p) => p,
632 Err(e) => {
633 if best_effort {
634 warn!(
635 "Failed to compile BPF program: {} (continuing without seccomp)",
636 e
637 );
638 return Ok(false);
639 }
640 return Err(e);
641 }
642 };
643
644 match Self::apply_bpf_program(&bpf_prog, log_denied) {
646 Ok(_) => {
647 self.applied = true;
648 info!("Successfully applied seccomp filter");
649 Ok(true)
650 }
651 Err(e) => {
652 if best_effort {
653 warn!(
654 "Failed to apply seccomp filter: {} (continuing without seccomp)",
655 e
656 );
657 Ok(false)
658 } else {
659 Err(NucleusError::SeccompError(format!(
660 "Failed to apply seccomp filter: {}",
661 e
662 )))
663 }
664 }
665 }
666 }
667
668 pub fn apply_profile_from_file(
687 &mut self,
688 profile_path: &Path,
689 expected_sha256: Option<&str>,
690 audit_mode: bool,
691 ) -> Result<bool> {
692 if self.applied {
693 debug!("Seccomp filter already applied, skipping");
694 return Ok(true);
695 }
696
697 info!("Loading seccomp profile from {:?}", profile_path);
698
699 let content = std::fs::read(profile_path).map_err(|e| {
701 NucleusError::SeccompError(format!(
702 "Failed to read seccomp profile {:?}: {}",
703 profile_path, e
704 ))
705 })?;
706
707 if let Some(expected) = expected_sha256 {
709 let actual = sha256_hex(&content);
710 if actual != expected {
711 return Err(NucleusError::SeccompError(format!(
712 "Seccomp profile hash mismatch: expected {}, got {}",
713 expected, actual
714 )));
715 }
716 info!("Seccomp profile hash verified: {}", actual);
717 }
718
719 let profile: SeccompProfile = serde_json::from_slice(&content).map_err(|e| {
721 NucleusError::SeccompError(format!("Failed to parse seccomp profile: {}", e))
722 })?;
723
724 Self::warn_missing_arg_filters(&profile);
729
730 let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
732
733 for syscall_group in &profile.syscalls {
734 if syscall_group.action == "SCMP_ACT_ALLOW" {
735 for name in &syscall_group.names {
736 if let Some(nr) = syscall_name_to_number(name) {
737 rules.insert(nr, Vec::new());
738 } else {
739 warn!("Unknown syscall in profile: {} (skipping)", name);
740 }
741 }
742 }
743 }
744
745 let builtin_rules = Self::minimal_filter(true, &[])?;
750 for syscall_name in Self::ARG_FILTERED_SYSCALLS {
751 if let Some(nr) = syscall_name_to_number(syscall_name) {
752 if let std::collections::btree_map::Entry::Occupied(mut entry) = rules.entry(nr) {
753 if let Some(builtin) = builtin_rules.get(&nr) {
754 if !builtin.is_empty() {
755 info!(
756 "Merging built-in argument filters for '{}' into custom profile",
757 syscall_name
758 );
759 entry.insert(builtin.clone());
760 }
761 }
762 }
763 }
764 }
765 if !rules.contains_key(&libc::SYS_clone3) {
771 rules.insert(libc::SYS_clone3, Vec::new());
772 }
773
774 let target_arch = std::env::consts::ARCH.try_into().map_err(|e| {
775 NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
776 })?;
777
778 let bpf_prog: BpfProgram = super::seccomp_bpf::compile_bitmap_bpf(
779 rules,
780 SeccompAction::KillProcess,
781 SeccompAction::Allow,
782 target_arch,
783 )?;
784
785 match Self::apply_bpf_program(&bpf_prog, audit_mode) {
786 Ok(_) => {
787 self.applied = true;
788 info!(
789 "Seccomp profile applied from {:?} (log_denied={})",
790 profile_path, audit_mode
791 );
792 Ok(true)
793 }
794 Err(e) => Err(e),
795 }
796 }
797
798 pub fn apply_trace_filter(&mut self) -> Result<bool> {
803 if self.applied {
804 debug!("Seccomp filter already applied, skipping trace filter");
805 return Ok(true);
806 }
807
808 info!("Applying seccomp trace filter (allow-all + LOG)");
809
810 let filter = SeccompFilter::new(
814 BTreeMap::new(),
815 SeccompAction::Allow, SeccompAction::Allow, std::env::consts::ARCH.try_into().map_err(|e| {
818 NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
819 })?,
820 )
821 .map_err(|e| NucleusError::SeccompError(format!("Failed to create trace filter: {}", e)))?;
822
823 let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
824 NucleusError::SeccompError(format!("Failed to compile trace BPF: {}", e))
825 })?;
826
827 Self::apply_bpf_program(&bpf_prog, true)?;
829 self.applied = true;
830 info!("Seccomp trace filter applied (all syscalls allowed + logged)");
831 Ok(true)
832 }
833
834 const ARG_FILTERED_SYSCALLS: &'static [&'static str] = &[
837 "clone", "clone3", "execveat", "ioctl", "mprotect", "prctl", "socket",
838 ];
839
840 const OPT_IN_SYSCALLS: &'static [&'static str] = &[
847 "io_uring_setup",
849 "io_uring_enter",
850 "io_uring_register",
851 "msgget",
853 "msgsnd",
854 "msgrcv",
855 "msgctl",
856 "mq_open",
858 "mq_unlink",
859 "mq_timedsend",
860 "mq_timedreceive",
861 "mq_notify",
862 "mq_getsetattr",
863 "timer_create",
865 "timer_settime",
866 "timer_gettime",
867 "timer_getoverrun",
868 "timer_delete",
869 "inotify_init",
871 "inotify_init1",
872 "inotify_add_watch",
873 "inotify_rm_watch",
874 "fanotify_init",
875 "fanotify_mark",
876 "mincore",
878 "mlockall",
879 "munlockall",
880 "membarrier",
881 "process_madvise",
882 "mbind",
883 "set_mempolicy",
884 "get_mempolicy",
885 "set_mempolicy_home_node",
886 "pkey_mprotect",
887 "pkey_alloc",
888 "pkey_free",
889 "cachestat",
890 "remap_file_pages",
891 "sync",
893 "syncfs",
894 "sync_file_range",
895 "readahead",
896 "vmsplice",
897 "openat2",
898 "name_to_handle_at",
899 "open_by_handle_at",
900 "io_cancel",
901 "io_pgetevents",
902 "creat",
903 "fchmodat2",
904 "statmount",
905 "listmount",
906 "utimensat",
907 "utimes",
908 "utime",
909 "futimesat",
910 "setxattr",
912 "lsetxattr",
913 "fsetxattr",
914 "removexattr",
915 "lremovexattr",
916 "fremovexattr",
917 "setxattrat",
918 "getxattrat",
919 "listxattrat",
920 "removexattrat",
921 "recvmmsg",
923 "sendmmsg",
924 "sched_setparam",
926 "sched_setscheduler",
927 "sched_get_priority_max",
928 "sched_get_priority_min",
929 "sched_rr_get_interval",
930 "sched_setattr",
931 "sched_getattr",
932 "setrlimit",
934 "getpriority",
935 "setpriority",
936 "ioprio_set",
937 "ioprio_get",
938 "vfork",
940 "pause",
941 "alarm",
942 "tkill",
943 "sysinfo",
944 "personality",
945 "vhangup",
946 "time",
947 "pidfd_open",
948 "pidfd_send_signal",
949 "pidfd_getfd",
950 "setuid",
952 "setgid",
953 "setreuid",
954 "setregid",
955 "setresuid",
956 "getresuid",
957 "setresgid",
958 "getresgid",
959 "setfsuid",
960 "setfsgid",
961 "setgroups",
962 "getsid",
963 "capget",
965 "rt_tgsigqueueinfo",
967 "mknod",
969 "mknodat",
970 "syslog",
971 "clock_settime",
972 "clock_adjtime",
973 "adjtimex",
974 "unshare",
975 "kcmp",
976 "epoll_pwait2",
977 "futex_waitv",
979 "futex_wake",
980 "futex_wait",
981 "futex_requeue",
982 "seccomp",
984 "add_key",
986 "request_key",
987 "keyctl",
988 ];
989
990 fn warn_missing_arg_filters(profile: &SeccompProfile) {
993 for group in &profile.syscalls {
994 if group.action != "SCMP_ACT_ALLOW" {
995 continue;
996 }
997 for name in &group.names {
998 if Self::ARG_FILTERED_SYSCALLS.contains(&name.as_str()) && group.args.is_empty() {
999 warn!(
1000 "Custom seccomp profile allows '{}' without argument filters. \
1001 The built-in filter restricts this syscall at the argument level. \
1002 This profile weakens security compared to the default.",
1003 name
1004 );
1005 }
1006 }
1007 }
1008 }
1009
1010 fn has_effective_cap(cap: i32) -> bool {
1013 let Ok(status) = std::fs::read_to_string("/proc/self/status") else {
1014 return true;
1016 };
1017 for line in status.lines() {
1018 if let Some(hex) = line.strip_prefix("CapEff:\t") {
1019 if let Ok(eff) = u64::from_str_radix(hex.trim(), 16) {
1020 return eff & (1u64 << cap) != 0;
1021 }
1022 }
1023 }
1024 true }
1026
1027 pub fn is_applied(&self) -> bool {
1029 self.applied
1030 }
1031
1032 fn apply_bpf_program(bpf_prog: &BpfProgram, log_denied: bool) -> Result<()> {
1033 let mut flags: libc::c_ulong = 0;
1034 if log_denied {
1035 flags |= libc::SECCOMP_FILTER_FLAG_LOG as libc::c_ulong;
1036 }
1037
1038 match Self::apply_bpf_program_with_flags(bpf_prog, flags) {
1039 Ok(()) => Ok(()),
1040 Err(err)
1041 if log_denied
1042 && err.raw_os_error() == Some(libc::EINVAL)
1043 && libc::SECCOMP_FILTER_FLAG_LOG != 0 =>
1044 {
1045 warn!(
1046 "Kernel rejected SECCOMP_FILTER_FLAG_LOG; continuing with seccomp \
1047 enforcement without deny logging"
1048 );
1049 Self::apply_bpf_program_with_flags(bpf_prog, 0)?;
1050 Ok(())
1051 }
1052 Err(err) => Err(NucleusError::SeccompError(format!(
1053 "Failed to apply seccomp filter: {}",
1054 err
1055 ))),
1056 }
1057 }
1058
1059 fn apply_bpf_program_with_flags(
1060 bpf_prog: &BpfProgram,
1061 flags: libc::c_ulong,
1062 ) -> std::io::Result<()> {
1063 let rc = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
1066 if rc != 0 {
1067 return Err(std::io::Error::last_os_error());
1068 }
1069
1070 let prog = libc::sock_fprog {
1071 len: bpf_prog.len() as u16,
1072 filter: bpf_prog.as_ptr() as *mut libc::sock_filter,
1073 };
1074
1075 let rc = unsafe {
1078 libc::syscall(
1079 libc::SYS_seccomp,
1080 libc::SECCOMP_SET_MODE_FILTER,
1081 flags,
1082 &prog as *const libc::sock_fprog,
1083 )
1084 };
1085
1086 if rc < 0 {
1087 return Err(std::io::Error::last_os_error());
1088 }
1089
1090 Ok(())
1091 }
1092}
1093
1094use crate::security::seccomp_generate::SeccompProfile;
1096
1097fn syscall_name_to_number(name: &str) -> Option<i64> {
1101 match name {
1102 "read" => Some(libc::SYS_read),
1104 "write" => Some(libc::SYS_write),
1105 #[cfg(target_arch = "x86_64")]
1106 "open" => Some(libc::SYS_open),
1107 "openat" => Some(libc::SYS_openat),
1108 "close" => Some(libc::SYS_close),
1109 #[cfg(target_arch = "x86_64")]
1110 "stat" => Some(libc::SYS_stat),
1111 "fstat" => Some(libc::SYS_fstat),
1112 #[cfg(target_arch = "x86_64")]
1113 "lstat" => Some(libc::SYS_lstat),
1114 "lseek" => Some(libc::SYS_lseek),
1115 #[cfg(target_arch = "x86_64")]
1116 "access" => Some(libc::SYS_access),
1117 "fcntl" => Some(libc::SYS_fcntl),
1118 "readv" => Some(libc::SYS_readv),
1119 "writev" => Some(libc::SYS_writev),
1120 "pread64" => Some(libc::SYS_pread64),
1121 "pwrite64" => Some(libc::SYS_pwrite64),
1122 #[cfg(target_arch = "x86_64")]
1123 "readlink" => Some(libc::SYS_readlink),
1124 "readlinkat" => Some(libc::SYS_readlinkat),
1125 "newfstatat" => Some(libc::SYS_newfstatat),
1126 "statx" => Some(libc::SYS_statx),
1127 "faccessat" => Some(libc::SYS_faccessat),
1128 "faccessat2" => Some(libc::SYS_faccessat2),
1129 "dup" => Some(libc::SYS_dup),
1130 #[cfg(target_arch = "x86_64")]
1131 "dup2" => Some(libc::SYS_dup2),
1132 "dup3" => Some(libc::SYS_dup3),
1133 #[cfg(target_arch = "x86_64")]
1134 "pipe" => Some(libc::SYS_pipe),
1135 "pipe2" => Some(libc::SYS_pipe2),
1136 #[cfg(target_arch = "x86_64")]
1137 "unlink" => Some(libc::SYS_unlink),
1138 "unlinkat" => Some(libc::SYS_unlinkat),
1139 #[cfg(target_arch = "x86_64")]
1140 "rename" => Some(libc::SYS_rename),
1141 "renameat" => Some(libc::SYS_renameat),
1142 "renameat2" => Some(libc::SYS_renameat2),
1143 #[cfg(target_arch = "x86_64")]
1144 "link" => Some(libc::SYS_link),
1145 "linkat" => Some(libc::SYS_linkat),
1146 #[cfg(target_arch = "x86_64")]
1147 "symlink" => Some(libc::SYS_symlink),
1148 "symlinkat" => Some(libc::SYS_symlinkat),
1149 #[cfg(target_arch = "x86_64")]
1150 "chmod" => Some(libc::SYS_chmod),
1151 "fchmod" => Some(libc::SYS_fchmod),
1152 "fchmodat" => Some(libc::SYS_fchmodat),
1153 "truncate" => Some(libc::SYS_truncate),
1154 "ftruncate" => Some(libc::SYS_ftruncate),
1155 "fallocate" => Some(libc::SYS_fallocate),
1156 #[cfg(target_arch = "x86_64")]
1157 "fadvise64" => Some(libc::SYS_fadvise64),
1158 "fsync" => Some(libc::SYS_fsync),
1159 "fdatasync" => Some(libc::SYS_fdatasync),
1160 "flock" => Some(libc::SYS_flock),
1161 #[cfg(target_arch = "x86_64")]
1162 "sendfile" => Some(libc::SYS_sendfile),
1163 "copy_file_range" => Some(libc::SYS_copy_file_range),
1164 "splice" => Some(libc::SYS_splice),
1165 "tee" => Some(libc::SYS_tee),
1166 "mmap" => Some(libc::SYS_mmap),
1168 "munmap" => Some(libc::SYS_munmap),
1169 "mprotect" => Some(libc::SYS_mprotect),
1170 "brk" => Some(libc::SYS_brk),
1171 "mremap" => Some(libc::SYS_mremap),
1172 "madvise" => Some(libc::SYS_madvise),
1173 "msync" => Some(libc::SYS_msync),
1174 "mlock" => Some(libc::SYS_mlock),
1175 "mlock2" => Some(libc::SYS_mlock2),
1176 "munlock" => Some(libc::SYS_munlock),
1177 "shmget" => Some(libc::SYS_shmget),
1179 "shmat" => Some(libc::SYS_shmat),
1180 "shmdt" => Some(libc::SYS_shmdt),
1181 "shmctl" => Some(libc::SYS_shmctl),
1182 "semget" => Some(libc::SYS_semget),
1183 "semop" => Some(libc::SYS_semop),
1184 "semctl" => Some(libc::SYS_semctl),
1185 "semtimedop" => Some(libc::SYS_semtimedop),
1186 #[cfg(target_arch = "x86_64")]
1188 "fork" => Some(libc::SYS_fork),
1189 "clone" => Some(libc::SYS_clone),
1190 "clone3" => Some(libc::SYS_clone3),
1191 "execve" => Some(libc::SYS_execve),
1192 "execveat" => Some(libc::SYS_execveat),
1193 "wait4" => Some(libc::SYS_wait4),
1194 "waitid" => Some(libc::SYS_waitid),
1195 "exit" => Some(libc::SYS_exit),
1196 "exit_group" => Some(libc::SYS_exit_group),
1197 "getpid" => Some(libc::SYS_getpid),
1198 "gettid" => Some(libc::SYS_gettid),
1199 "getuid" => Some(libc::SYS_getuid),
1200 "getgid" => Some(libc::SYS_getgid),
1201 "geteuid" => Some(libc::SYS_geteuid),
1202 "getegid" => Some(libc::SYS_getegid),
1203 "getppid" => Some(libc::SYS_getppid),
1204 #[cfg(target_arch = "x86_64")]
1205 "getpgrp" => Some(libc::SYS_getpgrp),
1206 "setsid" => Some(libc::SYS_setsid),
1207 "getgroups" => Some(libc::SYS_getgroups),
1208 "rt_sigaction" => Some(libc::SYS_rt_sigaction),
1210 "rt_sigprocmask" => Some(libc::SYS_rt_sigprocmask),
1211 "rt_sigreturn" => Some(libc::SYS_rt_sigreturn),
1212 "rt_sigsuspend" => Some(libc::SYS_rt_sigsuspend),
1213 "rt_sigtimedwait" => Some(libc::SYS_rt_sigtimedwait),
1214 "rt_sigpending" => Some(libc::SYS_rt_sigpending),
1215 "rt_sigqueueinfo" => Some(libc::SYS_rt_sigqueueinfo),
1216 "sigaltstack" => Some(libc::SYS_sigaltstack),
1217 "restart_syscall" => Some(libc::SYS_restart_syscall),
1218 "kill" => Some(libc::SYS_kill),
1219 "tgkill" => Some(libc::SYS_tgkill),
1220 "clock_gettime" => Some(libc::SYS_clock_gettime),
1222 "clock_getres" => Some(libc::SYS_clock_getres),
1223 "clock_nanosleep" => Some(libc::SYS_clock_nanosleep),
1224 "gettimeofday" => Some(libc::SYS_gettimeofday),
1225 "nanosleep" => Some(libc::SYS_nanosleep),
1226 "getcwd" => Some(libc::SYS_getcwd),
1228 "chdir" => Some(libc::SYS_chdir),
1229 "fchdir" => Some(libc::SYS_fchdir),
1230 #[cfg(target_arch = "x86_64")]
1231 "mkdir" => Some(libc::SYS_mkdir),
1232 "mkdirat" => Some(libc::SYS_mkdirat),
1233 #[cfg(target_arch = "x86_64")]
1234 "rmdir" => Some(libc::SYS_rmdir),
1235 #[cfg(target_arch = "x86_64")]
1236 "getdents" => Some(libc::SYS_getdents),
1237 "getdents64" => Some(libc::SYS_getdents64),
1238 "socket" => Some(libc::SYS_socket),
1240 "connect" => Some(libc::SYS_connect),
1241 "sendto" => Some(libc::SYS_sendto),
1242 "recvfrom" => Some(libc::SYS_recvfrom),
1243 "sendmsg" => Some(libc::SYS_sendmsg),
1244 "recvmsg" => Some(libc::SYS_recvmsg),
1245 "shutdown" => Some(libc::SYS_shutdown),
1246 "bind" => Some(libc::SYS_bind),
1247 "listen" => Some(libc::SYS_listen),
1248 "accept" => Some(libc::SYS_accept),
1249 "accept4" => Some(libc::SYS_accept4),
1250 "setsockopt" => Some(libc::SYS_setsockopt),
1251 "getsockopt" => Some(libc::SYS_getsockopt),
1252 "getsockname" => Some(libc::SYS_getsockname),
1253 "getpeername" => Some(libc::SYS_getpeername),
1254 "socketpair" => Some(libc::SYS_socketpair),
1255 #[cfg(target_arch = "x86_64")]
1257 "poll" => Some(libc::SYS_poll),
1258 "ppoll" => Some(libc::SYS_ppoll),
1259 #[cfg(target_arch = "x86_64")]
1260 "select" => Some(libc::SYS_select),
1261 "pselect6" => Some(libc::SYS_pselect6),
1262 #[cfg(target_arch = "x86_64")]
1263 "epoll_create" => Some(libc::SYS_epoll_create),
1264 "epoll_create1" => Some(libc::SYS_epoll_create1),
1265 "epoll_ctl" => Some(libc::SYS_epoll_ctl),
1266 #[cfg(target_arch = "x86_64")]
1267 "epoll_wait" => Some(libc::SYS_epoll_wait),
1268 "epoll_pwait" => Some(libc::SYS_epoll_pwait),
1269 #[cfg(target_arch = "x86_64")]
1270 "eventfd" => Some(libc::SYS_eventfd),
1271 "eventfd2" => Some(libc::SYS_eventfd2),
1272 #[cfg(target_arch = "x86_64")]
1273 "signalfd" => Some(libc::SYS_signalfd),
1274 "signalfd4" => Some(libc::SYS_signalfd4),
1275 "timerfd_create" => Some(libc::SYS_timerfd_create),
1276 "timerfd_settime" => Some(libc::SYS_timerfd_settime),
1277 "timerfd_gettime" => Some(libc::SYS_timerfd_gettime),
1278 "uname" => Some(libc::SYS_uname),
1280 "getrandom" => Some(libc::SYS_getrandom),
1281 "futex" => Some(libc::SYS_futex),
1282 "set_tid_address" => Some(libc::SYS_set_tid_address),
1283 "set_robust_list" => Some(libc::SYS_set_robust_list),
1284 "get_robust_list" => Some(libc::SYS_get_robust_list),
1285 #[cfg(target_arch = "x86_64")]
1286 "arch_prctl" => Some(libc::SYS_arch_prctl),
1287 "sysinfo" => Some(libc::SYS_sysinfo),
1288 "umask" => Some(libc::SYS_umask),
1289 #[cfg(target_arch = "x86_64")]
1290 "getrlimit" => Some(libc::SYS_getrlimit),
1291 "prlimit64" => Some(libc::SYS_prlimit64),
1292 "getrusage" => Some(libc::SYS_getrusage),
1293 "times" => Some(libc::SYS_times),
1294 "sched_yield" => Some(libc::SYS_sched_yield),
1295 "sched_getaffinity" => Some(libc::SYS_sched_getaffinity),
1296 "getcpu" => Some(libc::SYS_getcpu),
1297 "rseq" => Some(libc::SYS_rseq),
1298 "close_range" => Some(libc::SYS_close_range),
1299 "fchown" => Some(libc::SYS_fchown),
1301 "fchownat" => Some(libc::SYS_fchownat),
1302 #[cfg(target_arch = "x86_64")]
1303 "chown" => Some(libc::SYS_chown),
1304 #[cfg(target_arch = "x86_64")]
1305 "lchown" => Some(libc::SYS_lchown),
1306 "io_uring_setup" => Some(libc::SYS_io_uring_setup),
1308 "io_uring_enter" => Some(libc::SYS_io_uring_enter),
1309 "io_uring_register" => Some(libc::SYS_io_uring_register),
1310 "io_setup" => Some(libc::SYS_io_setup),
1312 "io_destroy" => Some(libc::SYS_io_destroy),
1313 "io_submit" => Some(libc::SYS_io_submit),
1314 "io_getevents" => Some(libc::SYS_io_getevents),
1315 "setitimer" => Some(libc::SYS_setitimer),
1317 "getitimer" => Some(libc::SYS_getitimer),
1318 "setpgid" => Some(libc::SYS_setpgid),
1320 "getpgid" => Some(libc::SYS_getpgid),
1321 "memfd_create" => Some(libc::SYS_memfd_create),
1322 "ioctl" => Some(libc::SYS_ioctl),
1323 "prctl" => Some(libc::SYS_prctl),
1324 "landlock_create_ruleset" => Some(libc::SYS_landlock_create_ruleset),
1326 "landlock_add_rule" => Some(libc::SYS_landlock_add_rule),
1327 "landlock_restrict_self" => Some(libc::SYS_landlock_restrict_self),
1328 "mincore" => Some(libc::SYS_mincore),
1331 "mlockall" => Some(libc::SYS_mlockall),
1332 "munlockall" => Some(libc::SYS_munlockall),
1333 "mbind" => Some(libc::SYS_mbind),
1334 "set_mempolicy" => Some(libc::SYS_set_mempolicy),
1335 "get_mempolicy" => Some(libc::SYS_get_mempolicy),
1336 "memfd_secret" => Some(libc::SYS_memfd_secret),
1337 "membarrier" => Some(libc::SYS_membarrier),
1338 "process_madvise" => Some(libc::SYS_process_madvise),
1339 "pkey_mprotect" => Some(libc::SYS_pkey_mprotect),
1340 "pkey_alloc" => Some(libc::SYS_pkey_alloc),
1341 "pkey_free" => Some(libc::SYS_pkey_free),
1342 "mseal" => Some(libc::SYS_mseal),
1343 "map_shadow_stack" => Some(453),
1344 "remap_file_pages" => Some(libc::SYS_remap_file_pages),
1345 "set_mempolicy_home_node" => Some(libc::SYS_set_mempolicy_home_node),
1346 "cachestat" => Some(451),
1347 #[cfg(target_arch = "x86_64")]
1349 "vfork" => Some(libc::SYS_vfork),
1350 #[cfg(target_arch = "x86_64")]
1351 "pause" => Some(libc::SYS_pause),
1352 #[cfg(target_arch = "x86_64")]
1353 "alarm" => Some(libc::SYS_alarm),
1354 "tkill" => Some(libc::SYS_tkill),
1355 "ptrace" => Some(libc::SYS_ptrace),
1356 "process_vm_readv" => Some(libc::SYS_process_vm_readv),
1357 "process_vm_writev" => Some(libc::SYS_process_vm_writev),
1358 "process_mrelease" => Some(libc::SYS_process_mrelease),
1359 "kcmp" => Some(libc::SYS_kcmp),
1360 "unshare" => Some(libc::SYS_unshare),
1361 "setns" => Some(libc::SYS_setns),
1362 "pidfd_open" => Some(libc::SYS_pidfd_open),
1363 "pidfd_send_signal" => Some(libc::SYS_pidfd_send_signal),
1364 "pidfd_getfd" => Some(libc::SYS_pidfd_getfd),
1365 "setuid" => Some(libc::SYS_setuid),
1367 "setgid" => Some(libc::SYS_setgid),
1368 "setreuid" => Some(libc::SYS_setreuid),
1369 "setregid" => Some(libc::SYS_setregid),
1370 "setresuid" => Some(libc::SYS_setresuid),
1371 "getresuid" => Some(libc::SYS_getresuid),
1372 "setresgid" => Some(libc::SYS_setresgid),
1373 "getresgid" => Some(libc::SYS_getresgid),
1374 "setfsuid" => Some(libc::SYS_setfsuid),
1375 "setfsgid" => Some(libc::SYS_setfsgid),
1376 "setgroups" => Some(libc::SYS_setgroups),
1377 "getsid" => Some(libc::SYS_getsid),
1378 "capget" => Some(libc::SYS_capget),
1380 "capset" => Some(libc::SYS_capset),
1381 "rt_tgsigqueueinfo" => Some(libc::SYS_rt_tgsigqueueinfo),
1383 "msgget" => Some(libc::SYS_msgget),
1385 "msgsnd" => Some(libc::SYS_msgsnd),
1386 "msgrcv" => Some(libc::SYS_msgrcv),
1387 "msgctl" => Some(libc::SYS_msgctl),
1388 "timer_create" => Some(libc::SYS_timer_create),
1390 "timer_settime" => Some(libc::SYS_timer_settime),
1391 "timer_gettime" => Some(libc::SYS_timer_gettime),
1392 "timer_getoverrun" => Some(libc::SYS_timer_getoverrun),
1393 "timer_delete" => Some(libc::SYS_timer_delete),
1394 "clock_settime" => Some(libc::SYS_clock_settime),
1395 "clock_adjtime" => Some(libc::SYS_clock_adjtime),
1396 #[cfg(target_arch = "x86_64")]
1397 "time" => Some(libc::SYS_time),
1398 #[cfg(target_arch = "x86_64")]
1400 "creat" => Some(libc::SYS_creat),
1401 "readahead" => Some(libc::SYS_readahead),
1402 "sync" => Some(libc::SYS_sync),
1403 "syncfs" => Some(libc::SYS_syncfs),
1404 "vmsplice" => Some(libc::SYS_vmsplice),
1405 "utimensat" => Some(libc::SYS_utimensat),
1406 #[cfg(target_arch = "x86_64")]
1407 "utimes" => Some(libc::SYS_utimes),
1408 #[cfg(target_arch = "x86_64")]
1409 "utime" => Some(libc::SYS_utime),
1410 #[cfg(target_arch = "x86_64")]
1411 "futimesat" => Some(libc::SYS_futimesat),
1412 "openat2" => Some(libc::SYS_openat2),
1413 "name_to_handle_at" => Some(libc::SYS_name_to_handle_at),
1414 "open_by_handle_at" => Some(libc::SYS_open_by_handle_at),
1415 "fchmodat2" => Some(libc::SYS_fchmodat2),
1416 "statmount" => Some(457),
1417 "listmount" => Some(458),
1418 "setxattr" => Some(libc::SYS_setxattr),
1420 "lsetxattr" => Some(libc::SYS_lsetxattr),
1421 "fsetxattr" => Some(libc::SYS_fsetxattr),
1422 "removexattr" => Some(libc::SYS_removexattr),
1423 "lremovexattr" => Some(libc::SYS_lremovexattr),
1424 "fremovexattr" => Some(libc::SYS_fremovexattr),
1425 "setxattrat" => Some(463),
1426 "getxattrat" => Some(464),
1427 "listxattrat" => Some(465),
1428 "removexattrat" => Some(466),
1429 "recvmmsg" => Some(libc::SYS_recvmmsg),
1431 "sendmmsg" => Some(libc::SYS_sendmmsg),
1432 #[cfg(target_arch = "x86_64")]
1434 "inotify_init" => Some(libc::SYS_inotify_init),
1435 "inotify_init1" => Some(libc::SYS_inotify_init1),
1436 "inotify_add_watch" => Some(libc::SYS_inotify_add_watch),
1437 "inotify_rm_watch" => Some(libc::SYS_inotify_rm_watch),
1438 "fanotify_init" => Some(libc::SYS_fanotify_init),
1440 "fanotify_mark" => Some(libc::SYS_fanotify_mark),
1441 "epoll_pwait2" => Some(libc::SYS_epoll_pwait2),
1443 "sched_setparam" => Some(libc::SYS_sched_setparam),
1445 "sched_setscheduler" => Some(libc::SYS_sched_setscheduler),
1446 "sched_get_priority_max" => Some(libc::SYS_sched_get_priority_max),
1447 "sched_get_priority_min" => Some(libc::SYS_sched_get_priority_min),
1448 "sched_rr_get_interval" => Some(libc::SYS_sched_rr_get_interval),
1449 "sched_setattr" => Some(libc::SYS_sched_setattr),
1450 "sched_getattr" => Some(libc::SYS_sched_getattr),
1451 "sched_setaffinity" => Some(libc::SYS_sched_setaffinity),
1452 #[cfg(target_arch = "x86_64")]
1454 "setrlimit" => Some(libc::SYS_setrlimit),
1455 "getpriority" => Some(libc::SYS_getpriority),
1456 "setpriority" => Some(libc::SYS_setpriority),
1457 "ioprio_set" => Some(libc::SYS_ioprio_set),
1458 "ioprio_get" => Some(libc::SYS_ioprio_get),
1459 "futex_waitv" => Some(libc::SYS_futex_waitv),
1461 "futex_wake" => Some(454),
1462 "futex_wait" => Some(455),
1463 "futex_requeue" => Some(456),
1464 "init_module" => Some(libc::SYS_init_module),
1466 "finit_module" => Some(libc::SYS_finit_module),
1467 "delete_module" => Some(libc::SYS_delete_module),
1468 "bpf" => Some(libc::SYS_bpf),
1470 "perf_event_open" => Some(libc::SYS_perf_event_open),
1471 "seccomp" => Some(libc::SYS_seccomp),
1473 "userfaultfd" => Some(libc::SYS_userfaultfd),
1475 "mount" => Some(libc::SYS_mount),
1477 "umount2" => Some(libc::SYS_umount2),
1478 "pivot_root" => Some(libc::SYS_pivot_root),
1479 "mount_setattr" => Some(libc::SYS_mount_setattr),
1480 "open_tree" => Some(libc::SYS_open_tree),
1481 "open_tree_attr" => Some(467),
1482 "move_mount" => Some(libc::SYS_move_mount),
1483 "fsopen" => Some(libc::SYS_fsopen),
1484 "fsconfig" => Some(libc::SYS_fsconfig),
1485 "fsmount" => Some(libc::SYS_fsmount),
1486 "fspick" => Some(libc::SYS_fspick),
1487 "syslog" => Some(libc::SYS_syslog),
1489 "reboot" => Some(libc::SYS_reboot),
1490 "swapon" => Some(libc::SYS_swapon),
1491 "swapoff" => Some(libc::SYS_swapoff),
1492 "chroot" => Some(libc::SYS_chroot),
1493 "acct" => Some(libc::SYS_acct),
1494 "settimeofday" => Some(libc::SYS_settimeofday),
1495 "sethostname" => Some(libc::SYS_sethostname),
1496 "setdomainname" => Some(libc::SYS_setdomainname),
1497 "adjtimex" => Some(libc::SYS_adjtimex),
1498 #[cfg(target_arch = "x86_64")]
1499 "modify_ldt" => Some(libc::SYS_modify_ldt),
1500 #[cfg(target_arch = "x86_64")]
1501 "iopl" => Some(libc::SYS_iopl),
1502 #[cfg(target_arch = "x86_64")]
1503 "ioperm" => Some(libc::SYS_ioperm),
1504 "quotactl" => Some(libc::SYS_quotactl),
1505 "quotactl_fd" => Some(libc::SYS_quotactl_fd),
1506 "personality" => Some(libc::SYS_personality),
1507 "vhangup" => Some(libc::SYS_vhangup),
1508 #[cfg(target_arch = "x86_64")]
1509 "ustat" => Some(libc::SYS_ustat),
1510 #[cfg(target_arch = "x86_64")]
1511 "sysfs" => Some(libc::SYS_sysfs),
1512 "mknod" => Some(libc::SYS_mknod),
1513 "mknodat" => Some(libc::SYS_mknodat),
1514 "migrate_pages" => Some(libc::SYS_migrate_pages),
1515 "move_pages" => Some(libc::SYS_move_pages),
1516 #[cfg(target_arch = "x86_64")]
1517 "kexec_load" => Some(libc::SYS_kexec_load),
1518 "kexec_file_load" => Some(libc::SYS_kexec_file_load),
1519 "mq_open" => Some(libc::SYS_mq_open),
1521 "mq_unlink" => Some(libc::SYS_mq_unlink),
1522 "mq_timedsend" => Some(libc::SYS_mq_timedsend),
1523 "mq_timedreceive" => Some(libc::SYS_mq_timedreceive),
1524 "mq_notify" => Some(libc::SYS_mq_notify),
1525 "mq_getsetattr" => Some(libc::SYS_mq_getsetattr),
1526 "add_key" => Some(libc::SYS_add_key),
1528 "request_key" => Some(libc::SYS_request_key),
1529 "keyctl" => Some(libc::SYS_keyctl),
1530 "io_pgetevents" => Some(333),
1532 "lsm_get_self_attr" => Some(459),
1534 "lsm_set_self_attr" => Some(460),
1535 "lsm_list_modules" => Some(461),
1536 #[cfg(target_arch = "x86_64")]
1537 "lookup_dcookie" => Some(libc::SYS_lookup_dcookie),
1538 "uretprobe" => Some(335),
1539 _ => None,
1540 }
1541}
1542
1543impl Default for SeccompManager {
1544 fn default() -> Self {
1545 Self::new()
1546 }
1547}
1548
1549#[cfg(test)]
1550mod tests {
1551 use super::*;
1552
1553 #[test]
1554 fn test_seccomp_manager_initial_state() {
1555 let mgr = SeccompManager::new();
1556 assert!(!mgr.is_applied());
1557 }
1558
1559 #[test]
1560 fn test_apply_idempotent() {
1561 let mgr = SeccompManager::new();
1562 assert!(!mgr.is_applied());
1566 }
1567
1568 #[test]
1569 fn test_clone_denied_flags_include_newcgroup() {
1570 assert_ne!(
1571 DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWCGROUP as u64,
1572 0
1573 );
1574 }
1575
1576 #[test]
1577 fn test_clone_denied_flags_include_newtime() {
1578 assert_ne!(
1579 DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWTIME as u64,
1580 0,
1581 "CLONE_NEWTIME must be in denied clone namespace flags"
1582 );
1583 }
1584
1585 #[test]
1586 fn test_network_none_socket_domains_are_unix_only() {
1587 let domains = SeccompManager::allowed_socket_domains(false);
1588 assert_eq!(domains, vec![libc::AF_UNIX]);
1589 }
1590
1591 #[test]
1592 fn test_network_enabled_socket_domains_exclude_netlink() {
1593 let domains = SeccompManager::allowed_socket_domains(true);
1594 assert!(domains.contains(&libc::AF_UNIX));
1595 assert!(domains.contains(&libc::AF_INET));
1596 assert!(domains.contains(&libc::AF_INET6));
1597 assert!(!domains.contains(&libc::AF_NETLINK));
1598 }
1599
1600 #[test]
1601 fn test_network_mode_syscalls_only_enabled_when_network_allowed() {
1602 let none = SeccompManager::network_mode_syscalls(false);
1603 assert!(none.is_empty());
1604
1605 let enabled = SeccompManager::network_mode_syscalls(true);
1606 assert!(enabled.contains(&libc::SYS_connect));
1607 assert!(enabled.contains(&libc::SYS_bind));
1608 assert!(enabled.contains(&libc::SYS_listen));
1609 assert!(enabled.contains(&libc::SYS_accept));
1610 assert!(enabled.contains(&libc::SYS_setsockopt));
1611 }
1612
1613 #[test]
1614 fn test_landlock_bootstrap_syscalls_present_in_base_allowlist() {
1615 let base = SeccompManager::base_allowed_syscalls();
1616 assert!(base.contains(&libc::SYS_landlock_create_ruleset));
1617 assert!(base.contains(&libc::SYS_landlock_add_rule));
1618 assert!(base.contains(&libc::SYS_landlock_restrict_self));
1619 }
1620
1621 #[test]
1622 fn test_x32_legacy_range_not_allowlisted() {
1623 let base = SeccompManager::base_allowed_syscalls();
1624 let net = SeccompManager::network_mode_syscalls(true);
1625 for nr in 512_i64..=547_i64 {
1626 assert!(
1627 !base.contains(&nr) && !net.contains(&nr),
1628 "x32 syscall number {} unexpectedly allowlisted",
1629 nr
1630 );
1631 }
1632 }
1633
1634 #[test]
1635 fn test_i386_compat_socketcall_range_not_allowlisted() {
1636 let base = SeccompManager::base_allowed_syscalls();
1637 let net = SeccompManager::network_mode_syscalls(true);
1638 for nr in 359_i64..=373_i64 {
1641 assert!(
1642 !base.contains(&nr) && !net.contains(&nr),
1643 "i386 compat syscall number {} unexpectedly allowlisted",
1644 nr
1645 );
1646 }
1647 }
1648
1649 #[test]
1650 fn test_minimal_filter_allowlist_counts_are_stable() {
1651 let base = SeccompManager::base_allowed_syscalls();
1652 let net = SeccompManager::network_mode_syscalls(true);
1653
1654 assert_eq!(base.len(), 173);
1662 assert_eq!(net.len(), 11);
1663 assert_eq!(base.len() + 8, 181);
1664 assert_eq!(base.len() + net.len() + 8, 192);
1665 }
1666
1667 #[test]
1668 fn test_arg_filtered_syscalls_list_includes_critical_syscalls() {
1669 for name in &["clone", "clone3", "execveat", "ioctl", "prctl", "socket"] {
1672 assert!(
1673 SeccompManager::ARG_FILTERED_SYSCALLS.contains(name),
1674 "'{}' must be in ARG_FILTERED_SYSCALLS",
1675 name
1676 );
1677 }
1678 }
1679
1680 #[test]
1681 fn test_clone3_allowed_in_minimal_filter() {
1682 let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1687 assert!(
1688 rules.contains_key(&libc::SYS_clone3),
1689 "clone3 must be in the seccomp allowlist (glibc 2.34+ requires it)"
1690 );
1691 }
1692
1693 #[test]
1694 fn test_clone_is_allowed_with_arg_filter() {
1695 let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1697 assert!(
1698 rules.contains_key(&libc::SYS_clone),
1699 "clone must be in the seccomp allowlist with arg filters"
1700 );
1701 }
1702
1703 #[test]
1704 fn test_high_risk_syscalls_removed_from_base_allowlist() {
1705 let base = SeccompManager::base_allowed_syscalls();
1706 let removed = [
1709 libc::SYS_sync,
1710 libc::SYS_syncfs,
1711 libc::SYS_mincore,
1712 libc::SYS_vfork,
1713 libc::SYS_tkill,
1714 libc::SYS_io_uring_setup,
1716 libc::SYS_io_uring_enter,
1717 libc::SYS_io_uring_register,
1718 ];
1719
1720 for syscall in removed {
1721 assert!(
1722 !base.contains(&syscall),
1723 "syscall {} unexpectedly present in base allowlist",
1724 syscall
1725 );
1726 }
1727 }
1728
1729 #[test]
1730 fn test_custom_profile_preserves_clone_arg_filters() {
1731 let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1736
1737 for name in SeccompManager::ARG_FILTERED_SYSCALLS {
1742 if *name == "clone3" {
1743 continue;
1747 }
1748 if let Some(nr) = syscall_name_to_number(name) {
1749 let entry = rules.get(&nr);
1750 assert!(
1751 entry.is_some() && !entry.unwrap().is_empty(),
1752 "built-in filter must have argument-level rules for '{}' \
1753 so apply_profile_from_file can merge them into custom profiles",
1754 name
1755 );
1756 }
1757 }
1758 }
1759
1760 #[test]
1761 fn test_memfd_create_not_in_default_allowlist() {
1762 let base = SeccompManager::base_allowed_syscalls();
1764 assert!(
1765 !base.contains(&libc::SYS_memfd_create),
1766 "memfd_create must not be in the default seccomp allowlist (fileless exec risk)"
1767 );
1768 let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1770 assert!(
1771 !rules.contains_key(&libc::SYS_memfd_create),
1772 "memfd_create must not be in the compiled seccomp filter rules"
1773 );
1774 }
1775
1776 #[test]
1777 fn test_mprotect_has_arg_filtering() {
1778 let base = SeccompManager::base_allowed_syscalls();
1783 assert!(
1784 !base.contains(&libc::SYS_mprotect),
1785 "SYS_mprotect must not be unconditionally allowed - needs arg filtering"
1786 );
1787
1788 let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1791 let mprotect_rules = rules.get(&libc::SYS_mprotect);
1792 assert!(
1793 mprotect_rules.is_some(),
1794 "mprotect must be present in the seccomp filter rules"
1795 );
1796 assert!(
1797 !mprotect_rules.unwrap().is_empty(),
1798 "mprotect must have argument-level conditions to prevent W^X violations"
1799 );
1800 }
1801
1802 #[test]
1803 fn test_unsafe_blocks_have_safety_comments() {
1804 let source = include_str!("seccomp.rs");
1806 let mut pos = 0;
1807 while let Some(idx) = source[pos..].find("unsafe {") {
1808 let abs_idx = pos + idx;
1809 let start = abs_idx.saturating_sub(200);
1811 let context = &source[start..abs_idx];
1812 assert!(
1813 context.contains("SAFETY:"),
1814 "unsafe block at byte {} must have a // SAFETY: comment. Context: ...{}...",
1815 abs_idx,
1816 &source[abs_idx.saturating_sub(80)..abs_idx + 10]
1817 );
1818 pos = abs_idx + 1;
1819 }
1820 }
1821
1822 fn mprotect_would_allow(prot: u64) -> bool {
1832 let mask = (libc::PROT_WRITE | libc::PROT_EXEC) as u64;
1833 let allowed_values: &[u64] = &[0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64];
1834 let masked = prot & mask;
1835 allowed_values.contains(&masked)
1836 }
1837
1838 #[test]
1839 fn test_mprotect_allows_prot_none() {
1840 assert!(mprotect_would_allow(0), "PROT_NONE must be allowed");
1841 }
1842
1843 #[test]
1844 fn test_mprotect_allows_prot_read_only() {
1845 assert!(
1846 mprotect_would_allow(libc::PROT_READ as u64),
1847 "PROT_READ must be allowed (W|X bits are 0)"
1848 );
1849 }
1850
1851 #[test]
1852 fn test_mprotect_allows_prot_read_write() {
1853 assert!(
1854 mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE) as u64),
1855 "PROT_READ|PROT_WRITE must be allowed"
1856 );
1857 }
1858
1859 #[test]
1860 fn test_mprotect_allows_prot_read_exec() {
1861 assert!(
1862 mprotect_would_allow((libc::PROT_READ | libc::PROT_EXEC) as u64),
1863 "PROT_READ|PROT_EXEC must be allowed"
1864 );
1865 }
1866
1867 #[test]
1868 fn test_mprotect_rejects_prot_write_exec() {
1869 assert!(
1870 !mprotect_would_allow((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
1871 "PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
1872 );
1873 }
1874
1875 #[test]
1876 fn test_mprotect_rejects_prot_read_write_exec() {
1877 assert!(
1878 !mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE | libc::PROT_EXEC) as u64),
1879 "PROT_READ|PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
1880 );
1881 }
1882
1883 #[test]
1884 fn test_mprotect_allows_prot_write_alone() {
1885 assert!(
1886 mprotect_would_allow(libc::PROT_WRITE as u64),
1887 "PROT_WRITE alone must be allowed"
1888 );
1889 }
1890
1891 #[test]
1892 fn test_mprotect_allows_prot_exec_alone() {
1893 assert!(
1894 mprotect_would_allow(libc::PROT_EXEC as u64),
1895 "PROT_EXEC alone must be allowed"
1896 );
1897 }
1898
1899 #[test]
1902 fn test_extra_syscalls_are_merged_into_filter() {
1903 let extra = vec!["io_uring_setup".to_string(), "sysinfo".to_string()];
1904 let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1905 assert!(
1906 rules.contains_key(&libc::SYS_io_uring_setup),
1907 "io_uring_setup must be in filter when requested via extra_syscalls"
1908 );
1909 assert!(
1910 rules.contains_key(&libc::SYS_sysinfo),
1911 "sysinfo must be in filter when requested via extra_syscalls"
1912 );
1913 }
1914
1915 #[test]
1916 fn test_extra_syscalls_do_not_override_arg_filtered() {
1917 let extra = vec!["clone".to_string()];
1921 let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1922 let clone_rules = rules.get(&libc::SYS_clone);
1923 assert!(
1924 clone_rules.is_some() && !clone_rules.unwrap().is_empty(),
1925 "clone must retain argument-level filtering even when in extra_syscalls"
1926 );
1927 }
1928
1929 #[test]
1930 fn test_extra_syscalls_unknown_name_is_warned_and_skipped() {
1931 let extra = vec!["not_a_real_syscall".to_string()];
1933 let result = SeccompManager::minimal_filter(true, &extra);
1934 assert!(
1935 result.is_ok(),
1936 "Unknown syscall name should warn and skip, not error"
1937 );
1938 }
1939
1940 #[test]
1941 fn test_extra_syscalls_empty_is_noop() {
1942 let rules_without = SeccompManager::minimal_filter(true, &[]).unwrap();
1943 let rules_with = SeccompManager::minimal_filter(true, &[]).unwrap();
1944 assert_eq!(rules_without.len(), rules_with.len());
1945 }
1946
1947 #[test]
1948 fn test_extra_syscalls_duplicate_of_default_is_harmless() {
1949 let extra = vec!["read".to_string()];
1951 let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1952 assert!(rules.contains_key(&libc::SYS_read));
1953 }
1954
1955 #[test]
1956 fn test_extra_syscalls_blocked_known_syscall_not_added() {
1957 let extra = vec!["kexec_load".to_string()];
1960 let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1961 assert!(
1962 !rules.contains_key(&libc::SYS_kexec_load),
1963 "kexec_load must be blocked even when requested via --seccomp-allow"
1964 );
1965 }
1966
1967 #[test]
1968 fn test_extra_syscalls_opt_in_syscall_is_added() {
1969 let extra = vec!["io_uring_setup".to_string()];
1971 let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1972 assert!(
1973 rules.contains_key(&libc::SYS_io_uring_setup),
1974 "io_uring_setup is in OPT_IN_SYSCALLS and must be added"
1975 );
1976 }
1977}