1use crate::error::{NucleusError, Result};
2use crate::security::policy::sha256_hex;
3use seccompiler::{BpfProgram, SeccompAction, SeccompCondition, SeccompFilter, SeccompRule};
4use std::collections::BTreeMap;
5use std::path::Path;
6use tracing::{debug, info, warn};
7
8pub struct SeccompManager {
13 applied: bool,
14}
15
16const DENIED_CLONE_NAMESPACE_FLAGS: u64 = (libc::CLONE_NEWUSER
17 | libc::CLONE_NEWNS
18 | libc::CLONE_NEWNET
19 | libc::CLONE_NEWIPC
20 | libc::CLONE_NEWUTS
21 | libc::CLONE_NEWPID
22 | libc::CLONE_NEWCGROUP
23 | libc::CLONE_NEWTIME) as u64;
24
25impl SeccompManager {
26 pub fn new() -> Self {
27 Self { applied: false }
28 }
29
30 fn base_allowed_syscalls() -> Vec<i64> {
31 let mut syscalls = vec![
32 libc::SYS_read,
34 libc::SYS_write,
35 libc::SYS_openat,
36 libc::SYS_close,
37 libc::SYS_fstat,
38 libc::SYS_lseek,
39 libc::SYS_fcntl,
40 libc::SYS_readv,
41 libc::SYS_writev,
42 libc::SYS_preadv,
43 libc::SYS_pwritev,
44 libc::SYS_preadv2,
45 libc::SYS_pwritev2,
46 libc::SYS_pread64,
47 libc::SYS_pwrite64,
48 libc::SYS_readlinkat,
49 libc::SYS_newfstatat,
50 libc::SYS_statx,
51 libc::SYS_faccessat,
52 libc::SYS_faccessat2,
53 libc::SYS_dup,
54 libc::SYS_dup3,
55 libc::SYS_pipe2,
56 libc::SYS_unlinkat,
57 libc::SYS_renameat,
58 libc::SYS_renameat2,
59 libc::SYS_linkat,
60 libc::SYS_symlinkat,
61 libc::SYS_fchmod,
62 libc::SYS_fchmodat,
63 libc::SYS_truncate,
64 libc::SYS_ftruncate,
65 libc::SYS_fallocate,
66 #[cfg(target_arch = "x86_64")]
67 libc::SYS_fadvise64,
68 libc::SYS_fsync,
69 libc::SYS_fdatasync,
70 libc::SYS_sync_file_range,
71 libc::SYS_flock,
72 libc::SYS_fstatfs,
73 libc::SYS_statfs,
74 #[cfg(target_arch = "x86_64")]
75 libc::SYS_sendfile,
76 libc::SYS_copy_file_range,
77 libc::SYS_splice,
78 libc::SYS_tee,
79 libc::SYS_mmap,
81 libc::SYS_munmap,
82 libc::SYS_brk,
83 libc::SYS_mremap,
84 libc::SYS_madvise,
85 libc::SYS_msync,
86 libc::SYS_mlock,
87 libc::SYS_munlock,
88 libc::SYS_mlock2,
89 libc::SYS_shmget,
92 libc::SYS_shmat,
93 libc::SYS_shmdt,
94 libc::SYS_shmctl,
95 libc::SYS_semget,
97 libc::SYS_semop,
98 libc::SYS_semctl,
99 libc::SYS_semtimedop,
100 libc::SYS_execve,
106 libc::SYS_wait4,
108 libc::SYS_waitid,
109 libc::SYS_exit,
110 libc::SYS_exit_group,
111 libc::SYS_getpid,
112 libc::SYS_gettid,
113 libc::SYS_getuid,
114 libc::SYS_getgid,
115 libc::SYS_geteuid,
116 libc::SYS_getegid,
117 libc::SYS_getppid,
118 libc::SYS_setsid,
119 libc::SYS_getgroups,
120 libc::SYS_rt_sigaction,
122 libc::SYS_rt_sigprocmask,
123 libc::SYS_rt_sigreturn,
124 libc::SYS_rt_sigsuspend,
125 libc::SYS_rt_sigtimedwait,
126 libc::SYS_rt_sigpending,
127 libc::SYS_rt_sigqueueinfo,
128 libc::SYS_sigaltstack,
129 libc::SYS_restart_syscall,
130 libc::SYS_kill,
134 libc::SYS_tgkill,
135 libc::SYS_clock_gettime,
137 libc::SYS_clock_getres,
138 libc::SYS_clock_nanosleep,
139 libc::SYS_gettimeofday,
140 libc::SYS_nanosleep,
141 libc::SYS_setitimer,
142 libc::SYS_getitimer,
143 libc::SYS_getcwd,
145 libc::SYS_chdir,
146 libc::SYS_fchdir,
147 libc::SYS_mkdirat,
148 libc::SYS_getdents64,
149 libc::SYS_uname,
151 libc::SYS_getrandom,
152 libc::SYS_futex,
153 libc::SYS_set_tid_address,
154 libc::SYS_set_robust_list,
155 libc::SYS_get_robust_list,
156 libc::SYS_umask,
159 libc::SYS_getrusage,
161 libc::SYS_times,
162 libc::SYS_sched_yield,
163 libc::SYS_sched_getaffinity,
164 libc::SYS_sched_setaffinity,
165 libc::SYS_sched_getparam,
166 libc::SYS_sched_getscheduler,
167 libc::SYS_getcpu,
168 libc::SYS_getxattr,
170 libc::SYS_lgetxattr,
171 libc::SYS_fgetxattr,
172 libc::SYS_listxattr,
173 libc::SYS_llistxattr,
174 libc::SYS_flistxattr,
175 libc::SYS_rseq,
176 libc::SYS_close_range,
177 libc::SYS_fchown,
180 libc::SYS_fchownat,
181 libc::SYS_io_setup,
184 libc::SYS_io_destroy,
185 libc::SYS_io_submit,
186 libc::SYS_io_getevents,
187 libc::SYS_setpgid,
193 libc::SYS_getpgid,
194 libc::SYS_landlock_create_ruleset,
198 libc::SYS_landlock_add_rule,
199 libc::SYS_landlock_restrict_self,
200 libc::SYS_getsockname,
202 libc::SYS_getpeername,
203 libc::SYS_socketpair,
204 libc::SYS_getsockopt,
205 libc::SYS_ppoll,
207 libc::SYS_pselect6,
208 libc::SYS_epoll_create1,
209 libc::SYS_epoll_ctl,
210 libc::SYS_epoll_pwait,
211 libc::SYS_eventfd2,
212 libc::SYS_signalfd4,
213 libc::SYS_timerfd_create,
214 libc::SYS_timerfd_settime,
215 libc::SYS_timerfd_gettime,
216 ];
217
218 #[cfg(target_arch = "x86_64")]
220 syscalls.extend_from_slice(&[
221 libc::SYS_open,
222 libc::SYS_stat,
223 libc::SYS_lstat,
224 libc::SYS_access,
225 libc::SYS_readlink,
226 libc::SYS_dup2,
227 libc::SYS_pipe,
228 libc::SYS_unlink,
229 libc::SYS_rename,
230 libc::SYS_link,
231 libc::SYS_symlink,
232 libc::SYS_chmod,
233 libc::SYS_mkdir,
234 libc::SYS_rmdir,
235 libc::SYS_getdents,
236 libc::SYS_getpgrp,
237 libc::SYS_chown,
238 libc::SYS_fchown,
239 libc::SYS_lchown,
240 libc::SYS_arch_prctl,
241 libc::SYS_getrlimit,
242 libc::SYS_poll,
243 libc::SYS_select,
244 libc::SYS_epoll_create,
245 libc::SYS_epoll_wait,
246 libc::SYS_eventfd,
247 libc::SYS_signalfd,
248 ]);
249
250 syscalls
251 }
252
253 fn allowed_socket_domains(allow_network: bool) -> Vec<i32> {
254 if allow_network {
255 vec![libc::AF_UNIX, libc::AF_INET, libc::AF_INET6]
256 } else {
257 vec![libc::AF_UNIX]
258 }
259 }
260
261 fn network_mode_syscalls(allow_network: bool) -> Vec<i64> {
262 if allow_network {
263 vec![
264 libc::SYS_connect,
265 libc::SYS_sendto,
266 libc::SYS_recvfrom,
267 libc::SYS_sendmsg,
268 libc::SYS_recvmsg,
269 libc::SYS_shutdown,
270 libc::SYS_bind,
271 libc::SYS_listen,
272 libc::SYS_accept,
273 libc::SYS_accept4,
274 libc::SYS_setsockopt,
275 ]
276 } else {
277 Vec::new()
278 }
279 }
280
281 fn minimal_filter(
291 allow_network: bool,
292 extra_syscalls: &[String],
293 ) -> Result<BTreeMap<i64, Vec<SeccompRule>>> {
294 let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
295
296 let allowed_syscalls = Self::base_allowed_syscalls();
298
299 for syscall in allowed_syscalls {
301 rules.insert(syscall, Vec::new());
302 }
303
304 for syscall in Self::network_mode_syscalls(allow_network) {
306 rules.insert(syscall, Vec::new());
307 }
308
309 for name in extra_syscalls {
315 if let Some(nr) = syscall_name_to_number(name) {
316 if let std::collections::btree_map::Entry::Vacant(entry) = rules.entry(nr) {
317 if Self::OPT_IN_SYSCALLS.contains(&name.as_str()) {
318 entry.insert(Vec::new());
319 } else {
320 warn!(
321 "--seccomp-allow: syscall '{}' is not in the opt-in allowlist – blocked",
322 name
323 );
324 }
325 }
326 } else {
327 warn!("--seccomp-allow: unknown syscall '{}' – blocked", name);
328 }
329 }
330
331 let mut socket_rules = Vec::new();
334 for domain in Self::allowed_socket_domains(allow_network) {
335 let condition = SeccompCondition::new(
336 0, seccompiler::SeccompCmpArgLen::Dword,
338 seccompiler::SeccompCmpOp::Eq,
339 domain as u64,
340 )
341 .map_err(|e| {
342 NucleusError::SeccompError(format!(
343 "Failed to create socket domain condition: {}",
344 e
345 ))
346 })?;
347 let rule = SeccompRule::new(vec![condition]).map_err(|e| {
348 NucleusError::SeccompError(format!("Failed to create socket rule: {}", e))
349 })?;
350 socket_rules.push(rule);
351 }
352 rules.insert(libc::SYS_socket, socket_rules);
353
354 let ioctl_allowed: &[u64] = &[
356 0x5401, 0x5402, 0x5403, 0x5404, 0x540B, 0x540F, 0x5410, 0x5413, 0x5429, 0x541B, 0x5421, 0x5451, 0x5450, ];
372 let mut ioctl_rules = Vec::new();
373 for &request in ioctl_allowed {
374 let condition = SeccompCondition::new(
375 1, seccompiler::SeccompCmpArgLen::Dword,
377 seccompiler::SeccompCmpOp::Eq,
378 request,
379 )
380 .map_err(|e| {
381 NucleusError::SeccompError(format!("Failed to create ioctl condition: {}", e))
382 })?;
383 let rule = SeccompRule::new(vec![condition]).map_err(|e| {
384 NucleusError::SeccompError(format!("Failed to create ioctl rule: {}", e))
385 })?;
386 ioctl_rules.push(rule);
387 }
388 rules.insert(libc::SYS_ioctl, ioctl_rules);
389
390 let prctl_allowed: &[u64] = &[
395 1, 2, 15, 16, 23, 27, 36, 37, 38, 40, 47, 39, ];
411 let mut prctl_rules = Vec::new();
412 for &option in prctl_allowed {
413 let condition = SeccompCondition::new(
414 0, seccompiler::SeccompCmpArgLen::Dword,
416 seccompiler::SeccompCmpOp::Eq,
417 option,
418 )
419 .map_err(|e| {
420 NucleusError::SeccompError(format!("Failed to create prctl condition: {}", e))
421 })?;
422 let rule = SeccompRule::new(vec![condition]).map_err(|e| {
423 NucleusError::SeccompError(format!("Failed to create prctl rule: {}", e))
424 })?;
425 prctl_rules.push(rule);
426 }
427 rules.insert(libc::SYS_prctl, prctl_rules);
428
429 let prlimit_condition = SeccompCondition::new(
432 2, seccompiler::SeccompCmpArgLen::Qword,
434 seccompiler::SeccompCmpOp::Eq,
435 0u64, )
437 .map_err(|e| {
438 NucleusError::SeccompError(format!("Failed to create prlimit64 condition: {}", e))
439 })?;
440 let prlimit_rule = SeccompRule::new(vec![prlimit_condition]).map_err(|e| {
441 NucleusError::SeccompError(format!("Failed to create prlimit64 rule: {}", e))
442 })?;
443 rules.insert(libc::SYS_prlimit64, vec![prlimit_rule]);
444
445 let mut mprotect_rules = Vec::new();
447 for allowed in [0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64] {
448 let condition = SeccompCondition::new(
449 2, seccompiler::SeccompCmpArgLen::Dword,
451 seccompiler::SeccompCmpOp::MaskedEq((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
452 allowed,
453 )
454 .map_err(|e| {
455 NucleusError::SeccompError(format!("Failed to create mprotect condition: {}", e))
456 })?;
457 let rule = SeccompRule::new(vec![condition]).map_err(|e| {
458 NucleusError::SeccompError(format!("Failed to create mprotect rule: {}", e))
459 })?;
460 mprotect_rules.push(rule);
461 }
462 rules.insert(libc::SYS_mprotect, mprotect_rules);
463
464 if Self::has_effective_cap(21) {
479 return Err(NucleusError::SeccompError(
480 "SECURITY: CAP_SYS_ADMIN is still in the effective capability set. \
481 Capabilities must be dropped before installing seccomp filters \
482 (clone3 is allowed unconditionally)."
483 .to_string(),
484 ));
485 }
486 rules.insert(libc::SYS_clone3, Vec::new());
487
488 let clone_condition = SeccompCondition::new(
490 0, seccompiler::SeccompCmpArgLen::Qword,
492 seccompiler::SeccompCmpOp::MaskedEq(DENIED_CLONE_NAMESPACE_FLAGS),
493 0, )
495 .map_err(|e| {
496 NucleusError::SeccompError(format!("Failed to create clone condition: {}", e))
497 })?;
498 let clone_rule = SeccompRule::new(vec![clone_condition]).map_err(|e| {
499 NucleusError::SeccompError(format!("Failed to create clone rule: {}", e))
500 })?;
501 rules.insert(libc::SYS_clone, vec![clone_rule]);
502
503 let execveat_condition = SeccompCondition::new(
510 4, seccompiler::SeccompCmpArgLen::Dword,
512 seccompiler::SeccompCmpOp::MaskedEq(libc::AT_EMPTY_PATH as u64),
513 0, )
515 .map_err(|e| {
516 NucleusError::SeccompError(format!("Failed to create execveat condition: {}", e))
517 })?;
518 let execveat_rule = SeccompRule::new(vec![execveat_condition]).map_err(|e| {
519 NucleusError::SeccompError(format!("Failed to create execveat rule: {}", e))
520 })?;
521 rules.insert(libc::SYS_execveat, vec![execveat_rule]);
522
523 Ok(rules)
524 }
525
526 pub fn compile_minimal_filter() -> Result<BpfProgram> {
533 let rules = Self::minimal_filter(true, &[])?;
534 let target_arch = std::env::consts::ARCH.try_into().map_err(|e| {
535 NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
536 })?;
537 super::seccomp_bpf::compile_bitmap_bpf(
538 rules,
539 SeccompAction::KillProcess,
540 SeccompAction::Allow,
541 target_arch,
542 )
543 }
544
545 #[cfg(test)]
547 pub(crate) fn minimal_filter_for_test(
548 allow_network: bool,
549 extra_syscalls: &[String],
550 ) -> BTreeMap<i64, Vec<SeccompRule>> {
551 Self::minimal_filter(allow_network, extra_syscalls).unwrap()
552 }
553
554 pub fn apply_minimal_filter(&mut self) -> Result<bool> {
562 self.apply_minimal_filter_with_mode(false, false)
563 }
564
565 pub fn apply_minimal_filter_with_mode(
570 &mut self,
571 best_effort: bool,
572 log_denied: bool,
573 ) -> Result<bool> {
574 self.apply_filter_for_network_mode(true, best_effort, log_denied, &[])
575 }
576
577 pub fn apply_filter_for_network_mode(
586 &mut self,
587 allow_network: bool,
588 best_effort: bool,
589 log_denied: bool,
590 extra_syscalls: &[String],
591 ) -> Result<bool> {
592 if self.applied {
593 debug!("Seccomp filter already applied, skipping");
594 return Ok(true);
595 }
596
597 info!(allow_network, "Applying seccomp filter");
598
599 let rules = match Self::minimal_filter(allow_network, extra_syscalls) {
600 Ok(r) => r,
601 Err(e) => {
602 if best_effort {
603 warn!(
604 "Failed to create seccomp rules: {} (continuing without seccomp)",
605 e
606 );
607 return Ok(false);
608 }
609 return Err(e);
610 }
611 };
612
613 let target_arch = match std::env::consts::ARCH.try_into() {
614 Ok(a) => a,
615 Err(e) => {
616 let msg = format!("Unsupported architecture: {:?}", e);
617 if best_effort {
618 warn!("{} (continuing without seccomp)", msg);
619 return Ok(false);
620 }
621 return Err(NucleusError::SeccompError(msg));
622 }
623 };
624
625 let bpf_prog: BpfProgram = match super::seccomp_bpf::compile_bitmap_bpf(
626 rules,
627 SeccompAction::KillProcess,
628 SeccompAction::Allow,
629 target_arch,
630 ) {
631 Ok(p) => p,
632 Err(e) => {
633 if best_effort {
634 warn!(
635 "Failed to compile BPF program: {} (continuing without seccomp)",
636 e
637 );
638 return Ok(false);
639 }
640 return Err(e);
641 }
642 };
643
644 match Self::apply_bpf_program(&bpf_prog, log_denied) {
646 Ok(_) => {
647 self.applied = true;
648 info!("Successfully applied seccomp filter");
649 Ok(true)
650 }
651 Err(e) => {
652 if best_effort {
653 warn!(
654 "Failed to apply seccomp filter: {} (continuing without seccomp)",
655 e
656 );
657 Ok(false)
658 } else {
659 Err(NucleusError::SeccompError(format!(
660 "Failed to apply seccomp filter: {}",
661 e
662 )))
663 }
664 }
665 }
666 }
667
668 pub fn apply_profile_from_file(
687 &mut self,
688 profile_path: &Path,
689 expected_sha256: Option<&str>,
690 audit_mode: bool,
691 ) -> Result<bool> {
692 if self.applied {
693 debug!("Seccomp filter already applied, skipping");
694 return Ok(true);
695 }
696
697 info!("Loading seccomp profile from {:?}", profile_path);
698
699 let content = std::fs::read(profile_path).map_err(|e| {
701 NucleusError::SeccompError(format!(
702 "Failed to read seccomp profile {:?}: {}",
703 profile_path, e
704 ))
705 })?;
706
707 if let Some(expected) = expected_sha256 {
709 let actual = sha256_hex(&content);
710 if actual != expected {
711 return Err(NucleusError::SeccompError(format!(
712 "Seccomp profile hash mismatch: expected {}, got {}",
713 expected, actual
714 )));
715 }
716 info!("Seccomp profile hash verified: {}", actual);
717 }
718
719 let profile: SeccompProfile = serde_json::from_slice(&content).map_err(|e| {
721 NucleusError::SeccompError(format!("Failed to parse seccomp profile: {}", e))
722 })?;
723
724 Self::warn_missing_arg_filters(&profile);
729
730 let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
732
733 for syscall_group in &profile.syscalls {
734 if syscall_group.action == "SCMP_ACT_ALLOW" {
735 for name in &syscall_group.names {
736 if let Some(nr) = syscall_name_to_number(name) {
737 rules.insert(nr, Vec::new());
738 } else {
739 warn!("Unknown syscall in profile: {} (skipping)", name);
740 }
741 }
742 }
743 }
744
745 let builtin_rules = Self::minimal_filter(true, &[])?;
750 for syscall_name in Self::ARG_FILTERED_SYSCALLS {
751 if let Some(nr) = syscall_name_to_number(syscall_name) {
752 if let std::collections::btree_map::Entry::Occupied(mut entry) = rules.entry(nr) {
753 if let Some(builtin) = builtin_rules.get(&nr) {
754 if !builtin.is_empty() {
755 info!(
756 "Merging built-in argument filters for '{}' into custom profile",
757 syscall_name
758 );
759 entry.insert(builtin.clone());
760 }
761 }
762 }
763 }
764 }
765 rules.entry(libc::SYS_clone3).or_default();
771
772 let target_arch = std::env::consts::ARCH.try_into().map_err(|e| {
773 NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
774 })?;
775
776 let bpf_prog: BpfProgram = super::seccomp_bpf::compile_bitmap_bpf(
777 rules,
778 SeccompAction::KillProcess,
779 SeccompAction::Allow,
780 target_arch,
781 )?;
782
783 match Self::apply_bpf_program(&bpf_prog, audit_mode) {
784 Ok(_) => {
785 self.applied = true;
786 info!(
787 "Seccomp profile applied from {:?} (log_denied={})",
788 profile_path, audit_mode
789 );
790 Ok(true)
791 }
792 Err(e) => Err(e),
793 }
794 }
795
796 pub fn apply_trace_filter(&mut self) -> Result<bool> {
801 if self.applied {
802 debug!("Seccomp filter already applied, skipping trace filter");
803 return Ok(true);
804 }
805
806 info!("Applying seccomp trace filter (allow-all + LOG)");
807
808 let filter = SeccompFilter::new(
812 BTreeMap::new(),
813 SeccompAction::Allow, SeccompAction::Allow, std::env::consts::ARCH.try_into().map_err(|e| {
816 NucleusError::SeccompError(format!("Unsupported architecture: {:?}", e))
817 })?,
818 )
819 .map_err(|e| NucleusError::SeccompError(format!("Failed to create trace filter: {}", e)))?;
820
821 let bpf_prog: BpfProgram = filter.try_into().map_err(|e| {
822 NucleusError::SeccompError(format!("Failed to compile trace BPF: {}", e))
823 })?;
824
825 Self::apply_bpf_program(&bpf_prog, true)?;
827 self.applied = true;
828 info!("Seccomp trace filter applied (all syscalls allowed + logged)");
829 Ok(true)
830 }
831
832 const ARG_FILTERED_SYSCALLS: &'static [&'static str] = &[
835 "clone", "clone3", "execveat", "ioctl", "mprotect", "prctl", "socket",
836 ];
837
838 const OPT_IN_SYSCALLS: &'static [&'static str] = &[
845 "io_uring_setup",
847 "io_uring_enter",
848 "io_uring_register",
849 "msgget",
851 "msgsnd",
852 "msgrcv",
853 "msgctl",
854 "mq_open",
856 "mq_unlink",
857 "mq_timedsend",
858 "mq_timedreceive",
859 "mq_notify",
860 "mq_getsetattr",
861 "timer_create",
863 "timer_settime",
864 "timer_gettime",
865 "timer_getoverrun",
866 "timer_delete",
867 "inotify_init",
869 "inotify_init1",
870 "inotify_add_watch",
871 "inotify_rm_watch",
872 "fanotify_init",
873 "fanotify_mark",
874 "mincore",
876 "mlockall",
877 "munlockall",
878 "membarrier",
879 "process_madvise",
880 "mbind",
881 "set_mempolicy",
882 "get_mempolicy",
883 "set_mempolicy_home_node",
884 "pkey_mprotect",
885 "pkey_alloc",
886 "pkey_free",
887 "cachestat",
888 "remap_file_pages",
889 "sync",
891 "syncfs",
892 "sync_file_range",
893 "readahead",
894 "vmsplice",
895 "openat2",
896 "name_to_handle_at",
897 "open_by_handle_at",
898 "io_cancel",
899 "io_pgetevents",
900 "creat",
901 "fchmodat2",
902 "statmount",
903 "listmount",
904 "utimensat",
905 "utimes",
906 "utime",
907 "futimesat",
908 "setxattr",
910 "lsetxattr",
911 "fsetxattr",
912 "removexattr",
913 "lremovexattr",
914 "fremovexattr",
915 "setxattrat",
916 "getxattrat",
917 "listxattrat",
918 "removexattrat",
919 "recvmmsg",
921 "sendmmsg",
922 "sched_setparam",
924 "sched_setscheduler",
925 "sched_get_priority_max",
926 "sched_get_priority_min",
927 "sched_rr_get_interval",
928 "sched_setattr",
929 "sched_getattr",
930 "setrlimit",
932 "getpriority",
933 "setpriority",
934 "ioprio_set",
935 "ioprio_get",
936 "vfork",
938 "pause",
939 "alarm",
940 "tkill",
941 "sysinfo",
942 "personality",
943 "vhangup",
944 "time",
945 "pidfd_open",
946 "pidfd_send_signal",
947 "pidfd_getfd",
948 "setuid",
950 "setgid",
951 "setreuid",
952 "setregid",
953 "setresuid",
954 "getresuid",
955 "setresgid",
956 "getresgid",
957 "setfsuid",
958 "setfsgid",
959 "setgroups",
960 "getsid",
961 "capget",
963 "rt_tgsigqueueinfo",
965 "mknod",
967 "mknodat",
968 "syslog",
969 "clock_settime",
970 "clock_adjtime",
971 "adjtimex",
972 "kcmp",
973 "epoll_pwait2",
974 "futex_waitv",
976 "futex_wake",
977 "futex_wait",
978 "futex_requeue",
979 "seccomp",
981 "add_key",
983 "request_key",
984 "keyctl",
985 ];
986
987 fn warn_missing_arg_filters(profile: &SeccompProfile) {
990 for group in &profile.syscalls {
991 if group.action != "SCMP_ACT_ALLOW" {
992 continue;
993 }
994 for name in &group.names {
995 if Self::ARG_FILTERED_SYSCALLS.contains(&name.as_str()) && group.args.is_empty() {
996 warn!(
997 "Custom seccomp profile allows '{}' without argument filters. \
998 The built-in filter restricts this syscall at the argument level. \
999 This profile weakens security compared to the default.",
1000 name
1001 );
1002 }
1003 }
1004 }
1005 }
1006
1007 fn has_effective_cap(cap: i32) -> bool {
1010 let Ok(status) = std::fs::read_to_string("/proc/self/status") else {
1011 return true;
1013 };
1014 for line in status.lines() {
1015 if let Some(hex) = line.strip_prefix("CapEff:\t") {
1016 if let Ok(eff) = u64::from_str_radix(hex.trim(), 16) {
1017 return eff & (1u64 << cap) != 0;
1018 }
1019 }
1020 }
1021 true }
1023
1024 pub fn is_applied(&self) -> bool {
1026 self.applied
1027 }
1028
1029 fn apply_bpf_program(bpf_prog: &BpfProgram, log_denied: bool) -> Result<()> {
1030 let mut flags: libc::c_ulong = 0;
1031 if log_denied {
1032 flags |= libc::SECCOMP_FILTER_FLAG_LOG as libc::c_ulong;
1033 }
1034
1035 match Self::apply_bpf_program_with_flags(bpf_prog, flags) {
1036 Ok(()) => Ok(()),
1037 Err(err)
1038 if log_denied
1039 && err.raw_os_error() == Some(libc::EINVAL)
1040 && libc::SECCOMP_FILTER_FLAG_LOG != 0 =>
1041 {
1042 warn!(
1043 "Kernel rejected SECCOMP_FILTER_FLAG_LOG; continuing with seccomp \
1044 enforcement without deny logging"
1045 );
1046 Self::apply_bpf_program_with_flags(bpf_prog, 0)?;
1047 Ok(())
1048 }
1049 Err(err) => Err(NucleusError::SeccompError(format!(
1050 "Failed to apply seccomp filter: {}",
1051 err
1052 ))),
1053 }
1054 }
1055
1056 fn apply_bpf_program_with_flags(
1057 bpf_prog: &BpfProgram,
1058 flags: libc::c_ulong,
1059 ) -> std::io::Result<()> {
1060 let rc = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
1063 if rc != 0 {
1064 return Err(std::io::Error::last_os_error());
1065 }
1066
1067 let prog = libc::sock_fprog {
1068 len: bpf_prog.len() as u16,
1069 filter: bpf_prog.as_ptr() as *mut libc::sock_filter,
1070 };
1071
1072 let rc = unsafe {
1075 libc::syscall(
1076 libc::SYS_seccomp,
1077 libc::SECCOMP_SET_MODE_FILTER,
1078 flags,
1079 &prog as *const libc::sock_fprog,
1080 )
1081 };
1082
1083 if rc < 0 {
1084 return Err(std::io::Error::last_os_error());
1085 }
1086
1087 Ok(())
1088 }
1089}
1090
1091use crate::security::seccomp_generate::SeccompProfile;
1093
1094fn syscall_name_to_number(name: &str) -> Option<i64> {
1098 match name {
1099 "read" => Some(libc::SYS_read),
1101 "write" => Some(libc::SYS_write),
1102 #[cfg(target_arch = "x86_64")]
1103 "open" => Some(libc::SYS_open),
1104 "openat" => Some(libc::SYS_openat),
1105 "close" => Some(libc::SYS_close),
1106 #[cfg(target_arch = "x86_64")]
1107 "stat" => Some(libc::SYS_stat),
1108 "fstat" => Some(libc::SYS_fstat),
1109 #[cfg(target_arch = "x86_64")]
1110 "lstat" => Some(libc::SYS_lstat),
1111 "lseek" => Some(libc::SYS_lseek),
1112 #[cfg(target_arch = "x86_64")]
1113 "access" => Some(libc::SYS_access),
1114 "fcntl" => Some(libc::SYS_fcntl),
1115 "readv" => Some(libc::SYS_readv),
1116 "writev" => Some(libc::SYS_writev),
1117 "pread64" => Some(libc::SYS_pread64),
1118 "pwrite64" => Some(libc::SYS_pwrite64),
1119 #[cfg(target_arch = "x86_64")]
1120 "readlink" => Some(libc::SYS_readlink),
1121 "readlinkat" => Some(libc::SYS_readlinkat),
1122 "newfstatat" => Some(libc::SYS_newfstatat),
1123 "statx" => Some(libc::SYS_statx),
1124 "faccessat" => Some(libc::SYS_faccessat),
1125 "faccessat2" => Some(libc::SYS_faccessat2),
1126 "dup" => Some(libc::SYS_dup),
1127 #[cfg(target_arch = "x86_64")]
1128 "dup2" => Some(libc::SYS_dup2),
1129 "dup3" => Some(libc::SYS_dup3),
1130 #[cfg(target_arch = "x86_64")]
1131 "pipe" => Some(libc::SYS_pipe),
1132 "pipe2" => Some(libc::SYS_pipe2),
1133 #[cfg(target_arch = "x86_64")]
1134 "unlink" => Some(libc::SYS_unlink),
1135 "unlinkat" => Some(libc::SYS_unlinkat),
1136 #[cfg(target_arch = "x86_64")]
1137 "rename" => Some(libc::SYS_rename),
1138 "renameat" => Some(libc::SYS_renameat),
1139 "renameat2" => Some(libc::SYS_renameat2),
1140 #[cfg(target_arch = "x86_64")]
1141 "link" => Some(libc::SYS_link),
1142 "linkat" => Some(libc::SYS_linkat),
1143 #[cfg(target_arch = "x86_64")]
1144 "symlink" => Some(libc::SYS_symlink),
1145 "symlinkat" => Some(libc::SYS_symlinkat),
1146 #[cfg(target_arch = "x86_64")]
1147 "chmod" => Some(libc::SYS_chmod),
1148 "fchmod" => Some(libc::SYS_fchmod),
1149 "fchmodat" => Some(libc::SYS_fchmodat),
1150 "truncate" => Some(libc::SYS_truncate),
1151 "ftruncate" => Some(libc::SYS_ftruncate),
1152 "fallocate" => Some(libc::SYS_fallocate),
1153 #[cfg(target_arch = "x86_64")]
1154 "fadvise64" => Some(libc::SYS_fadvise64),
1155 "fsync" => Some(libc::SYS_fsync),
1156 "fdatasync" => Some(libc::SYS_fdatasync),
1157 "flock" => Some(libc::SYS_flock),
1158 #[cfg(target_arch = "x86_64")]
1159 "sendfile" => Some(libc::SYS_sendfile),
1160 "copy_file_range" => Some(libc::SYS_copy_file_range),
1161 "splice" => Some(libc::SYS_splice),
1162 "tee" => Some(libc::SYS_tee),
1163 "mmap" => Some(libc::SYS_mmap),
1165 "munmap" => Some(libc::SYS_munmap),
1166 "mprotect" => Some(libc::SYS_mprotect),
1167 "brk" => Some(libc::SYS_brk),
1168 "mremap" => Some(libc::SYS_mremap),
1169 "madvise" => Some(libc::SYS_madvise),
1170 "msync" => Some(libc::SYS_msync),
1171 "mlock" => Some(libc::SYS_mlock),
1172 "mlock2" => Some(libc::SYS_mlock2),
1173 "munlock" => Some(libc::SYS_munlock),
1174 "shmget" => Some(libc::SYS_shmget),
1176 "shmat" => Some(libc::SYS_shmat),
1177 "shmdt" => Some(libc::SYS_shmdt),
1178 "shmctl" => Some(libc::SYS_shmctl),
1179 "semget" => Some(libc::SYS_semget),
1180 "semop" => Some(libc::SYS_semop),
1181 "semctl" => Some(libc::SYS_semctl),
1182 "semtimedop" => Some(libc::SYS_semtimedop),
1183 #[cfg(target_arch = "x86_64")]
1185 "fork" => Some(libc::SYS_fork),
1186 "clone" => Some(libc::SYS_clone),
1187 "clone3" => Some(libc::SYS_clone3),
1188 "execve" => Some(libc::SYS_execve),
1189 "execveat" => Some(libc::SYS_execveat),
1190 "wait4" => Some(libc::SYS_wait4),
1191 "waitid" => Some(libc::SYS_waitid),
1192 "exit" => Some(libc::SYS_exit),
1193 "exit_group" => Some(libc::SYS_exit_group),
1194 "getpid" => Some(libc::SYS_getpid),
1195 "gettid" => Some(libc::SYS_gettid),
1196 "getuid" => Some(libc::SYS_getuid),
1197 "getgid" => Some(libc::SYS_getgid),
1198 "geteuid" => Some(libc::SYS_geteuid),
1199 "getegid" => Some(libc::SYS_getegid),
1200 "getppid" => Some(libc::SYS_getppid),
1201 #[cfg(target_arch = "x86_64")]
1202 "getpgrp" => Some(libc::SYS_getpgrp),
1203 "setsid" => Some(libc::SYS_setsid),
1204 "getgroups" => Some(libc::SYS_getgroups),
1205 "rt_sigaction" => Some(libc::SYS_rt_sigaction),
1207 "rt_sigprocmask" => Some(libc::SYS_rt_sigprocmask),
1208 "rt_sigreturn" => Some(libc::SYS_rt_sigreturn),
1209 "rt_sigsuspend" => Some(libc::SYS_rt_sigsuspend),
1210 "rt_sigtimedwait" => Some(libc::SYS_rt_sigtimedwait),
1211 "rt_sigpending" => Some(libc::SYS_rt_sigpending),
1212 "rt_sigqueueinfo" => Some(libc::SYS_rt_sigqueueinfo),
1213 "sigaltstack" => Some(libc::SYS_sigaltstack),
1214 "restart_syscall" => Some(libc::SYS_restart_syscall),
1215 "kill" => Some(libc::SYS_kill),
1216 "tgkill" => Some(libc::SYS_tgkill),
1217 "clock_gettime" => Some(libc::SYS_clock_gettime),
1219 "clock_getres" => Some(libc::SYS_clock_getres),
1220 "clock_nanosleep" => Some(libc::SYS_clock_nanosleep),
1221 "gettimeofday" => Some(libc::SYS_gettimeofday),
1222 "nanosleep" => Some(libc::SYS_nanosleep),
1223 "getcwd" => Some(libc::SYS_getcwd),
1225 "chdir" => Some(libc::SYS_chdir),
1226 "fchdir" => Some(libc::SYS_fchdir),
1227 #[cfg(target_arch = "x86_64")]
1228 "mkdir" => Some(libc::SYS_mkdir),
1229 "mkdirat" => Some(libc::SYS_mkdirat),
1230 #[cfg(target_arch = "x86_64")]
1231 "rmdir" => Some(libc::SYS_rmdir),
1232 #[cfg(target_arch = "x86_64")]
1233 "getdents" => Some(libc::SYS_getdents),
1234 "getdents64" => Some(libc::SYS_getdents64),
1235 "socket" => Some(libc::SYS_socket),
1237 "connect" => Some(libc::SYS_connect),
1238 "sendto" => Some(libc::SYS_sendto),
1239 "recvfrom" => Some(libc::SYS_recvfrom),
1240 "sendmsg" => Some(libc::SYS_sendmsg),
1241 "recvmsg" => Some(libc::SYS_recvmsg),
1242 "shutdown" => Some(libc::SYS_shutdown),
1243 "bind" => Some(libc::SYS_bind),
1244 "listen" => Some(libc::SYS_listen),
1245 "accept" => Some(libc::SYS_accept),
1246 "accept4" => Some(libc::SYS_accept4),
1247 "setsockopt" => Some(libc::SYS_setsockopt),
1248 "getsockopt" => Some(libc::SYS_getsockopt),
1249 "getsockname" => Some(libc::SYS_getsockname),
1250 "getpeername" => Some(libc::SYS_getpeername),
1251 "socketpair" => Some(libc::SYS_socketpair),
1252 #[cfg(target_arch = "x86_64")]
1254 "poll" => Some(libc::SYS_poll),
1255 "ppoll" => Some(libc::SYS_ppoll),
1256 #[cfg(target_arch = "x86_64")]
1257 "select" => Some(libc::SYS_select),
1258 "pselect6" => Some(libc::SYS_pselect6),
1259 #[cfg(target_arch = "x86_64")]
1260 "epoll_create" => Some(libc::SYS_epoll_create),
1261 "epoll_create1" => Some(libc::SYS_epoll_create1),
1262 "epoll_ctl" => Some(libc::SYS_epoll_ctl),
1263 #[cfg(target_arch = "x86_64")]
1264 "epoll_wait" => Some(libc::SYS_epoll_wait),
1265 "epoll_pwait" => Some(libc::SYS_epoll_pwait),
1266 #[cfg(target_arch = "x86_64")]
1267 "eventfd" => Some(libc::SYS_eventfd),
1268 "eventfd2" => Some(libc::SYS_eventfd2),
1269 #[cfg(target_arch = "x86_64")]
1270 "signalfd" => Some(libc::SYS_signalfd),
1271 "signalfd4" => Some(libc::SYS_signalfd4),
1272 "timerfd_create" => Some(libc::SYS_timerfd_create),
1273 "timerfd_settime" => Some(libc::SYS_timerfd_settime),
1274 "timerfd_gettime" => Some(libc::SYS_timerfd_gettime),
1275 "uname" => Some(libc::SYS_uname),
1277 "getrandom" => Some(libc::SYS_getrandom),
1278 "futex" => Some(libc::SYS_futex),
1279 "set_tid_address" => Some(libc::SYS_set_tid_address),
1280 "set_robust_list" => Some(libc::SYS_set_robust_list),
1281 "get_robust_list" => Some(libc::SYS_get_robust_list),
1282 #[cfg(target_arch = "x86_64")]
1283 "arch_prctl" => Some(libc::SYS_arch_prctl),
1284 "sysinfo" => Some(libc::SYS_sysinfo),
1285 "umask" => Some(libc::SYS_umask),
1286 #[cfg(target_arch = "x86_64")]
1287 "getrlimit" => Some(libc::SYS_getrlimit),
1288 "prlimit64" => Some(libc::SYS_prlimit64),
1289 "getrusage" => Some(libc::SYS_getrusage),
1290 "times" => Some(libc::SYS_times),
1291 "sched_yield" => Some(libc::SYS_sched_yield),
1292 "sched_getaffinity" => Some(libc::SYS_sched_getaffinity),
1293 "getcpu" => Some(libc::SYS_getcpu),
1294 "rseq" => Some(libc::SYS_rseq),
1295 "close_range" => Some(libc::SYS_close_range),
1296 "fchown" => Some(libc::SYS_fchown),
1298 "fchownat" => Some(libc::SYS_fchownat),
1299 #[cfg(target_arch = "x86_64")]
1300 "chown" => Some(libc::SYS_chown),
1301 #[cfg(target_arch = "x86_64")]
1302 "lchown" => Some(libc::SYS_lchown),
1303 "io_uring_setup" => Some(libc::SYS_io_uring_setup),
1305 "io_uring_enter" => Some(libc::SYS_io_uring_enter),
1306 "io_uring_register" => Some(libc::SYS_io_uring_register),
1307 "io_setup" => Some(libc::SYS_io_setup),
1309 "io_destroy" => Some(libc::SYS_io_destroy),
1310 "io_submit" => Some(libc::SYS_io_submit),
1311 "io_getevents" => Some(libc::SYS_io_getevents),
1312 "setitimer" => Some(libc::SYS_setitimer),
1314 "getitimer" => Some(libc::SYS_getitimer),
1315 "setpgid" => Some(libc::SYS_setpgid),
1317 "getpgid" => Some(libc::SYS_getpgid),
1318 "memfd_create" => Some(libc::SYS_memfd_create),
1319 "ioctl" => Some(libc::SYS_ioctl),
1320 "prctl" => Some(libc::SYS_prctl),
1321 "landlock_create_ruleset" => Some(libc::SYS_landlock_create_ruleset),
1323 "landlock_add_rule" => Some(libc::SYS_landlock_add_rule),
1324 "landlock_restrict_self" => Some(libc::SYS_landlock_restrict_self),
1325 "mincore" => Some(libc::SYS_mincore),
1328 "mlockall" => Some(libc::SYS_mlockall),
1329 "munlockall" => Some(libc::SYS_munlockall),
1330 "mbind" => Some(libc::SYS_mbind),
1331 "set_mempolicy" => Some(libc::SYS_set_mempolicy),
1332 "get_mempolicy" => Some(libc::SYS_get_mempolicy),
1333 "memfd_secret" => Some(libc::SYS_memfd_secret),
1334 "membarrier" => Some(libc::SYS_membarrier),
1335 "process_madvise" => Some(libc::SYS_process_madvise),
1336 "pkey_mprotect" => Some(libc::SYS_pkey_mprotect),
1337 "pkey_alloc" => Some(libc::SYS_pkey_alloc),
1338 "pkey_free" => Some(libc::SYS_pkey_free),
1339 "mseal" => Some(libc::SYS_mseal),
1340 "map_shadow_stack" => Some(453),
1341 "remap_file_pages" => Some(libc::SYS_remap_file_pages),
1342 "set_mempolicy_home_node" => Some(libc::SYS_set_mempolicy_home_node),
1343 "cachestat" => Some(451),
1344 #[cfg(target_arch = "x86_64")]
1346 "vfork" => Some(libc::SYS_vfork),
1347 #[cfg(target_arch = "x86_64")]
1348 "pause" => Some(libc::SYS_pause),
1349 #[cfg(target_arch = "x86_64")]
1350 "alarm" => Some(libc::SYS_alarm),
1351 "tkill" => Some(libc::SYS_tkill),
1352 "ptrace" => Some(libc::SYS_ptrace),
1353 "process_vm_readv" => Some(libc::SYS_process_vm_readv),
1354 "process_vm_writev" => Some(libc::SYS_process_vm_writev),
1355 "process_mrelease" => Some(libc::SYS_process_mrelease),
1356 "kcmp" => Some(libc::SYS_kcmp),
1357 "unshare" => Some(libc::SYS_unshare),
1358 "setns" => Some(libc::SYS_setns),
1359 "pidfd_open" => Some(libc::SYS_pidfd_open),
1360 "pidfd_send_signal" => Some(libc::SYS_pidfd_send_signal),
1361 "pidfd_getfd" => Some(libc::SYS_pidfd_getfd),
1362 "setuid" => Some(libc::SYS_setuid),
1364 "setgid" => Some(libc::SYS_setgid),
1365 "setreuid" => Some(libc::SYS_setreuid),
1366 "setregid" => Some(libc::SYS_setregid),
1367 "setresuid" => Some(libc::SYS_setresuid),
1368 "getresuid" => Some(libc::SYS_getresuid),
1369 "setresgid" => Some(libc::SYS_setresgid),
1370 "getresgid" => Some(libc::SYS_getresgid),
1371 "setfsuid" => Some(libc::SYS_setfsuid),
1372 "setfsgid" => Some(libc::SYS_setfsgid),
1373 "setgroups" => Some(libc::SYS_setgroups),
1374 "getsid" => Some(libc::SYS_getsid),
1375 "capget" => Some(libc::SYS_capget),
1377 "capset" => Some(libc::SYS_capset),
1378 "rt_tgsigqueueinfo" => Some(libc::SYS_rt_tgsigqueueinfo),
1380 "msgget" => Some(libc::SYS_msgget),
1382 "msgsnd" => Some(libc::SYS_msgsnd),
1383 "msgrcv" => Some(libc::SYS_msgrcv),
1384 "msgctl" => Some(libc::SYS_msgctl),
1385 "timer_create" => Some(libc::SYS_timer_create),
1387 "timer_settime" => Some(libc::SYS_timer_settime),
1388 "timer_gettime" => Some(libc::SYS_timer_gettime),
1389 "timer_getoverrun" => Some(libc::SYS_timer_getoverrun),
1390 "timer_delete" => Some(libc::SYS_timer_delete),
1391 "clock_settime" => Some(libc::SYS_clock_settime),
1392 "clock_adjtime" => Some(libc::SYS_clock_adjtime),
1393 #[cfg(target_arch = "x86_64")]
1394 "time" => Some(libc::SYS_time),
1395 #[cfg(target_arch = "x86_64")]
1397 "creat" => Some(libc::SYS_creat),
1398 "readahead" => Some(libc::SYS_readahead),
1399 "sync" => Some(libc::SYS_sync),
1400 "syncfs" => Some(libc::SYS_syncfs),
1401 "vmsplice" => Some(libc::SYS_vmsplice),
1402 "utimensat" => Some(libc::SYS_utimensat),
1403 #[cfg(target_arch = "x86_64")]
1404 "utimes" => Some(libc::SYS_utimes),
1405 #[cfg(target_arch = "x86_64")]
1406 "utime" => Some(libc::SYS_utime),
1407 #[cfg(target_arch = "x86_64")]
1408 "futimesat" => Some(libc::SYS_futimesat),
1409 "openat2" => Some(libc::SYS_openat2),
1410 "name_to_handle_at" => Some(libc::SYS_name_to_handle_at),
1411 "open_by_handle_at" => Some(libc::SYS_open_by_handle_at),
1412 "fchmodat2" => Some(libc::SYS_fchmodat2),
1413 "statmount" => Some(457),
1414 "listmount" => Some(458),
1415 "setxattr" => Some(libc::SYS_setxattr),
1417 "lsetxattr" => Some(libc::SYS_lsetxattr),
1418 "fsetxattr" => Some(libc::SYS_fsetxattr),
1419 "removexattr" => Some(libc::SYS_removexattr),
1420 "lremovexattr" => Some(libc::SYS_lremovexattr),
1421 "fremovexattr" => Some(libc::SYS_fremovexattr),
1422 "setxattrat" => Some(463),
1423 "getxattrat" => Some(464),
1424 "listxattrat" => Some(465),
1425 "removexattrat" => Some(466),
1426 "recvmmsg" => Some(libc::SYS_recvmmsg),
1428 "sendmmsg" => Some(libc::SYS_sendmmsg),
1429 #[cfg(target_arch = "x86_64")]
1431 "inotify_init" => Some(libc::SYS_inotify_init),
1432 "inotify_init1" => Some(libc::SYS_inotify_init1),
1433 "inotify_add_watch" => Some(libc::SYS_inotify_add_watch),
1434 "inotify_rm_watch" => Some(libc::SYS_inotify_rm_watch),
1435 "fanotify_init" => Some(libc::SYS_fanotify_init),
1437 "fanotify_mark" => Some(libc::SYS_fanotify_mark),
1438 "epoll_pwait2" => Some(libc::SYS_epoll_pwait2),
1440 "sched_setparam" => Some(libc::SYS_sched_setparam),
1442 "sched_setscheduler" => Some(libc::SYS_sched_setscheduler),
1443 "sched_get_priority_max" => Some(libc::SYS_sched_get_priority_max),
1444 "sched_get_priority_min" => Some(libc::SYS_sched_get_priority_min),
1445 "sched_rr_get_interval" => Some(libc::SYS_sched_rr_get_interval),
1446 "sched_setattr" => Some(libc::SYS_sched_setattr),
1447 "sched_getattr" => Some(libc::SYS_sched_getattr),
1448 "sched_setaffinity" => Some(libc::SYS_sched_setaffinity),
1449 #[cfg(target_arch = "x86_64")]
1451 "setrlimit" => Some(libc::SYS_setrlimit),
1452 "getpriority" => Some(libc::SYS_getpriority),
1453 "setpriority" => Some(libc::SYS_setpriority),
1454 "ioprio_set" => Some(libc::SYS_ioprio_set),
1455 "ioprio_get" => Some(libc::SYS_ioprio_get),
1456 "futex_waitv" => Some(libc::SYS_futex_waitv),
1458 "futex_wake" => Some(454),
1459 "futex_wait" => Some(455),
1460 "futex_requeue" => Some(456),
1461 "init_module" => Some(libc::SYS_init_module),
1463 "finit_module" => Some(libc::SYS_finit_module),
1464 "delete_module" => Some(libc::SYS_delete_module),
1465 "bpf" => Some(libc::SYS_bpf),
1467 "perf_event_open" => Some(libc::SYS_perf_event_open),
1468 "seccomp" => Some(libc::SYS_seccomp),
1470 "userfaultfd" => Some(libc::SYS_userfaultfd),
1472 "mount" => Some(libc::SYS_mount),
1474 "umount2" => Some(libc::SYS_umount2),
1475 "pivot_root" => Some(libc::SYS_pivot_root),
1476 "mount_setattr" => Some(libc::SYS_mount_setattr),
1477 "open_tree" => Some(libc::SYS_open_tree),
1478 "open_tree_attr" => Some(467),
1479 "move_mount" => Some(libc::SYS_move_mount),
1480 "fsopen" => Some(libc::SYS_fsopen),
1481 "fsconfig" => Some(libc::SYS_fsconfig),
1482 "fsmount" => Some(libc::SYS_fsmount),
1483 "fspick" => Some(libc::SYS_fspick),
1484 "syslog" => Some(libc::SYS_syslog),
1486 "reboot" => Some(libc::SYS_reboot),
1487 "swapon" => Some(libc::SYS_swapon),
1488 "swapoff" => Some(libc::SYS_swapoff),
1489 "chroot" => Some(libc::SYS_chroot),
1490 "acct" => Some(libc::SYS_acct),
1491 "settimeofday" => Some(libc::SYS_settimeofday),
1492 "sethostname" => Some(libc::SYS_sethostname),
1493 "setdomainname" => Some(libc::SYS_setdomainname),
1494 "adjtimex" => Some(libc::SYS_adjtimex),
1495 #[cfg(target_arch = "x86_64")]
1496 "modify_ldt" => Some(libc::SYS_modify_ldt),
1497 #[cfg(target_arch = "x86_64")]
1498 "iopl" => Some(libc::SYS_iopl),
1499 #[cfg(target_arch = "x86_64")]
1500 "ioperm" => Some(libc::SYS_ioperm),
1501 "quotactl" => Some(libc::SYS_quotactl),
1502 "quotactl_fd" => Some(libc::SYS_quotactl_fd),
1503 "personality" => Some(libc::SYS_personality),
1504 "vhangup" => Some(libc::SYS_vhangup),
1505 #[cfg(target_arch = "x86_64")]
1506 "ustat" => Some(libc::SYS_ustat),
1507 #[cfg(target_arch = "x86_64")]
1508 "sysfs" => Some(libc::SYS_sysfs),
1509 "mknod" => Some(libc::SYS_mknod),
1510 "mknodat" => Some(libc::SYS_mknodat),
1511 "migrate_pages" => Some(libc::SYS_migrate_pages),
1512 "move_pages" => Some(libc::SYS_move_pages),
1513 #[cfg(target_arch = "x86_64")]
1514 "kexec_load" => Some(libc::SYS_kexec_load),
1515 "kexec_file_load" => Some(libc::SYS_kexec_file_load),
1516 "mq_open" => Some(libc::SYS_mq_open),
1518 "mq_unlink" => Some(libc::SYS_mq_unlink),
1519 "mq_timedsend" => Some(libc::SYS_mq_timedsend),
1520 "mq_timedreceive" => Some(libc::SYS_mq_timedreceive),
1521 "mq_notify" => Some(libc::SYS_mq_notify),
1522 "mq_getsetattr" => Some(libc::SYS_mq_getsetattr),
1523 "add_key" => Some(libc::SYS_add_key),
1525 "request_key" => Some(libc::SYS_request_key),
1526 "keyctl" => Some(libc::SYS_keyctl),
1527 "io_pgetevents" => Some(333),
1529 "lsm_get_self_attr" => Some(459),
1531 "lsm_set_self_attr" => Some(460),
1532 "lsm_list_modules" => Some(461),
1533 #[cfg(target_arch = "x86_64")]
1534 "lookup_dcookie" => Some(libc::SYS_lookup_dcookie),
1535 "uretprobe" => Some(335),
1536 _ => None,
1537 }
1538}
1539
1540impl Default for SeccompManager {
1541 fn default() -> Self {
1542 Self::new()
1543 }
1544}
1545
1546#[cfg(test)]
1547mod tests {
1548 use super::*;
1549
1550 #[test]
1551 fn test_seccomp_manager_initial_state() {
1552 let mgr = SeccompManager::new();
1553 assert!(!mgr.is_applied());
1554 }
1555
1556 #[test]
1557 fn test_apply_idempotent() {
1558 let mgr = SeccompManager::new();
1559 assert!(!mgr.is_applied());
1563 }
1564
1565 #[test]
1566 fn test_clone_denied_flags_include_newcgroup() {
1567 assert_ne!(
1568 DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWCGROUP as u64,
1569 0
1570 );
1571 }
1572
1573 #[test]
1574 fn test_clone_denied_flags_include_newtime() {
1575 assert_ne!(
1576 DENIED_CLONE_NAMESPACE_FLAGS & libc::CLONE_NEWTIME as u64,
1577 0,
1578 "CLONE_NEWTIME must be in denied clone namespace flags"
1579 );
1580 }
1581
1582 #[test]
1583 fn test_network_none_socket_domains_are_unix_only() {
1584 let domains = SeccompManager::allowed_socket_domains(false);
1585 assert_eq!(domains, vec![libc::AF_UNIX]);
1586 }
1587
1588 #[test]
1589 fn test_network_enabled_socket_domains_exclude_netlink() {
1590 let domains = SeccompManager::allowed_socket_domains(true);
1591 assert!(domains.contains(&libc::AF_UNIX));
1592 assert!(domains.contains(&libc::AF_INET));
1593 assert!(domains.contains(&libc::AF_INET6));
1594 assert!(!domains.contains(&libc::AF_NETLINK));
1595 }
1596
1597 #[test]
1598 fn test_network_mode_syscalls_only_enabled_when_network_allowed() {
1599 let none = SeccompManager::network_mode_syscalls(false);
1600 assert!(none.is_empty());
1601
1602 let enabled = SeccompManager::network_mode_syscalls(true);
1603 assert!(enabled.contains(&libc::SYS_connect));
1604 assert!(enabled.contains(&libc::SYS_bind));
1605 assert!(enabled.contains(&libc::SYS_listen));
1606 assert!(enabled.contains(&libc::SYS_accept));
1607 assert!(enabled.contains(&libc::SYS_setsockopt));
1608 }
1609
1610 #[test]
1611 fn test_landlock_bootstrap_syscalls_present_in_base_allowlist() {
1612 let base = SeccompManager::base_allowed_syscalls();
1613 assert!(base.contains(&libc::SYS_landlock_create_ruleset));
1614 assert!(base.contains(&libc::SYS_landlock_add_rule));
1615 assert!(base.contains(&libc::SYS_landlock_restrict_self));
1616 }
1617
1618 #[test]
1619 fn test_x32_legacy_range_not_allowlisted() {
1620 let base = SeccompManager::base_allowed_syscalls();
1621 let net = SeccompManager::network_mode_syscalls(true);
1622 for nr in 512_i64..=547_i64 {
1623 assert!(
1624 !base.contains(&nr) && !net.contains(&nr),
1625 "x32 syscall number {} unexpectedly allowlisted",
1626 nr
1627 );
1628 }
1629 }
1630
1631 #[test]
1632 fn test_i386_compat_socketcall_range_not_allowlisted() {
1633 let base = SeccompManager::base_allowed_syscalls();
1634 let net = SeccompManager::network_mode_syscalls(true);
1635 for nr in 359_i64..=373_i64 {
1638 assert!(
1639 !base.contains(&nr) && !net.contains(&nr),
1640 "i386 compat syscall number {} unexpectedly allowlisted",
1641 nr
1642 );
1643 }
1644 }
1645
1646 #[test]
1647 fn test_minimal_filter_allowlist_counts_are_stable() {
1648 let base = SeccompManager::base_allowed_syscalls();
1649 let net = SeccompManager::network_mode_syscalls(true);
1650
1651 assert_eq!(base.len(), 173);
1659 assert_eq!(net.len(), 11);
1660 assert_eq!(base.len() + 8, 181);
1661 assert_eq!(base.len() + net.len() + 8, 192);
1662 }
1663
1664 #[test]
1665 fn test_arg_filtered_syscalls_list_includes_critical_syscalls() {
1666 for name in &["clone", "clone3", "execveat", "ioctl", "prctl", "socket"] {
1669 assert!(
1670 SeccompManager::ARG_FILTERED_SYSCALLS.contains(name),
1671 "'{}' must be in ARG_FILTERED_SYSCALLS",
1672 name
1673 );
1674 }
1675 }
1676
1677 #[test]
1678 fn test_clone3_allowed_in_minimal_filter() {
1679 let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1684 assert!(
1685 rules.contains_key(&libc::SYS_clone3),
1686 "clone3 must be in the seccomp allowlist (glibc 2.34+ requires it)"
1687 );
1688 }
1689
1690 #[test]
1691 fn test_clone_is_allowed_with_arg_filter() {
1692 let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1694 assert!(
1695 rules.contains_key(&libc::SYS_clone),
1696 "clone must be in the seccomp allowlist with arg filters"
1697 );
1698 }
1699
1700 #[test]
1701 fn test_high_risk_syscalls_removed_from_base_allowlist() {
1702 let base = SeccompManager::base_allowed_syscalls();
1703 let removed = [
1706 libc::SYS_sync,
1707 libc::SYS_syncfs,
1708 libc::SYS_mincore,
1709 libc::SYS_vfork,
1710 libc::SYS_tkill,
1711 libc::SYS_io_uring_setup,
1713 libc::SYS_io_uring_enter,
1714 libc::SYS_io_uring_register,
1715 ];
1716
1717 for syscall in removed {
1718 assert!(
1719 !base.contains(&syscall),
1720 "syscall {} unexpectedly present in base allowlist",
1721 syscall
1722 );
1723 }
1724 }
1725
1726 #[test]
1727 fn test_custom_profile_preserves_clone_arg_filters() {
1728 let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1733
1734 for name in SeccompManager::ARG_FILTERED_SYSCALLS {
1739 if *name == "clone3" {
1740 continue;
1744 }
1745 if let Some(nr) = syscall_name_to_number(name) {
1746 let entry = rules.get(&nr);
1747 assert!(
1748 entry.is_some() && !entry.unwrap().is_empty(),
1749 "built-in filter must have argument-level rules for '{}' \
1750 so apply_profile_from_file can merge them into custom profiles",
1751 name
1752 );
1753 }
1754 }
1755 }
1756
1757 #[test]
1758 fn test_memfd_create_not_in_default_allowlist() {
1759 let base = SeccompManager::base_allowed_syscalls();
1761 assert!(
1762 !base.contains(&libc::SYS_memfd_create),
1763 "memfd_create must not be in the default seccomp allowlist (fileless exec risk)"
1764 );
1765 let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1767 assert!(
1768 !rules.contains_key(&libc::SYS_memfd_create),
1769 "memfd_create must not be in the compiled seccomp filter rules"
1770 );
1771 }
1772
1773 #[test]
1774 fn test_mprotect_has_arg_filtering() {
1775 let base = SeccompManager::base_allowed_syscalls();
1780 assert!(
1781 !base.contains(&libc::SYS_mprotect),
1782 "SYS_mprotect must not be unconditionally allowed - needs arg filtering"
1783 );
1784
1785 let rules = SeccompManager::minimal_filter(true, &[]).unwrap();
1788 let mprotect_rules = rules.get(&libc::SYS_mprotect);
1789 assert!(
1790 mprotect_rules.is_some(),
1791 "mprotect must be present in the seccomp filter rules"
1792 );
1793 assert!(
1794 !mprotect_rules.unwrap().is_empty(),
1795 "mprotect must have argument-level conditions to prevent W^X violations"
1796 );
1797 }
1798
1799 #[test]
1800 fn test_unsafe_blocks_have_safety_comments() {
1801 let source = include_str!("seccomp.rs");
1803 let mut pos = 0;
1804 while let Some(idx) = source[pos..].find("unsafe {") {
1805 let abs_idx = pos + idx;
1806 let start = abs_idx.saturating_sub(200);
1808 let context = &source[start..abs_idx];
1809 assert!(
1810 context.contains("SAFETY:"),
1811 "unsafe block at byte {} must have a // SAFETY: comment. Context: ...{}...",
1812 abs_idx,
1813 &source[abs_idx.saturating_sub(80)..abs_idx + 10]
1814 );
1815 pos = abs_idx + 1;
1816 }
1817 }
1818
1819 fn mprotect_would_allow(prot: u64) -> bool {
1829 let mask = (libc::PROT_WRITE | libc::PROT_EXEC) as u64;
1830 let allowed_values: &[u64] = &[0, libc::PROT_WRITE as u64, libc::PROT_EXEC as u64];
1831 let masked = prot & mask;
1832 allowed_values.contains(&masked)
1833 }
1834
1835 #[test]
1836 fn test_mprotect_allows_prot_none() {
1837 assert!(mprotect_would_allow(0), "PROT_NONE must be allowed");
1838 }
1839
1840 #[test]
1841 fn test_mprotect_allows_prot_read_only() {
1842 assert!(
1843 mprotect_would_allow(libc::PROT_READ as u64),
1844 "PROT_READ must be allowed (W|X bits are 0)"
1845 );
1846 }
1847
1848 #[test]
1849 fn test_mprotect_allows_prot_read_write() {
1850 assert!(
1851 mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE) as u64),
1852 "PROT_READ|PROT_WRITE must be allowed"
1853 );
1854 }
1855
1856 #[test]
1857 fn test_mprotect_allows_prot_read_exec() {
1858 assert!(
1859 mprotect_would_allow((libc::PROT_READ | libc::PROT_EXEC) as u64),
1860 "PROT_READ|PROT_EXEC must be allowed"
1861 );
1862 }
1863
1864 #[test]
1865 fn test_mprotect_rejects_prot_write_exec() {
1866 assert!(
1867 !mprotect_would_allow((libc::PROT_WRITE | libc::PROT_EXEC) as u64),
1868 "PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
1869 );
1870 }
1871
1872 #[test]
1873 fn test_mprotect_rejects_prot_read_write_exec() {
1874 assert!(
1875 !mprotect_would_allow((libc::PROT_READ | libc::PROT_WRITE | libc::PROT_EXEC) as u64),
1876 "PROT_READ|PROT_WRITE|PROT_EXEC (W^X violation) must be REJECTED"
1877 );
1878 }
1879
1880 #[test]
1881 fn test_mprotect_allows_prot_write_alone() {
1882 assert!(
1883 mprotect_would_allow(libc::PROT_WRITE as u64),
1884 "PROT_WRITE alone must be allowed"
1885 );
1886 }
1887
1888 #[test]
1889 fn test_mprotect_allows_prot_exec_alone() {
1890 assert!(
1891 mprotect_would_allow(libc::PROT_EXEC as u64),
1892 "PROT_EXEC alone must be allowed"
1893 );
1894 }
1895
1896 #[test]
1899 fn test_extra_syscalls_are_merged_into_filter() {
1900 let extra = vec!["io_uring_setup".to_string(), "sysinfo".to_string()];
1901 let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1902 assert!(
1903 rules.contains_key(&libc::SYS_io_uring_setup),
1904 "io_uring_setup must be in filter when requested via extra_syscalls"
1905 );
1906 assert!(
1907 rules.contains_key(&libc::SYS_sysinfo),
1908 "sysinfo must be in filter when requested via extra_syscalls"
1909 );
1910 }
1911
1912 #[test]
1913 fn test_extra_syscalls_do_not_override_arg_filtered() {
1914 let extra = vec!["clone".to_string()];
1918 let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1919 let clone_rules = rules.get(&libc::SYS_clone);
1920 assert!(
1921 clone_rules.is_some() && !clone_rules.unwrap().is_empty(),
1922 "clone must retain argument-level filtering even when in extra_syscalls"
1923 );
1924 }
1925
1926 #[test]
1927 fn test_extra_syscalls_unknown_name_is_warned_and_skipped() {
1928 let extra = vec!["not_a_real_syscall".to_string()];
1930 let result = SeccompManager::minimal_filter(true, &extra);
1931 assert!(
1932 result.is_ok(),
1933 "Unknown syscall name should warn and skip, not error"
1934 );
1935 }
1936
1937 #[test]
1938 fn test_extra_syscalls_empty_is_noop() {
1939 let rules_without = SeccompManager::minimal_filter(true, &[]).unwrap();
1940 let rules_with = SeccompManager::minimal_filter(true, &[]).unwrap();
1941 assert_eq!(rules_without.len(), rules_with.len());
1942 }
1943
1944 #[test]
1945 fn test_extra_syscalls_duplicate_of_default_is_harmless() {
1946 let extra = vec!["read".to_string()];
1948 let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1949 assert!(rules.contains_key(&libc::SYS_read));
1950 }
1951
1952 #[test]
1953 fn test_extra_syscalls_blocked_known_syscall_not_added() {
1954 let extra = vec!["kexec_load".to_string()];
1957 let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1958 assert!(
1959 !rules.contains_key(&libc::SYS_kexec_load),
1960 "kexec_load must be blocked even when requested via --seccomp-allow"
1961 );
1962 }
1963
1964 #[test]
1965 fn test_extra_syscalls_unshare_remains_blocked() {
1966 let extra = vec!["unshare".to_string()];
1967 let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1968 assert!(
1969 !rules.contains_key(&libc::SYS_unshare),
1970 "unshare must stay blocked even when requested via --seccomp-allow"
1971 );
1972 }
1973
1974 #[test]
1975 fn test_extra_syscalls_opt_in_syscall_is_added() {
1976 let extra = vec!["io_uring_setup".to_string()];
1978 let rules = SeccompManager::minimal_filter(true, &extra).unwrap();
1979 assert!(
1980 rules.contains_key(&libc::SYS_io_uring_setup),
1981 "io_uring_setup is in OPT_IN_SYSCALLS and must be added"
1982 );
1983 }
1984}