1use rustix::io::Errno;
61
62use crate::last_errno;
63
64const SECCOMP_SET_MODE_FILTER: u32 = 1;
66const SECCOMP_RET_KILL_PROCESS: u32 = 0x80000000;
67const SECCOMP_RET_USER_NOTIF: u32 = 0x7fc00000;
68const SECCOMP_RET_ALLOW: u32 = 0x7fff0000;
69const SECCOMP_RET_ERRNO_ENOSYS: u32 = 0x00050000 | 38;
71
72const BPF_LD: u16 = 0x00;
74const BPF_JMP: u16 = 0x05;
75const BPF_RET: u16 = 0x06;
76
77const BPF_W: u16 = 0x00;
79const BPF_ABS: u16 = 0x20;
80
81const BPF_JEQ: u16 = 0x10;
83const BPF_JSET: u16 = 0x40;
84const BPF_K: u16 = 0x00;
85
86const AUDIT_ARCH_X86_64: u32 = 0xc000003e;
87
88const OFFSET_SYSCALL_NR: u32 = 0;
90const OFFSET_ARCH: u32 = 4;
91const OFFSET_ARGS_0: u32 = 16; const OFFSET_ARGS_1: u32 = 24; const CLONE_NEWNS: u32 = 0x00020000;
96const CLONE_NEWCGROUP: u32 = 0x02000000;
97const CLONE_NEWUTS: u32 = 0x04000000;
98const CLONE_NEWIPC: u32 = 0x08000000;
99const CLONE_NEWUSER: u32 = 0x10000000;
100const CLONE_NEWPID: u32 = 0x20000000;
101const CLONE_NEWNET: u32 = 0x40000000;
102
103const BLOCKED_CLONE_FLAGS: u32 = CLONE_NEWNS
105 | CLONE_NEWCGROUP
106 | CLONE_NEWUTS
107 | CLONE_NEWIPC
108 | CLONE_NEWUSER
109 | CLONE_NEWPID
110 | CLONE_NEWNET;
111
112const AF_NETLINK: u32 = 16; const SOCK_RAW: u32 = 3; const TIOCSTI: u32 = 0x5412;
120const TIOCSETD: u32 = 0x5423;
122const TIOCLINUX: u32 = 0x541C;
124
125const MAX_WHITELIST_SIZE: usize = 200;
127
128#[repr(C)]
129#[derive(Debug, Clone, Copy, Default)]
130pub struct SockFilter {
131 pub code: u16,
132 pub jt: u8,
133 pub jf: u8,
134 pub k: u32,
135}
136
137impl SockFilter {
138 #[inline]
139 pub const fn stmt(code: u16, k: u32) -> Self {
140 Self {
141 code,
142 jt: 0,
143 jf: 0,
144 k,
145 }
146 }
147
148 #[inline]
149 pub const fn jump(code: u16, k: u32, jt: u8, jf: u8) -> Self {
150 Self { code, jt, jf, k }
151 }
152}
153
154#[repr(C)]
155#[derive(Debug)]
156pub struct SockFprog {
157 pub len: u16,
158 pub filter: *const SockFilter,
159}
160
161pub const DEFAULT_WHITELIST: &[i64] = &[
179 libc::SYS_read,
181 libc::SYS_write,
182 libc::SYS_close,
183 libc::SYS_close_range, libc::SYS_fstat,
185 libc::SYS_lseek,
186 libc::SYS_pread64,
187 libc::SYS_pwrite64,
188 libc::SYS_readv,
189 libc::SYS_writev,
190 libc::SYS_preadv,
191 libc::SYS_pwritev,
192 libc::SYS_preadv2,
193 libc::SYS_pwritev2,
194 libc::SYS_dup,
195 libc::SYS_dup2,
196 libc::SYS_dup3,
197 libc::SYS_fcntl,
198 libc::SYS_flock,
199 libc::SYS_fsync,
200 libc::SYS_fdatasync,
201 libc::SYS_ftruncate,
202 libc::SYS_fadvise64,
203 libc::SYS_access,
204 libc::SYS_pipe,
205 libc::SYS_pipe2,
206 libc::SYS_select,
207 libc::SYS_poll,
208 libc::SYS_ppoll,
209 libc::SYS_pselect6,
210 libc::SYS_sendfile,
212 libc::SYS_copy_file_range,
213 libc::SYS_splice,
214 libc::SYS_tee,
215 libc::SYS_mmap,
217 libc::SYS_mprotect,
218 libc::SYS_munmap,
219 libc::SYS_brk,
220 libc::SYS_mremap,
221 libc::SYS_msync,
222 libc::SYS_mincore,
223 libc::SYS_madvise,
224 libc::SYS_membarrier,
226 libc::SYS_mlock,
227 libc::SYS_mlock2,
228 libc::SYS_munlock,
229 libc::SYS_mlockall,
230 libc::SYS_munlockall,
231 libc::SYS_getpid,
233 libc::SYS_getppid,
234 libc::SYS_gettid, libc::SYS_getuid,
236 libc::SYS_getgid,
237 libc::SYS_geteuid,
238 libc::SYS_getegid,
239 libc::SYS_getresuid,
240 libc::SYS_getresgid,
241 libc::SYS_getpgrp,
243 libc::SYS_getgroups,
245 libc::SYS_getsid,
246 libc::SYS_uname,
247 libc::SYS_getrusage,
248 libc::SYS_times,
249 libc::SYS_sysinfo,
250 libc::SYS_clock_gettime,
252 libc::SYS_clock_getres,
253 libc::SYS_clock_nanosleep,
254 libc::SYS_gettimeofday,
255 libc::SYS_nanosleep,
256 libc::SYS_openat,
258 libc::SYS_open,
259 libc::SYS_creat,
260 libc::SYS_unlink,
261 libc::SYS_unlinkat,
262 libc::SYS_rename,
263 libc::SYS_renameat,
264 libc::SYS_renameat2,
265 libc::SYS_mkdir,
266 libc::SYS_mkdirat,
267 libc::SYS_rmdir,
268 libc::SYS_symlink,
269 libc::SYS_symlinkat,
270 libc::SYS_link,
271 libc::SYS_linkat,
272 libc::SYS_chmod,
273 libc::SYS_fchmod,
274 libc::SYS_fchmodat,
275 libc::SYS_chown,
276 libc::SYS_fchown,
277 libc::SYS_fchownat,
278 libc::SYS_lchown,
279 libc::SYS_utimensat,
280 libc::SYS_faccessat,
281 libc::SYS_faccessat2,
282 libc::SYS_stat,
283 libc::SYS_lstat,
284 libc::SYS_newfstatat,
285 libc::SYS_statfs,
286 libc::SYS_fstatfs,
287 libc::SYS_statx,
288 libc::SYS_getdents,
289 libc::SYS_getdents64,
290 libc::SYS_getcwd,
291 libc::SYS_chdir,
292 libc::SYS_fchdir,
293 libc::SYS_readlink,
294 libc::SYS_readlinkat,
295 libc::SYS_rt_sigaction,
297 libc::SYS_rt_sigprocmask,
298 libc::SYS_rt_sigreturn,
299 libc::SYS_rt_sigsuspend,
300 libc::SYS_rt_sigpending,
301 libc::SYS_rt_sigtimedwait,
302 libc::SYS_sigaltstack,
303 libc::SYS_kill, libc::SYS_tgkill, libc::SYS_tkill, libc::SYS_execve,
308 libc::SYS_fork, libc::SYS_vfork, libc::SYS_exit,
312 libc::SYS_exit_group,
313 libc::SYS_wait4,
314 libc::SYS_waitid,
315 libc::SYS_set_tid_address,
316 libc::SYS_futex,
317 libc::SYS_get_robust_list,
318 libc::SYS_set_robust_list,
319 libc::SYS_sched_yield,
320 libc::SYS_sched_getaffinity, libc::SYS_sched_setaffinity, libc::SYS_sched_getparam,
323 libc::SYS_sched_setparam,
324 libc::SYS_sched_getscheduler,
325 libc::SYS_sched_get_priority_max,
326 libc::SYS_sched_get_priority_min,
327 libc::SYS_arch_prctl,
328 libc::SYS_prctl, libc::SYS_getrandom,
330 libc::SYS_prlimit64,
331 libc::SYS_rseq,
332 libc::SYS_ioprio_get,
333 libc::SYS_eventfd,
338 libc::SYS_eventfd2,
339 libc::SYS_epoll_create,
340 libc::SYS_epoll_create1,
341 libc::SYS_epoll_ctl,
342 libc::SYS_epoll_wait,
343 libc::SYS_epoll_pwait,
344 libc::SYS_epoll_pwait2,
345 libc::SYS_timerfd_create,
346 libc::SYS_timerfd_settime,
347 libc::SYS_timerfd_gettime,
348 libc::SYS_signalfd,
349 libc::SYS_signalfd4,
350 libc::SYS_socketpair,
353 libc::SYS_connect,
354 libc::SYS_bind,
355 libc::SYS_listen,
356 libc::SYS_accept,
357 libc::SYS_accept4,
358 libc::SYS_getsockname,
359 libc::SYS_getpeername,
360 libc::SYS_sendto,
361 libc::SYS_recvfrom,
362 libc::SYS_setsockopt,
363 libc::SYS_getsockopt,
364 libc::SYS_shutdown,
365 libc::SYS_sendmsg,
366 libc::SYS_recvmsg,
367 libc::SYS_sendmmsg,
368 libc::SYS_recvmmsg,
369];
370
371pub fn build_whitelist_filter(syscalls: &[i64]) -> Vec<SockFilter> {
392 assert!(
393 syscalls.len() <= MAX_WHITELIST_SIZE,
394 "whitelist too large: {} > {} (BPF jump offset overflow)",
395 syscalls.len(),
396 MAX_WHITELIST_SIZE
397 );
398
399 let n = syscalls.len();
400 let mut filter = Vec::with_capacity(n + 20);
401
402 filter.push(SockFilter::stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARCH));
404 filter.push(SockFilter::jump(
405 BPF_JMP | BPF_JEQ | BPF_K,
406 AUDIT_ARCH_X86_64,
407 1,
408 0,
409 ));
410 filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS));
411
412 filter.push(SockFilter::stmt(
414 BPF_LD | BPF_W | BPF_ABS,
415 OFFSET_SYSCALL_NR,
416 ));
417
418 let clone3_errno_offset = (3 + n + 2) as u8;
423 filter.push(SockFilter::jump(
424 BPF_JMP | BPF_JEQ | BPF_K,
425 libc::SYS_clone3 as u32,
426 clone3_errno_offset,
427 0,
428 ));
429
430 let clone_handler_offset = (2 + n + 3) as u8;
433 filter.push(SockFilter::jump(
434 BPF_JMP | BPF_JEQ | BPF_K,
435 libc::SYS_clone as u32,
436 clone_handler_offset,
437 0,
438 ));
439
440 let socket_handler_offset = (1 + n + 3 + 4) as u8;
443 filter.push(SockFilter::jump(
444 BPF_JMP | BPF_JEQ | BPF_K,
445 libc::SYS_socket as u32,
446 socket_handler_offset,
447 0,
448 ));
449
450 let ioctl_handler_offset = (n + 3 + 4 + 6) as u8;
453 filter.push(SockFilter::jump(
454 BPF_JMP | BPF_JEQ | BPF_K,
455 libc::SYS_ioctl as u32,
456 ioctl_handler_offset,
457 0,
458 ));
459
460 for (i, &nr) in syscalls.iter().enumerate() {
462 let allow_offset = (n - i) as u8;
463 filter.push(SockFilter::jump(
464 BPF_JMP | BPF_JEQ | BPF_K,
465 nr as u32,
466 allow_offset,
467 0,
468 ));
469 }
470
471 filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS));
473
474 filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
476
477 filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ERRNO_ENOSYS));
480
481 filter.push(SockFilter::stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS_0));
484 filter.push(SockFilter::jump(
486 BPF_JMP | BPF_JSET | BPF_K,
487 BLOCKED_CLONE_FLAGS,
488 1,
489 0,
490 ));
491 filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
493 filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS));
495
496 filter.push(SockFilter::stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS_0));
499 filter.push(SockFilter::jump(
501 BPF_JMP | BPF_JEQ | BPF_K,
502 AF_NETLINK,
503 3,
504 0,
505 )); filter.push(SockFilter::stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS_1));
509 filter.push(SockFilter::jump(BPF_JMP | BPF_JEQ | BPF_K, SOCK_RAW, 1, 0)); filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
516 filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS));
518
519 filter.push(SockFilter::stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS_1));
526 filter.push(SockFilter::jump(BPF_JMP | BPF_JEQ | BPF_K, TIOCSTI, 3, 0));
528 filter.push(SockFilter::jump(BPF_JMP | BPF_JEQ | BPF_K, TIOCSETD, 2, 0));
530 filter.push(SockFilter::jump(BPF_JMP | BPF_JEQ | BPF_K, TIOCLINUX, 1, 0));
532 filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
534 filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS));
536
537 filter
538}
539
540pub unsafe fn seccomp_set_mode_filter(fprog: &SockFprog) -> Result<(), Errno> {
550 let ret = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
551 if ret != 0 {
552 return Err(last_errno());
553 }
554
555 let ret = unsafe {
556 libc::syscall(
557 libc::SYS_seccomp,
558 SECCOMP_SET_MODE_FILTER,
559 0u32,
560 fprog as *const _,
561 )
562 };
563 if ret != 0 { Err(last_errno()) } else { Ok(()) }
564}
565
566pub fn seccomp_available() -> bool {
568 unsafe { libc::prctl(libc::PR_GET_SECCOMP, 0, 0, 0, 0) >= 0 }
569}
570
571pub fn build_notify_filter(syscalls: &[i64]) -> Vec<SockFilter> {
584 assert!(
585 syscalls.len() <= MAX_WHITELIST_SIZE,
586 "notify syscall list too large: {} > {}",
587 syscalls.len(),
588 MAX_WHITELIST_SIZE
589 );
590
591 let n = syscalls.len();
592 let mut filter = Vec::with_capacity(n + 8);
593
594 filter.push(SockFilter::stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARCH));
596 filter.push(SockFilter::jump(
597 BPF_JMP | BPF_JEQ | BPF_K,
598 AUDIT_ARCH_X86_64,
599 1,
600 0,
601 ));
602 filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
603
604 filter.push(SockFilter::stmt(
606 BPF_LD | BPF_W | BPF_ABS,
607 OFFSET_SYSCALL_NR,
608 ));
609
610 for (i, &nr) in syscalls.iter().enumerate() {
612 let notify_offset = (n - i) as u8; filter.push(SockFilter::jump(
614 BPF_JMP | BPF_JEQ | BPF_K,
615 nr as u32,
616 notify_offset,
617 0,
618 ));
619 }
620
621 filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
623
624 filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_USER_NOTIF));
626
627 filter
628}
629
630pub const NOTIFY_FS_SYSCALLS: &[i64] = &[
632 libc::SYS_openat,
633 libc::SYS_open,
634 libc::SYS_creat,
635 libc::SYS_access,
636 libc::SYS_faccessat,
637 libc::SYS_faccessat2,
638 libc::SYS_stat,
639 libc::SYS_lstat,
640 libc::SYS_newfstatat,
641 libc::SYS_statx,
642 libc::SYS_readlink,
643 libc::SYS_readlinkat,
644];
645
646#[cfg(test)]
647mod tests {
648 use super::*;
649
650 #[test]
651 fn filter_structure() {
652 let syscalls = &[libc::SYS_read, libc::SYS_write, libc::SYS_exit];
653 let filter = build_whitelist_filter(syscalls);
654 assert_eq!(filter.len(), 30);
657 }
658
659 #[test]
660 fn clone3_returns_enosys() {
661 let filter = build_whitelist_filter(DEFAULT_WHITELIST);
662 let clone3_check = &filter[4];
663 assert_eq!(clone3_check.k, libc::SYS_clone3 as u32);
664 assert!(clone3_check.jt > 0);
665 }
667
668 #[test]
669 fn clone_has_flag_check() {
670 let filter = build_whitelist_filter(DEFAULT_WHITELIST);
671 let clone_check = &filter[5];
672 assert_eq!(clone_check.k, libc::SYS_clone as u32);
673 assert!(clone_check.jt > 0);
674
675 let has_jset = filter
676 .iter()
677 .any(|f| f.code == (BPF_JMP | BPF_JSET | BPF_K));
678 assert!(has_jset);
679 }
680
681 #[test]
682 fn socket_is_filtered() {
683 let filter = build_whitelist_filter(DEFAULT_WHITELIST);
684 let socket_check = &filter[6];
685 assert_eq!(socket_check.k, libc::SYS_socket as u32);
686 assert!(socket_check.jt > 0);
687 }
688
689 #[test]
690 fn ioctl_is_filtered() {
691 let filter = build_whitelist_filter(DEFAULT_WHITELIST);
692 let ioctl_check = &filter[7];
693 assert_eq!(ioctl_check.k, libc::SYS_ioctl as u32);
694 assert!(ioctl_check.jt > 0);
695 }
696
697 #[test]
698 fn blocked_clone_flags_mask() {
699 assert_ne!(BLOCKED_CLONE_FLAGS & CLONE_NEWUSER, 0);
700 assert_ne!(BLOCKED_CLONE_FLAGS & CLONE_NEWNET, 0);
701 assert_ne!(BLOCKED_CLONE_FLAGS & CLONE_NEWNS, 0);
702 assert_ne!(BLOCKED_CLONE_FLAGS & CLONE_NEWPID, 0);
703 assert_ne!(BLOCKED_CLONE_FLAGS & CLONE_NEWIPC, 0);
704 assert_ne!(BLOCKED_CLONE_FLAGS & CLONE_NEWUTS, 0);
705 assert_ne!(BLOCKED_CLONE_FLAGS & CLONE_NEWCGROUP, 0);
706 }
707
708 #[test]
709 fn dangerous_syscalls_removed() {
710 assert!(!DEFAULT_WHITELIST.contains(&libc::SYS_clone));
712 assert!(!DEFAULT_WHITELIST.contains(&libc::SYS_clone3));
713 assert!(!DEFAULT_WHITELIST.contains(&libc::SYS_socket)); assert!(!DEFAULT_WHITELIST.contains(&libc::SYS_memfd_create));
715 assert!(!DEFAULT_WHITELIST.contains(&libc::SYS_execveat));
716 assert!(!DEFAULT_WHITELIST.contains(&libc::SYS_setresuid));
717 assert!(!DEFAULT_WHITELIST.contains(&libc::SYS_setresgid));
718 assert!(!DEFAULT_WHITELIST.contains(&libc::SYS_setsid));
719 assert!(!DEFAULT_WHITELIST.contains(&libc::SYS_setpgid));
720 }
722
723 #[test]
724 fn safe_syscalls_present() {
725 assert!(DEFAULT_WHITELIST.contains(&libc::SYS_fork));
726 assert!(DEFAULT_WHITELIST.contains(&libc::SYS_vfork));
727 assert!(DEFAULT_WHITELIST.contains(&libc::SYS_execve));
728 assert!(DEFAULT_WHITELIST.contains(&libc::SYS_sendfile));
729 assert!(DEFAULT_WHITELIST.contains(&libc::SYS_close_range));
730 }
731
732 #[test]
733 #[should_panic(expected = "whitelist too large")]
734 fn whitelist_overflow_panics() {
735 let huge: Vec<i64> = (0..300).map(|i| i as i64).collect();
736 build_whitelist_filter(&huge);
737 }
738
739 #[test]
740 fn notify_filter_structure() {
741 let syscalls = &[libc::SYS_openat, libc::SYS_open, libc::SYS_stat];
742 let filter = build_notify_filter(syscalls);
743 assert_eq!(filter.len(), 9);
745 }
746
747 #[test]
748 fn notify_fs_syscalls_present() {
749 assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_openat));
750 assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_open));
751 assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_stat));
752 assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_readlink));
753 }
754}