1use std::ffi::CString;
5use std::io;
6use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
7
8use syscalls::{Sysno, SysnoSet};
9
10use crate::arch;
11use crate::sandbox::Sandbox;
12use crate::seccomp::bpf::{self, stmt, jump};
13use crate::sys::structs::{
14 AF_INET, AF_INET6,
15 BPF_ABS, BPF_ALU, BPF_AND, BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K, BPF_LD, BPF_RET, BPF_W,
16 CLONE_NS_FLAGS, DEFAULT_BLOCKLIST_SYSCALLS, EPERM, SYSV_IPC_BLOCKLIST_SYSCALLS,
17 SECCOMP_RET_ALLOW, SECCOMP_RET_ERRNO,
18 SIOCETHTOOL, SIOCGIFADDR, SIOCGIFBRDADDR, SIOCGIFCONF, SIOCGIFDSTADDR,
19 SIOCGIFFLAGS, SIOCGIFHWADDR, SIOCGIFINDEX, SIOCGIFNAME, SIOCGIFNETMASK,
20 SOCK_DGRAM, SOCK_RAW, SOCK_TYPE_MASK, TIOCLINUX, TIOCSTI,
21 PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER,
22 OFFSET_ARGS0_LO, OFFSET_ARGS1_LO, OFFSET_ARGS2_LO, OFFSET_ARGS3_LO, OFFSET_NR,
23 SockFilter,
24};
25
26pub struct PipePair {
32 pub notif_r: OwnedFd,
34 pub notif_w: OwnedFd,
36 pub ready_r: OwnedFd,
38 pub ready_w: OwnedFd,
40}
41
42impl PipePair {
43 pub fn new() -> io::Result<Self> {
45 let mut notif_fds = [0i32; 2];
46 let mut ready_fds = [0i32; 2];
47
48 let ret = unsafe { libc::pipe2(notif_fds.as_mut_ptr(), libc::O_CLOEXEC) };
50 if ret < 0 {
51 return Err(io::Error::last_os_error());
52 }
53
54 let ret = unsafe { libc::pipe2(ready_fds.as_mut_ptr(), libc::O_CLOEXEC) };
55 if ret < 0 {
56 unsafe {
58 libc::close(notif_fds[0]);
59 libc::close(notif_fds[1]);
60 }
61 return Err(io::Error::last_os_error());
62 }
63
64 Ok(PipePair {
66 notif_r: unsafe { OwnedFd::from_raw_fd(notif_fds[0]) },
67 notif_w: unsafe { OwnedFd::from_raw_fd(notif_fds[1]) },
68 ready_r: unsafe { OwnedFd::from_raw_fd(ready_fds[0]) },
69 ready_w: unsafe { OwnedFd::from_raw_fd(ready_fds[1]) },
70 })
71 }
72}
73
74pub(crate) fn write_u32_fd(fd: RawFd, val: u32) -> io::Result<()> {
80 let buf = val.to_le_bytes();
81 let mut written = 0usize;
82 while written < 4 {
83 let ret = unsafe {
84 libc::write(
85 fd,
86 buf[written..].as_ptr() as *const libc::c_void,
87 4 - written,
88 )
89 };
90 if ret < 0 {
91 return Err(io::Error::last_os_error());
92 }
93 written += ret as usize;
94 }
95 Ok(())
96}
97
98pub(crate) fn read_u32_fd(fd: RawFd) -> io::Result<u32> {
100 let mut buf = [0u8; 4];
101 let mut total = 0usize;
102 while total < 4 {
103 let ret = unsafe {
104 libc::read(
105 fd,
106 buf[total..].as_mut_ptr() as *mut libc::c_void,
107 4 - total,
108 )
109 };
110 if ret < 0 {
111 return Err(io::Error::last_os_error());
112 }
113 if ret == 0 {
114 return Err(io::Error::new(
115 io::ErrorKind::UnexpectedEof,
116 "pipe closed before 4 bytes read",
117 ));
118 }
119 total += ret as usize;
120 }
121 Ok(u32::from_le_bytes(buf))
122}
123
124#[cfg(test)]
125use crate::seccomp::syscall::syscall_name_to_nr;
126
127#[derive(Default)]
132struct SyscallList {
133 nrs: Vec<u32>,
134}
135
136impl SyscallList {
137 fn with(syscalls: &[i64]) -> Self {
138 let mut list = Self::default();
139 list.extend(syscalls);
140 list
141 }
142
143 fn push(&mut self, nr: i64) {
144 self.nrs.push(nr as u32);
145 }
146
147 fn extend(&mut self, syscalls: &[i64]) {
148 self.nrs.extend(syscalls.iter().map(|&nr| nr as u32));
149 }
150
151 fn push_optional(&mut self, nr: Option<i64>) {
152 if let Some(nr) = nr {
153 self.push(nr);
154 }
155 }
156
157
158 fn finish(mut self) -> Vec<u32> {
159 self.nrs.sort_unstable();
160 self.nrs.dedup();
161 self.nrs
162 }
163}
164
165const BASE_NOTIF_SYSCALLS: &[i64] = &[
166 libc::SYS_clone,
167 libc::SYS_clone3,
168 libc::SYS_wait4,
169 libc::SYS_waitid,
170];
171
172const MEMORY_NOTIF_SYSCALLS: &[i64] = &[
173 libc::SYS_mmap,
174 libc::SYS_munmap,
175 libc::SYS_brk,
176 libc::SYS_mremap,
177];
178
179const NETWORK_POLICY_SYSCALLS: &[i64] = &[
180 libc::SYS_connect,
181 libc::SYS_sendto,
182 libc::SYS_sendmsg,
183 libc::SYS_sendmmsg,
184 libc::SYS_bind,
185];
186
187const RANDOM_NOTIF_SYSCALLS: &[i64] = &[libc::SYS_getrandom, libc::SYS_openat];
189
190const TIME_NOTIF_SYSCALLS: &[i64] = &[
193 libc::SYS_clock_nanosleep,
194 libc::SYS_timerfd_settime,
195 libc::SYS_timer_settime,
196 libc::SYS_openat,
197];
198
199fn procfs_hosts_notif_syscalls() -> Vec<i64> {
209 let mut v = vec![libc::SYS_openat, arch::SYS_OPENAT2, libc::SYS_getdents64];
210 v.extend([arch::sys_open(), arch::sys_getdents()].into_iter().flatten());
211 v
212}
213
214const NETLINK_NOTIF_SYSCALLS: &[i64] = &[
223 libc::SYS_socket,
224 libc::SYS_bind,
225 libc::SYS_getsockname,
226 libc::SYS_recvfrom,
227 libc::SYS_recvmsg,
228 libc::SYS_close,
229];
230
231fn cow_path_syscalls() -> Vec<i64> {
232 let mut v = vec![
233 libc::SYS_openat,
234 libc::SYS_execve,
235 libc::SYS_execveat,
236 libc::SYS_unlinkat,
237 libc::SYS_mkdirat,
238 libc::SYS_renameat2,
239 libc::SYS_symlinkat,
240 libc::SYS_linkat,
241 libc::SYS_fchmodat,
242 libc::SYS_fchownat,
243 libc::SYS_truncate,
244 libc::SYS_utimensat,
245 libc::SYS_newfstatat,
246 libc::SYS_statx,
247 libc::SYS_faccessat,
248 arch::SYS_FACCESSAT2,
249 libc::SYS_readlinkat,
250 libc::SYS_getdents64,
251 libc::SYS_chdir,
252 libc::SYS_getcwd,
253 ];
254 v.extend(
255 [
256 arch::sys_open(),
257 arch::sys_unlink(),
258 arch::sys_rmdir(),
259 arch::sys_mkdir(),
260 arch::sys_rename(),
261 arch::sys_symlink(),
262 arch::sys_link(),
263 arch::sys_chmod(),
264 arch::sys_chown(),
265 arch::sys_lchown(),
266 arch::sys_stat(),
267 arch::sys_lstat(),
268 arch::sys_access(),
269 arch::sys_readlink(),
270 arch::sys_getdents(),
271 ]
272 .into_iter()
273 .flatten(),
274 );
275 v
276}
277
278fn chroot_path_syscalls() -> Vec<i64> {
279 let mut v = vec![
280 libc::SYS_openat,
281 libc::SYS_execve,
282 libc::SYS_execveat,
283 libc::SYS_unlinkat,
284 libc::SYS_mkdirat,
285 libc::SYS_renameat2,
286 libc::SYS_symlinkat,
287 libc::SYS_linkat,
288 libc::SYS_fchmodat,
289 libc::SYS_fchownat,
290 libc::SYS_truncate,
291 libc::SYS_newfstatat,
292 libc::SYS_statx,
293 libc::SYS_faccessat,
294 arch::SYS_FACCESSAT2,
295 libc::SYS_readlinkat,
296 libc::SYS_getdents64,
297 libc::SYS_chdir,
298 libc::SYS_getcwd,
299 libc::SYS_statfs,
300 libc::SYS_utimensat,
301 libc::SYS_getxattr,
306 libc::SYS_lgetxattr,
307 libc::SYS_setxattr,
308 libc::SYS_lsetxattr,
309 libc::SYS_listxattr,
310 libc::SYS_llistxattr,
311 libc::SYS_removexattr,
312 libc::SYS_lremovexattr,
313 ];
314 v.extend(
315 [
316 arch::sys_open(),
317 arch::sys_stat(),
318 arch::sys_lstat(),
319 arch::sys_access(),
320 arch::sys_readlink(),
321 arch::sys_getdents(),
322 arch::sys_unlink(),
323 arch::sys_rmdir(),
324 arch::sys_mkdir(),
325 arch::sys_rename(),
326 arch::sys_symlink(),
327 arch::sys_link(),
328 arch::sys_chmod(),
329 arch::sys_chown(),
330 arch::sys_lchown(),
331 ]
332 .into_iter()
333 .flatten(),
334 );
335 v
336}
337
338fn fs_denied_path_syscalls() -> Vec<i64> {
339 let mut v = vec![
340 libc::SYS_openat,
341 libc::SYS_execve,
342 libc::SYS_execveat,
343 libc::SYS_linkat,
344 libc::SYS_renameat2,
345 libc::SYS_symlinkat,
346 ];
347 v.extend(
348 [
349 arch::sys_open(),
350 arch::sys_link(),
351 arch::sys_rename(),
352 arch::sys_symlink(),
353 ]
354 .into_iter()
355 .flatten(),
356 );
357 v
358}
359
360const POLICY_EVENT_SYSCALLS: &[i64] = &[
361 libc::SYS_openat,
362 libc::SYS_connect,
363 libc::SYS_sendto,
364 libc::SYS_bind,
365 libc::SYS_execve,
366 libc::SYS_execveat,
367];
368
369const PORT_REMAP_SYSCALLS: &[i64] = &[
370 libc::SYS_bind,
371 libc::SYS_getsockname,
372];
373
374fn needs_network_supervision(policy: &Sandbox) -> bool {
375 !policy.net_allow.is_empty()
376 || !policy.net_deny.is_empty()
377 || !policy.net_deny_bind.is_empty()
378 || policy.policy_fn.is_some()
379 || !policy.http_allow.is_empty()
380 || !policy.http_deny.is_empty()
381}
382
383pub fn notif_syscalls(policy: &Sandbox, sandbox_name: Option<&str>) -> Vec<u32> {
385 let mut nrs = SyscallList::with(BASE_NOTIF_SYSCALLS);
386 nrs.push_optional(arch::sys_vfork());
387
388 if policy.policy_fn.is_some() {
395 nrs.push_optional(arch::sys_fork());
396 }
397
398 if policy.max_memory.is_some() {
399 nrs.extend(MEMORY_NOTIF_SYSCALLS);
400 if policy.allows_sysv_ipc() {
406 nrs.push(libc::SYS_shmget);
407 }
408 }
409
410 if needs_network_supervision(policy) {
411 nrs.extend(NETWORK_POLICY_SYSCALLS);
412 }
413
414 if policy.random_seed.is_some() {
415 nrs.extend(RANDOM_NOTIF_SYSCALLS);
416 }
417
418 if policy.time_start.is_some() {
419 nrs.extend(TIME_NOTIF_SYSCALLS);
420 }
421
422 nrs.extend(&procfs_hosts_notif_syscalls());
423 nrs.extend(NETLINK_NOTIF_SYSCALLS);
424
425 if policy.num_cpus.is_some() {
427 nrs.push(libc::SYS_sched_getaffinity);
428 }
429 if sandbox_name.is_some() {
430 nrs.extend(&[libc::SYS_uname, libc::SYS_openat]);
431 }
432
433 if policy.workdir.is_some() {
435 nrs.extend(&cow_path_syscalls());
436 }
437
438 if policy.chroot.is_some() {
440 nrs.extend(&chroot_path_syscalls());
441 }
442
443 if !policy.fs_denied.is_empty() {
445 nrs.extend(&fs_denied_path_syscalls());
446 }
447
448 if policy.policy_fn.is_some() {
450 nrs.extend(POLICY_EVENT_SYSCALLS);
451 }
452
453 if policy.port_remap {
455 nrs.extend(PORT_REMAP_SYSCALLS);
456 }
457
458 nrs.finish()
459}
460
461fn resolve_blocklist(base: &[&str], policy: &Sandbox) -> Vec<u32> {
470 let mut set: SysnoSet = base
471 .iter()
472 .copied()
473 .chain(policy.extra_deny_syscalls.iter().map(String::as_str))
474 .filter_map(|n| n.parse::<Sysno>().ok())
475 .collect();
476 if !policy.allows_sysv_ipc() {
477 for name in SYSV_IPC_BLOCKLIST_SYSCALLS {
478 if let Ok(sysno) = name.parse::<Sysno>() {
479 set.insert(sysno);
480 }
481 }
482 }
483 set.iter().map(|s| s.id() as u32).collect()
484}
485
486pub fn no_supervisor_blocklist_syscall_numbers(policy: &Sandbox) -> Vec<u32> {
489 use crate::sys::structs::NO_SUPERVISOR_BLOCKLIST_SYSCALLS;
490 resolve_blocklist(NO_SUPERVISOR_BLOCKLIST_SYSCALLS, policy)
491}
492
493pub fn blocklist_syscall_numbers(policy: &Sandbox) -> Vec<u32> {
498 resolve_blocklist(DEFAULT_BLOCKLIST_SYSCALLS, policy)
499}
500
501pub fn arg_filters(policy: &Sandbox) -> Vec<SockFilter> {
510 let ret_errno = SECCOMP_RET_ERRNO | EPERM as u32;
511 let nr_clone = libc::SYS_clone as u32;
512 let nr_ioctl = libc::SYS_ioctl as u32;
513 let nr_prctl = libc::SYS_prctl as u32;
514 let nr_socket = libc::SYS_socket as u32;
515
516 let mut insns: Vec<SockFilter> = Vec::new();
517
518 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
526 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_clone, 0, 3));
527 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
528 insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, CLONE_NS_FLAGS as u32, 0, 1));
529 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
530
531 let dangerous_ioctls: &[u32] = &[
537 TIOCSTI as u32,
538 TIOCLINUX as u32,
539 SIOCGIFNAME as u32,
540 SIOCGIFCONF as u32,
541 SIOCGIFFLAGS as u32,
542 SIOCGIFADDR as u32,
543 SIOCGIFDSTADDR as u32,
544 SIOCGIFBRDADDR as u32,
545 SIOCGIFNETMASK as u32,
546 SIOCGIFHWADDR as u32,
547 SIOCGIFINDEX as u32,
548 SIOCETHTOOL as u32,
549 ];
550 let n_ioctls = dangerous_ioctls.len();
551 let skip_count = (1 + n_ioctls * 2) as u8;
552 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
553 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_ioctl, 0, skip_count));
554 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
555 for &cmd in dangerous_ioctls {
556 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, cmd, 0, 1));
557 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
558 }
559
560 let dangerous_prctl_ops: &[u32] = &[PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER];
563 let n_ops = dangerous_prctl_ops.len();
564 let skip_count = (1 + n_ops * 2) as u8;
565 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
566 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_prctl, 0, skip_count));
567 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
568 for &op in dangerous_prctl_ops {
569 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, op, 0, 1));
570 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
571 }
572
573 use crate::sandbox::Protocol;
587 let any_udp_rule = policy.net_allow.iter().any(|r| r.protocol == Protocol::Udp);
588 let any_icmp_rule = policy.net_allow.iter().any(|r| r.protocol == Protocol::Icmp);
589 let net_deny_active = !policy.net_deny.is_empty();
594 let mut blocked_types: Vec<u32> = Vec::new();
595 blocked_types.push(SOCK_RAW);
596 if !any_udp_rule && !any_icmp_rule && !net_deny_active {
597 blocked_types.push(SOCK_DGRAM);
598 }
599
600 if !blocked_types.is_empty() {
601 let n = blocked_types.len();
602 let after_domain = 2 + n + 1;
604 let skip_all = (3 + after_domain) as u8;
606
607 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
608 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, skip_all));
609 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
611 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET, 1, 0));
613 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET6, 0, after_domain as u8));
615 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
617 insns.push(stmt(BPF_ALU | BPF_AND | BPF_K, SOCK_TYPE_MASK));
618 for (i, &sock_type) in blocked_types.iter().enumerate() {
620 let remaining = n - i - 1;
621 let jf: u8 = if remaining == 0 { 1 } else { 0 };
625 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, sock_type, remaining as u8, jf));
626 }
627 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
629 }
630
631 {
645 let nr_wait4 = libc::SYS_wait4 as u32;
646 let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000) as u32;
647 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
648 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_wait4, 0, 3));
649 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS2_LO));
650 insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
651 insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
652 }
653
654 {
657 let nr_waitid = libc::SYS_waitid as u32;
658 let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000) as u32;
659 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
660 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_waitid, 0, 3));
661 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS3_LO));
662 insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
663 insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
664 }
665
666 insns
667}
668
669fn close_fds_above(min_fd: RawFd, keep: &[RawFd]) {
675 let fds_to_close: Vec<RawFd> = {
679 let dir = match std::fs::read_dir("/proc/self/fd") {
680 Ok(d) => d,
681 Err(_) => return,
682 };
683 dir.flatten()
684 .filter_map(|entry| {
685 entry.file_name().into_string().ok()
686 .and_then(|name| name.parse::<RawFd>().ok())
687 })
688 .filter(|&fd| fd > min_fd && !keep.contains(&fd))
689 .collect()
690 };
691 for fd in fds_to_close {
693 unsafe { libc::close(fd) };
694 }
695}
696
697fn write_id_maps(real_uid: u32, real_gid: u32, target_uid: u32, target_gid: u32) {
706 let _ = std::fs::write("/proc/self/uid_map", format!("{} {} 1\n", target_uid, real_uid));
707 let _ = std::fs::write("/proc/self/setgroups", "deny\n");
708 let _ = std::fs::write("/proc/self/gid_map", format!("{} {} 1\n", target_gid, real_gid));
709}
710
711pub(crate) struct ChildSpawnArgs<'a> {
723 pub sandbox: &'a Sandbox,
724 pub cmd: &'a [CString],
725 pub pipes: &'a PipePair,
726 pub no_supervisor: bool,
730 pub keep_fds: &'a [RawFd],
731 pub sandbox_name: Option<&'a str>,
734 pub extra_syscalls: &'a [u32],
738 pub parent_pid: libc::pid_t,
742}
743
744pub(crate) fn confine_child(args: ChildSpawnArgs<'_>) -> ! {
749 let ChildSpawnArgs {
750 sandbox,
751 cmd,
752 pipes,
753 no_supervisor,
754 keep_fds,
755 sandbox_name,
756 extra_syscalls,
757 parent_pid,
758 } = args;
759 macro_rules! fail {
761 ($msg:expr) => {{
762 let err = std::io::Error::last_os_error();
763 let _ = write!(std::io::stderr(), "sandlock child: {}: {}\n", $msg, err);
764 unsafe { libc::_exit(127) };
765 }};
766 }
767
768 use std::io::Write;
769
770 if unsafe { libc::setpgid(0, 0) } != 0 {
772 fail!("setpgid");
773 }
774
775 if unsafe { libc::isatty(0) } == 1 {
780 unsafe {
781 libc::signal(libc::SIGTTOU, libc::SIG_IGN);
782 libc::tcsetpgrp(0, libc::getpgrp());
783 libc::signal(libc::SIGTTOU, libc::SIG_DFL);
784 }
785 }
786
787 if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } != 0 {
789 fail!("prctl(PR_SET_PDEATHSIG)");
790 }
791
792 if unsafe { libc::getppid() } != parent_pid {
797 fail!("parent died before confinement");
798 }
799
800 if sandbox.no_randomize_memory {
802 const ADDR_NO_RANDOMIZE: libc::c_ulong = 0x0040000;
803 let current = unsafe { libc::personality(0xffffffff) };
805 if current == -1 {
806 fail!("personality(query)");
807 }
808 if unsafe { libc::personality(current as libc::c_ulong | ADDR_NO_RANDOMIZE) } == -1 {
809 fail!("personality(ADDR_NO_RANDOMIZE)");
810 }
811 }
812
813 if let Some(ref cores) = sandbox.cpu_cores {
815 if !cores.is_empty() {
816 let mut set = unsafe { std::mem::zeroed::<libc::cpu_set_t>() };
817 unsafe { libc::CPU_ZERO(&mut set) };
818 for &core in cores {
819 unsafe { libc::CPU_SET(core as usize, &mut set) };
820 }
821 if unsafe {
822 libc::sched_setaffinity(
823 0,
824 std::mem::size_of::<libc::cpu_set_t>(),
825 &set,
826 )
827 } != 0
828 {
829 fail!("sched_setaffinity");
830 }
831 }
832 }
833
834 if sandbox.no_huge_pages {
836 if unsafe { libc::prctl(libc::PR_SET_THP_DISABLE, 1, 0, 0, 0) } != 0 {
837 fail!("prctl(PR_SET_THP_DISABLE)");
838 }
839 }
840
841 if sandbox.no_coredump {
843 let rlim = libc::rlimit { rlim_cur: 0, rlim_max: 0 };
849 if unsafe { libc::setrlimit(libc::RLIMIT_CORE, &rlim) } != 0 {
850 fail!("setrlimit(RLIMIT_CORE, 0)");
851 }
852 }
853
854 let real_uid = unsafe { libc::getuid() };
856 let real_gid = unsafe { libc::getgid() };
857
858 if let Some(target_uid) = sandbox.uid {
860 if unsafe { libc::unshare(libc::CLONE_NEWUSER) } != 0 {
861 fail!("unshare(CLONE_NEWUSER)");
862 }
863 write_id_maps(real_uid, real_gid, target_uid, target_uid);
864 }
865
866 let effective_cwd = if let Some(ref cwd) = sandbox.cwd {
869 if let Some(ref chroot_root) = sandbox.chroot {
870 Some(chroot_root.join(cwd.strip_prefix("/").unwrap_or(cwd)))
871 } else {
872 Some(cwd.clone())
873 }
874 } else if let Some(ref chroot_root) = sandbox.chroot {
875 Some(chroot_root.to_path_buf())
877 } else if let Some(ref workdir) = sandbox.workdir {
878 Some(workdir.clone())
880 } else {
881 None
882 };
883
884 if let Some(ref cwd) = effective_cwd {
885 let c_path = match CString::new(cwd.as_os_str().as_encoded_bytes()) {
886 Ok(c) => c,
887 Err(_) => fail!("invalid cwd path"),
888 };
889 if unsafe { libc::chdir(c_path.as_ptr()) } != 0 {
890 fail!("chdir");
891 }
892 }
893
894 if unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) } != 0 {
896 fail!("prctl(PR_SET_NO_NEW_PRIVS)");
897 }
898
899 if let Err(e) = crate::landlock::confine(sandbox) {
901 fail!(format!("landlock: {}", e));
902 }
903
904 let args = arg_filters(sandbox);
906 let mut keep_fd: i32 = -1;
907
908 if no_supervisor {
909 let deny = no_supervisor_blocklist_syscall_numbers(sandbox);
918 let filter = match bpf::assemble_filter(&[], &deny, &args) {
919 Ok(f) => f,
920 Err(e) => fail!(format!("seccomp assemble: {}", e)),
921 };
922 if let Err(e) = bpf::install_deny_filter(&filter) {
923 fail!(format!("seccomp deny filter: {}", e));
924 }
925 if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), 0) {
927 fail!(format!("write no-supervisor signal: {}", e));
928 }
929 } else {
930 let deny = blocklist_syscall_numbers(sandbox);
931 let mut notif = notif_syscalls(sandbox, sandbox_name);
939 if !extra_syscalls.is_empty() {
940 notif.extend_from_slice(extra_syscalls);
941 }
942 let exec_extra = extra_syscalls.iter().any(|&n| {
949 n == libc::SYS_execve as u32 || n == libc::SYS_execveat as u32
950 });
951 if exec_extra {
952 arch::push_optional_syscall(&mut notif, arch::sys_fork());
953 }
954 notif.sort_unstable();
955 notif.dedup();
956 let filter = match bpf::assemble_filter(¬if, &deny, &args) {
957 Ok(f) => f,
958 Err(e) => fail!(format!("seccomp assemble: {}", e)),
959 };
960 let notif_fd = match bpf::install_filter(&filter) {
961 Ok(fd) => fd,
962 Err(e) => {
963 if e.raw_os_error() == Some(libc::EBUSY) {
969 let _ = write!(
970 std::io::stderr(),
971 "sandlock child: seccomp install: {} (an outer sandbox already owns the \
972 seccomp listener; pass --no-supervisor or Sandbox::no_supervisor(true) \
973 on this sandbox to nest)\n",
974 e,
975 );
976 unsafe { libc::_exit(127) };
977 }
978 fail!(format!("seccomp install: {}", e));
979 }
980 };
981 keep_fd = notif_fd.as_raw_fd();
982 if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), keep_fd as u32) {
983 fail!(format!("write notif fd: {}", e));
984 }
985 std::mem::forget(notif_fd);
986 }
987
988 match read_u32_fd(pipes.ready_r.as_raw_fd()) {
990 Ok(_) => {}
991 Err(e) => fail!(format!("read ready signal: {}", e)),
992 }
993
994 let mut fds_to_keep: Vec<RawFd> = keep_fds.to_vec();
996 if keep_fd >= 0 {
997 fds_to_keep.push(keep_fd);
998 }
999 close_fds_above(2, &fds_to_keep);
1000
1001 if sandbox.clean_env {
1003 for (key, _) in std::env::vars_os() {
1005 std::env::remove_var(&key);
1006 }
1007 }
1008 for (key, value) in &sandbox.env {
1009 std::env::set_var(key, value);
1010 }
1011
1012 if let Some(ref devices) = sandbox.gpu_devices {
1014 if !devices.is_empty() {
1015 let vis = devices.iter().map(|d| d.to_string()).collect::<Vec<_>>().join(",");
1016 std::env::set_var("CUDA_VISIBLE_DEVICES", &vis);
1017 std::env::set_var("ROCR_VISIBLE_DEVICES", &vis);
1018 }
1019 }
1021
1022 debug_assert!(!cmd.is_empty(), "cmd must not be empty");
1024 let argv_ptrs: Vec<*const libc::c_char> = cmd
1025 .iter()
1026 .map(|s| s.as_ptr())
1027 .chain(std::iter::once(std::ptr::null()))
1028 .collect();
1029
1030 if sandbox.chroot.is_some() {
1031 let mut exec_path = vec![0u8; libc::PATH_MAX as usize];
1037 let orig = cmd[0].as_bytes_with_nul();
1038 exec_path[..orig.len()].copy_from_slice(orig);
1039
1040 unsafe {
1041 libc::execvp(
1042 exec_path.as_ptr() as *const libc::c_char,
1043 argv_ptrs.as_ptr(),
1044 )
1045 };
1046 } else {
1047 unsafe { libc::execvp(argv_ptrs[0], argv_ptrs.as_ptr()) };
1048 }
1049
1050 fail!(format!("execvp '{}'", cmd[0].to_string_lossy()));
1052}
1053
1054#[cfg(test)]
1059mod tests {
1060 use super::*;
1061
1062 #[test]
1063 fn test_pipe_pair_creation() {
1064 let pipes = PipePair::new().expect("pipe creation failed");
1065 assert!(pipes.notif_r.as_raw_fd() >= 0);
1067 assert!(pipes.notif_w.as_raw_fd() >= 0);
1068 assert!(pipes.ready_r.as_raw_fd() >= 0);
1069 assert!(pipes.ready_w.as_raw_fd() >= 0);
1070 let fds = [
1072 pipes.notif_r.as_raw_fd(),
1073 pipes.notif_w.as_raw_fd(),
1074 pipes.ready_r.as_raw_fd(),
1075 pipes.ready_w.as_raw_fd(),
1076 ];
1077 for i in 0..4 {
1078 for j in (i + 1)..4 {
1079 assert_ne!(fds[i], fds[j]);
1080 }
1081 }
1082 }
1083
1084 #[test]
1085 fn test_write_read_u32() {
1086 let pipes = PipePair::new().expect("pipe creation failed");
1087 let val = 42u32;
1088 write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
1089 let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
1090 assert_eq!(got, val);
1091 }
1092
1093 #[test]
1094 fn test_write_read_u32_large() {
1095 let pipes = PipePair::new().expect("pipe creation failed");
1096 let val = 0xDEAD_BEEFu32;
1097 write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
1098 let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
1099 assert_eq!(got, val);
1100 }
1101
1102 #[test]
1103 fn test_notif_syscalls_always_has_clone() {
1104 let policy = Sandbox::builder().build().unwrap();
1105 let nrs = notif_syscalls(&policy, None);
1106 assert!(nrs.contains(&(libc::SYS_clone as u32)));
1107 assert!(nrs.contains(&(libc::SYS_clone3 as u32)));
1108 if let Some(vfork) = arch::sys_vfork() {
1109 assert!(nrs.contains(&(vfork as u32)));
1110 }
1111 if let Some(fork) = arch::sys_fork() {
1116 assert!(!nrs.contains(&(fork as u32)));
1117 }
1118 }
1119
1120 #[test]
1121 fn test_notif_syscalls_fork_gated_on_policy_fn() {
1122 let Some(fork) = arch::sys_fork() else { return };
1123 let policy = Sandbox::builder()
1124 .policy_fn(|_event, _ctx| crate::policy_fn::Verdict::Allow)
1125 .build()
1126 .unwrap();
1127 let nrs = notif_syscalls(&policy, None);
1128 assert!(nrs.contains(&(fork as u32)));
1129 }
1130
1131 #[test]
1132 fn test_notif_syscalls_memory() {
1133 let policy = Sandbox::builder()
1138 .max_memory(crate::sandbox::ByteSize::mib(256))
1139 .extra_allow_syscalls(vec!["sysv_ipc".into()])
1140 .build()
1141 .unwrap();
1142 let nrs = notif_syscalls(&policy, None);
1143 assert!(nrs.contains(&(libc::SYS_mmap as u32)));
1144 assert!(nrs.contains(&(libc::SYS_munmap as u32)));
1145 assert!(nrs.contains(&(libc::SYS_brk as u32)));
1146 assert!(nrs.contains(&(libc::SYS_mremap as u32)));
1147 assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1148 }
1149
1150 #[test]
1151 fn test_notif_syscalls_memory_excludes_shmget_when_sysv_ipc_denied() {
1152 let policy = Sandbox::builder()
1157 .max_memory(crate::sandbox::ByteSize::mib(256))
1158 .build()
1159 .unwrap();
1160 let nrs = notif_syscalls(&policy, None);
1161 assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1162 assert!(nrs.contains(&(libc::SYS_mmap as u32)));
1164 assert!(nrs.contains(&(libc::SYS_brk as u32)));
1165 }
1166
1167 #[test]
1168 fn test_notif_syscalls_net() {
1169 let policy = Sandbox::builder()
1170 .net_allow("example.com:443")
1171 .build()
1172 .unwrap();
1173 let nrs = notif_syscalls(&policy, None);
1174 assert!(nrs.contains(&(libc::SYS_connect as u32)));
1175 assert!(nrs.contains(&(libc::SYS_sendto as u32)));
1176 assert!(nrs.contains(&(libc::SYS_sendmsg as u32)));
1177 assert!(nrs.contains(&(libc::SYS_sendmmsg as u32)));
1178 }
1179
1180 #[test]
1181 fn test_notif_syscalls_net_deny() {
1182 let policy = Sandbox::builder()
1185 .net_deny("10.0.0.0/8")
1186 .build()
1187 .unwrap();
1188 let nrs = notif_syscalls(&policy, None);
1189 assert!(nrs.contains(&(libc::SYS_connect as u32)));
1190 assert!(nrs.contains(&(libc::SYS_sendto as u32)));
1191 }
1192
1193 #[test]
1194 fn test_notif_syscalls_sandbox_name_enables_hostname_virtualization() {
1195 let policy = Sandbox::builder().build().unwrap();
1196 let nrs = notif_syscalls(&policy, Some("api.local"));
1197 assert!(nrs.contains(&(libc::SYS_uname as u32)));
1198 assert!(nrs.contains(&(libc::SYS_openat as u32)));
1199 }
1200
1201 #[test]
1204 fn test_notif_syscalls_faccessat2() {
1205 let policy = Sandbox::builder()
1207 .chroot("/tmp")
1208 .build()
1209 .unwrap();
1210 let nrs = notif_syscalls(&policy, None);
1211 assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
1212 assert!(nrs.contains(&(arch::SYS_FACCESSAT2 as u32)),
1213 "chroot notif filter must include SYS_faccessat2 (439)");
1214
1215 let policy = Sandbox::builder()
1217 .workdir("/tmp")
1218 .build()
1219 .unwrap();
1220 let nrs = notif_syscalls(&policy, None);
1221 assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
1222 assert!(nrs.contains(&(arch::SYS_FACCESSAT2 as u32)),
1223 "COW notif filter must include SYS_faccessat2 (439)");
1224 }
1225
1226 #[test]
1227 fn test_blocklist_syscall_numbers_default() {
1228 let policy = Sandbox::builder().build().unwrap();
1229 let nrs = blocklist_syscall_numbers(&policy);
1230 assert!(nrs.contains(&(libc::SYS_mount as u32)));
1232 assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1233 assert!(nrs.contains(&(libc::SYS_bpf as u32)));
1234 assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1236 assert!(nrs.contains(&(libc::SYS_shmat as u32)));
1237 assert!(nrs.contains(&(libc::SYS_msgget as u32)));
1238 assert!(nrs.contains(&(libc::SYS_semget as u32)));
1239 assert!(!nrs.is_empty());
1241 }
1242
1243 #[test]
1244 fn test_blocklist_syscall_numbers_custom() {
1245 let policy = Sandbox::builder()
1246 .extra_deny_syscalls(vec!["mount".into(), "ptrace".into()])
1247 .build()
1248 .unwrap();
1249 let nrs = blocklist_syscall_numbers(&policy);
1250 assert!(nrs.contains(&(libc::SYS_mount as u32)));
1253 assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1254 assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1255 }
1256
1257 #[test]
1258 fn test_blocklist_syscall_numbers_custom_with_sysv_ipc_allowed() {
1259 let policy = Sandbox::builder()
1260 .extra_deny_syscalls(vec!["mount".into(), "ptrace".into()])
1261 .extra_allow_syscalls(vec!["sysv_ipc".into()])
1262 .build()
1263 .unwrap();
1264 let nrs = blocklist_syscall_numbers(&policy);
1265 assert!(nrs.contains(&(libc::SYS_mount as u32)));
1267 assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1268 assert!(nrs.contains(&(libc::SYS_bpf as u32)));
1269 assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1270 }
1271
1272 #[test]
1273 fn test_blocklist_syscall_numbers_default_with_sysv_ipc_allowed() {
1274 let policy = Sandbox::builder()
1275 .extra_allow_syscalls(vec!["sysv_ipc".into()])
1276 .build()
1277 .unwrap();
1278 let nrs = blocklist_syscall_numbers(&policy);
1279 assert!(nrs.contains(&(libc::SYS_mount as u32)));
1281 assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1282 assert!(!nrs.contains(&(libc::SYS_msgget as u32)));
1283 assert!(!nrs.contains(&(libc::SYS_semget as u32)));
1284 }
1285
1286 #[test]
1287 fn test_no_supervisor_blocklist_includes_sysv_ipc_by_default() {
1288 let policy = Sandbox::builder().build().unwrap();
1289 let nrs = no_supervisor_blocklist_syscall_numbers(&policy);
1290 assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1291 assert!(nrs.contains(&(libc::SYS_msgget as u32)));
1292 assert!(nrs.contains(&(libc::SYS_semget as u32)));
1293 }
1294
1295 #[test]
1296 fn test_no_supervisor_blocklist_excludes_sysv_ipc_when_allowed() {
1297 let policy = Sandbox::builder()
1298 .extra_allow_syscalls(vec!["sysv_ipc".into()])
1299 .build()
1300 .unwrap();
1301 let nrs = no_supervisor_blocklist_syscall_numbers(&policy);
1302 assert!(!nrs.contains(&(libc::SYS_shmget as u32)));
1303 assert!(!nrs.contains(&(libc::SYS_msgget as u32)));
1304 assert!(!nrs.contains(&(libc::SYS_semget as u32)));
1305 }
1306
1307 #[test]
1308 fn test_arg_filters_has_clone_ioctl_prctl_socket() {
1309 use crate::sys::structs::{
1310 BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K,
1311 };
1312 let policy = Sandbox::builder().build().unwrap();
1313 let filters = arg_filters(&policy);
1314 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1316 && f.k == libc::SYS_clone as u32));
1317 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JSET | BPF_K)
1319 && f.k == CLONE_NS_FLAGS as u32));
1320 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1322 && f.k == libc::SYS_ioctl as u32));
1323 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1325 && f.k == TIOCSTI as u32));
1326 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1327 && f.k == TIOCLINUX as u32));
1328 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1329 && f.k == SIOCGIFCONF as u32));
1330 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1331 && f.k == SIOCETHTOOL as u32));
1332 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1334 && f.k == libc::SYS_prctl as u32));
1335 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1337 && f.k == PR_SET_DUMPABLE));
1338 }
1339
1340 #[test]
1341 fn test_arg_filters_raw_sockets() {
1342 use crate::sys::structs::{BPF_ALU, BPF_AND, BPF_JEQ, BPF_JMP, BPF_K};
1343 let policy = Sandbox::builder().build().unwrap();
1345 let filters = arg_filters(&policy);
1346 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1348 && f.k == AF_INET));
1349 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1351 && f.k == AF_INET6));
1352 assert!(filters.iter().any(|f| f.code == (BPF_ALU | BPF_AND | BPF_K)
1354 && f.k == SOCK_TYPE_MASK));
1355 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1357 && f.k == SOCK_RAW));
1358 }
1359
1360 #[test]
1361 fn test_arg_filters_udp_denied_by_default() {
1362 use crate::sys::structs::{BPF_JEQ, BPF_JMP, BPF_K};
1363 let policy = Sandbox::builder().build().unwrap();
1365 let filters = arg_filters(&policy);
1366 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1368 && f.k == SOCK_DGRAM));
1369 }
1370
1371 #[test]
1372 fn test_syscall_name_to_nr_covers_defaults() {
1373 let expected_unresolved: &[&str] = &[
1379 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
1380 "ioperm",
1381 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
1382 "iopl",
1383 ];
1384 let mut skipped = 0;
1385 for name in DEFAULT_BLOCKLIST_SYSCALLS {
1386 match syscall_name_to_nr(name) {
1387 Some(_) => {}
1388 None => {
1389 assert!(
1390 expected_unresolved.contains(name),
1391 "unexpected unresolved syscall: {}",
1392 name
1393 );
1394 skipped += 1;
1395 }
1396 }
1397 }
1398 assert_eq!(skipped, expected_unresolved.len());
1399 }
1400}