1use std::ffi::CString;
5use std::io;
6use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
7use std::path::PathBuf;
8
9use crate::policy::{FsIsolation, Policy};
10use crate::seccomp::bpf::{self, stmt, jump};
11use crate::sys::structs::{
12 AF_INET, AF_INET6, AF_NETLINK,
13 BPF_ABS, BPF_ALU, BPF_AND, BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K, BPF_LD, BPF_RET, BPF_W,
14 CLONE_NS_FLAGS, DEFAULT_DENY_SYSCALLS, EPERM, NETLINK_SOCK_DIAG, SECCOMP_RET_ERRNO,
15 SOCK_DGRAM, SOCK_RAW, SOCK_TYPE_MASK, TIOCLINUX, TIOCSTI,
16 PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER,
17 OFFSET_ARGS0_LO, OFFSET_ARGS1_LO, OFFSET_ARGS2_LO, OFFSET_NR,
18 SockFilter,
19};
20
21pub struct PipePair {
27 pub notif_r: OwnedFd,
29 pub notif_w: OwnedFd,
31 pub ready_r: OwnedFd,
33 pub ready_w: OwnedFd,
35}
36
37impl PipePair {
38 pub fn new() -> io::Result<Self> {
40 let mut notif_fds = [0i32; 2];
41 let mut ready_fds = [0i32; 2];
42
43 let ret = unsafe { libc::pipe2(notif_fds.as_mut_ptr(), libc::O_CLOEXEC) };
45 if ret < 0 {
46 return Err(io::Error::last_os_error());
47 }
48
49 let ret = unsafe { libc::pipe2(ready_fds.as_mut_ptr(), libc::O_CLOEXEC) };
50 if ret < 0 {
51 unsafe {
53 libc::close(notif_fds[0]);
54 libc::close(notif_fds[1]);
55 }
56 return Err(io::Error::last_os_error());
57 }
58
59 Ok(PipePair {
61 notif_r: unsafe { OwnedFd::from_raw_fd(notif_fds[0]) },
62 notif_w: unsafe { OwnedFd::from_raw_fd(notif_fds[1]) },
63 ready_r: unsafe { OwnedFd::from_raw_fd(ready_fds[0]) },
64 ready_w: unsafe { OwnedFd::from_raw_fd(ready_fds[1]) },
65 })
66 }
67}
68
69pub(crate) fn write_u32_fd(fd: RawFd, val: u32) -> io::Result<()> {
75 let buf = val.to_le_bytes();
76 let mut written = 0usize;
77 while written < 4 {
78 let ret = unsafe {
79 libc::write(
80 fd,
81 buf[written..].as_ptr() as *const libc::c_void,
82 4 - written,
83 )
84 };
85 if ret < 0 {
86 return Err(io::Error::last_os_error());
87 }
88 written += ret as usize;
89 }
90 Ok(())
91}
92
93pub(crate) fn read_u32_fd(fd: RawFd) -> io::Result<u32> {
95 let mut buf = [0u8; 4];
96 let mut total = 0usize;
97 while total < 4 {
98 let ret = unsafe {
99 libc::read(
100 fd,
101 buf[total..].as_mut_ptr() as *mut libc::c_void,
102 4 - total,
103 )
104 };
105 if ret < 0 {
106 return Err(io::Error::last_os_error());
107 }
108 if ret == 0 {
109 return Err(io::Error::new(
110 io::ErrorKind::UnexpectedEof,
111 "pipe closed before 4 bytes read",
112 ));
113 }
114 total += ret as usize;
115 }
116 Ok(u32::from_le_bytes(buf))
117}
118
119pub fn syscall_name_to_nr(name: &str) -> Option<u32> {
128 let nr: i64 = match name {
129 "mount" => libc::SYS_mount,
130 "umount2" => libc::SYS_umount2,
131 "pivot_root" => libc::SYS_pivot_root,
132 "swapon" => libc::SYS_swapon,
133 "swapoff" => libc::SYS_swapoff,
134 "reboot" => libc::SYS_reboot,
135 "sethostname" => libc::SYS_sethostname,
136 "setdomainname" => libc::SYS_setdomainname,
137 "kexec_load" => libc::SYS_kexec_load,
138 "init_module" => libc::SYS_init_module,
139 "finit_module" => libc::SYS_finit_module,
140 "delete_module" => libc::SYS_delete_module,
141 "unshare" => libc::SYS_unshare,
142 "setns" => libc::SYS_setns,
143 "perf_event_open" => libc::SYS_perf_event_open,
144 "bpf" => libc::SYS_bpf,
145 "userfaultfd" => libc::SYS_userfaultfd,
146 "keyctl" => libc::SYS_keyctl,
147 "add_key" => libc::SYS_add_key,
148 "request_key" => libc::SYS_request_key,
149 "ptrace" => libc::SYS_ptrace,
150 "process_vm_readv" => libc::SYS_process_vm_readv,
151 "process_vm_writev" => libc::SYS_process_vm_writev,
152 "open_by_handle_at" => libc::SYS_open_by_handle_at,
153 "name_to_handle_at" => libc::SYS_name_to_handle_at,
154 "ioperm" => libc::SYS_ioperm,
155 "iopl" => libc::SYS_iopl,
156 "quotactl" => libc::SYS_quotactl,
157 "acct" => libc::SYS_acct,
158 "lookup_dcookie" => libc::SYS_lookup_dcookie,
159 "io_uring_setup" => libc::SYS_io_uring_setup,
161 "io_uring_enter" => libc::SYS_io_uring_enter,
162 "io_uring_register" => libc::SYS_io_uring_register,
163 "clone" => libc::SYS_clone,
165 "clone3" => libc::SYS_clone3,
166 "vfork" => libc::SYS_vfork,
167 "mmap" => libc::SYS_mmap,
168 "munmap" => libc::SYS_munmap,
169 "brk" => libc::SYS_brk,
170 "mremap" => libc::SYS_mremap,
171 "connect" => libc::SYS_connect,
172 "sendto" => libc::SYS_sendto,
173 "sendmsg" => libc::SYS_sendmsg,
174 "ioctl" => libc::SYS_ioctl,
175 "socket" => libc::SYS_socket,
176 "prctl" => libc::SYS_prctl,
177 "getrandom" => libc::SYS_getrandom,
178 "openat" => libc::SYS_openat,
179 "open" => libc::SYS_open,
180 "getdents64" => libc::SYS_getdents64,
181 "getdents" => libc::SYS_getdents,
182 "bind" => libc::SYS_bind,
183 "getsockname" => libc::SYS_getsockname,
184 "clock_gettime" => libc::SYS_clock_gettime,
185 "gettimeofday" => libc::SYS_gettimeofday,
186 "time" => libc::SYS_time,
187 "clock_nanosleep" => libc::SYS_clock_nanosleep,
188 "timerfd_settime" => libc::SYS_timerfd_settime,
189 "timer_settime" => libc::SYS_timer_settime,
190 "execve" => libc::SYS_execve,
191 "execveat" => libc::SYS_execveat,
192 "unlinkat" => libc::SYS_unlinkat,
194 "mkdirat" => libc::SYS_mkdirat,
195 "renameat2" => libc::SYS_renameat2,
196 "newfstatat" => libc::SYS_newfstatat,
197 "statx" => libc::SYS_statx,
198 "faccessat" => libc::SYS_faccessat,
199 "symlinkat" => libc::SYS_symlinkat,
200 "linkat" => libc::SYS_linkat,
201 "fchmodat" => libc::SYS_fchmodat,
202 "fchownat" => libc::SYS_fchownat,
203 "readlinkat" => libc::SYS_readlinkat,
204 "truncate" => libc::SYS_truncate,
205 "utimensat" => libc::SYS_utimensat,
206 "unlink" => libc::SYS_unlink,
207 "rmdir" => libc::SYS_rmdir,
208 "mkdir" => libc::SYS_mkdir,
209 "rename" => libc::SYS_rename,
210 "stat" => libc::SYS_stat,
211 "lstat" => libc::SYS_lstat,
212 "access" => libc::SYS_access,
213 "symlink" => libc::SYS_symlink,
214 "link" => libc::SYS_link,
215 "chmod" => libc::SYS_chmod,
216 "chown" => libc::SYS_chown,
217 "lchown" => libc::SYS_lchown,
218 "readlink" => libc::SYS_readlink,
219 "futimesat" => libc::SYS_futimesat,
220 "fork" => libc::SYS_fork,
221 _ => return None,
222 };
223 Some(nr as u32)
224}
225
226pub fn notif_syscalls(policy: &Policy) -> Vec<u32> {
232 let mut nrs = vec![
233 libc::SYS_clone as u32,
234 libc::SYS_clone3 as u32,
235 libc::SYS_vfork as u32,
236 ];
237
238 if policy.max_memory.is_some() {
239 nrs.push(libc::SYS_mmap as u32);
240 nrs.push(libc::SYS_munmap as u32);
241 nrs.push(libc::SYS_brk as u32);
242 nrs.push(libc::SYS_mremap as u32);
243 nrs.push(libc::SYS_shmget as u32);
244 }
245
246 if !policy.net_allow_hosts.is_empty() || policy.policy_fn.is_some() {
247 nrs.push(libc::SYS_connect as u32);
248 nrs.push(libc::SYS_sendto as u32);
249 nrs.push(libc::SYS_sendmsg as u32);
250 nrs.push(libc::SYS_bind as u32);
251 }
252
253 if policy.random_seed.is_some() {
254 nrs.push(libc::SYS_getrandom as u32);
255 nrs.push(libc::SYS_openat as u32);
257 }
258
259 if policy.time_start.is_some() {
260 nrs.extend_from_slice(&[
261 libc::SYS_clock_nanosleep as u32,
262 libc::SYS_timerfd_settime as u32,
263 libc::SYS_timer_settime as u32,
264 ]);
265 nrs.push(libc::SYS_openat as u32);
268 }
269
270 if policy.num_cpus.is_some() || policy.max_memory.is_some() || policy.isolate_pids || policy.port_remap {
272 nrs.push(libc::SYS_openat as u32);
273 }
274 if policy.num_cpus.is_some() {
276 nrs.push(libc::SYS_sched_getaffinity as u32);
277 }
278 if policy.isolate_pids || policy.deterministic_dirs {
279 nrs.extend_from_slice(&[
280 libc::SYS_getdents64 as u32,
281 libc::SYS_getdents as u32,
282 ]);
283 }
284 if policy.hostname.is_some() {
285 nrs.push(libc::SYS_uname as u32);
286 nrs.push(libc::SYS_openat as u32);
287 }
288
289 if policy.workdir.is_some() && policy.fs_isolation == FsIsolation::None {
291 nrs.extend_from_slice(&[
292 libc::SYS_openat as u32,
293 libc::SYS_unlinkat as u32,
294 libc::SYS_mkdirat as u32,
295 libc::SYS_renameat2 as u32,
296 libc::SYS_symlinkat as u32,
297 libc::SYS_linkat as u32,
298 libc::SYS_fchmodat as u32,
299 libc::SYS_fchownat as u32,
300 libc::SYS_truncate as u32,
301 libc::SYS_newfstatat as u32,
302 libc::SYS_statx as u32,
303 libc::SYS_faccessat as u32,
304 libc::SYS_readlinkat as u32,
305 libc::SYS_getdents64 as u32,
306 libc::SYS_getdents as u32,
307 ]);
308 }
309
310 if policy.chroot.is_some() {
312 nrs.extend_from_slice(&[
313 libc::SYS_openat as u32,
314 libc::SYS_execve as u32,
315 libc::SYS_execveat as u32,
316 libc::SYS_unlinkat as u32,
317 libc::SYS_mkdirat as u32,
318 libc::SYS_renameat2 as u32,
319 libc::SYS_symlinkat as u32,
320 libc::SYS_linkat as u32,
321 libc::SYS_fchmodat as u32,
322 libc::SYS_fchownat as u32,
323 libc::SYS_truncate as u32,
324 libc::SYS_newfstatat as u32,
325 libc::SYS_statx as u32,
326 libc::SYS_faccessat as u32,
327 libc::SYS_readlinkat as u32,
328 libc::SYS_getdents64 as u32,
329 libc::SYS_getdents as u32,
330 libc::SYS_chdir as u32,
331 libc::SYS_getcwd as u32,
332 libc::SYS_statfs as u32,
333 libc::SYS_utimensat as u32,
334 ]);
335 }
336
337 if policy.policy_fn.is_some() {
339 nrs.extend_from_slice(&[
340 libc::SYS_openat as u32,
341 libc::SYS_connect as u32,
342 libc::SYS_sendto as u32,
343 libc::SYS_bind as u32,
344 libc::SYS_execve as u32,
345 libc::SYS_execveat as u32,
346 ]);
347 }
348
349 if policy.port_remap {
351 nrs.extend_from_slice(&[
352 libc::SYS_bind as u32,
353 libc::SYS_getsockname as u32,
354 ]);
355 }
356
357 nrs.sort_unstable();
358 nrs.dedup();
359 nrs
360}
361
362pub fn deny_syscall_numbers(policy: &Policy) -> Vec<u32> {
367 if let Some(ref names) = policy.deny_syscalls {
368 names
369 .iter()
370 .filter_map(|n| syscall_name_to_nr(n))
371 .collect()
372 } else if policy.allow_syscalls.is_none() {
373 DEFAULT_DENY_SYSCALLS
374 .iter()
375 .filter_map(|n| syscall_name_to_nr(n))
376 .collect()
377 } else {
378 Vec::new()
380 }
381}
382
383pub fn arg_filters(policy: &Policy) -> Vec<SockFilter> {
393 let ret_errno = SECCOMP_RET_ERRNO | EPERM as u32;
394 let nr_clone = libc::SYS_clone as u32;
395 let nr_ioctl = libc::SYS_ioctl as u32;
396 let nr_prctl = libc::SYS_prctl as u32;
397 let nr_socket = libc::SYS_socket as u32;
398
399 let mut insns: Vec<SockFilter> = Vec::new();
400
401 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
409 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_clone, 0, 3));
410 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
411 insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, CLONE_NS_FLAGS as u32, 0, 1));
412 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
413
414 let dangerous_ioctls: &[u32] = &[TIOCSTI as u32, TIOCLINUX as u32];
417 let n_ioctls = dangerous_ioctls.len();
418 let skip_count = (1 + n_ioctls * 2) as u8;
419 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
420 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_ioctl, 0, skip_count));
421 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
422 for &cmd in dangerous_ioctls {
423 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, cmd, 0, 1));
424 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
425 }
426
427 let dangerous_prctl_ops: &[u32] = &[PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER];
430 let n_ops = dangerous_prctl_ops.len();
431 let skip_count = (1 + n_ops * 2) as u8;
432 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
433 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_prctl, 0, skip_count));
434 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
435 for &op in dangerous_prctl_ops {
436 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, op, 0, 1));
437 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
438 }
439
440 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
450 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, 5));
451 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
452 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_NETLINK, 0, 3));
453 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS2_LO));
454 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, NETLINK_SOCK_DIAG, 0, 1));
455 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
456
457 let mut blocked_types: Vec<u32> = Vec::new();
459 if policy.no_raw_sockets {
460 blocked_types.push(SOCK_RAW);
461 }
462 if policy.no_udp {
463 blocked_types.push(SOCK_DGRAM);
464 }
465
466 if !blocked_types.is_empty() {
467 let n = blocked_types.len();
468 let after_domain = 2 + n + 1;
470 let skip_all = (3 + after_domain) as u8;
472
473 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
474 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, skip_all));
475 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
477 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET, 1, 0));
479 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET6, 0, after_domain as u8));
481 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
483 insns.push(stmt(BPF_ALU | BPF_AND | BPF_K, SOCK_TYPE_MASK));
484 for (i, &sock_type) in blocked_types.iter().enumerate() {
486 let remaining = n - i - 1;
487 let jf: u8 = if remaining == 0 { 1 } else { 0 };
491 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, sock_type, remaining as u8, jf));
492 }
493 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
495 }
496
497 insns
498}
499
500fn close_fds_above(min_fd: RawFd, keep: &[RawFd]) {
506 let fds_to_close: Vec<RawFd> = {
510 let dir = match std::fs::read_dir("/proc/self/fd") {
511 Ok(d) => d,
512 Err(_) => return,
513 };
514 dir.flatten()
515 .filter_map(|entry| {
516 entry.file_name().into_string().ok()
517 .and_then(|name| name.parse::<RawFd>().ok())
518 })
519 .filter(|&fd| fd > min_fd && !keep.contains(&fd))
520 .collect()
521 };
522 for fd in fds_to_close {
524 unsafe { libc::close(fd) };
525 }
526}
527
528pub(crate) struct CowConfig {
534 pub merged: PathBuf,
535 pub upper: PathBuf,
536 pub work: PathBuf,
537 pub lowers: Vec<PathBuf>,
538}
539
540fn write_id_maps(real_uid: u32, real_gid: u32) {
544 let _ = std::fs::write("/proc/self/uid_map", format!("0 {} 1\n", real_uid));
545 let _ = std::fs::write("/proc/self/setgroups", "deny\n");
546 let _ = std::fs::write("/proc/self/gid_map", format!("0 {} 1\n", real_gid));
547}
548
549fn write_id_maps_overflow() {
552 let uid = unsafe { libc::getuid() };
553 let gid = unsafe { libc::getgid() };
554 write_id_maps(uid, gid);
555}
556
557pub(crate) fn confine_child(policy: &Policy, cmd: &[CString], pipes: &PipePair, cow_config: Option<&CowConfig>, nested: bool) -> ! {
566 macro_rules! fail {
568 ($msg:expr) => {{
569 let err = std::io::Error::last_os_error();
570 let _ = write!(std::io::stderr(), "sandlock child: {}: {}\n", $msg, err);
571 unsafe { libc::_exit(127) };
572 }};
573 }
574
575 use std::io::Write;
576
577 if unsafe { libc::setpgid(0, 0) } != 0 {
579 fail!("setpgid");
580 }
581
582 if unsafe { libc::isatty(0) } == 1 {
587 unsafe {
588 libc::signal(libc::SIGTTOU, libc::SIG_IGN);
589 libc::tcsetpgrp(0, libc::getpgrp());
590 libc::signal(libc::SIGTTOU, libc::SIG_DFL);
591 }
592 }
593
594 if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } != 0 {
596 fail!("prctl(PR_SET_PDEATHSIG)");
597 }
598
599 if unsafe { libc::getppid() } == 1 {
601 fail!("parent died before confinement");
602 }
603
604 if policy.no_randomize_memory {
606 const ADDR_NO_RANDOMIZE: u64 = 0x0040000;
607 if unsafe { libc::personality(ADDR_NO_RANDOMIZE as libc::c_ulong) } == -1 {
608 fail!("personality(ADDR_NO_RANDOMIZE)");
609 }
610 }
611
612 if let Some(ref cores) = policy.cpu_cores {
614 if !cores.is_empty() {
615 let mut set = unsafe { std::mem::zeroed::<libc::cpu_set_t>() };
616 unsafe { libc::CPU_ZERO(&mut set) };
617 for &core in cores {
618 unsafe { libc::CPU_SET(core as usize, &mut set) };
619 }
620 if unsafe {
621 libc::sched_setaffinity(
622 0,
623 std::mem::size_of::<libc::cpu_set_t>(),
624 &set,
625 )
626 } != 0
627 {
628 fail!("sched_setaffinity");
629 }
630 }
631 }
632
633 if policy.no_huge_pages {
635 if unsafe { libc::prctl(libc::PR_SET_THP_DISABLE, 1, 0, 0, 0) } != 0 {
636 fail!("prctl(PR_SET_THP_DISABLE)");
637 }
638 }
639
640 let real_uid = unsafe { libc::getuid() };
642 let real_gid = unsafe { libc::getgid() };
643
644 if policy.privileged && cow_config.is_none() {
646 if unsafe { libc::unshare(libc::CLONE_NEWUSER) } != 0 {
647 fail!("unshare(CLONE_NEWUSER)");
648 }
649 write_id_maps(real_uid, real_gid);
650 }
651
652 if let Some(ref cow) = cow_config {
654 if unsafe { libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) } != 0 {
656 fail!("unshare(CLONE_NEWUSER | CLONE_NEWNS)");
657 }
658
659 write_id_maps_overflow();
661
662 let lowerdir = cow.lowers.iter()
664 .map(|p| p.display().to_string())
665 .collect::<Vec<_>>()
666 .join(":");
667 let opts = format!(
668 "lowerdir={},upperdir={},workdir={}",
669 lowerdir,
670 cow.upper.display(),
671 cow.work.display(),
672 );
673
674 let merged_cstr = match CString::new(cow.merged.to_str().unwrap_or("")) {
675 Ok(c) => c,
676 Err(_) => fail!("invalid merged path"),
677 };
678 let overlay_cstr = CString::new("overlay").unwrap();
679 let opts_cstr = match CString::new(opts) {
680 Ok(c) => c,
681 Err(_) => fail!("invalid overlay opts"),
682 };
683
684 let ret = unsafe {
685 libc::mount(
686 overlay_cstr.as_ptr(),
687 merged_cstr.as_ptr(),
688 overlay_cstr.as_ptr(),
689 0,
690 opts_cstr.as_ptr() as *const libc::c_void,
691 )
692 };
693 if ret != 0 {
694 fail!("mount overlay");
695 }
696 }
697
698 let effective_workdir = if let Some(ref workdir) = policy.workdir {
701 if let Some(ref chroot_root) = policy.chroot {
702 Some(chroot_root.join(workdir.strip_prefix("/").unwrap_or(workdir)))
704 } else {
705 Some(workdir.clone())
706 }
707 } else if let Some(ref chroot_root) = policy.chroot {
708 Some(chroot_root.clone())
710 } else {
711 None
712 };
713
714 if let Some(ref workdir) = effective_workdir {
715 let c_path = match CString::new(workdir.as_os_str().as_encoded_bytes()) {
716 Ok(p) => p,
717 Err(_) => fail!("invalid workdir path"),
718 };
719 if unsafe { libc::chdir(c_path.as_ptr()) } != 0 {
720 fail!("chdir");
721 }
722 }
723
724 if unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) } != 0 {
726 fail!("prctl(PR_SET_NO_NEW_PRIVS)");
727 }
728
729 if let Err(e) = crate::landlock::confine(policy) {
731 fail!(format!("landlock: {}", e));
732 }
733
734 let deny = deny_syscall_numbers(policy);
736 let args = arg_filters(policy);
737 let mut keep_fd: i32 = -1;
738
739 if nested {
740 let filter = bpf::assemble_filter(&[], &deny, &args);
743 if let Err(e) = bpf::install_deny_filter(&filter) {
744 fail!(format!("seccomp deny filter: {}", e));
745 }
746 if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), 0) {
748 fail!(format!("write nested signal: {}", e));
749 }
750 } else {
751 let notif = notif_syscalls(policy);
753 let filter = bpf::assemble_filter(¬if, &deny, &args);
754 let notif_fd = match bpf::install_filter(&filter) {
755 Ok(fd) => fd,
756 Err(e) => fail!(format!("seccomp install: {}", e)),
757 };
758 keep_fd = notif_fd.as_raw_fd();
759 if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), keep_fd as u32) {
760 fail!(format!("write notif fd: {}", e));
761 }
762 std::mem::forget(notif_fd);
763 }
764
765 crate::sandbox::CONFINED.store(true, std::sync::atomic::Ordering::Relaxed);
767
768 match read_u32_fd(pipes.ready_r.as_raw_fd()) {
770 Ok(_) => {}
771 Err(e) => fail!(format!("read ready signal: {}", e)),
772 }
773
774 if policy.close_fds {
776 if keep_fd >= 0 {
777 close_fds_above(2, &[keep_fd]);
778 } else {
779 close_fds_above(2, &[]);
780 }
781 }
782
783 if policy.clean_env {
785 for (key, _) in std::env::vars_os() {
787 std::env::remove_var(&key);
788 }
789 }
790 for (key, value) in &policy.env {
791 std::env::set_var(key, value);
792 }
793
794 if let Some(ref devices) = policy.gpu_devices {
796 if !devices.is_empty() {
797 let vis = devices.iter().map(|d| d.to_string()).collect::<Vec<_>>().join(",");
798 std::env::set_var("CUDA_VISIBLE_DEVICES", &vis);
799 std::env::set_var("ROCR_VISIBLE_DEVICES", &vis);
800 }
801 }
803
804 debug_assert!(!cmd.is_empty(), "cmd must not be empty");
806 let argv_ptrs: Vec<*const libc::c_char> = cmd
807 .iter()
808 .map(|s| s.as_ptr())
809 .chain(std::iter::once(std::ptr::null()))
810 .collect();
811
812 unsafe { libc::execvp(argv_ptrs[0], argv_ptrs.as_ptr()) };
813
814 fail!(format!("execvp '{}'", cmd[0].to_string_lossy()));
816}
817
818#[cfg(test)]
823mod tests {
824 use super::*;
825
826 #[test]
827 fn test_pipe_pair_creation() {
828 let pipes = PipePair::new().expect("pipe creation failed");
829 assert!(pipes.notif_r.as_raw_fd() >= 0);
831 assert!(pipes.notif_w.as_raw_fd() >= 0);
832 assert!(pipes.ready_r.as_raw_fd() >= 0);
833 assert!(pipes.ready_w.as_raw_fd() >= 0);
834 let fds = [
836 pipes.notif_r.as_raw_fd(),
837 pipes.notif_w.as_raw_fd(),
838 pipes.ready_r.as_raw_fd(),
839 pipes.ready_w.as_raw_fd(),
840 ];
841 for i in 0..4 {
842 for j in (i + 1)..4 {
843 assert_ne!(fds[i], fds[j]);
844 }
845 }
846 }
847
848 #[test]
849 fn test_write_read_u32() {
850 let pipes = PipePair::new().expect("pipe creation failed");
851 let val = 42u32;
852 write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
853 let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
854 assert_eq!(got, val);
855 }
856
857 #[test]
858 fn test_write_read_u32_large() {
859 let pipes = PipePair::new().expect("pipe creation failed");
860 let val = 0xDEAD_BEEFu32;
861 write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
862 let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
863 assert_eq!(got, val);
864 }
865
866 #[test]
867 fn test_notif_syscalls_always_has_clone() {
868 let policy = Policy::builder().build().unwrap();
869 let nrs = notif_syscalls(&policy);
870 assert!(nrs.contains(&(libc::SYS_clone as u32)));
871 assert!(nrs.contains(&(libc::SYS_clone3 as u32)));
872 assert!(nrs.contains(&(libc::SYS_vfork as u32)));
873 }
874
875 #[test]
876 fn test_notif_syscalls_memory() {
877 let policy = Policy::builder()
878 .max_memory(crate::policy::ByteSize::mib(256))
879 .build()
880 .unwrap();
881 let nrs = notif_syscalls(&policy);
882 assert!(nrs.contains(&(libc::SYS_mmap as u32)));
883 assert!(nrs.contains(&(libc::SYS_munmap as u32)));
884 assert!(nrs.contains(&(libc::SYS_brk as u32)));
885 assert!(nrs.contains(&(libc::SYS_mremap as u32)));
886 assert!(nrs.contains(&(libc::SYS_shmget as u32)));
887 }
888
889 #[test]
890 fn test_notif_syscalls_net() {
891 let policy = Policy::builder()
892 .net_allow_host("example.com")
893 .build()
894 .unwrap();
895 let nrs = notif_syscalls(&policy);
896 assert!(nrs.contains(&(libc::SYS_connect as u32)));
897 assert!(nrs.contains(&(libc::SYS_sendto as u32)));
898 assert!(nrs.contains(&(libc::SYS_sendmsg as u32)));
899 }
900
901 #[test]
902 fn test_deny_syscall_numbers_default() {
903 let policy = Policy::builder().build().unwrap();
904 let nrs = deny_syscall_numbers(&policy);
905 assert!(nrs.contains(&(libc::SYS_mount as u32)));
907 assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
908 assert!(nrs.contains(&(libc::SYS_bpf as u32)));
909 assert!(!nrs.is_empty());
911 }
912
913 #[test]
914 fn test_deny_syscall_numbers_custom() {
915 let policy = Policy::builder()
916 .deny_syscalls(vec!["mount".into(), "ptrace".into()])
917 .build()
918 .unwrap();
919 let nrs = deny_syscall_numbers(&policy);
920 assert_eq!(nrs.len(), 2);
921 assert!(nrs.contains(&(libc::SYS_mount as u32)));
922 assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
923 }
924
925 #[test]
926 fn test_deny_syscall_numbers_empty_when_allow_set() {
927 let policy = Policy::builder()
928 .allow_syscalls(vec!["read".into(), "write".into()])
929 .build()
930 .unwrap();
931 let nrs = deny_syscall_numbers(&policy);
932 assert!(nrs.is_empty());
933 }
934
935 #[test]
936 fn test_arg_filters_has_clone_ioctl_prctl_socket() {
937 use crate::sys::structs::{
938 BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K,
939 };
940 let policy = Policy::builder().build().unwrap();
941 let filters = arg_filters(&policy);
942 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
944 && f.k == libc::SYS_clone as u32));
945 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JSET | BPF_K)
947 && f.k == CLONE_NS_FLAGS as u32));
948 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
950 && f.k == libc::SYS_ioctl as u32));
951 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
953 && f.k == TIOCSTI as u32));
954 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
955 && f.k == TIOCLINUX as u32));
956 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
958 && f.k == libc::SYS_prctl as u32));
959 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
961 && f.k == PR_SET_DUMPABLE));
962 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
964 && f.k == AF_NETLINK));
965 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
966 && f.k == NETLINK_SOCK_DIAG));
967 }
968
969 #[test]
970 fn test_arg_filters_raw_sockets() {
971 use crate::sys::structs::{BPF_ALU, BPF_AND, BPF_JEQ, BPF_JMP, BPF_K};
972 let policy = Policy::builder().no_raw_sockets(true).build().unwrap();
973 let filters = arg_filters(&policy);
974 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
976 && f.k == AF_INET));
977 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
979 && f.k == AF_INET6));
980 assert!(filters.iter().any(|f| f.code == (BPF_ALU | BPF_AND | BPF_K)
982 && f.k == SOCK_TYPE_MASK));
983 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
985 && f.k == SOCK_RAW));
986 }
987
988 #[test]
989 fn test_arg_filters_no_udp() {
990 use crate::sys::structs::{BPF_JEQ, BPF_JMP, BPF_K};
991 let policy = Policy::builder().no_udp(true).build().unwrap();
992 let filters = arg_filters(&policy);
993 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
995 && f.k == SOCK_DGRAM));
996 }
997
998 #[test]
999 fn test_syscall_name_to_nr_covers_defaults() {
1000 let mut skipped = 0;
1002 for name in DEFAULT_DENY_SYSCALLS {
1003 match syscall_name_to_nr(name) {
1004 Some(_) => {}
1005 None => {
1006 assert_eq!(*name, "nfsservctl", "unexpected unresolved syscall: {}", name);
1007 skipped += 1;
1008 }
1009 }
1010 }
1011 assert_eq!(skipped, 1); }
1013}