1use std::ffi::CString;
5use std::io;
6use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
7
8use crate::policy::{FsIsolation, Policy};
9use crate::seccomp::bpf::{self, stmt, jump};
10use crate::sys::structs::{
11 AF_INET, AF_INET6, AF_NETLINK,
12 BPF_ABS, BPF_ALU, BPF_AND, BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K, BPF_LD, BPF_RET, BPF_W,
13 CLONE_NS_FLAGS, DEFAULT_DENY_SYSCALLS, EPERM, NETLINK_SOCK_DIAG, SECCOMP_RET_ERRNO,
14 SOCK_DGRAM, SOCK_RAW, SOCK_TYPE_MASK, TIOCLINUX, TIOCSTI,
15 PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER,
16 OFFSET_ARGS0_LO, OFFSET_ARGS1_LO, OFFSET_ARGS2_LO, OFFSET_NR,
17 SockFilter,
18};
19
20pub struct PipePair {
26 pub notif_r: OwnedFd,
28 pub notif_w: OwnedFd,
30 pub ready_r: OwnedFd,
32 pub ready_w: OwnedFd,
34}
35
36impl PipePair {
37 pub fn new() -> io::Result<Self> {
39 let mut notif_fds = [0i32; 2];
40 let mut ready_fds = [0i32; 2];
41
42 let ret = unsafe { libc::pipe2(notif_fds.as_mut_ptr(), libc::O_CLOEXEC) };
44 if ret < 0 {
45 return Err(io::Error::last_os_error());
46 }
47
48 let ret = unsafe { libc::pipe2(ready_fds.as_mut_ptr(), libc::O_CLOEXEC) };
49 if ret < 0 {
50 unsafe {
52 libc::close(notif_fds[0]);
53 libc::close(notif_fds[1]);
54 }
55 return Err(io::Error::last_os_error());
56 }
57
58 Ok(PipePair {
60 notif_r: unsafe { OwnedFd::from_raw_fd(notif_fds[0]) },
61 notif_w: unsafe { OwnedFd::from_raw_fd(notif_fds[1]) },
62 ready_r: unsafe { OwnedFd::from_raw_fd(ready_fds[0]) },
63 ready_w: unsafe { OwnedFd::from_raw_fd(ready_fds[1]) },
64 })
65 }
66}
67
68pub(crate) fn write_u32_fd(fd: RawFd, val: u32) -> io::Result<()> {
74 let buf = val.to_le_bytes();
75 let mut written = 0usize;
76 while written < 4 {
77 let ret = unsafe {
78 libc::write(
79 fd,
80 buf[written..].as_ptr() as *const libc::c_void,
81 4 - written,
82 )
83 };
84 if ret < 0 {
85 return Err(io::Error::last_os_error());
86 }
87 written += ret as usize;
88 }
89 Ok(())
90}
91
92pub(crate) fn read_u32_fd(fd: RawFd) -> io::Result<u32> {
94 let mut buf = [0u8; 4];
95 let mut total = 0usize;
96 while total < 4 {
97 let ret = unsafe {
98 libc::read(
99 fd,
100 buf[total..].as_mut_ptr() as *mut libc::c_void,
101 4 - total,
102 )
103 };
104 if ret < 0 {
105 return Err(io::Error::last_os_error());
106 }
107 if ret == 0 {
108 return Err(io::Error::new(
109 io::ErrorKind::UnexpectedEof,
110 "pipe closed before 4 bytes read",
111 ));
112 }
113 total += ret as usize;
114 }
115 Ok(u32::from_le_bytes(buf))
116}
117
118pub fn syscall_name_to_nr(name: &str) -> Option<u32> {
127 let nr: i64 = match name {
128 "mount" => libc::SYS_mount,
129 "umount2" => libc::SYS_umount2,
130 "pivot_root" => libc::SYS_pivot_root,
131 "swapon" => libc::SYS_swapon,
132 "swapoff" => libc::SYS_swapoff,
133 "reboot" => libc::SYS_reboot,
134 "sethostname" => libc::SYS_sethostname,
135 "setdomainname" => libc::SYS_setdomainname,
136 "kexec_load" => libc::SYS_kexec_load,
137 "init_module" => libc::SYS_init_module,
138 "finit_module" => libc::SYS_finit_module,
139 "delete_module" => libc::SYS_delete_module,
140 "unshare" => libc::SYS_unshare,
141 "setns" => libc::SYS_setns,
142 "perf_event_open" => libc::SYS_perf_event_open,
143 "bpf" => libc::SYS_bpf,
144 "userfaultfd" => libc::SYS_userfaultfd,
145 "keyctl" => libc::SYS_keyctl,
146 "add_key" => libc::SYS_add_key,
147 "request_key" => libc::SYS_request_key,
148 "ptrace" => libc::SYS_ptrace,
149 "process_vm_readv" => libc::SYS_process_vm_readv,
150 "process_vm_writev" => libc::SYS_process_vm_writev,
151 "open_by_handle_at" => libc::SYS_open_by_handle_at,
152 "name_to_handle_at" => libc::SYS_name_to_handle_at,
153 "ioperm" => libc::SYS_ioperm,
154 "iopl" => libc::SYS_iopl,
155 "quotactl" => libc::SYS_quotactl,
156 "acct" => libc::SYS_acct,
157 "lookup_dcookie" => libc::SYS_lookup_dcookie,
158 "io_uring_setup" => libc::SYS_io_uring_setup,
160 "io_uring_enter" => libc::SYS_io_uring_enter,
161 "io_uring_register" => libc::SYS_io_uring_register,
162 "clone" => libc::SYS_clone,
164 "clone3" => libc::SYS_clone3,
165 "vfork" => libc::SYS_vfork,
166 "mmap" => libc::SYS_mmap,
167 "munmap" => libc::SYS_munmap,
168 "brk" => libc::SYS_brk,
169 "mremap" => libc::SYS_mremap,
170 "connect" => libc::SYS_connect,
171 "sendto" => libc::SYS_sendto,
172 "sendmsg" => libc::SYS_sendmsg,
173 "ioctl" => libc::SYS_ioctl,
174 "socket" => libc::SYS_socket,
175 "prctl" => libc::SYS_prctl,
176 "getrandom" => libc::SYS_getrandom,
177 "openat" => libc::SYS_openat,
178 "open" => libc::SYS_open,
179 "getdents64" => libc::SYS_getdents64,
180 "getdents" => libc::SYS_getdents,
181 "bind" => libc::SYS_bind,
182 "getsockname" => libc::SYS_getsockname,
183 "clock_gettime" => libc::SYS_clock_gettime,
184 "gettimeofday" => libc::SYS_gettimeofday,
185 "time" => libc::SYS_time,
186 "clock_nanosleep" => libc::SYS_clock_nanosleep,
187 "timerfd_settime" => libc::SYS_timerfd_settime,
188 "timer_settime" => libc::SYS_timer_settime,
189 "execve" => libc::SYS_execve,
190 "execveat" => libc::SYS_execveat,
191 "unlinkat" => libc::SYS_unlinkat,
193 "mkdirat" => libc::SYS_mkdirat,
194 "renameat2" => libc::SYS_renameat2,
195 "newfstatat" => libc::SYS_newfstatat,
196 "statx" => libc::SYS_statx,
197 "faccessat" => libc::SYS_faccessat,
198 "symlinkat" => libc::SYS_symlinkat,
199 "linkat" => libc::SYS_linkat,
200 "fchmodat" => libc::SYS_fchmodat,
201 "fchownat" => libc::SYS_fchownat,
202 "readlinkat" => libc::SYS_readlinkat,
203 "truncate" => libc::SYS_truncate,
204 "utimensat" => libc::SYS_utimensat,
205 "unlink" => libc::SYS_unlink,
206 "rmdir" => libc::SYS_rmdir,
207 "mkdir" => libc::SYS_mkdir,
208 "rename" => libc::SYS_rename,
209 "stat" => libc::SYS_stat,
210 "lstat" => libc::SYS_lstat,
211 "access" => libc::SYS_access,
212 "symlink" => libc::SYS_symlink,
213 "link" => libc::SYS_link,
214 "chmod" => libc::SYS_chmod,
215 "chown" => libc::SYS_chown,
216 "lchown" => libc::SYS_lchown,
217 "readlink" => libc::SYS_readlink,
218 "futimesat" => libc::SYS_futimesat,
219 "fork" => libc::SYS_fork,
220 _ => return None,
221 };
222 Some(nr as u32)
223}
224
225pub fn notif_syscalls(policy: &Policy) -> Vec<u32> {
231 let mut nrs = vec![
232 libc::SYS_clone as u32,
233 libc::SYS_clone3 as u32,
234 libc::SYS_vfork as u32,
235 ];
236
237 if policy.max_memory.is_some() {
238 nrs.push(libc::SYS_mmap as u32);
239 nrs.push(libc::SYS_munmap as u32);
240 nrs.push(libc::SYS_brk as u32);
241 nrs.push(libc::SYS_mremap as u32);
242 nrs.push(libc::SYS_shmget as u32);
243 }
244
245 if !policy.net_allow_hosts.is_empty()
246 || policy.policy_fn.is_some()
247 || !policy.http_allow.is_empty()
248 || !policy.http_deny.is_empty()
249 {
250 nrs.push(libc::SYS_connect as u32);
251 nrs.push(libc::SYS_sendto as u32);
252 nrs.push(libc::SYS_sendmsg as u32);
253 nrs.push(libc::SYS_bind as u32);
254 }
255
256 if policy.random_seed.is_some() {
257 nrs.push(libc::SYS_getrandom as u32);
258 nrs.push(libc::SYS_openat as u32);
260 }
261
262 if policy.time_start.is_some() {
263 nrs.extend_from_slice(&[
264 libc::SYS_clock_nanosleep as u32,
265 libc::SYS_timerfd_settime as u32,
266 libc::SYS_timer_settime as u32,
267 ]);
268 nrs.push(libc::SYS_openat as u32);
271 }
272
273 nrs.push(libc::SYS_openat as u32);
275 nrs.extend_from_slice(&[
276 libc::SYS_getdents64 as u32,
277 libc::SYS_getdents as u32,
278 ]);
279 if policy.num_cpus.is_some() {
281 nrs.push(libc::SYS_sched_getaffinity as u32);
282 }
283 if policy.hostname.is_some() {
284 nrs.push(libc::SYS_uname as u32);
285 nrs.push(libc::SYS_openat as u32);
286 }
287
288 if policy.workdir.is_some() && policy.fs_isolation == FsIsolation::None {
290 nrs.extend_from_slice(&[
291 libc::SYS_openat as u32,
292 libc::SYS_unlinkat as u32,
293 libc::SYS_mkdirat as u32,
294 libc::SYS_renameat2 as u32,
295 libc::SYS_symlinkat as u32,
296 libc::SYS_linkat as u32,
297 libc::SYS_fchmodat as u32,
298 libc::SYS_fchownat as u32,
299 libc::SYS_truncate as u32,
300 libc::SYS_newfstatat as u32,
301 libc::SYS_statx as u32,
302 libc::SYS_faccessat as u32,
303 439u32, libc::SYS_readlinkat as u32,
305 libc::SYS_getdents64 as u32,
306 libc::SYS_getdents as u32,
307 ]);
308 }
309
310 if policy.chroot.is_some() {
312 nrs.extend_from_slice(&[
313 libc::SYS_openat as u32,
314 libc::SYS_open as u32, libc::SYS_execve as u32,
316 libc::SYS_execveat as u32,
317 libc::SYS_unlinkat as u32,
318 libc::SYS_mkdirat as u32,
319 libc::SYS_renameat2 as u32,
320 libc::SYS_symlinkat as u32,
321 libc::SYS_linkat as u32,
322 libc::SYS_fchmodat as u32,
323 libc::SYS_fchownat as u32,
324 libc::SYS_truncate as u32,
325 libc::SYS_newfstatat as u32,
326 libc::SYS_stat as u32, libc::SYS_lstat as u32, libc::SYS_statx as u32,
329 libc::SYS_faccessat as u32,
330 439u32, libc::SYS_access as u32, libc::SYS_readlinkat as u32,
333 libc::SYS_readlink as u32, libc::SYS_getdents64 as u32,
335 libc::SYS_getdents as u32,
336 libc::SYS_chdir as u32,
337 libc::SYS_getcwd as u32,
338 libc::SYS_statfs as u32,
339 libc::SYS_utimensat as u32,
340 libc::SYS_unlink as u32, libc::SYS_rmdir as u32, libc::SYS_mkdir as u32, libc::SYS_rename as u32, libc::SYS_symlink as u32, libc::SYS_link as u32, libc::SYS_chmod as u32, libc::SYS_chown as u32, libc::SYS_lchown as u32,
349 ]);
350 }
351
352 if !policy.fs_denied.is_empty() {
354 nrs.extend_from_slice(&[
355 libc::SYS_openat as u32,
356 libc::SYS_open as u32,
357 libc::SYS_execve as u32,
358 libc::SYS_execveat as u32,
359 ]);
360 }
361
362 if policy.policy_fn.is_some() {
364 nrs.extend_from_slice(&[
365 libc::SYS_openat as u32,
366 libc::SYS_connect as u32,
367 libc::SYS_sendto as u32,
368 libc::SYS_bind as u32,
369 libc::SYS_execve as u32,
370 libc::SYS_execveat as u32,
371 ]);
372 }
373
374 if policy.port_remap {
376 nrs.extend_from_slice(&[
377 libc::SYS_bind as u32,
378 libc::SYS_getsockname as u32,
379 ]);
380 }
381
382 nrs.sort_unstable();
383 nrs.dedup();
384 nrs
385}
386
387pub fn no_supervisor_deny_syscall_numbers() -> Vec<u32> {
389 use crate::sys::structs::NO_SUPERVISOR_DENY_SYSCALLS;
390 NO_SUPERVISOR_DENY_SYSCALLS
391 .iter()
392 .filter_map(|n| syscall_name_to_nr(n))
393 .collect()
394}
395
396pub fn deny_syscall_numbers(policy: &Policy) -> Vec<u32> {
401 if let Some(ref names) = policy.deny_syscalls {
402 names
403 .iter()
404 .filter_map(|n| syscall_name_to_nr(n))
405 .collect()
406 } else if policy.allow_syscalls.is_none() {
407 DEFAULT_DENY_SYSCALLS
408 .iter()
409 .filter_map(|n| syscall_name_to_nr(n))
410 .collect()
411 } else {
412 Vec::new()
414 }
415}
416
417pub fn arg_filters(policy: &Policy) -> Vec<SockFilter> {
427 let ret_errno = SECCOMP_RET_ERRNO | EPERM as u32;
428 let nr_clone = libc::SYS_clone as u32;
429 let nr_ioctl = libc::SYS_ioctl as u32;
430 let nr_prctl = libc::SYS_prctl as u32;
431 let nr_socket = libc::SYS_socket as u32;
432
433 let mut insns: Vec<SockFilter> = Vec::new();
434
435 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
443 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_clone, 0, 3));
444 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
445 insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, CLONE_NS_FLAGS as u32, 0, 1));
446 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
447
448 let dangerous_ioctls: &[u32] = &[TIOCSTI as u32, TIOCLINUX as u32];
451 let n_ioctls = dangerous_ioctls.len();
452 let skip_count = (1 + n_ioctls * 2) as u8;
453 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
454 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_ioctl, 0, skip_count));
455 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
456 for &cmd in dangerous_ioctls {
457 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, cmd, 0, 1));
458 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
459 }
460
461 let dangerous_prctl_ops: &[u32] = &[PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER];
464 let n_ops = dangerous_prctl_ops.len();
465 let skip_count = (1 + n_ops * 2) as u8;
466 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
467 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_prctl, 0, skip_count));
468 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
469 for &op in dangerous_prctl_ops {
470 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, op, 0, 1));
471 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
472 }
473
474 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
484 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, 5));
485 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
486 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_NETLINK, 0, 3));
487 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS2_LO));
488 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, NETLINK_SOCK_DIAG, 0, 1));
489 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
490
491 let mut blocked_types: Vec<u32> = Vec::new();
493 if policy.no_raw_sockets {
494 blocked_types.push(SOCK_RAW);
495 }
496 if policy.no_udp {
497 blocked_types.push(SOCK_DGRAM);
498 }
499
500 if !blocked_types.is_empty() {
501 let n = blocked_types.len();
502 let after_domain = 2 + n + 1;
504 let skip_all = (3 + after_domain) as u8;
506
507 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
508 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, skip_all));
509 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
511 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET, 1, 0));
513 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET6, 0, after_domain as u8));
515 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
517 insns.push(stmt(BPF_ALU | BPF_AND | BPF_K, SOCK_TYPE_MASK));
518 for (i, &sock_type) in blocked_types.iter().enumerate() {
520 let remaining = n - i - 1;
521 let jf: u8 = if remaining == 0 { 1 } else { 0 };
525 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, sock_type, remaining as u8, jf));
526 }
527 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
529 }
530
531 insns
532}
533
534fn close_fds_above(min_fd: RawFd, keep: &[RawFd]) {
540 let fds_to_close: Vec<RawFd> = {
544 let dir = match std::fs::read_dir("/proc/self/fd") {
545 Ok(d) => d,
546 Err(_) => return,
547 };
548 dir.flatten()
549 .filter_map(|entry| {
550 entry.file_name().into_string().ok()
551 .and_then(|name| name.parse::<RawFd>().ok())
552 })
553 .filter(|&fd| fd > min_fd && !keep.contains(&fd))
554 .collect()
555 };
556 for fd in fds_to_close {
558 unsafe { libc::close(fd) };
559 }
560}
561
562pub(crate) use crate::cow::ChildMountConfig;
568
569fn write_id_maps(real_uid: u32, real_gid: u32, target_uid: u32, target_gid: u32) {
574 let _ = std::fs::write("/proc/self/uid_map", format!("{} {} 1\n", target_uid, real_uid));
575 let _ = std::fs::write("/proc/self/setgroups", "deny\n");
576 let _ = std::fs::write("/proc/self/gid_map", format!("{} {} 1\n", target_gid, real_gid));
577}
578
579fn write_id_maps_overflow() {
582 let uid = unsafe { libc::getuid() };
583 let gid = unsafe { libc::getgid() };
584 write_id_maps(uid, gid, 0, 0);
585}
586
587pub(crate) fn confine_child(policy: &Policy, cmd: &[CString], pipes: &PipePair, cow_config: Option<&ChildMountConfig>, nested: bool) -> ! {
596 macro_rules! fail {
598 ($msg:expr) => {{
599 let err = std::io::Error::last_os_error();
600 let _ = write!(std::io::stderr(), "sandlock child: {}: {}\n", $msg, err);
601 unsafe { libc::_exit(127) };
602 }};
603 }
604
605 use std::io::Write;
606
607 if unsafe { libc::setpgid(0, 0) } != 0 {
609 fail!("setpgid");
610 }
611
612 if unsafe { libc::isatty(0) } == 1 {
617 unsafe {
618 libc::signal(libc::SIGTTOU, libc::SIG_IGN);
619 libc::tcsetpgrp(0, libc::getpgrp());
620 libc::signal(libc::SIGTTOU, libc::SIG_DFL);
621 }
622 }
623
624 if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } != 0 {
626 fail!("prctl(PR_SET_PDEATHSIG)");
627 }
628
629 if unsafe { libc::getppid() } == 1 {
631 fail!("parent died before confinement");
632 }
633
634 if policy.no_randomize_memory {
636 const ADDR_NO_RANDOMIZE: libc::c_ulong = 0x0040000;
637 let current = unsafe { libc::personality(0xffffffff) };
639 if current == -1 {
640 fail!("personality(query)");
641 }
642 if unsafe { libc::personality(current as libc::c_ulong | ADDR_NO_RANDOMIZE) } == -1 {
643 fail!("personality(ADDR_NO_RANDOMIZE)");
644 }
645 }
646
647 if let Some(ref cores) = policy.cpu_cores {
649 if !cores.is_empty() {
650 let mut set = unsafe { std::mem::zeroed::<libc::cpu_set_t>() };
651 unsafe { libc::CPU_ZERO(&mut set) };
652 for &core in cores {
653 unsafe { libc::CPU_SET(core as usize, &mut set) };
654 }
655 if unsafe {
656 libc::sched_setaffinity(
657 0,
658 std::mem::size_of::<libc::cpu_set_t>(),
659 &set,
660 )
661 } != 0
662 {
663 fail!("sched_setaffinity");
664 }
665 }
666 }
667
668 if policy.no_huge_pages {
670 if unsafe { libc::prctl(libc::PR_SET_THP_DISABLE, 1, 0, 0, 0) } != 0 {
671 fail!("prctl(PR_SET_THP_DISABLE)");
672 }
673 }
674
675 if policy.no_coredump {
677 let rlim = libc::rlimit { rlim_cur: 0, rlim_max: 0 };
683 if unsafe { libc::setrlimit(libc::RLIMIT_CORE, &rlim) } != 0 {
684 fail!("setrlimit(RLIMIT_CORE, 0)");
685 }
686 }
687
688 let real_uid = unsafe { libc::getuid() };
690 let real_gid = unsafe { libc::getgid() };
691
692 if let Some(target_uid) = policy.uid {
695 if cow_config.is_none() {
696 if unsafe { libc::unshare(libc::CLONE_NEWUSER) } != 0 {
697 fail!("unshare(CLONE_NEWUSER)");
698 }
699 write_id_maps(real_uid, real_gid, target_uid, target_uid);
700 }
701 }
702
703 if let Some(ref cow) = cow_config {
705 if unsafe { libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) } != 0 {
707 fail!("unshare(CLONE_NEWUSER | CLONE_NEWNS)");
708 }
709
710 write_id_maps_overflow();
712
713 let lowerdir = cow.lowers.iter()
719 .map(|p| p.display().to_string())
720 .collect::<Vec<_>>()
721 .join(":");
722 let opts = format!(
723 "lowerdir={},upperdir={},workdir={}",
724 lowerdir,
725 cow.upper.display(),
726 cow.work.display(),
727 );
728
729 let mount_cstr = match CString::new(cow.mount_point.to_str().unwrap_or("")) {
730 Ok(c) => c,
731 Err(_) => fail!("invalid overlay mount point path"),
732 };
733 let overlay_cstr = CString::new("overlay").unwrap();
734 let opts_cstr = match CString::new(opts) {
735 Ok(c) => c,
736 Err(_) => fail!("invalid overlay opts"),
737 };
738
739 let ret = unsafe {
740 libc::mount(
741 overlay_cstr.as_ptr(),
742 mount_cstr.as_ptr(),
743 overlay_cstr.as_ptr(),
744 0,
745 opts_cstr.as_ptr() as *const libc::c_void,
746 )
747 };
748 if ret != 0 {
749 fail!("mount overlay");
750 }
751 }
752
753 let effective_cwd = if let Some(ref cwd) = policy.cwd {
756 if let Some(ref chroot_root) = policy.chroot {
757 Some(chroot_root.join(cwd.strip_prefix("/").unwrap_or(cwd)))
758 } else {
759 Some(cwd.clone())
760 }
761 } else if let Some(ref chroot_root) = policy.chroot {
762 Some(chroot_root.to_path_buf())
764 } else {
765 None
766 };
767
768 if let Some(ref cwd) = effective_cwd {
769 let c_path = match CString::new(cwd.as_os_str().as_encoded_bytes()) {
770 Ok(c) => c,
771 Err(_) => fail!("invalid cwd path"),
772 };
773 if unsafe { libc::chdir(c_path.as_ptr()) } != 0 {
774 fail!("chdir");
775 }
776 }
777
778 if unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) } != 0 {
780 fail!("prctl(PR_SET_NO_NEW_PRIVS)");
781 }
782
783 if let Err(e) = crate::landlock::confine(policy) {
785 fail!(format!("landlock: {}", e));
786 }
787
788 let deny = deny_syscall_numbers(policy);
790 let args = arg_filters(policy);
791 let mut keep_fd: i32 = -1;
792
793 if nested {
794 let filter = bpf::assemble_filter(&[], &deny, &args);
797 if let Err(e) = bpf::install_deny_filter(&filter) {
798 fail!(format!("seccomp deny filter: {}", e));
799 }
800 if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), 0) {
802 fail!(format!("write nested signal: {}", e));
803 }
804 } else {
805 let notif = notif_syscalls(policy);
807 let filter = bpf::assemble_filter(¬if, &deny, &args);
808 let notif_fd = match bpf::install_filter(&filter) {
809 Ok(fd) => fd,
810 Err(e) => fail!(format!("seccomp install: {}", e)),
811 };
812 keep_fd = notif_fd.as_raw_fd();
813 if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), keep_fd as u32) {
814 fail!(format!("write notif fd: {}", e));
815 }
816 std::mem::forget(notif_fd);
817 }
818
819 crate::sandbox::CONFINED.store(true, std::sync::atomic::Ordering::Relaxed);
821
822 match read_u32_fd(pipes.ready_r.as_raw_fd()) {
824 Ok(_) => {}
825 Err(e) => fail!(format!("read ready signal: {}", e)),
826 }
827
828 if keep_fd >= 0 {
830 close_fds_above(2, &[keep_fd]);
831 } else {
832 close_fds_above(2, &[]);
833 }
834
835 if policy.clean_env {
837 for (key, _) in std::env::vars_os() {
839 std::env::remove_var(&key);
840 }
841 }
842 for (key, value) in &policy.env {
843 std::env::set_var(key, value);
844 }
845
846 if let Some(ref devices) = policy.gpu_devices {
848 if !devices.is_empty() {
849 let vis = devices.iter().map(|d| d.to_string()).collect::<Vec<_>>().join(",");
850 std::env::set_var("CUDA_VISIBLE_DEVICES", &vis);
851 std::env::set_var("ROCR_VISIBLE_DEVICES", &vis);
852 }
853 }
855
856 debug_assert!(!cmd.is_empty(), "cmd must not be empty");
858 let argv_ptrs: Vec<*const libc::c_char> = cmd
859 .iter()
860 .map(|s| s.as_ptr())
861 .chain(std::iter::once(std::ptr::null()))
862 .collect();
863
864 if policy.chroot.is_some() {
865 let mut exec_path = vec![0u8; libc::PATH_MAX as usize];
871 let orig = cmd[0].as_bytes_with_nul();
872 exec_path[..orig.len()].copy_from_slice(orig);
873
874 unsafe {
875 libc::execvp(
876 exec_path.as_ptr() as *const libc::c_char,
877 argv_ptrs.as_ptr(),
878 )
879 };
880 } else {
881 unsafe { libc::execvp(argv_ptrs[0], argv_ptrs.as_ptr()) };
882 }
883
884 fail!(format!("execvp '{}'", cmd[0].to_string_lossy()));
886}
887
888#[cfg(test)]
893mod tests {
894 use super::*;
895
896 #[test]
897 fn test_pipe_pair_creation() {
898 let pipes = PipePair::new().expect("pipe creation failed");
899 assert!(pipes.notif_r.as_raw_fd() >= 0);
901 assert!(pipes.notif_w.as_raw_fd() >= 0);
902 assert!(pipes.ready_r.as_raw_fd() >= 0);
903 assert!(pipes.ready_w.as_raw_fd() >= 0);
904 let fds = [
906 pipes.notif_r.as_raw_fd(),
907 pipes.notif_w.as_raw_fd(),
908 pipes.ready_r.as_raw_fd(),
909 pipes.ready_w.as_raw_fd(),
910 ];
911 for i in 0..4 {
912 for j in (i + 1)..4 {
913 assert_ne!(fds[i], fds[j]);
914 }
915 }
916 }
917
918 #[test]
919 fn test_write_read_u32() {
920 let pipes = PipePair::new().expect("pipe creation failed");
921 let val = 42u32;
922 write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
923 let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
924 assert_eq!(got, val);
925 }
926
927 #[test]
928 fn test_write_read_u32_large() {
929 let pipes = PipePair::new().expect("pipe creation failed");
930 let val = 0xDEAD_BEEFu32;
931 write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
932 let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
933 assert_eq!(got, val);
934 }
935
936 #[test]
937 fn test_notif_syscalls_always_has_clone() {
938 let policy = Policy::builder().build().unwrap();
939 let nrs = notif_syscalls(&policy);
940 assert!(nrs.contains(&(libc::SYS_clone as u32)));
941 assert!(nrs.contains(&(libc::SYS_clone3 as u32)));
942 assert!(nrs.contains(&(libc::SYS_vfork as u32)));
943 }
944
945 #[test]
946 fn test_notif_syscalls_memory() {
947 let policy = Policy::builder()
948 .max_memory(crate::policy::ByteSize::mib(256))
949 .build()
950 .unwrap();
951 let nrs = notif_syscalls(&policy);
952 assert!(nrs.contains(&(libc::SYS_mmap as u32)));
953 assert!(nrs.contains(&(libc::SYS_munmap as u32)));
954 assert!(nrs.contains(&(libc::SYS_brk as u32)));
955 assert!(nrs.contains(&(libc::SYS_mremap as u32)));
956 assert!(nrs.contains(&(libc::SYS_shmget as u32)));
957 }
958
959 #[test]
960 fn test_notif_syscalls_net() {
961 let policy = Policy::builder()
962 .net_allow_host("example.com")
963 .build()
964 .unwrap();
965 let nrs = notif_syscalls(&policy);
966 assert!(nrs.contains(&(libc::SYS_connect as u32)));
967 assert!(nrs.contains(&(libc::SYS_sendto as u32)));
968 assert!(nrs.contains(&(libc::SYS_sendmsg as u32)));
969 }
970
971 #[test]
974 fn test_notif_syscalls_faccessat2() {
975 const SYS_FACCESSAT2: u32 = 439;
976
977 let policy = Policy::builder()
979 .chroot("/tmp")
980 .build()
981 .unwrap();
982 let nrs = notif_syscalls(&policy);
983 assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
984 assert!(nrs.contains(&SYS_FACCESSAT2),
985 "chroot notif filter must include SYS_faccessat2 (439)");
986
987 let policy = Policy::builder()
989 .workdir("/tmp")
990 .build()
991 .unwrap();
992 let nrs = notif_syscalls(&policy);
993 assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
994 assert!(nrs.contains(&SYS_FACCESSAT2),
995 "COW notif filter must include SYS_faccessat2 (439)");
996 }
997
998 #[test]
999 fn test_deny_syscall_numbers_default() {
1000 let policy = Policy::builder().build().unwrap();
1001 let nrs = deny_syscall_numbers(&policy);
1002 assert!(nrs.contains(&(libc::SYS_mount as u32)));
1004 assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1005 assert!(nrs.contains(&(libc::SYS_bpf as u32)));
1006 assert!(!nrs.is_empty());
1008 }
1009
1010 #[test]
1011 fn test_deny_syscall_numbers_custom() {
1012 let policy = Policy::builder()
1013 .deny_syscalls(vec!["mount".into(), "ptrace".into()])
1014 .build()
1015 .unwrap();
1016 let nrs = deny_syscall_numbers(&policy);
1017 assert_eq!(nrs.len(), 2);
1018 assert!(nrs.contains(&(libc::SYS_mount as u32)));
1019 assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1020 }
1021
1022 #[test]
1023 fn test_deny_syscall_numbers_empty_when_allow_set() {
1024 let policy = Policy::builder()
1025 .allow_syscalls(vec!["read".into(), "write".into()])
1026 .build()
1027 .unwrap();
1028 let nrs = deny_syscall_numbers(&policy);
1029 assert!(nrs.is_empty());
1030 }
1031
1032 #[test]
1033 fn test_arg_filters_has_clone_ioctl_prctl_socket() {
1034 use crate::sys::structs::{
1035 BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K,
1036 };
1037 let policy = Policy::builder().build().unwrap();
1038 let filters = arg_filters(&policy);
1039 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1041 && f.k == libc::SYS_clone as u32));
1042 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JSET | BPF_K)
1044 && f.k == CLONE_NS_FLAGS as u32));
1045 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1047 && f.k == libc::SYS_ioctl as u32));
1048 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1050 && f.k == TIOCSTI as u32));
1051 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1052 && f.k == TIOCLINUX as u32));
1053 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1055 && f.k == libc::SYS_prctl as u32));
1056 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1058 && f.k == PR_SET_DUMPABLE));
1059 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1061 && f.k == AF_NETLINK));
1062 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1063 && f.k == NETLINK_SOCK_DIAG));
1064 }
1065
1066 #[test]
1067 fn test_arg_filters_raw_sockets() {
1068 use crate::sys::structs::{BPF_ALU, BPF_AND, BPF_JEQ, BPF_JMP, BPF_K};
1069 let policy = Policy::builder().no_raw_sockets(true).build().unwrap();
1070 let filters = arg_filters(&policy);
1071 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1073 && f.k == AF_INET));
1074 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1076 && f.k == AF_INET6));
1077 assert!(filters.iter().any(|f| f.code == (BPF_ALU | BPF_AND | BPF_K)
1079 && f.k == SOCK_TYPE_MASK));
1080 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1082 && f.k == SOCK_RAW));
1083 }
1084
1085 #[test]
1086 fn test_arg_filters_no_udp() {
1087 use crate::sys::structs::{BPF_JEQ, BPF_JMP, BPF_K};
1088 let policy = Policy::builder().no_udp(true).build().unwrap();
1089 let filters = arg_filters(&policy);
1090 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1092 && f.k == SOCK_DGRAM));
1093 }
1094
1095 #[test]
1096 fn test_syscall_name_to_nr_covers_defaults() {
1097 let mut skipped = 0;
1099 for name in DEFAULT_DENY_SYSCALLS {
1100 match syscall_name_to_nr(name) {
1101 Some(_) => {}
1102 None => {
1103 assert_eq!(*name, "nfsservctl", "unexpected unresolved syscall: {}", name);
1104 skipped += 1;
1105 }
1106 }
1107 }
1108 assert_eq!(skipped, 1); }
1110}