1use std::ffi::CString;
5use std::io;
6use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
7
8use crate::policy::{FsIsolation, Policy};
9use crate::seccomp::bpf::{self, stmt, jump};
10use crate::sys::structs::{
11 AF_INET, AF_INET6, AF_NETLINK,
12 BPF_ABS, BPF_ALU, BPF_AND, BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K, BPF_LD, BPF_RET, BPF_W,
13 CLONE_NS_FLAGS, DEFAULT_DENY_SYSCALLS, EPERM,
14 SECCOMP_RET_ALLOW, SECCOMP_RET_ERRNO,
15 SOCK_DGRAM, SOCK_RAW, SOCK_TYPE_MASK, TIOCLINUX, TIOCSTI,
16 PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER,
17 OFFSET_ARGS0_LO, OFFSET_ARGS1_LO, OFFSET_ARGS2_LO, OFFSET_ARGS3_LO, OFFSET_NR,
18 SockFilter,
19};
20
21pub struct PipePair {
27 pub notif_r: OwnedFd,
29 pub notif_w: OwnedFd,
31 pub ready_r: OwnedFd,
33 pub ready_w: OwnedFd,
35}
36
37impl PipePair {
38 pub fn new() -> io::Result<Self> {
40 let mut notif_fds = [0i32; 2];
41 let mut ready_fds = [0i32; 2];
42
43 let ret = unsafe { libc::pipe2(notif_fds.as_mut_ptr(), libc::O_CLOEXEC) };
45 if ret < 0 {
46 return Err(io::Error::last_os_error());
47 }
48
49 let ret = unsafe { libc::pipe2(ready_fds.as_mut_ptr(), libc::O_CLOEXEC) };
50 if ret < 0 {
51 unsafe {
53 libc::close(notif_fds[0]);
54 libc::close(notif_fds[1]);
55 }
56 return Err(io::Error::last_os_error());
57 }
58
59 Ok(PipePair {
61 notif_r: unsafe { OwnedFd::from_raw_fd(notif_fds[0]) },
62 notif_w: unsafe { OwnedFd::from_raw_fd(notif_fds[1]) },
63 ready_r: unsafe { OwnedFd::from_raw_fd(ready_fds[0]) },
64 ready_w: unsafe { OwnedFd::from_raw_fd(ready_fds[1]) },
65 })
66 }
67}
68
69pub(crate) fn write_u32_fd(fd: RawFd, val: u32) -> io::Result<()> {
75 let buf = val.to_le_bytes();
76 let mut written = 0usize;
77 while written < 4 {
78 let ret = unsafe {
79 libc::write(
80 fd,
81 buf[written..].as_ptr() as *const libc::c_void,
82 4 - written,
83 )
84 };
85 if ret < 0 {
86 return Err(io::Error::last_os_error());
87 }
88 written += ret as usize;
89 }
90 Ok(())
91}
92
93pub(crate) fn read_u32_fd(fd: RawFd) -> io::Result<u32> {
95 let mut buf = [0u8; 4];
96 let mut total = 0usize;
97 while total < 4 {
98 let ret = unsafe {
99 libc::read(
100 fd,
101 buf[total..].as_mut_ptr() as *mut libc::c_void,
102 4 - total,
103 )
104 };
105 if ret < 0 {
106 return Err(io::Error::last_os_error());
107 }
108 if ret == 0 {
109 return Err(io::Error::new(
110 io::ErrorKind::UnexpectedEof,
111 "pipe closed before 4 bytes read",
112 ));
113 }
114 total += ret as usize;
115 }
116 Ok(u32::from_le_bytes(buf))
117}
118
119pub fn syscall_name_to_nr(name: &str) -> Option<u32> {
128 let nr: i64 = match name {
129 "mount" => libc::SYS_mount,
130 "umount2" => libc::SYS_umount2,
131 "pivot_root" => libc::SYS_pivot_root,
132 "swapon" => libc::SYS_swapon,
133 "swapoff" => libc::SYS_swapoff,
134 "reboot" => libc::SYS_reboot,
135 "sethostname" => libc::SYS_sethostname,
136 "setdomainname" => libc::SYS_setdomainname,
137 "kexec_load" => libc::SYS_kexec_load,
138 "init_module" => libc::SYS_init_module,
139 "finit_module" => libc::SYS_finit_module,
140 "delete_module" => libc::SYS_delete_module,
141 "unshare" => libc::SYS_unshare,
142 "setns" => libc::SYS_setns,
143 "perf_event_open" => libc::SYS_perf_event_open,
144 "bpf" => libc::SYS_bpf,
145 "userfaultfd" => libc::SYS_userfaultfd,
146 "keyctl" => libc::SYS_keyctl,
147 "add_key" => libc::SYS_add_key,
148 "request_key" => libc::SYS_request_key,
149 "ptrace" => libc::SYS_ptrace,
150 "process_vm_readv" => libc::SYS_process_vm_readv,
151 "process_vm_writev" => libc::SYS_process_vm_writev,
152 "open_by_handle_at" => libc::SYS_open_by_handle_at,
153 "name_to_handle_at" => libc::SYS_name_to_handle_at,
154 "ioperm" => libc::SYS_ioperm,
155 "iopl" => libc::SYS_iopl,
156 "quotactl" => libc::SYS_quotactl,
157 "acct" => libc::SYS_acct,
158 "lookup_dcookie" => libc::SYS_lookup_dcookie,
159 "personality" => libc::SYS_personality,
161 "io_uring_setup" => libc::SYS_io_uring_setup,
162 "io_uring_enter" => libc::SYS_io_uring_enter,
163 "io_uring_register" => libc::SYS_io_uring_register,
164 "clone" => libc::SYS_clone,
166 "clone3" => libc::SYS_clone3,
167 "vfork" => libc::SYS_vfork,
168 "mmap" => libc::SYS_mmap,
169 "munmap" => libc::SYS_munmap,
170 "brk" => libc::SYS_brk,
171 "mremap" => libc::SYS_mremap,
172 "connect" => libc::SYS_connect,
173 "sendto" => libc::SYS_sendto,
174 "sendmsg" => libc::SYS_sendmsg,
175 "ioctl" => libc::SYS_ioctl,
176 "socket" => libc::SYS_socket,
177 "prctl" => libc::SYS_prctl,
178 "getrandom" => libc::SYS_getrandom,
179 "openat" => libc::SYS_openat,
180 "open" => libc::SYS_open,
181 "getdents64" => libc::SYS_getdents64,
182 "getdents" => libc::SYS_getdents,
183 "bind" => libc::SYS_bind,
184 "getsockname" => libc::SYS_getsockname,
185 "clock_gettime" => libc::SYS_clock_gettime,
186 "gettimeofday" => libc::SYS_gettimeofday,
187 "time" => libc::SYS_time,
188 "clock_nanosleep" => libc::SYS_clock_nanosleep,
189 "timerfd_settime" => libc::SYS_timerfd_settime,
190 "timer_settime" => libc::SYS_timer_settime,
191 "execve" => libc::SYS_execve,
192 "execveat" => libc::SYS_execveat,
193 "unlinkat" => libc::SYS_unlinkat,
195 "mkdirat" => libc::SYS_mkdirat,
196 "renameat2" => libc::SYS_renameat2,
197 "newfstatat" => libc::SYS_newfstatat,
198 "statx" => libc::SYS_statx,
199 "faccessat" => libc::SYS_faccessat,
200 "symlinkat" => libc::SYS_symlinkat,
201 "linkat" => libc::SYS_linkat,
202 "fchmodat" => libc::SYS_fchmodat,
203 "fchownat" => libc::SYS_fchownat,
204 "readlinkat" => libc::SYS_readlinkat,
205 "truncate" => libc::SYS_truncate,
206 "utimensat" => libc::SYS_utimensat,
207 "unlink" => libc::SYS_unlink,
208 "rmdir" => libc::SYS_rmdir,
209 "mkdir" => libc::SYS_mkdir,
210 "rename" => libc::SYS_rename,
211 "stat" => libc::SYS_stat,
212 "lstat" => libc::SYS_lstat,
213 "access" => libc::SYS_access,
214 "symlink" => libc::SYS_symlink,
215 "link" => libc::SYS_link,
216 "chmod" => libc::SYS_chmod,
217 "chown" => libc::SYS_chown,
218 "lchown" => libc::SYS_lchown,
219 "readlink" => libc::SYS_readlink,
220 "futimesat" => libc::SYS_futimesat,
221 "fork" => libc::SYS_fork,
222 _ => return None,
223 };
224 Some(nr as u32)
225}
226
227pub fn notif_syscalls(policy: &Policy) -> Vec<u32> {
233 let mut nrs = vec![
234 libc::SYS_clone as u32,
235 libc::SYS_clone3 as u32,
236 libc::SYS_vfork as u32,
237 libc::SYS_wait4 as u32,
238 libc::SYS_waitid as u32,
239 ];
240
241 if policy.max_memory.is_some() {
242 nrs.push(libc::SYS_mmap as u32);
243 nrs.push(libc::SYS_munmap as u32);
244 nrs.push(libc::SYS_brk as u32);
245 nrs.push(libc::SYS_mremap as u32);
246 nrs.push(libc::SYS_shmget as u32);
247 }
248
249 if !policy.net_allow_hosts.is_empty()
250 || policy.policy_fn.is_some()
251 || !policy.http_allow.is_empty()
252 || !policy.http_deny.is_empty()
253 {
254 nrs.push(libc::SYS_connect as u32);
255 nrs.push(libc::SYS_sendto as u32);
256 nrs.push(libc::SYS_sendmsg as u32);
257 nrs.push(libc::SYS_bind as u32);
258 }
259
260 if policy.random_seed.is_some() {
261 nrs.push(libc::SYS_getrandom as u32);
262 nrs.push(libc::SYS_openat as u32);
264 }
265
266 if policy.time_start.is_some() {
267 nrs.extend_from_slice(&[
268 libc::SYS_clock_nanosleep as u32,
269 libc::SYS_timerfd_settime as u32,
270 libc::SYS_timer_settime as u32,
271 ]);
272 nrs.push(libc::SYS_openat as u32);
275 }
276
277 nrs.push(libc::SYS_openat as u32);
279 nrs.extend_from_slice(&[
280 libc::SYS_getdents64 as u32,
281 libc::SYS_getdents as u32,
282 ]);
283 if policy.num_cpus.is_some() {
285 nrs.push(libc::SYS_sched_getaffinity as u32);
286 }
287 if policy.hostname.is_some() {
288 nrs.push(libc::SYS_uname as u32);
289 nrs.push(libc::SYS_openat as u32);
290 }
291
292 if policy.workdir.is_some() && policy.fs_isolation == FsIsolation::None {
294 nrs.extend_from_slice(&[
295 libc::SYS_openat as u32,
296 libc::SYS_open as u32,
297 libc::SYS_unlinkat as u32,
298 libc::SYS_unlink as u32,
299 libc::SYS_rmdir as u32,
300 libc::SYS_mkdirat as u32,
301 libc::SYS_mkdir as u32,
302 libc::SYS_renameat2 as u32,
303 libc::SYS_rename as u32,
304 libc::SYS_symlinkat as u32,
305 libc::SYS_symlink as u32,
306 libc::SYS_linkat as u32,
307 libc::SYS_link as u32,
308 libc::SYS_fchmodat as u32,
309 libc::SYS_chmod as u32,
310 libc::SYS_fchownat as u32,
311 libc::SYS_chown as u32,
312 libc::SYS_lchown as u32,
313 libc::SYS_truncate as u32,
314 libc::SYS_utimensat as u32,
315 libc::SYS_newfstatat as u32,
316 libc::SYS_stat as u32,
317 libc::SYS_lstat as u32,
318 libc::SYS_statx as u32,
319 libc::SYS_faccessat as u32,
320 439u32, libc::SYS_access as u32,
322 libc::SYS_readlinkat as u32,
323 libc::SYS_readlink as u32,
324 libc::SYS_getdents64 as u32,
325 libc::SYS_getdents as u32,
326 libc::SYS_chdir as u32,
327 ]);
328 }
329
330 if policy.chroot.is_some() {
332 nrs.extend_from_slice(&[
333 libc::SYS_openat as u32,
334 libc::SYS_open as u32, libc::SYS_execve as u32,
336 libc::SYS_execveat as u32,
337 libc::SYS_unlinkat as u32,
338 libc::SYS_mkdirat as u32,
339 libc::SYS_renameat2 as u32,
340 libc::SYS_symlinkat as u32,
341 libc::SYS_linkat as u32,
342 libc::SYS_fchmodat as u32,
343 libc::SYS_fchownat as u32,
344 libc::SYS_truncate as u32,
345 libc::SYS_newfstatat as u32,
346 libc::SYS_stat as u32, libc::SYS_lstat as u32, libc::SYS_statx as u32,
349 libc::SYS_faccessat as u32,
350 439u32, libc::SYS_access as u32, libc::SYS_readlinkat as u32,
353 libc::SYS_readlink as u32, libc::SYS_getdents64 as u32,
355 libc::SYS_getdents as u32,
356 libc::SYS_chdir as u32,
357 libc::SYS_getcwd as u32,
358 libc::SYS_statfs as u32,
359 libc::SYS_utimensat as u32,
360 libc::SYS_unlink as u32, libc::SYS_rmdir as u32, libc::SYS_mkdir as u32, libc::SYS_rename as u32, libc::SYS_symlink as u32, libc::SYS_link as u32, libc::SYS_chmod as u32, libc::SYS_chown as u32, libc::SYS_lchown as u32,
369 ]);
370 }
371
372 if !policy.fs_denied.is_empty() {
374 nrs.extend_from_slice(&[
375 libc::SYS_openat as u32,
376 libc::SYS_open as u32,
377 libc::SYS_execve as u32,
378 libc::SYS_execveat as u32,
379 libc::SYS_linkat as u32,
380 libc::SYS_link as u32,
381 libc::SYS_renameat2 as u32,
382 libc::SYS_rename as u32,
383 libc::SYS_symlinkat as u32,
384 libc::SYS_symlink as u32,
385 ]);
386 }
387
388 if policy.policy_fn.is_some() {
390 nrs.extend_from_slice(&[
391 libc::SYS_openat as u32,
392 libc::SYS_connect as u32,
393 libc::SYS_sendto as u32,
394 libc::SYS_bind as u32,
395 libc::SYS_execve as u32,
396 libc::SYS_execveat as u32,
397 ]);
398 }
399
400 if policy.port_remap {
402 nrs.extend_from_slice(&[
403 libc::SYS_bind as u32,
404 libc::SYS_getsockname as u32,
405 ]);
406 }
407
408 nrs.sort_unstable();
409 nrs.dedup();
410 nrs
411}
412
413pub fn no_supervisor_deny_syscall_numbers() -> Vec<u32> {
415 use crate::sys::structs::NO_SUPERVISOR_DENY_SYSCALLS;
416 NO_SUPERVISOR_DENY_SYSCALLS
417 .iter()
418 .filter_map(|n| syscall_name_to_nr(n))
419 .collect()
420}
421
422pub fn deny_syscall_numbers(policy: &Policy) -> Vec<u32> {
427 if let Some(ref names) = policy.deny_syscalls {
428 names
429 .iter()
430 .filter_map(|n| syscall_name_to_nr(n))
431 .collect()
432 } else if policy.allow_syscalls.is_none() {
433 DEFAULT_DENY_SYSCALLS
434 .iter()
435 .filter_map(|n| syscall_name_to_nr(n))
436 .collect()
437 } else {
438 Vec::new()
440 }
441}
442
443pub fn arg_filters(policy: &Policy) -> Vec<SockFilter> {
453 let ret_errno = SECCOMP_RET_ERRNO | EPERM as u32;
454 let nr_clone = libc::SYS_clone as u32;
455 let nr_ioctl = libc::SYS_ioctl as u32;
456 let nr_prctl = libc::SYS_prctl as u32;
457 let nr_socket = libc::SYS_socket as u32;
458
459 let mut insns: Vec<SockFilter> = Vec::new();
460
461 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
469 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_clone, 0, 3));
470 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
471 insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, CLONE_NS_FLAGS as u32, 0, 1));
472 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
473
474 let dangerous_ioctls: &[u32] = &[TIOCSTI as u32, TIOCLINUX as u32];
477 let n_ioctls = dangerous_ioctls.len();
478 let skip_count = (1 + n_ioctls * 2) as u8;
479 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
480 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_ioctl, 0, skip_count));
481 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
482 for &cmd in dangerous_ioctls {
483 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, cmd, 0, 1));
484 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
485 }
486
487 let dangerous_prctl_ops: &[u32] = &[PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER];
490 let n_ops = dangerous_prctl_ops.len();
491 let skip_count = (1 + n_ops * 2) as u8;
492 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
493 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_prctl, 0, skip_count));
494 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
495 for &op in dangerous_prctl_ops {
496 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, op, 0, 1));
497 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
498 }
499
500 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
511 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, 3));
512 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
513 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_NETLINK, 0, 1));
514 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
515
516 let mut blocked_types: Vec<u32> = Vec::new();
518 if policy.no_raw_sockets {
519 blocked_types.push(SOCK_RAW);
520 }
521 if policy.no_udp {
522 blocked_types.push(SOCK_DGRAM);
523 }
524
525 if !blocked_types.is_empty() {
526 let n = blocked_types.len();
527 let after_domain = 2 + n + 1;
529 let skip_all = (3 + after_domain) as u8;
531
532 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
533 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, skip_all));
534 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
536 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET, 1, 0));
538 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET6, 0, after_domain as u8));
540 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
542 insns.push(stmt(BPF_ALU | BPF_AND | BPF_K, SOCK_TYPE_MASK));
543 for (i, &sock_type) in blocked_types.iter().enumerate() {
545 let remaining = n - i - 1;
546 let jf: u8 = if remaining == 0 { 1 } else { 0 };
550 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, sock_type, remaining as u8, jf));
551 }
552 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
554 }
555
556 {
565 let nr_wait4 = libc::SYS_wait4 as u32;
566 let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000) as u32;
567 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
568 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_wait4, 0, 3));
569 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS2_LO));
570 insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
571 insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
572 }
573
574 {
577 let nr_waitid = libc::SYS_waitid as u32;
578 let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000) as u32;
579 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
580 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_waitid, 0, 3));
581 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS3_LO));
582 insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
583 insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
584 }
585
586 insns
587}
588
589fn close_fds_above(min_fd: RawFd, keep: &[RawFd]) {
595 let fds_to_close: Vec<RawFd> = {
599 let dir = match std::fs::read_dir("/proc/self/fd") {
600 Ok(d) => d,
601 Err(_) => return,
602 };
603 dir.flatten()
604 .filter_map(|entry| {
605 entry.file_name().into_string().ok()
606 .and_then(|name| name.parse::<RawFd>().ok())
607 })
608 .filter(|&fd| fd > min_fd && !keep.contains(&fd))
609 .collect()
610 };
611 for fd in fds_to_close {
613 unsafe { libc::close(fd) };
614 }
615}
616
617pub(crate) use crate::cow::ChildMountConfig;
623
624fn write_id_maps(real_uid: u32, real_gid: u32, target_uid: u32, target_gid: u32) {
629 let _ = std::fs::write("/proc/self/uid_map", format!("{} {} 1\n", target_uid, real_uid));
630 let _ = std::fs::write("/proc/self/setgroups", "deny\n");
631 let _ = std::fs::write("/proc/self/gid_map", format!("{} {} 1\n", target_gid, real_gid));
632}
633
634fn write_id_maps_overflow() {
637 let uid = unsafe { libc::getuid() };
638 let gid = unsafe { libc::getgid() };
639 write_id_maps(uid, gid, 0, 0);
640}
641
642pub(crate) fn confine_child(policy: &Policy, cmd: &[CString], pipes: &PipePair, cow_config: Option<&ChildMountConfig>, nested: bool) -> ! {
651 macro_rules! fail {
653 ($msg:expr) => {{
654 let err = std::io::Error::last_os_error();
655 let _ = write!(std::io::stderr(), "sandlock child: {}: {}\n", $msg, err);
656 unsafe { libc::_exit(127) };
657 }};
658 }
659
660 use std::io::Write;
661
662 if unsafe { libc::setpgid(0, 0) } != 0 {
664 fail!("setpgid");
665 }
666
667 if unsafe { libc::isatty(0) } == 1 {
672 unsafe {
673 libc::signal(libc::SIGTTOU, libc::SIG_IGN);
674 libc::tcsetpgrp(0, libc::getpgrp());
675 libc::signal(libc::SIGTTOU, libc::SIG_DFL);
676 }
677 }
678
679 if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } != 0 {
681 fail!("prctl(PR_SET_PDEATHSIG)");
682 }
683
684 if unsafe { libc::getppid() } == 1 {
686 fail!("parent died before confinement");
687 }
688
689 if policy.no_randomize_memory {
691 const ADDR_NO_RANDOMIZE: libc::c_ulong = 0x0040000;
692 let current = unsafe { libc::personality(0xffffffff) };
694 if current == -1 {
695 fail!("personality(query)");
696 }
697 if unsafe { libc::personality(current as libc::c_ulong | ADDR_NO_RANDOMIZE) } == -1 {
698 fail!("personality(ADDR_NO_RANDOMIZE)");
699 }
700 }
701
702 if let Some(ref cores) = policy.cpu_cores {
704 if !cores.is_empty() {
705 let mut set = unsafe { std::mem::zeroed::<libc::cpu_set_t>() };
706 unsafe { libc::CPU_ZERO(&mut set) };
707 for &core in cores {
708 unsafe { libc::CPU_SET(core as usize, &mut set) };
709 }
710 if unsafe {
711 libc::sched_setaffinity(
712 0,
713 std::mem::size_of::<libc::cpu_set_t>(),
714 &set,
715 )
716 } != 0
717 {
718 fail!("sched_setaffinity");
719 }
720 }
721 }
722
723 if policy.no_huge_pages {
725 if unsafe { libc::prctl(libc::PR_SET_THP_DISABLE, 1, 0, 0, 0) } != 0 {
726 fail!("prctl(PR_SET_THP_DISABLE)");
727 }
728 }
729
730 if policy.no_coredump {
732 let rlim = libc::rlimit { rlim_cur: 0, rlim_max: 0 };
738 if unsafe { libc::setrlimit(libc::RLIMIT_CORE, &rlim) } != 0 {
739 fail!("setrlimit(RLIMIT_CORE, 0)");
740 }
741 }
742
743 let real_uid = unsafe { libc::getuid() };
745 let real_gid = unsafe { libc::getgid() };
746
747 if let Some(target_uid) = policy.uid {
750 if cow_config.is_none() {
751 if unsafe { libc::unshare(libc::CLONE_NEWUSER) } != 0 {
752 fail!("unshare(CLONE_NEWUSER)");
753 }
754 write_id_maps(real_uid, real_gid, target_uid, target_uid);
755 }
756 }
757
758 if let Some(ref cow) = cow_config {
760 if unsafe { libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) } != 0 {
762 fail!("unshare(CLONE_NEWUSER | CLONE_NEWNS)");
763 }
764
765 write_id_maps_overflow();
767
768 let lowerdir = cow.lowers.iter()
774 .map(|p| p.display().to_string())
775 .collect::<Vec<_>>()
776 .join(":");
777 let opts = format!(
778 "lowerdir={},upperdir={},workdir={}",
779 lowerdir,
780 cow.upper.display(),
781 cow.work.display(),
782 );
783
784 let mount_cstr = match CString::new(cow.mount_point.to_str().unwrap_or("")) {
785 Ok(c) => c,
786 Err(_) => fail!("invalid overlay mount point path"),
787 };
788 let overlay_cstr = CString::new("overlay").unwrap();
789 let opts_cstr = match CString::new(opts) {
790 Ok(c) => c,
791 Err(_) => fail!("invalid overlay opts"),
792 };
793
794 let ret = unsafe {
795 libc::mount(
796 overlay_cstr.as_ptr(),
797 mount_cstr.as_ptr(),
798 overlay_cstr.as_ptr(),
799 0,
800 opts_cstr.as_ptr() as *const libc::c_void,
801 )
802 };
803 if ret != 0 {
804 fail!("mount overlay");
805 }
806 }
807
808 let effective_cwd = if let Some(ref cwd) = policy.cwd {
811 if let Some(ref chroot_root) = policy.chroot {
812 Some(chroot_root.join(cwd.strip_prefix("/").unwrap_or(cwd)))
813 } else {
814 Some(cwd.clone())
815 }
816 } else if let Some(ref chroot_root) = policy.chroot {
817 Some(chroot_root.to_path_buf())
819 } else if let Some(ref workdir) = policy.workdir {
820 Some(workdir.clone())
822 } else {
823 None
824 };
825
826 if let Some(ref cwd) = effective_cwd {
827 let c_path = match CString::new(cwd.as_os_str().as_encoded_bytes()) {
828 Ok(c) => c,
829 Err(_) => fail!("invalid cwd path"),
830 };
831 if unsafe { libc::chdir(c_path.as_ptr()) } != 0 {
832 fail!("chdir");
833 }
834 }
835
836 if unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) } != 0 {
838 fail!("prctl(PR_SET_NO_NEW_PRIVS)");
839 }
840
841 if let Err(e) = crate::landlock::confine(policy) {
843 fail!(format!("landlock: {}", e));
844 }
845
846 let deny = deny_syscall_numbers(policy);
848 let args = arg_filters(policy);
849 let mut keep_fd: i32 = -1;
850
851 if nested {
852 let filter = bpf::assemble_filter(&[], &deny, &args);
855 if let Err(e) = bpf::install_deny_filter(&filter) {
856 fail!(format!("seccomp deny filter: {}", e));
857 }
858 if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), 0) {
860 fail!(format!("write nested signal: {}", e));
861 }
862 } else {
863 let notif = notif_syscalls(policy);
865 let filter = bpf::assemble_filter(¬if, &deny, &args);
866 let notif_fd = match bpf::install_filter(&filter) {
867 Ok(fd) => fd,
868 Err(e) => fail!(format!("seccomp install: {}", e)),
869 };
870 keep_fd = notif_fd.as_raw_fd();
871 if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), keep_fd as u32) {
872 fail!(format!("write notif fd: {}", e));
873 }
874 std::mem::forget(notif_fd);
875 }
876
877 crate::sandbox::CONFINED.store(true, std::sync::atomic::Ordering::Relaxed);
879
880 match read_u32_fd(pipes.ready_r.as_raw_fd()) {
882 Ok(_) => {}
883 Err(e) => fail!(format!("read ready signal: {}", e)),
884 }
885
886 if keep_fd >= 0 {
888 close_fds_above(2, &[keep_fd]);
889 } else {
890 close_fds_above(2, &[]);
891 }
892
893 if policy.clean_env {
895 for (key, _) in std::env::vars_os() {
897 std::env::remove_var(&key);
898 }
899 }
900 for (key, value) in &policy.env {
901 std::env::set_var(key, value);
902 }
903
904 if let Some(ref devices) = policy.gpu_devices {
906 if !devices.is_empty() {
907 let vis = devices.iter().map(|d| d.to_string()).collect::<Vec<_>>().join(",");
908 std::env::set_var("CUDA_VISIBLE_DEVICES", &vis);
909 std::env::set_var("ROCR_VISIBLE_DEVICES", &vis);
910 }
911 }
913
914 debug_assert!(!cmd.is_empty(), "cmd must not be empty");
916 let argv_ptrs: Vec<*const libc::c_char> = cmd
917 .iter()
918 .map(|s| s.as_ptr())
919 .chain(std::iter::once(std::ptr::null()))
920 .collect();
921
922 if policy.chroot.is_some() {
923 let mut exec_path = vec![0u8; libc::PATH_MAX as usize];
929 let orig = cmd[0].as_bytes_with_nul();
930 exec_path[..orig.len()].copy_from_slice(orig);
931
932 unsafe {
933 libc::execvp(
934 exec_path.as_ptr() as *const libc::c_char,
935 argv_ptrs.as_ptr(),
936 )
937 };
938 } else {
939 unsafe { libc::execvp(argv_ptrs[0], argv_ptrs.as_ptr()) };
940 }
941
942 fail!(format!("execvp '{}'", cmd[0].to_string_lossy()));
944}
945
946#[cfg(test)]
951mod tests {
952 use super::*;
953
954 #[test]
955 fn test_pipe_pair_creation() {
956 let pipes = PipePair::new().expect("pipe creation failed");
957 assert!(pipes.notif_r.as_raw_fd() >= 0);
959 assert!(pipes.notif_w.as_raw_fd() >= 0);
960 assert!(pipes.ready_r.as_raw_fd() >= 0);
961 assert!(pipes.ready_w.as_raw_fd() >= 0);
962 let fds = [
964 pipes.notif_r.as_raw_fd(),
965 pipes.notif_w.as_raw_fd(),
966 pipes.ready_r.as_raw_fd(),
967 pipes.ready_w.as_raw_fd(),
968 ];
969 for i in 0..4 {
970 for j in (i + 1)..4 {
971 assert_ne!(fds[i], fds[j]);
972 }
973 }
974 }
975
976 #[test]
977 fn test_write_read_u32() {
978 let pipes = PipePair::new().expect("pipe creation failed");
979 let val = 42u32;
980 write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
981 let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
982 assert_eq!(got, val);
983 }
984
985 #[test]
986 fn test_write_read_u32_large() {
987 let pipes = PipePair::new().expect("pipe creation failed");
988 let val = 0xDEAD_BEEFu32;
989 write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
990 let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
991 assert_eq!(got, val);
992 }
993
994 #[test]
995 fn test_notif_syscalls_always_has_clone() {
996 let policy = Policy::builder().build().unwrap();
997 let nrs = notif_syscalls(&policy);
998 assert!(nrs.contains(&(libc::SYS_clone as u32)));
999 assert!(nrs.contains(&(libc::SYS_clone3 as u32)));
1000 assert!(nrs.contains(&(libc::SYS_vfork as u32)));
1001 }
1002
1003 #[test]
1004 fn test_notif_syscalls_memory() {
1005 let policy = Policy::builder()
1006 .max_memory(crate::policy::ByteSize::mib(256))
1007 .build()
1008 .unwrap();
1009 let nrs = notif_syscalls(&policy);
1010 assert!(nrs.contains(&(libc::SYS_mmap as u32)));
1011 assert!(nrs.contains(&(libc::SYS_munmap as u32)));
1012 assert!(nrs.contains(&(libc::SYS_brk as u32)));
1013 assert!(nrs.contains(&(libc::SYS_mremap as u32)));
1014 assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1015 }
1016
1017 #[test]
1018 fn test_notif_syscalls_net() {
1019 let policy = Policy::builder()
1020 .net_allow_host("example.com")
1021 .build()
1022 .unwrap();
1023 let nrs = notif_syscalls(&policy);
1024 assert!(nrs.contains(&(libc::SYS_connect as u32)));
1025 assert!(nrs.contains(&(libc::SYS_sendto as u32)));
1026 assert!(nrs.contains(&(libc::SYS_sendmsg as u32)));
1027 }
1028
1029 #[test]
1032 fn test_notif_syscalls_faccessat2() {
1033 const SYS_FACCESSAT2: u32 = 439;
1034
1035 let policy = Policy::builder()
1037 .chroot("/tmp")
1038 .build()
1039 .unwrap();
1040 let nrs = notif_syscalls(&policy);
1041 assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
1042 assert!(nrs.contains(&SYS_FACCESSAT2),
1043 "chroot notif filter must include SYS_faccessat2 (439)");
1044
1045 let policy = Policy::builder()
1047 .workdir("/tmp")
1048 .build()
1049 .unwrap();
1050 let nrs = notif_syscalls(&policy);
1051 assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
1052 assert!(nrs.contains(&SYS_FACCESSAT2),
1053 "COW notif filter must include SYS_faccessat2 (439)");
1054 }
1055
1056 #[test]
1057 fn test_deny_syscall_numbers_default() {
1058 let policy = Policy::builder().build().unwrap();
1059 let nrs = deny_syscall_numbers(&policy);
1060 assert!(nrs.contains(&(libc::SYS_mount as u32)));
1062 assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1063 assert!(nrs.contains(&(libc::SYS_bpf as u32)));
1064 assert!(!nrs.is_empty());
1066 }
1067
1068 #[test]
1069 fn test_deny_syscall_numbers_custom() {
1070 let policy = Policy::builder()
1071 .deny_syscalls(vec!["mount".into(), "ptrace".into()])
1072 .build()
1073 .unwrap();
1074 let nrs = deny_syscall_numbers(&policy);
1075 assert_eq!(nrs.len(), 2);
1076 assert!(nrs.contains(&(libc::SYS_mount as u32)));
1077 assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1078 }
1079
1080 #[test]
1081 fn test_deny_syscall_numbers_empty_when_allow_set() {
1082 let policy = Policy::builder()
1083 .allow_syscalls(vec!["read".into(), "write".into()])
1084 .build()
1085 .unwrap();
1086 let nrs = deny_syscall_numbers(&policy);
1087 assert!(nrs.is_empty());
1088 }
1089
1090 #[test]
1091 fn test_arg_filters_has_clone_ioctl_prctl_socket() {
1092 use crate::sys::structs::{
1093 BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K,
1094 };
1095 let policy = Policy::builder().build().unwrap();
1096 let filters = arg_filters(&policy);
1097 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1099 && f.k == libc::SYS_clone as u32));
1100 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JSET | BPF_K)
1102 && f.k == CLONE_NS_FLAGS as u32));
1103 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1105 && f.k == libc::SYS_ioctl as u32));
1106 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1108 && f.k == TIOCSTI as u32));
1109 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1110 && f.k == TIOCLINUX as u32));
1111 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1113 && f.k == libc::SYS_prctl as u32));
1114 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1116 && f.k == PR_SET_DUMPABLE));
1117 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1119 && f.k == AF_NETLINK));
1120 }
1121
1122 #[test]
1123 fn test_arg_filters_raw_sockets() {
1124 use crate::sys::structs::{BPF_ALU, BPF_AND, BPF_JEQ, BPF_JMP, BPF_K};
1125 let policy = Policy::builder().no_raw_sockets(true).build().unwrap();
1126 let filters = arg_filters(&policy);
1127 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1129 && f.k == AF_INET));
1130 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1132 && f.k == AF_INET6));
1133 assert!(filters.iter().any(|f| f.code == (BPF_ALU | BPF_AND | BPF_K)
1135 && f.k == SOCK_TYPE_MASK));
1136 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1138 && f.k == SOCK_RAW));
1139 }
1140
1141 #[test]
1142 fn test_arg_filters_no_udp() {
1143 use crate::sys::structs::{BPF_JEQ, BPF_JMP, BPF_K};
1144 let policy = Policy::builder().no_udp(true).build().unwrap();
1145 let filters = arg_filters(&policy);
1146 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1148 && f.k == SOCK_DGRAM));
1149 }
1150
1151 #[test]
1152 fn test_syscall_name_to_nr_covers_defaults() {
1153 let mut skipped = 0;
1155 for name in DEFAULT_DENY_SYSCALLS {
1156 match syscall_name_to_nr(name) {
1157 Some(_) => {}
1158 None => {
1159 assert_eq!(*name, "nfsservctl", "unexpected unresolved syscall: {}", name);
1160 skipped += 1;
1161 }
1162 }
1163 }
1164 assert_eq!(skipped, 1); }
1166}