1use std::ffi::CString;
5use std::io;
6use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
7
8use crate::arch;
9use crate::policy::{FsIsolation, Policy};
10use crate::seccomp::bpf::{self, stmt, jump};
11use crate::sys::structs::{
12 AF_INET, AF_INET6,
13 BPF_ABS, BPF_ALU, BPF_AND, BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K, BPF_LD, BPF_RET, BPF_W,
14 CLONE_NS_FLAGS, DEFAULT_DENY_SYSCALLS, EPERM,
15 SECCOMP_RET_ALLOW, SECCOMP_RET_ERRNO,
16 SIOCETHTOOL, SIOCGIFADDR, SIOCGIFBRDADDR, SIOCGIFCONF, SIOCGIFDSTADDR,
17 SIOCGIFFLAGS, SIOCGIFHWADDR, SIOCGIFINDEX, SIOCGIFNAME, SIOCGIFNETMASK,
18 SOCK_DGRAM, SOCK_RAW, SOCK_TYPE_MASK, TIOCLINUX, TIOCSTI,
19 PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER,
20 OFFSET_ARGS0_LO, OFFSET_ARGS1_LO, OFFSET_ARGS2_LO, OFFSET_ARGS3_LO, OFFSET_NR,
21 SockFilter,
22};
23
24pub struct PipePair {
30 pub notif_r: OwnedFd,
32 pub notif_w: OwnedFd,
34 pub ready_r: OwnedFd,
36 pub ready_w: OwnedFd,
38}
39
40impl PipePair {
41 pub fn new() -> io::Result<Self> {
43 let mut notif_fds = [0i32; 2];
44 let mut ready_fds = [0i32; 2];
45
46 let ret = unsafe { libc::pipe2(notif_fds.as_mut_ptr(), libc::O_CLOEXEC) };
48 if ret < 0 {
49 return Err(io::Error::last_os_error());
50 }
51
52 let ret = unsafe { libc::pipe2(ready_fds.as_mut_ptr(), libc::O_CLOEXEC) };
53 if ret < 0 {
54 unsafe {
56 libc::close(notif_fds[0]);
57 libc::close(notif_fds[1]);
58 }
59 return Err(io::Error::last_os_error());
60 }
61
62 Ok(PipePair {
64 notif_r: unsafe { OwnedFd::from_raw_fd(notif_fds[0]) },
65 notif_w: unsafe { OwnedFd::from_raw_fd(notif_fds[1]) },
66 ready_r: unsafe { OwnedFd::from_raw_fd(ready_fds[0]) },
67 ready_w: unsafe { OwnedFd::from_raw_fd(ready_fds[1]) },
68 })
69 }
70}
71
72pub(crate) fn write_u32_fd(fd: RawFd, val: u32) -> io::Result<()> {
78 let buf = val.to_le_bytes();
79 let mut written = 0usize;
80 while written < 4 {
81 let ret = unsafe {
82 libc::write(
83 fd,
84 buf[written..].as_ptr() as *const libc::c_void,
85 4 - written,
86 )
87 };
88 if ret < 0 {
89 return Err(io::Error::last_os_error());
90 }
91 written += ret as usize;
92 }
93 Ok(())
94}
95
96pub(crate) fn read_u32_fd(fd: RawFd) -> io::Result<u32> {
98 let mut buf = [0u8; 4];
99 let mut total = 0usize;
100 while total < 4 {
101 let ret = unsafe {
102 libc::read(
103 fd,
104 buf[total..].as_mut_ptr() as *mut libc::c_void,
105 4 - total,
106 )
107 };
108 if ret < 0 {
109 return Err(io::Error::last_os_error());
110 }
111 if ret == 0 {
112 return Err(io::Error::new(
113 io::ErrorKind::UnexpectedEof,
114 "pipe closed before 4 bytes read",
115 ));
116 }
117 total += ret as usize;
118 }
119 Ok(u32::from_le_bytes(buf))
120}
121
122pub fn syscall_name_to_nr(name: &str) -> Option<u32> {
131 let nr: i64 = match name {
132 "mount" => libc::SYS_mount,
133 "umount2" => libc::SYS_umount2,
134 "pivot_root" => libc::SYS_pivot_root,
135 "swapon" => libc::SYS_swapon,
136 "swapoff" => libc::SYS_swapoff,
137 "reboot" => libc::SYS_reboot,
138 "sethostname" => libc::SYS_sethostname,
139 "setdomainname" => libc::SYS_setdomainname,
140 "kexec_load" => libc::SYS_kexec_load,
141 "init_module" => libc::SYS_init_module,
142 "finit_module" => libc::SYS_finit_module,
143 "delete_module" => libc::SYS_delete_module,
144 "unshare" => libc::SYS_unshare,
145 "setns" => libc::SYS_setns,
146 "perf_event_open" => libc::SYS_perf_event_open,
147 "bpf" => libc::SYS_bpf,
148 "userfaultfd" => libc::SYS_userfaultfd,
149 "keyctl" => libc::SYS_keyctl,
150 "add_key" => libc::SYS_add_key,
151 "request_key" => libc::SYS_request_key,
152 "ptrace" => libc::SYS_ptrace,
153 "process_vm_readv" => libc::SYS_process_vm_readv,
154 "process_vm_writev" => libc::SYS_process_vm_writev,
155 "open_by_handle_at" => libc::SYS_open_by_handle_at,
156 "name_to_handle_at" => libc::SYS_name_to_handle_at,
157 "ioperm" => arch::SYS_IOPERM?,
158 "iopl" => arch::SYS_IOPL?,
159 "quotactl" => libc::SYS_quotactl,
160 "acct" => libc::SYS_acct,
161 "lookup_dcookie" => libc::SYS_lookup_dcookie,
162 "personality" => libc::SYS_personality,
164 "io_uring_setup" => libc::SYS_io_uring_setup,
165 "io_uring_enter" => libc::SYS_io_uring_enter,
166 "io_uring_register" => libc::SYS_io_uring_register,
167 "clone" => libc::SYS_clone,
169 "clone3" => libc::SYS_clone3,
170 "vfork" => arch::SYS_VFORK?,
171 "mmap" => libc::SYS_mmap,
172 "munmap" => libc::SYS_munmap,
173 "brk" => libc::SYS_brk,
174 "mremap" => libc::SYS_mremap,
175 "connect" => libc::SYS_connect,
176 "sendto" => libc::SYS_sendto,
177 "sendmsg" => libc::SYS_sendmsg,
178 "ioctl" => libc::SYS_ioctl,
179 "socket" => libc::SYS_socket,
180 "prctl" => libc::SYS_prctl,
181 "getrandom" => libc::SYS_getrandom,
182 "openat" => libc::SYS_openat,
183 "open" => arch::SYS_OPEN?,
184 "getdents64" => libc::SYS_getdents64,
185 "getdents" => arch::SYS_GETDENTS?,
186 "bind" => libc::SYS_bind,
187 "getsockname" => libc::SYS_getsockname,
188 "clock_gettime" => libc::SYS_clock_gettime,
189 "gettimeofday" => libc::SYS_gettimeofday,
190 "time" => arch::SYS_TIME?,
191 "clock_nanosleep" => libc::SYS_clock_nanosleep,
192 "timerfd_settime" => libc::SYS_timerfd_settime,
193 "timer_settime" => libc::SYS_timer_settime,
194 "execve" => libc::SYS_execve,
195 "execveat" => libc::SYS_execveat,
196 "unlinkat" => libc::SYS_unlinkat,
198 "mkdirat" => libc::SYS_mkdirat,
199 "renameat2" => libc::SYS_renameat2,
200 "newfstatat" => libc::SYS_newfstatat,
201 "statx" => libc::SYS_statx,
202 "faccessat" => libc::SYS_faccessat,
203 "symlinkat" => libc::SYS_symlinkat,
204 "linkat" => libc::SYS_linkat,
205 "fchmodat" => libc::SYS_fchmodat,
206 "fchownat" => libc::SYS_fchownat,
207 "readlinkat" => libc::SYS_readlinkat,
208 "truncate" => libc::SYS_truncate,
209 "utimensat" => libc::SYS_utimensat,
210 "unlink" => arch::SYS_UNLINK?,
211 "rmdir" => arch::SYS_RMDIR?,
212 "mkdir" => arch::SYS_MKDIR?,
213 "rename" => arch::SYS_RENAME?,
214 "stat" => arch::SYS_STAT?,
215 "lstat" => arch::SYS_LSTAT?,
216 "access" => arch::SYS_ACCESS?,
217 "symlink" => arch::SYS_SYMLINK?,
218 "link" => arch::SYS_LINK?,
219 "chmod" => arch::SYS_CHMOD?,
220 "chown" => arch::SYS_CHOWN?,
221 "lchown" => arch::SYS_LCHOWN?,
222 "readlink" => arch::SYS_READLINK?,
223 "futimesat" => arch::SYS_FUTIMESAT?,
224 "fork" => arch::SYS_FORK?,
225 _ => return None,
226 };
227 Some(nr as u32)
228}
229
230pub fn notif_syscalls(policy: &Policy) -> Vec<u32> {
236 let mut nrs = vec![
237 libc::SYS_clone as u32,
238 libc::SYS_clone3 as u32,
239 libc::SYS_wait4 as u32,
240 libc::SYS_waitid as u32,
241 ];
242 arch::push_optional_syscall(&mut nrs, arch::SYS_VFORK);
243
244 if policy.max_memory.is_some() {
245 nrs.push(libc::SYS_mmap as u32);
246 nrs.push(libc::SYS_munmap as u32);
247 nrs.push(libc::SYS_brk as u32);
248 nrs.push(libc::SYS_mremap as u32);
249 nrs.push(libc::SYS_shmget as u32);
250 }
251
252 if policy.net_allow_hosts.is_some()
253 || policy.policy_fn.is_some()
254 || !policy.http_allow.is_empty()
255 || !policy.http_deny.is_empty()
256 {
257 nrs.push(libc::SYS_connect as u32);
258 nrs.push(libc::SYS_sendto as u32);
259 nrs.push(libc::SYS_sendmsg as u32);
260 nrs.push(libc::SYS_bind as u32);
261 }
262
263 if policy.random_seed.is_some() {
264 nrs.push(libc::SYS_getrandom as u32);
265 nrs.push(libc::SYS_openat as u32);
267 }
268
269 if policy.time_start.is_some() {
270 nrs.extend_from_slice(&[
271 libc::SYS_clock_nanosleep as u32,
272 libc::SYS_timerfd_settime as u32,
273 libc::SYS_timer_settime as u32,
274 ]);
275 nrs.push(libc::SYS_openat as u32);
278 }
279
280 nrs.push(libc::SYS_openat as u32);
282 nrs.push(libc::SYS_getdents64 as u32);
283 arch::push_optional_syscall(&mut nrs, arch::SYS_GETDENTS);
284
285 nrs.push(libc::SYS_socket as u32);
294 nrs.push(libc::SYS_bind as u32);
295 nrs.push(libc::SYS_getsockname as u32);
296 nrs.push(libc::SYS_recvfrom as u32);
297 nrs.push(libc::SYS_recvmsg as u32);
298 nrs.push(libc::SYS_close as u32);
299 if policy.num_cpus.is_some() {
301 nrs.push(libc::SYS_sched_getaffinity as u32);
302 }
303 if policy.hostname.is_some() {
304 nrs.push(libc::SYS_uname as u32);
305 nrs.push(libc::SYS_openat as u32);
306 }
307
308 if policy.workdir.is_some() && policy.fs_isolation == FsIsolation::None {
310 nrs.extend_from_slice(&[
311 libc::SYS_openat as u32,
312 libc::SYS_unlinkat as u32,
313 libc::SYS_mkdirat as u32,
314 libc::SYS_renameat2 as u32,
315 libc::SYS_symlinkat as u32,
316 libc::SYS_linkat as u32,
317 libc::SYS_fchmodat as u32,
318 libc::SYS_fchownat as u32,
319 libc::SYS_truncate as u32,
320 libc::SYS_utimensat as u32,
321 libc::SYS_newfstatat as u32,
322 libc::SYS_statx as u32,
323 libc::SYS_faccessat as u32,
324 439u32, libc::SYS_readlinkat as u32,
326 libc::SYS_getdents64 as u32,
327 libc::SYS_chdir as u32,
328 libc::SYS_getcwd as u32,
329 ]);
330 for nr in [
331 arch::SYS_OPEN, arch::SYS_UNLINK, arch::SYS_RMDIR, arch::SYS_MKDIR,
332 arch::SYS_RENAME, arch::SYS_SYMLINK, arch::SYS_LINK, arch::SYS_CHMOD,
333 arch::SYS_CHOWN, arch::SYS_LCHOWN, arch::SYS_STAT, arch::SYS_LSTAT,
334 arch::SYS_ACCESS, arch::SYS_READLINK, arch::SYS_GETDENTS,
335 ] {
336 arch::push_optional_syscall(&mut nrs, nr);
337 }
338 }
339
340 if policy.chroot.is_some() {
342 nrs.extend_from_slice(&[
343 libc::SYS_openat as u32,
344 libc::SYS_execve as u32,
345 libc::SYS_execveat as u32,
346 libc::SYS_unlinkat as u32,
347 libc::SYS_mkdirat as u32,
348 libc::SYS_renameat2 as u32,
349 libc::SYS_symlinkat as u32,
350 libc::SYS_linkat as u32,
351 libc::SYS_fchmodat as u32,
352 libc::SYS_fchownat as u32,
353 libc::SYS_truncate as u32,
354 libc::SYS_newfstatat as u32,
355 libc::SYS_statx as u32,
356 libc::SYS_faccessat as u32,
357 439u32, libc::SYS_readlinkat as u32,
359 libc::SYS_getdents64 as u32,
360 libc::SYS_chdir as u32,
361 libc::SYS_getcwd as u32,
362 libc::SYS_statfs as u32,
363 libc::SYS_utimensat as u32,
364 ]);
365 for nr in [
366 arch::SYS_OPEN, arch::SYS_STAT, arch::SYS_LSTAT, arch::SYS_ACCESS,
367 arch::SYS_READLINK, arch::SYS_GETDENTS, arch::SYS_UNLINK,
368 arch::SYS_RMDIR, arch::SYS_MKDIR, arch::SYS_RENAME,
369 arch::SYS_SYMLINK, arch::SYS_LINK, arch::SYS_CHMOD,
370 arch::SYS_CHOWN, arch::SYS_LCHOWN,
371 ] {
372 arch::push_optional_syscall(&mut nrs, nr);
373 }
374 }
375
376 if !policy.fs_denied.is_empty() {
378 nrs.extend_from_slice(&[
379 libc::SYS_openat as u32,
380 libc::SYS_execve as u32,
381 libc::SYS_execveat as u32,
382 libc::SYS_linkat as u32,
383 libc::SYS_renameat2 as u32,
384 libc::SYS_symlinkat as u32,
385 ]);
386 for nr in [arch::SYS_OPEN, arch::SYS_LINK, arch::SYS_RENAME, arch::SYS_SYMLINK] {
387 arch::push_optional_syscall(&mut nrs, nr);
388 }
389 }
390
391 if policy.policy_fn.is_some() {
393 nrs.extend_from_slice(&[
394 libc::SYS_openat as u32,
395 libc::SYS_connect as u32,
396 libc::SYS_sendto as u32,
397 libc::SYS_bind as u32,
398 libc::SYS_execve as u32,
399 libc::SYS_execveat as u32,
400 ]);
401 }
402
403 if policy.port_remap {
405 nrs.extend_from_slice(&[
406 libc::SYS_bind as u32,
407 libc::SYS_getsockname as u32,
408 ]);
409 }
410
411 nrs.sort_unstable();
412 nrs.dedup();
413 nrs
414}
415
416pub fn no_supervisor_deny_syscall_numbers() -> Vec<u32> {
418 use crate::sys::structs::NO_SUPERVISOR_DENY_SYSCALLS;
419 NO_SUPERVISOR_DENY_SYSCALLS
420 .iter()
421 .filter_map(|n| syscall_name_to_nr(n))
422 .collect()
423}
424
425pub fn deny_syscall_numbers(policy: &Policy) -> Vec<u32> {
430 if let Some(ref names) = policy.deny_syscalls {
431 names
432 .iter()
433 .filter_map(|n| syscall_name_to_nr(n))
434 .collect()
435 } else if policy.allow_syscalls.is_none() {
436 DEFAULT_DENY_SYSCALLS
437 .iter()
438 .filter_map(|n| syscall_name_to_nr(n))
439 .collect()
440 } else {
441 Vec::new()
443 }
444}
445
446pub fn arg_filters(policy: &Policy) -> Vec<SockFilter> {
455 let ret_errno = SECCOMP_RET_ERRNO | EPERM as u32;
456 let nr_clone = libc::SYS_clone as u32;
457 let nr_ioctl = libc::SYS_ioctl as u32;
458 let nr_prctl = libc::SYS_prctl as u32;
459 let nr_socket = libc::SYS_socket as u32;
460
461 let mut insns: Vec<SockFilter> = Vec::new();
462
463 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
471 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_clone, 0, 3));
472 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
473 insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, CLONE_NS_FLAGS as u32, 0, 1));
474 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
475
476 let dangerous_ioctls: &[u32] = &[
482 TIOCSTI as u32,
483 TIOCLINUX as u32,
484 SIOCGIFNAME as u32,
485 SIOCGIFCONF as u32,
486 SIOCGIFFLAGS as u32,
487 SIOCGIFADDR as u32,
488 SIOCGIFDSTADDR as u32,
489 SIOCGIFBRDADDR as u32,
490 SIOCGIFNETMASK as u32,
491 SIOCGIFHWADDR as u32,
492 SIOCGIFINDEX as u32,
493 SIOCETHTOOL as u32,
494 ];
495 let n_ioctls = dangerous_ioctls.len();
496 let skip_count = (1 + n_ioctls * 2) as u8;
497 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
498 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_ioctl, 0, skip_count));
499 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
500 for &cmd in dangerous_ioctls {
501 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, cmd, 0, 1));
502 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
503 }
504
505 let dangerous_prctl_ops: &[u32] = &[PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER];
508 let n_ops = dangerous_prctl_ops.len();
509 let skip_count = (1 + n_ops * 2) as u8;
510 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
511 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_prctl, 0, skip_count));
512 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
513 for &op in dangerous_prctl_ops {
514 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, op, 0, 1));
515 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
516 }
517
518 let mut blocked_types: Vec<u32> = Vec::new();
520 if policy.no_raw_sockets {
521 blocked_types.push(SOCK_RAW);
522 }
523 if policy.no_udp {
524 blocked_types.push(SOCK_DGRAM);
525 }
526
527 if !blocked_types.is_empty() {
528 let n = blocked_types.len();
529 let after_domain = 2 + n + 1;
531 let skip_all = (3 + after_domain) as u8;
533
534 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
535 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, skip_all));
536 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
538 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET, 1, 0));
540 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET6, 0, after_domain as u8));
542 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
544 insns.push(stmt(BPF_ALU | BPF_AND | BPF_K, SOCK_TYPE_MASK));
545 for (i, &sock_type) in blocked_types.iter().enumerate() {
547 let remaining = n - i - 1;
548 let jf: u8 = if remaining == 0 { 1 } else { 0 };
552 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, sock_type, remaining as u8, jf));
553 }
554 insns.push(stmt(BPF_RET | BPF_K, ret_errno));
556 }
557
558 {
567 let nr_wait4 = libc::SYS_wait4 as u32;
568 let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000) as u32;
569 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
570 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_wait4, 0, 3));
571 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS2_LO));
572 insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
573 insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
574 }
575
576 {
579 let nr_waitid = libc::SYS_waitid as u32;
580 let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000) as u32;
581 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
582 insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_waitid, 0, 3));
583 insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS3_LO));
584 insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
585 insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
586 }
587
588 insns
589}
590
591fn close_fds_above(min_fd: RawFd, keep: &[RawFd]) {
597 let fds_to_close: Vec<RawFd> = {
601 let dir = match std::fs::read_dir("/proc/self/fd") {
602 Ok(d) => d,
603 Err(_) => return,
604 };
605 dir.flatten()
606 .filter_map(|entry| {
607 entry.file_name().into_string().ok()
608 .and_then(|name| name.parse::<RawFd>().ok())
609 })
610 .filter(|&fd| fd > min_fd && !keep.contains(&fd))
611 .collect()
612 };
613 for fd in fds_to_close {
615 unsafe { libc::close(fd) };
616 }
617}
618
619pub(crate) use crate::cow::ChildMountConfig;
625
626fn write_id_maps(real_uid: u32, real_gid: u32, target_uid: u32, target_gid: u32) {
631 let _ = std::fs::write("/proc/self/uid_map", format!("{} {} 1\n", target_uid, real_uid));
632 let _ = std::fs::write("/proc/self/setgroups", "deny\n");
633 let _ = std::fs::write("/proc/self/gid_map", format!("{} {} 1\n", target_gid, real_gid));
634}
635
636fn write_id_maps_overflow() {
639 let uid = unsafe { libc::getuid() };
640 let gid = unsafe { libc::getgid() };
641 write_id_maps(uid, gid, 0, 0);
642}
643
644pub(crate) fn confine_child(policy: &Policy, cmd: &[CString], pipes: &PipePair, cow_config: Option<&ChildMountConfig>, nested: bool, keep_fds: &[RawFd]) -> ! {
653 macro_rules! fail {
655 ($msg:expr) => {{
656 let err = std::io::Error::last_os_error();
657 let _ = write!(std::io::stderr(), "sandlock child: {}: {}\n", $msg, err);
658 unsafe { libc::_exit(127) };
659 }};
660 }
661
662 use std::io::Write;
663
664 if unsafe { libc::setpgid(0, 0) } != 0 {
666 fail!("setpgid");
667 }
668
669 if unsafe { libc::isatty(0) } == 1 {
674 unsafe {
675 libc::signal(libc::SIGTTOU, libc::SIG_IGN);
676 libc::tcsetpgrp(0, libc::getpgrp());
677 libc::signal(libc::SIGTTOU, libc::SIG_DFL);
678 }
679 }
680
681 if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } != 0 {
683 fail!("prctl(PR_SET_PDEATHSIG)");
684 }
685
686 if unsafe { libc::getppid() } == 1 {
688 fail!("parent died before confinement");
689 }
690
691 if policy.no_randomize_memory {
693 const ADDR_NO_RANDOMIZE: libc::c_ulong = 0x0040000;
694 let current = unsafe { libc::personality(0xffffffff) };
696 if current == -1 {
697 fail!("personality(query)");
698 }
699 if unsafe { libc::personality(current as libc::c_ulong | ADDR_NO_RANDOMIZE) } == -1 {
700 fail!("personality(ADDR_NO_RANDOMIZE)");
701 }
702 }
703
704 if let Some(ref cores) = policy.cpu_cores {
706 if !cores.is_empty() {
707 let mut set = unsafe { std::mem::zeroed::<libc::cpu_set_t>() };
708 unsafe { libc::CPU_ZERO(&mut set) };
709 for &core in cores {
710 unsafe { libc::CPU_SET(core as usize, &mut set) };
711 }
712 if unsafe {
713 libc::sched_setaffinity(
714 0,
715 std::mem::size_of::<libc::cpu_set_t>(),
716 &set,
717 )
718 } != 0
719 {
720 fail!("sched_setaffinity");
721 }
722 }
723 }
724
725 if policy.no_huge_pages {
727 if unsafe { libc::prctl(libc::PR_SET_THP_DISABLE, 1, 0, 0, 0) } != 0 {
728 fail!("prctl(PR_SET_THP_DISABLE)");
729 }
730 }
731
732 if policy.no_coredump {
734 let rlim = libc::rlimit { rlim_cur: 0, rlim_max: 0 };
740 if unsafe { libc::setrlimit(libc::RLIMIT_CORE, &rlim) } != 0 {
741 fail!("setrlimit(RLIMIT_CORE, 0)");
742 }
743 }
744
745 let real_uid = unsafe { libc::getuid() };
747 let real_gid = unsafe { libc::getgid() };
748
749 if let Some(target_uid) = policy.uid {
752 if cow_config.is_none() {
753 if unsafe { libc::unshare(libc::CLONE_NEWUSER) } != 0 {
754 fail!("unshare(CLONE_NEWUSER)");
755 }
756 write_id_maps(real_uid, real_gid, target_uid, target_uid);
757 }
758 }
759
760 if let Some(ref cow) = cow_config {
762 if unsafe { libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) } != 0 {
764 fail!("unshare(CLONE_NEWUSER | CLONE_NEWNS)");
765 }
766
767 write_id_maps_overflow();
769
770 let lowerdir = cow.lowers.iter()
776 .map(|p| p.display().to_string())
777 .collect::<Vec<_>>()
778 .join(":");
779 let opts = format!(
780 "lowerdir={},upperdir={},workdir={}",
781 lowerdir,
782 cow.upper.display(),
783 cow.work.display(),
784 );
785
786 let mount_cstr = match CString::new(cow.mount_point.to_str().unwrap_or("")) {
787 Ok(c) => c,
788 Err(_) => fail!("invalid overlay mount point path"),
789 };
790 let overlay_cstr = CString::new("overlay").unwrap();
791 let opts_cstr = match CString::new(opts) {
792 Ok(c) => c,
793 Err(_) => fail!("invalid overlay opts"),
794 };
795
796 let ret = unsafe {
797 libc::mount(
798 overlay_cstr.as_ptr(),
799 mount_cstr.as_ptr(),
800 overlay_cstr.as_ptr(),
801 0,
802 opts_cstr.as_ptr() as *const libc::c_void,
803 )
804 };
805 if ret != 0 {
806 fail!("mount overlay");
807 }
808 }
809
810 let effective_cwd = if let Some(ref cwd) = policy.cwd {
813 if let Some(ref chroot_root) = policy.chroot {
814 Some(chroot_root.join(cwd.strip_prefix("/").unwrap_or(cwd)))
815 } else {
816 Some(cwd.clone())
817 }
818 } else if let Some(ref chroot_root) = policy.chroot {
819 Some(chroot_root.to_path_buf())
821 } else if let Some(ref workdir) = policy.workdir {
822 Some(workdir.clone())
824 } else {
825 None
826 };
827
828 if let Some(ref cwd) = effective_cwd {
829 let c_path = match CString::new(cwd.as_os_str().as_encoded_bytes()) {
830 Ok(c) => c,
831 Err(_) => fail!("invalid cwd path"),
832 };
833 if unsafe { libc::chdir(c_path.as_ptr()) } != 0 {
834 fail!("chdir");
835 }
836 }
837
838 if unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) } != 0 {
840 fail!("prctl(PR_SET_NO_NEW_PRIVS)");
841 }
842
843 if let Err(e) = crate::landlock::confine(policy) {
845 fail!(format!("landlock: {}", e));
846 }
847
848 let deny = deny_syscall_numbers(policy);
850 let args = arg_filters(policy);
851 let mut keep_fd: i32 = -1;
852
853 if nested {
854 let filter = match bpf::assemble_filter(&[], &deny, &args) {
857 Ok(f) => f,
858 Err(e) => fail!(format!("seccomp assemble: {}", e)),
859 };
860 if let Err(e) = bpf::install_deny_filter(&filter) {
861 fail!(format!("seccomp deny filter: {}", e));
862 }
863 if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), 0) {
865 fail!(format!("write nested signal: {}", e));
866 }
867 } else {
868 let notif = notif_syscalls(policy);
870 let filter = match bpf::assemble_filter(¬if, &deny, &args) {
871 Ok(f) => f,
872 Err(e) => fail!(format!("seccomp assemble: {}", e)),
873 };
874 let notif_fd = match bpf::install_filter(&filter) {
875 Ok(fd) => fd,
876 Err(e) => fail!(format!("seccomp install: {}", e)),
877 };
878 keep_fd = notif_fd.as_raw_fd();
879 if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), keep_fd as u32) {
880 fail!(format!("write notif fd: {}", e));
881 }
882 std::mem::forget(notif_fd);
883 }
884
885 crate::sandbox::CONFINED.store(true, std::sync::atomic::Ordering::Relaxed);
887
888 match read_u32_fd(pipes.ready_r.as_raw_fd()) {
890 Ok(_) => {}
891 Err(e) => fail!(format!("read ready signal: {}", e)),
892 }
893
894 let mut fds_to_keep: Vec<RawFd> = keep_fds.to_vec();
896 if keep_fd >= 0 {
897 fds_to_keep.push(keep_fd);
898 }
899 close_fds_above(2, &fds_to_keep);
900
901 if policy.clean_env {
903 for (key, _) in std::env::vars_os() {
905 std::env::remove_var(&key);
906 }
907 }
908 for (key, value) in &policy.env {
909 std::env::set_var(key, value);
910 }
911
912 if let Some(ref devices) = policy.gpu_devices {
914 if !devices.is_empty() {
915 let vis = devices.iter().map(|d| d.to_string()).collect::<Vec<_>>().join(",");
916 std::env::set_var("CUDA_VISIBLE_DEVICES", &vis);
917 std::env::set_var("ROCR_VISIBLE_DEVICES", &vis);
918 }
919 }
921
922 debug_assert!(!cmd.is_empty(), "cmd must not be empty");
924 let argv_ptrs: Vec<*const libc::c_char> = cmd
925 .iter()
926 .map(|s| s.as_ptr())
927 .chain(std::iter::once(std::ptr::null()))
928 .collect();
929
930 if policy.chroot.is_some() {
931 let mut exec_path = vec![0u8; libc::PATH_MAX as usize];
937 let orig = cmd[0].as_bytes_with_nul();
938 exec_path[..orig.len()].copy_from_slice(orig);
939
940 unsafe {
941 libc::execvp(
942 exec_path.as_ptr() as *const libc::c_char,
943 argv_ptrs.as_ptr(),
944 )
945 };
946 } else {
947 unsafe { libc::execvp(argv_ptrs[0], argv_ptrs.as_ptr()) };
948 }
949
950 fail!(format!("execvp '{}'", cmd[0].to_string_lossy()));
952}
953
954#[cfg(test)]
959mod tests {
960 use super::*;
961
962 #[test]
963 fn test_pipe_pair_creation() {
964 let pipes = PipePair::new().expect("pipe creation failed");
965 assert!(pipes.notif_r.as_raw_fd() >= 0);
967 assert!(pipes.notif_w.as_raw_fd() >= 0);
968 assert!(pipes.ready_r.as_raw_fd() >= 0);
969 assert!(pipes.ready_w.as_raw_fd() >= 0);
970 let fds = [
972 pipes.notif_r.as_raw_fd(),
973 pipes.notif_w.as_raw_fd(),
974 pipes.ready_r.as_raw_fd(),
975 pipes.ready_w.as_raw_fd(),
976 ];
977 for i in 0..4 {
978 for j in (i + 1)..4 {
979 assert_ne!(fds[i], fds[j]);
980 }
981 }
982 }
983
984 #[test]
985 fn test_write_read_u32() {
986 let pipes = PipePair::new().expect("pipe creation failed");
987 let val = 42u32;
988 write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
989 let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
990 assert_eq!(got, val);
991 }
992
993 #[test]
994 fn test_write_read_u32_large() {
995 let pipes = PipePair::new().expect("pipe creation failed");
996 let val = 0xDEAD_BEEFu32;
997 write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
998 let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
999 assert_eq!(got, val);
1000 }
1001
1002 #[test]
1003 fn test_notif_syscalls_always_has_clone() {
1004 let policy = Policy::builder().build().unwrap();
1005 let nrs = notif_syscalls(&policy);
1006 assert!(nrs.contains(&(libc::SYS_clone as u32)));
1007 assert!(nrs.contains(&(libc::SYS_clone3 as u32)));
1008 if let Some(vfork) = arch::SYS_VFORK {
1009 assert!(nrs.contains(&(vfork as u32)));
1010 }
1011 }
1012
1013 #[test]
1014 fn test_notif_syscalls_memory() {
1015 let policy = Policy::builder()
1016 .max_memory(crate::policy::ByteSize::mib(256))
1017 .build()
1018 .unwrap();
1019 let nrs = notif_syscalls(&policy);
1020 assert!(nrs.contains(&(libc::SYS_mmap as u32)));
1021 assert!(nrs.contains(&(libc::SYS_munmap as u32)));
1022 assert!(nrs.contains(&(libc::SYS_brk as u32)));
1023 assert!(nrs.contains(&(libc::SYS_mremap as u32)));
1024 assert!(nrs.contains(&(libc::SYS_shmget as u32)));
1025 }
1026
1027 #[test]
1028 fn test_notif_syscalls_net() {
1029 let policy = Policy::builder()
1030 .net_allow_host("example.com")
1031 .build()
1032 .unwrap();
1033 let nrs = notif_syscalls(&policy);
1034 assert!(nrs.contains(&(libc::SYS_connect as u32)));
1035 assert!(nrs.contains(&(libc::SYS_sendto as u32)));
1036 assert!(nrs.contains(&(libc::SYS_sendmsg as u32)));
1037 }
1038
1039 #[test]
1042 fn test_notif_syscalls_faccessat2() {
1043 const SYS_FACCESSAT2: u32 = 439;
1044
1045 let policy = Policy::builder()
1047 .chroot("/tmp")
1048 .build()
1049 .unwrap();
1050 let nrs = notif_syscalls(&policy);
1051 assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
1052 assert!(nrs.contains(&SYS_FACCESSAT2),
1053 "chroot notif filter must include SYS_faccessat2 (439)");
1054
1055 let policy = Policy::builder()
1057 .workdir("/tmp")
1058 .build()
1059 .unwrap();
1060 let nrs = notif_syscalls(&policy);
1061 assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
1062 assert!(nrs.contains(&SYS_FACCESSAT2),
1063 "COW notif filter must include SYS_faccessat2 (439)");
1064 }
1065
1066 #[test]
1067 fn test_deny_syscall_numbers_default() {
1068 let policy = Policy::builder().build().unwrap();
1069 let nrs = deny_syscall_numbers(&policy);
1070 assert!(nrs.contains(&(libc::SYS_mount as u32)));
1072 assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1073 assert!(nrs.contains(&(libc::SYS_bpf as u32)));
1074 assert!(!nrs.is_empty());
1076 }
1077
1078 #[test]
1079 fn test_deny_syscall_numbers_custom() {
1080 let policy = Policy::builder()
1081 .deny_syscalls(vec!["mount".into(), "ptrace".into()])
1082 .build()
1083 .unwrap();
1084 let nrs = deny_syscall_numbers(&policy);
1085 assert_eq!(nrs.len(), 2);
1086 assert!(nrs.contains(&(libc::SYS_mount as u32)));
1087 assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
1088 }
1089
1090 #[test]
1091 fn test_deny_syscall_numbers_empty_when_allow_set() {
1092 let policy = Policy::builder()
1093 .allow_syscalls(vec!["read".into(), "write".into()])
1094 .build()
1095 .unwrap();
1096 let nrs = deny_syscall_numbers(&policy);
1097 assert!(nrs.is_empty());
1098 }
1099
1100 #[test]
1101 fn test_arg_filters_has_clone_ioctl_prctl_socket() {
1102 use crate::sys::structs::{
1103 BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K,
1104 };
1105 let policy = Policy::builder().build().unwrap();
1106 let filters = arg_filters(&policy);
1107 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1109 && f.k == libc::SYS_clone as u32));
1110 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JSET | BPF_K)
1112 && f.k == CLONE_NS_FLAGS as u32));
1113 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1115 && f.k == libc::SYS_ioctl as u32));
1116 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1118 && f.k == TIOCSTI as u32));
1119 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1120 && f.k == TIOCLINUX as u32));
1121 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1122 && f.k == SIOCGIFCONF as u32));
1123 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1124 && f.k == SIOCETHTOOL as u32));
1125 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1127 && f.k == libc::SYS_prctl as u32));
1128 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1130 && f.k == PR_SET_DUMPABLE));
1131 }
1132
1133 #[test]
1134 fn test_arg_filters_raw_sockets() {
1135 use crate::sys::structs::{BPF_ALU, BPF_AND, BPF_JEQ, BPF_JMP, BPF_K};
1136 let policy = Policy::builder().no_raw_sockets(true).build().unwrap();
1137 let filters = arg_filters(&policy);
1138 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1140 && f.k == AF_INET));
1141 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1143 && f.k == AF_INET6));
1144 assert!(filters.iter().any(|f| f.code == (BPF_ALU | BPF_AND | BPF_K)
1146 && f.k == SOCK_TYPE_MASK));
1147 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1149 && f.k == SOCK_RAW));
1150 }
1151
1152 #[test]
1153 fn test_arg_filters_no_udp() {
1154 use crate::sys::structs::{BPF_JEQ, BPF_JMP, BPF_K};
1155 let policy = Policy::builder().no_udp(true).build().unwrap();
1156 let filters = arg_filters(&policy);
1157 assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
1159 && f.k == SOCK_DGRAM));
1160 }
1161
1162 #[test]
1163 fn test_syscall_name_to_nr_covers_defaults() {
1164 let expected_unresolved: &[&str] = &[
1167 "nfsservctl",
1168 #[cfg(target_arch = "aarch64")]
1169 "ioperm",
1170 #[cfg(target_arch = "aarch64")]
1171 "iopl",
1172 ];
1173 let mut skipped = 0;
1174 for name in DEFAULT_DENY_SYSCALLS {
1175 match syscall_name_to_nr(name) {
1176 Some(_) => {}
1177 None => {
1178 assert!(
1179 expected_unresolved.contains(name),
1180 "unexpected unresolved syscall: {}",
1181 name
1182 );
1183 skipped += 1;
1184 }
1185 }
1186 }
1187 assert_eq!(skipped, expected_unresolved.len());
1188 }
1189}