use std::ffi::CString;
use std::io;
use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
use crate::arch;
use crate::policy::{FsIsolation, Policy};
use crate::seccomp::bpf::{self, stmt, jump};
use crate::sys::structs::{
AF_INET, AF_INET6,
BPF_ABS, BPF_ALU, BPF_AND, BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K, BPF_LD, BPF_RET, BPF_W,
CLONE_NS_FLAGS, DEFAULT_DENY_SYSCALLS, EPERM,
SECCOMP_RET_ALLOW, SECCOMP_RET_ERRNO,
SIOCETHTOOL, SIOCGIFADDR, SIOCGIFBRDADDR, SIOCGIFCONF, SIOCGIFDSTADDR,
SIOCGIFFLAGS, SIOCGIFHWADDR, SIOCGIFINDEX, SIOCGIFNAME, SIOCGIFNETMASK,
SOCK_DGRAM, SOCK_RAW, SOCK_TYPE_MASK, TIOCLINUX, TIOCSTI,
PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER,
OFFSET_ARGS0_LO, OFFSET_ARGS1_LO, OFFSET_ARGS2_LO, OFFSET_ARGS3_LO, OFFSET_NR,
SockFilter,
};
pub struct PipePair {
pub notif_r: OwnedFd,
pub notif_w: OwnedFd,
pub ready_r: OwnedFd,
pub ready_w: OwnedFd,
}
impl PipePair {
pub fn new() -> io::Result<Self> {
let mut notif_fds = [0i32; 2];
let mut ready_fds = [0i32; 2];
let ret = unsafe { libc::pipe2(notif_fds.as_mut_ptr(), libc::O_CLOEXEC) };
if ret < 0 {
return Err(io::Error::last_os_error());
}
let ret = unsafe { libc::pipe2(ready_fds.as_mut_ptr(), libc::O_CLOEXEC) };
if ret < 0 {
unsafe {
libc::close(notif_fds[0]);
libc::close(notif_fds[1]);
}
return Err(io::Error::last_os_error());
}
Ok(PipePair {
notif_r: unsafe { OwnedFd::from_raw_fd(notif_fds[0]) },
notif_w: unsafe { OwnedFd::from_raw_fd(notif_fds[1]) },
ready_r: unsafe { OwnedFd::from_raw_fd(ready_fds[0]) },
ready_w: unsafe { OwnedFd::from_raw_fd(ready_fds[1]) },
})
}
}
pub(crate) fn write_u32_fd(fd: RawFd, val: u32) -> io::Result<()> {
let buf = val.to_le_bytes();
let mut written = 0usize;
while written < 4 {
let ret = unsafe {
libc::write(
fd,
buf[written..].as_ptr() as *const libc::c_void,
4 - written,
)
};
if ret < 0 {
return Err(io::Error::last_os_error());
}
written += ret as usize;
}
Ok(())
}
pub(crate) fn read_u32_fd(fd: RawFd) -> io::Result<u32> {
let mut buf = [0u8; 4];
let mut total = 0usize;
while total < 4 {
let ret = unsafe {
libc::read(
fd,
buf[total..].as_mut_ptr() as *mut libc::c_void,
4 - total,
)
};
if ret < 0 {
return Err(io::Error::last_os_error());
}
if ret == 0 {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"pipe closed before 4 bytes read",
));
}
total += ret as usize;
}
Ok(u32::from_le_bytes(buf))
}
pub fn syscall_name_to_nr(name: &str) -> Option<u32> {
let nr: i64 = match name {
"mount" => libc::SYS_mount,
"umount2" => libc::SYS_umount2,
"pivot_root" => libc::SYS_pivot_root,
"swapon" => libc::SYS_swapon,
"swapoff" => libc::SYS_swapoff,
"reboot" => libc::SYS_reboot,
"sethostname" => libc::SYS_sethostname,
"setdomainname" => libc::SYS_setdomainname,
"kexec_load" => libc::SYS_kexec_load,
"init_module" => libc::SYS_init_module,
"finit_module" => libc::SYS_finit_module,
"delete_module" => libc::SYS_delete_module,
"unshare" => libc::SYS_unshare,
"setns" => libc::SYS_setns,
"perf_event_open" => libc::SYS_perf_event_open,
"bpf" => libc::SYS_bpf,
"userfaultfd" => libc::SYS_userfaultfd,
"keyctl" => libc::SYS_keyctl,
"add_key" => libc::SYS_add_key,
"request_key" => libc::SYS_request_key,
"ptrace" => libc::SYS_ptrace,
"process_vm_readv" => libc::SYS_process_vm_readv,
"process_vm_writev" => libc::SYS_process_vm_writev,
"open_by_handle_at" => libc::SYS_open_by_handle_at,
"name_to_handle_at" => libc::SYS_name_to_handle_at,
"ioperm" => arch::SYS_IOPERM?,
"iopl" => arch::SYS_IOPL?,
"quotactl" => libc::SYS_quotactl,
"acct" => libc::SYS_acct,
"lookup_dcookie" => libc::SYS_lookup_dcookie,
"personality" => libc::SYS_personality,
"io_uring_setup" => libc::SYS_io_uring_setup,
"io_uring_enter" => libc::SYS_io_uring_enter,
"io_uring_register" => libc::SYS_io_uring_register,
"clone" => libc::SYS_clone,
"clone3" => libc::SYS_clone3,
"vfork" => arch::SYS_VFORK?,
"mmap" => libc::SYS_mmap,
"munmap" => libc::SYS_munmap,
"brk" => libc::SYS_brk,
"mremap" => libc::SYS_mremap,
"connect" => libc::SYS_connect,
"sendto" => libc::SYS_sendto,
"sendmsg" => libc::SYS_sendmsg,
"ioctl" => libc::SYS_ioctl,
"socket" => libc::SYS_socket,
"prctl" => libc::SYS_prctl,
"getrandom" => libc::SYS_getrandom,
"openat" => libc::SYS_openat,
"open" => arch::SYS_OPEN?,
"getdents64" => libc::SYS_getdents64,
"getdents" => arch::SYS_GETDENTS?,
"bind" => libc::SYS_bind,
"getsockname" => libc::SYS_getsockname,
"clock_gettime" => libc::SYS_clock_gettime,
"gettimeofday" => libc::SYS_gettimeofday,
"time" => arch::SYS_TIME?,
"clock_nanosleep" => libc::SYS_clock_nanosleep,
"timerfd_settime" => libc::SYS_timerfd_settime,
"timer_settime" => libc::SYS_timer_settime,
"execve" => libc::SYS_execve,
"execveat" => libc::SYS_execveat,
"unlinkat" => libc::SYS_unlinkat,
"mkdirat" => libc::SYS_mkdirat,
"renameat2" => libc::SYS_renameat2,
"newfstatat" => libc::SYS_newfstatat,
"statx" => libc::SYS_statx,
"faccessat" => libc::SYS_faccessat,
"symlinkat" => libc::SYS_symlinkat,
"linkat" => libc::SYS_linkat,
"fchmodat" => libc::SYS_fchmodat,
"fchownat" => libc::SYS_fchownat,
"readlinkat" => libc::SYS_readlinkat,
"truncate" => libc::SYS_truncate,
"utimensat" => libc::SYS_utimensat,
"unlink" => arch::SYS_UNLINK?,
"rmdir" => arch::SYS_RMDIR?,
"mkdir" => arch::SYS_MKDIR?,
"rename" => arch::SYS_RENAME?,
"stat" => arch::SYS_STAT?,
"lstat" => arch::SYS_LSTAT?,
"access" => arch::SYS_ACCESS?,
"symlink" => arch::SYS_SYMLINK?,
"link" => arch::SYS_LINK?,
"chmod" => arch::SYS_CHMOD?,
"chown" => arch::SYS_CHOWN?,
"lchown" => arch::SYS_LCHOWN?,
"readlink" => arch::SYS_READLINK?,
"futimesat" => arch::SYS_FUTIMESAT?,
"fork" => arch::SYS_FORK?,
_ => return None,
};
Some(nr as u32)
}
pub fn notif_syscalls(policy: &Policy) -> Vec<u32> {
let mut nrs = vec![
libc::SYS_clone as u32,
libc::SYS_clone3 as u32,
libc::SYS_wait4 as u32,
libc::SYS_waitid as u32,
];
arch::push_optional_syscall(&mut nrs, arch::SYS_VFORK);
if policy.max_memory.is_some() {
nrs.push(libc::SYS_mmap as u32);
nrs.push(libc::SYS_munmap as u32);
nrs.push(libc::SYS_brk as u32);
nrs.push(libc::SYS_mremap as u32);
nrs.push(libc::SYS_shmget as u32);
}
if policy.net_allow_hosts.is_some()
|| policy.policy_fn.is_some()
|| !policy.http_allow.is_empty()
|| !policy.http_deny.is_empty()
{
nrs.push(libc::SYS_connect as u32);
nrs.push(libc::SYS_sendto as u32);
nrs.push(libc::SYS_sendmsg as u32);
nrs.push(libc::SYS_bind as u32);
}
if policy.random_seed.is_some() {
nrs.push(libc::SYS_getrandom as u32);
nrs.push(libc::SYS_openat as u32);
}
if policy.time_start.is_some() {
nrs.extend_from_slice(&[
libc::SYS_clock_nanosleep as u32,
libc::SYS_timerfd_settime as u32,
libc::SYS_timer_settime as u32,
]);
nrs.push(libc::SYS_openat as u32);
}
nrs.push(libc::SYS_openat as u32);
nrs.push(libc::SYS_getdents64 as u32);
arch::push_optional_syscall(&mut nrs, arch::SYS_GETDENTS);
nrs.push(libc::SYS_socket as u32);
nrs.push(libc::SYS_bind as u32);
nrs.push(libc::SYS_getsockname as u32);
nrs.push(libc::SYS_recvfrom as u32);
nrs.push(libc::SYS_recvmsg as u32);
nrs.push(libc::SYS_close as u32);
if policy.num_cpus.is_some() {
nrs.push(libc::SYS_sched_getaffinity as u32);
}
if policy.hostname.is_some() {
nrs.push(libc::SYS_uname as u32);
nrs.push(libc::SYS_openat as u32);
}
if policy.workdir.is_some() && policy.fs_isolation == FsIsolation::None {
nrs.extend_from_slice(&[
libc::SYS_openat as u32,
libc::SYS_unlinkat as u32,
libc::SYS_mkdirat as u32,
libc::SYS_renameat2 as u32,
libc::SYS_symlinkat as u32,
libc::SYS_linkat as u32,
libc::SYS_fchmodat as u32,
libc::SYS_fchownat as u32,
libc::SYS_truncate as u32,
libc::SYS_utimensat as u32,
libc::SYS_newfstatat as u32,
libc::SYS_statx as u32,
libc::SYS_faccessat as u32,
439u32, libc::SYS_readlinkat as u32,
libc::SYS_getdents64 as u32,
libc::SYS_chdir as u32,
libc::SYS_getcwd as u32,
]);
for nr in [
arch::SYS_OPEN, arch::SYS_UNLINK, arch::SYS_RMDIR, arch::SYS_MKDIR,
arch::SYS_RENAME, arch::SYS_SYMLINK, arch::SYS_LINK, arch::SYS_CHMOD,
arch::SYS_CHOWN, arch::SYS_LCHOWN, arch::SYS_STAT, arch::SYS_LSTAT,
arch::SYS_ACCESS, arch::SYS_READLINK, arch::SYS_GETDENTS,
] {
arch::push_optional_syscall(&mut nrs, nr);
}
}
if policy.chroot.is_some() {
nrs.extend_from_slice(&[
libc::SYS_openat as u32,
libc::SYS_execve as u32,
libc::SYS_execveat as u32,
libc::SYS_unlinkat as u32,
libc::SYS_mkdirat as u32,
libc::SYS_renameat2 as u32,
libc::SYS_symlinkat as u32,
libc::SYS_linkat as u32,
libc::SYS_fchmodat as u32,
libc::SYS_fchownat as u32,
libc::SYS_truncate as u32,
libc::SYS_newfstatat as u32,
libc::SYS_statx as u32,
libc::SYS_faccessat as u32,
439u32, libc::SYS_readlinkat as u32,
libc::SYS_getdents64 as u32,
libc::SYS_chdir as u32,
libc::SYS_getcwd as u32,
libc::SYS_statfs as u32,
libc::SYS_utimensat as u32,
]);
for nr in [
arch::SYS_OPEN, arch::SYS_STAT, arch::SYS_LSTAT, arch::SYS_ACCESS,
arch::SYS_READLINK, arch::SYS_GETDENTS, arch::SYS_UNLINK,
arch::SYS_RMDIR, arch::SYS_MKDIR, arch::SYS_RENAME,
arch::SYS_SYMLINK, arch::SYS_LINK, arch::SYS_CHMOD,
arch::SYS_CHOWN, arch::SYS_LCHOWN,
] {
arch::push_optional_syscall(&mut nrs, nr);
}
}
if !policy.fs_denied.is_empty() {
nrs.extend_from_slice(&[
libc::SYS_openat as u32,
libc::SYS_execve as u32,
libc::SYS_execveat as u32,
libc::SYS_linkat as u32,
libc::SYS_renameat2 as u32,
libc::SYS_symlinkat as u32,
]);
for nr in [arch::SYS_OPEN, arch::SYS_LINK, arch::SYS_RENAME, arch::SYS_SYMLINK] {
arch::push_optional_syscall(&mut nrs, nr);
}
}
if policy.policy_fn.is_some() {
nrs.extend_from_slice(&[
libc::SYS_openat as u32,
libc::SYS_connect as u32,
libc::SYS_sendto as u32,
libc::SYS_bind as u32,
libc::SYS_execve as u32,
libc::SYS_execveat as u32,
]);
}
if policy.port_remap {
nrs.extend_from_slice(&[
libc::SYS_bind as u32,
libc::SYS_getsockname as u32,
]);
}
nrs.sort_unstable();
nrs.dedup();
nrs
}
pub fn no_supervisor_deny_syscall_numbers() -> Vec<u32> {
use crate::sys::structs::NO_SUPERVISOR_DENY_SYSCALLS;
NO_SUPERVISOR_DENY_SYSCALLS
.iter()
.filter_map(|n| syscall_name_to_nr(n))
.collect()
}
pub fn deny_syscall_numbers(policy: &Policy) -> Vec<u32> {
if let Some(ref names) = policy.deny_syscalls {
names
.iter()
.filter_map(|n| syscall_name_to_nr(n))
.collect()
} else if policy.allow_syscalls.is_none() {
DEFAULT_DENY_SYSCALLS
.iter()
.filter_map(|n| syscall_name_to_nr(n))
.collect()
} else {
Vec::new()
}
}
pub fn arg_filters(policy: &Policy) -> Vec<SockFilter> {
let ret_errno = SECCOMP_RET_ERRNO | EPERM as u32;
let nr_clone = libc::SYS_clone as u32;
let nr_ioctl = libc::SYS_ioctl as u32;
let nr_prctl = libc::SYS_prctl as u32;
let nr_socket = libc::SYS_socket as u32;
let mut insns: Vec<SockFilter> = Vec::new();
insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_clone, 0, 3));
insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, CLONE_NS_FLAGS as u32, 0, 1));
insns.push(stmt(BPF_RET | BPF_K, ret_errno));
let dangerous_ioctls: &[u32] = &[
TIOCSTI as u32,
TIOCLINUX as u32,
SIOCGIFNAME as u32,
SIOCGIFCONF as u32,
SIOCGIFFLAGS as u32,
SIOCGIFADDR as u32,
SIOCGIFDSTADDR as u32,
SIOCGIFBRDADDR as u32,
SIOCGIFNETMASK as u32,
SIOCGIFHWADDR as u32,
SIOCGIFINDEX as u32,
SIOCETHTOOL as u32,
];
let n_ioctls = dangerous_ioctls.len();
let skip_count = (1 + n_ioctls * 2) as u8;
insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_ioctl, 0, skip_count));
insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
for &cmd in dangerous_ioctls {
insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, cmd, 0, 1));
insns.push(stmt(BPF_RET | BPF_K, ret_errno));
}
let dangerous_prctl_ops: &[u32] = &[PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER];
let n_ops = dangerous_prctl_ops.len();
let skip_count = (1 + n_ops * 2) as u8;
insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_prctl, 0, skip_count));
insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
for &op in dangerous_prctl_ops {
insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, op, 0, 1));
insns.push(stmt(BPF_RET | BPF_K, ret_errno));
}
let mut blocked_types: Vec<u32> = Vec::new();
if policy.no_raw_sockets {
blocked_types.push(SOCK_RAW);
}
if policy.no_udp {
blocked_types.push(SOCK_DGRAM);
}
if !blocked_types.is_empty() {
let n = blocked_types.len();
let after_domain = 2 + n + 1;
let skip_all = (3 + after_domain) as u8;
insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, skip_all));
insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET, 1, 0));
insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET6, 0, after_domain as u8));
insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
insns.push(stmt(BPF_ALU | BPF_AND | BPF_K, SOCK_TYPE_MASK));
for (i, &sock_type) in blocked_types.iter().enumerate() {
let remaining = n - i - 1;
let jf: u8 = if remaining == 0 { 1 } else { 0 };
insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, sock_type, remaining as u8, jf));
}
insns.push(stmt(BPF_RET | BPF_K, ret_errno));
}
{
let nr_wait4 = libc::SYS_wait4 as u32;
let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000) as u32;
insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_wait4, 0, 3));
insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS2_LO));
insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
}
{
let nr_waitid = libc::SYS_waitid as u32;
let wnohang_or_wnowait = (libc::WNOHANG | 0x0100_0000) as u32;
insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_waitid, 0, 3));
insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS3_LO));
insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, wnohang_or_wnowait, 0, 1));
insns.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
}
insns
}
fn close_fds_above(min_fd: RawFd, keep: &[RawFd]) {
let fds_to_close: Vec<RawFd> = {
let dir = match std::fs::read_dir("/proc/self/fd") {
Ok(d) => d,
Err(_) => return,
};
dir.flatten()
.filter_map(|entry| {
entry.file_name().into_string().ok()
.and_then(|name| name.parse::<RawFd>().ok())
})
.filter(|&fd| fd > min_fd && !keep.contains(&fd))
.collect()
};
for fd in fds_to_close {
unsafe { libc::close(fd) };
}
}
pub(crate) use crate::cow::ChildMountConfig;
fn write_id_maps(real_uid: u32, real_gid: u32, target_uid: u32, target_gid: u32) {
let _ = std::fs::write("/proc/self/uid_map", format!("{} {} 1\n", target_uid, real_uid));
let _ = std::fs::write("/proc/self/setgroups", "deny\n");
let _ = std::fs::write("/proc/self/gid_map", format!("{} {} 1\n", target_gid, real_gid));
}
fn write_id_maps_overflow() {
let uid = unsafe { libc::getuid() };
let gid = unsafe { libc::getgid() };
write_id_maps(uid, gid, 0, 0);
}
pub(crate) fn confine_child(policy: &Policy, cmd: &[CString], pipes: &PipePair, cow_config: Option<&ChildMountConfig>, nested: bool, keep_fds: &[RawFd]) -> ! {
macro_rules! fail {
($msg:expr) => {{
let err = std::io::Error::last_os_error();
let _ = write!(std::io::stderr(), "sandlock child: {}: {}\n", $msg, err);
unsafe { libc::_exit(127) };
}};
}
use std::io::Write;
if unsafe { libc::setpgid(0, 0) } != 0 {
fail!("setpgid");
}
if unsafe { libc::isatty(0) } == 1 {
unsafe {
libc::signal(libc::SIGTTOU, libc::SIG_IGN);
libc::tcsetpgrp(0, libc::getpgrp());
libc::signal(libc::SIGTTOU, libc::SIG_DFL);
}
}
if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } != 0 {
fail!("prctl(PR_SET_PDEATHSIG)");
}
if unsafe { libc::getppid() } == 1 {
fail!("parent died before confinement");
}
if policy.no_randomize_memory {
const ADDR_NO_RANDOMIZE: libc::c_ulong = 0x0040000;
let current = unsafe { libc::personality(0xffffffff) };
if current == -1 {
fail!("personality(query)");
}
if unsafe { libc::personality(current as libc::c_ulong | ADDR_NO_RANDOMIZE) } == -1 {
fail!("personality(ADDR_NO_RANDOMIZE)");
}
}
if let Some(ref cores) = policy.cpu_cores {
if !cores.is_empty() {
let mut set = unsafe { std::mem::zeroed::<libc::cpu_set_t>() };
unsafe { libc::CPU_ZERO(&mut set) };
for &core in cores {
unsafe { libc::CPU_SET(core as usize, &mut set) };
}
if unsafe {
libc::sched_setaffinity(
0,
std::mem::size_of::<libc::cpu_set_t>(),
&set,
)
} != 0
{
fail!("sched_setaffinity");
}
}
}
if policy.no_huge_pages {
if unsafe { libc::prctl(libc::PR_SET_THP_DISABLE, 1, 0, 0, 0) } != 0 {
fail!("prctl(PR_SET_THP_DISABLE)");
}
}
if policy.no_coredump {
let rlim = libc::rlimit { rlim_cur: 0, rlim_max: 0 };
if unsafe { libc::setrlimit(libc::RLIMIT_CORE, &rlim) } != 0 {
fail!("setrlimit(RLIMIT_CORE, 0)");
}
}
let real_uid = unsafe { libc::getuid() };
let real_gid = unsafe { libc::getgid() };
if let Some(target_uid) = policy.uid {
if cow_config.is_none() {
if unsafe { libc::unshare(libc::CLONE_NEWUSER) } != 0 {
fail!("unshare(CLONE_NEWUSER)");
}
write_id_maps(real_uid, real_gid, target_uid, target_uid);
}
}
if let Some(ref cow) = cow_config {
if unsafe { libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) } != 0 {
fail!("unshare(CLONE_NEWUSER | CLONE_NEWNS)");
}
write_id_maps_overflow();
let lowerdir = cow.lowers.iter()
.map(|p| p.display().to_string())
.collect::<Vec<_>>()
.join(":");
let opts = format!(
"lowerdir={},upperdir={},workdir={}",
lowerdir,
cow.upper.display(),
cow.work.display(),
);
let mount_cstr = match CString::new(cow.mount_point.to_str().unwrap_or("")) {
Ok(c) => c,
Err(_) => fail!("invalid overlay mount point path"),
};
let overlay_cstr = CString::new("overlay").unwrap();
let opts_cstr = match CString::new(opts) {
Ok(c) => c,
Err(_) => fail!("invalid overlay opts"),
};
let ret = unsafe {
libc::mount(
overlay_cstr.as_ptr(),
mount_cstr.as_ptr(),
overlay_cstr.as_ptr(),
0,
opts_cstr.as_ptr() as *const libc::c_void,
)
};
if ret != 0 {
fail!("mount overlay");
}
}
let effective_cwd = if let Some(ref cwd) = policy.cwd {
if let Some(ref chroot_root) = policy.chroot {
Some(chroot_root.join(cwd.strip_prefix("/").unwrap_or(cwd)))
} else {
Some(cwd.clone())
}
} else if let Some(ref chroot_root) = policy.chroot {
Some(chroot_root.to_path_buf())
} else if let Some(ref workdir) = policy.workdir {
Some(workdir.clone())
} else {
None
};
if let Some(ref cwd) = effective_cwd {
let c_path = match CString::new(cwd.as_os_str().as_encoded_bytes()) {
Ok(c) => c,
Err(_) => fail!("invalid cwd path"),
};
if unsafe { libc::chdir(c_path.as_ptr()) } != 0 {
fail!("chdir");
}
}
if unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) } != 0 {
fail!("prctl(PR_SET_NO_NEW_PRIVS)");
}
if let Err(e) = crate::landlock::confine(policy) {
fail!(format!("landlock: {}", e));
}
let deny = deny_syscall_numbers(policy);
let args = arg_filters(policy);
let mut keep_fd: i32 = -1;
if nested {
let filter = match bpf::assemble_filter(&[], &deny, &args) {
Ok(f) => f,
Err(e) => fail!(format!("seccomp assemble: {}", e)),
};
if let Err(e) = bpf::install_deny_filter(&filter) {
fail!(format!("seccomp deny filter: {}", e));
}
if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), 0) {
fail!(format!("write nested signal: {}", e));
}
} else {
let notif = notif_syscalls(policy);
let filter = match bpf::assemble_filter(¬if, &deny, &args) {
Ok(f) => f,
Err(e) => fail!(format!("seccomp assemble: {}", e)),
};
let notif_fd = match bpf::install_filter(&filter) {
Ok(fd) => fd,
Err(e) => fail!(format!("seccomp install: {}", e)),
};
keep_fd = notif_fd.as_raw_fd();
if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), keep_fd as u32) {
fail!(format!("write notif fd: {}", e));
}
std::mem::forget(notif_fd);
}
crate::sandbox::CONFINED.store(true, std::sync::atomic::Ordering::Relaxed);
match read_u32_fd(pipes.ready_r.as_raw_fd()) {
Ok(_) => {}
Err(e) => fail!(format!("read ready signal: {}", e)),
}
let mut fds_to_keep: Vec<RawFd> = keep_fds.to_vec();
if keep_fd >= 0 {
fds_to_keep.push(keep_fd);
}
close_fds_above(2, &fds_to_keep);
if policy.clean_env {
for (key, _) in std::env::vars_os() {
std::env::remove_var(&key);
}
}
for (key, value) in &policy.env {
std::env::set_var(key, value);
}
if let Some(ref devices) = policy.gpu_devices {
if !devices.is_empty() {
let vis = devices.iter().map(|d| d.to_string()).collect::<Vec<_>>().join(",");
std::env::set_var("CUDA_VISIBLE_DEVICES", &vis);
std::env::set_var("ROCR_VISIBLE_DEVICES", &vis);
}
}
debug_assert!(!cmd.is_empty(), "cmd must not be empty");
let argv_ptrs: Vec<*const libc::c_char> = cmd
.iter()
.map(|s| s.as_ptr())
.chain(std::iter::once(std::ptr::null()))
.collect();
if policy.chroot.is_some() {
let mut exec_path = vec![0u8; libc::PATH_MAX as usize];
let orig = cmd[0].as_bytes_with_nul();
exec_path[..orig.len()].copy_from_slice(orig);
unsafe {
libc::execvp(
exec_path.as_ptr() as *const libc::c_char,
argv_ptrs.as_ptr(),
)
};
} else {
unsafe { libc::execvp(argv_ptrs[0], argv_ptrs.as_ptr()) };
}
fail!(format!("execvp '{}'", cmd[0].to_string_lossy()));
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pipe_pair_creation() {
let pipes = PipePair::new().expect("pipe creation failed");
assert!(pipes.notif_r.as_raw_fd() >= 0);
assert!(pipes.notif_w.as_raw_fd() >= 0);
assert!(pipes.ready_r.as_raw_fd() >= 0);
assert!(pipes.ready_w.as_raw_fd() >= 0);
let fds = [
pipes.notif_r.as_raw_fd(),
pipes.notif_w.as_raw_fd(),
pipes.ready_r.as_raw_fd(),
pipes.ready_w.as_raw_fd(),
];
for i in 0..4 {
for j in (i + 1)..4 {
assert_ne!(fds[i], fds[j]);
}
}
}
#[test]
fn test_write_read_u32() {
let pipes = PipePair::new().expect("pipe creation failed");
let val = 42u32;
write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
assert_eq!(got, val);
}
#[test]
fn test_write_read_u32_large() {
let pipes = PipePair::new().expect("pipe creation failed");
let val = 0xDEAD_BEEFu32;
write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
assert_eq!(got, val);
}
#[test]
fn test_notif_syscalls_always_has_clone() {
let policy = Policy::builder().build().unwrap();
let nrs = notif_syscalls(&policy);
assert!(nrs.contains(&(libc::SYS_clone as u32)));
assert!(nrs.contains(&(libc::SYS_clone3 as u32)));
if let Some(vfork) = arch::SYS_VFORK {
assert!(nrs.contains(&(vfork as u32)));
}
}
#[test]
fn test_notif_syscalls_memory() {
let policy = Policy::builder()
.max_memory(crate::policy::ByteSize::mib(256))
.build()
.unwrap();
let nrs = notif_syscalls(&policy);
assert!(nrs.contains(&(libc::SYS_mmap as u32)));
assert!(nrs.contains(&(libc::SYS_munmap as u32)));
assert!(nrs.contains(&(libc::SYS_brk as u32)));
assert!(nrs.contains(&(libc::SYS_mremap as u32)));
assert!(nrs.contains(&(libc::SYS_shmget as u32)));
}
#[test]
fn test_notif_syscalls_net() {
let policy = Policy::builder()
.net_allow_host("example.com")
.build()
.unwrap();
let nrs = notif_syscalls(&policy);
assert!(nrs.contains(&(libc::SYS_connect as u32)));
assert!(nrs.contains(&(libc::SYS_sendto as u32)));
assert!(nrs.contains(&(libc::SYS_sendmsg as u32)));
}
#[test]
fn test_notif_syscalls_faccessat2() {
const SYS_FACCESSAT2: u32 = 439;
let policy = Policy::builder()
.chroot("/tmp")
.build()
.unwrap();
let nrs = notif_syscalls(&policy);
assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
assert!(nrs.contains(&SYS_FACCESSAT2),
"chroot notif filter must include SYS_faccessat2 (439)");
let policy = Policy::builder()
.workdir("/tmp")
.build()
.unwrap();
let nrs = notif_syscalls(&policy);
assert!(nrs.contains(&(libc::SYS_faccessat as u32)));
assert!(nrs.contains(&SYS_FACCESSAT2),
"COW notif filter must include SYS_faccessat2 (439)");
}
#[test]
fn test_deny_syscall_numbers_default() {
let policy = Policy::builder().build().unwrap();
let nrs = deny_syscall_numbers(&policy);
assert!(nrs.contains(&(libc::SYS_mount as u32)));
assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
assert!(nrs.contains(&(libc::SYS_bpf as u32)));
assert!(!nrs.is_empty());
}
#[test]
fn test_deny_syscall_numbers_custom() {
let policy = Policy::builder()
.deny_syscalls(vec!["mount".into(), "ptrace".into()])
.build()
.unwrap();
let nrs = deny_syscall_numbers(&policy);
assert_eq!(nrs.len(), 2);
assert!(nrs.contains(&(libc::SYS_mount as u32)));
assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
}
#[test]
fn test_deny_syscall_numbers_empty_when_allow_set() {
let policy = Policy::builder()
.allow_syscalls(vec!["read".into(), "write".into()])
.build()
.unwrap();
let nrs = deny_syscall_numbers(&policy);
assert!(nrs.is_empty());
}
#[test]
fn test_arg_filters_has_clone_ioctl_prctl_socket() {
use crate::sys::structs::{
BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K,
};
let policy = Policy::builder().build().unwrap();
let filters = arg_filters(&policy);
assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
&& f.k == libc::SYS_clone as u32));
assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JSET | BPF_K)
&& f.k == CLONE_NS_FLAGS as u32));
assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
&& f.k == libc::SYS_ioctl as u32));
assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
&& f.k == TIOCSTI as u32));
assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
&& f.k == TIOCLINUX as u32));
assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
&& f.k == SIOCGIFCONF as u32));
assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
&& f.k == SIOCETHTOOL as u32));
assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
&& f.k == libc::SYS_prctl as u32));
assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
&& f.k == PR_SET_DUMPABLE));
}
#[test]
fn test_arg_filters_raw_sockets() {
use crate::sys::structs::{BPF_ALU, BPF_AND, BPF_JEQ, BPF_JMP, BPF_K};
let policy = Policy::builder().no_raw_sockets(true).build().unwrap();
let filters = arg_filters(&policy);
assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
&& f.k == AF_INET));
assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
&& f.k == AF_INET6));
assert!(filters.iter().any(|f| f.code == (BPF_ALU | BPF_AND | BPF_K)
&& f.k == SOCK_TYPE_MASK));
assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
&& f.k == SOCK_RAW));
}
#[test]
fn test_arg_filters_no_udp() {
use crate::sys::structs::{BPF_JEQ, BPF_JMP, BPF_K};
let policy = Policy::builder().no_udp(true).build().unwrap();
let filters = arg_filters(&policy);
assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
&& f.k == SOCK_DGRAM));
}
#[test]
fn test_syscall_name_to_nr_covers_defaults() {
let expected_unresolved: &[&str] = &[
"nfsservctl",
#[cfg(target_arch = "aarch64")]
"ioperm",
#[cfg(target_arch = "aarch64")]
"iopl",
];
let mut skipped = 0;
for name in DEFAULT_DENY_SYSCALLS {
match syscall_name_to_nr(name) {
Some(_) => {}
None => {
assert!(
expected_unresolved.contains(name),
"unexpected unresolved syscall: {}",
name
);
skipped += 1;
}
}
}
assert_eq!(skipped, expected_unresolved.len());
}
}