use std::{
option::Option,
os::fd::{BorrowedFd, FromRawFd, RawFd},
sync::{
atomic::{AtomicBool, Ordering},
Arc, RwLock,
},
thread,
};
use libc::{AF_ALG, AF_INET, AF_INET6, AF_NETLINK, AF_UNIX};
use libseccomp::{ScmpAction, ScmpArch, ScmpFilterContext, ScmpNotifResp, ScmpNotifRespFlags};
use libseccomp_sys::{const_scmp_filter_ctx, seccomp_load};
use nix::{
errno::Errno,
fcntl::OFlag,
poll::{poll, PollFd, PollFlags},
sched::{unshare, CloneFlags},
unistd::{Gid, Pid, Uid},
};
use crate::{
alert,
compat::seccomp_notif_resp,
config::*,
confine::{
confine_scmp, confine_scmp_accept4, confine_scmp_bind, confine_scmp_clone,
confine_scmp_clone3, confine_scmp_close, confine_scmp_close_range, confine_scmp_connect,
confine_scmp_execveat, confine_scmp_faccessat2, confine_scmp_fallocate,
confine_scmp_fchdir, confine_scmp_fchmod, confine_scmp_fchmodat, confine_scmp_fchmodat2,
confine_scmp_fchown, confine_scmp_fchownat, confine_scmp_fcntl, confine_scmp_fgetxattr,
confine_scmp_flistxattr, confine_scmp_fremovexattr, confine_scmp_fsetxattr,
confine_scmp_ftruncate, confine_scmp_getdents64, confine_scmp_getpeername,
confine_scmp_getsockname, confine_scmp_getsockopt, confine_scmp_getxattr,
confine_scmp_getxattrat, confine_scmp_inotify_add_watch, confine_scmp_ioctl_syd,
confine_scmp_kcmp, confine_scmp_linkat, confine_scmp_listxattr, confine_scmp_listxattrat,
confine_scmp_madvise, confine_scmp_memfd_create, confine_scmp_memfd_secret,
confine_scmp_mkdirat, confine_scmp_mknodat, confine_scmp_open, confine_scmp_openat,
confine_scmp_openat2, confine_scmp_pidfd_getfd, confine_scmp_pidfd_open,
confine_scmp_pidfd_send_signal, confine_scmp_pipe2, confine_scmp_prctl, confine_scmp_read,
confine_scmp_readlinkat, confine_scmp_recvmmsg, confine_scmp_recvmsg,
confine_scmp_removexattr, confine_scmp_removexattrat, confine_scmp_renameat2,
confine_scmp_sendfile, confine_scmp_sendmmsg, confine_scmp_sendmsg, confine_scmp_setid,
confine_scmp_setxattr, confine_scmp_setxattrat, confine_scmp_sigaction,
confine_scmp_socket, confine_scmp_socketpair, confine_scmp_splice, confine_scmp_statx,
confine_scmp_symlinkat, confine_scmp_tgkill, confine_scmp_tkill, confine_scmp_truncate,
confine_scmp_umask, confine_scmp_uname, confine_scmp_unlinkat, confine_scmp_unshare,
confine_scmp_utimensat, confine_scmp_write, confine_scmp_wx_syd, ScmpNotifReq, SydArch,
Sydcall, EIDRM, EOWNERDEAD, SIGCANCEL, SIGSETXID, SIGTIMER, X32_SYSCALL_BIT,
},
err::{err2no, SydJoinHandle, SydResult},
fd::SafeOwnedFd,
fs::{seccomp_notify_receive, seccomp_notify_respond},
hook::HandlerMap,
id::SydId,
info,
proc::proc_get_vma,
req::UNotifyEventRequest,
sandbox::{Options, Sandbox},
workers::{WorkerCache, WorkerData},
xfmt,
};
#[derive(Clone)]
pub(crate) struct Worker {
fd: RawFd,
cache: Arc<WorkerCache>,
sandbox: Arc<RwLock<Sandbox>>,
handlers: Arc<HandlerMap>,
is_idle: bool,
should_exit: Arc<AtomicBool>,
worker_data: Arc<WorkerData>,
}
impl Worker {
pub(crate) fn new(
fd: RawFd,
cache: Arc<WorkerCache>,
sandbox: Arc<RwLock<Sandbox>>,
handlers: Arc<HandlerMap>,
is_idle: bool,
should_exit: Arc<AtomicBool>,
worker_data: Arc<WorkerData>,
) -> Self {
Self {
fd,
cache,
sandbox,
handlers,
is_idle,
should_exit,
worker_data,
}
}
#[expect(clippy::cognitive_complexity)]
pub(crate) fn try_spawn(
self,
ctx: Option<&ScmpFilterContext>,
) -> Result<SydJoinHandle<()>, Errno> {
let mut ctx = ctx.map(|ctx| ctx.as_ptr() as usize);
thread::Builder::new()
.name(SydId::get_name("syd_emu").to_string())
.stack_size(EMU_STACK_SIZE)
.spawn(move || {
let mut unshare_flags = CloneFlags::CLONE_FS | CloneFlags::CLONE_SYSVSEM;
let is_crypt = self.cache.crypt_map.is_some();
if !cfg!(feature = "kcov") && !is_crypt {
unshare_flags.insert(CloneFlags::CLONE_FILES);
}
if let Err(errno) = unshare(unshare_flags) {
alert!("ctx": "boot", "op": "unshare_emu_thread",
"msg": xfmt!("failed to unshare({unshare_flags:?}): {errno}"),
"err": errno as i32);
std::process::exit(101);
}
let timeout = if self.is_idle {
Some(EMU_KEEP_ALIVE)
} else {
None
};
let mut sentinel = Sentinel::new(&self);
loop {
if let Some(filter) = ctx {
if Sandbox::is_locked_once() {
if let Err(error) =
confine_scmp(ScmpAction::KillProcess, EMU_LOCK_SYSCALLS)
{
let errno = error.errno().unwrap_or(Errno::ENOSYS);
alert!("ctx": "boot", "op": "confine_emu_thread",
"msg": xfmt!("failed to confine: {error}"),
"err": errno as i32);
std::process::exit(101);
}
info!("ctx": "confine", "op": "confine_emu_thread",
"msg": "emulator thread confined");
let error = unsafe { seccomp_load(filter as const_scmp_filter_ctx) };
ctx = None;
if error != 0 {
let errno = Errno::from_raw(error.abs());
alert!("ctx": "boot", "op": "confine_emu_thread",
"msg": xfmt!("failed to confine: {error}"),
"err": errno as i32);
std::process::exit(101);
}
self.cache.notify_mon();
}
}
let request = match self.receive(timeout) {
Ok(request) => request,
Err(Errno::EBADF | Errno::EAGAIN) => {
self.worker_data.decrement_worker_total();
break;
}
Err(_) => {
continue;
}
};
sentinel.seccomp_id = Some(request.id);
self.worker_data.increment_worker_busy();
self.handle(request);
sentinel.seccomp_id = None;
self.worker_data.decrement_worker_busy();
if self.should_exit.load(Ordering::Acquire) {
break;
}
}
Ok(())
})
.map_err(|err| err2no(&err))
}
fn receive(&self, timeout: Option<u16>) -> Result<ScmpNotifReq, Errno> {
if let Some(timeout) = timeout {
let fd = unsafe { BorrowedFd::borrow_raw(self.fd) };
let mut pollfd = [PollFd::new(fd, PollFlags::POLLIN)];
if poll(&mut pollfd, timeout)? == 0 {
return Err(Errno::EAGAIN); }
#[expect(clippy::disallowed_methods)]
let revents = pollfd[0]
.revents()
.expect("BUG: Kernel returned unknown poll events, report a bug!");
if revents.contains(PollFlags::POLLHUP) {
return Err(Errno::EBADF);
} else if revents.contains(PollFlags::POLLERR) {
return Err(Errno::EINTR);
}
assert!(
revents.contains(PollFlags::POLLIN),
"BUG: Kernel didn't return POLLIN with `{revents:?}', report a bug!"
);
}
seccomp_notify_receive(self.fd)
}
#[expect(clippy::cognitive_complexity)]
fn handle(&self, mut req: ScmpNotifReq) {
if req.data.arch == ScmpArch::X8664
&& req.data.syscall.as_raw_syscall() & X32_SYSCALL_BIT != 0
{
req.data.arch = ScmpArch::X32;
}
let syscall = Sydcall::new(req.data.syscall, req.data.arch);
let handler = if let Some(handler) = self.handlers.get(&syscall) {
handler
} else {
unreachable!("BUG: Missing hook for request {req:?}!");
};
let request = UNotifyEventRequest::new(
req,
syscall,
self.fd,
Arc::clone(&self.cache),
Arc::clone(&self.sandbox),
);
let mut ghost = Sandbox::ghost_once();
let mut response = if ghost {
ScmpNotifResp::new_error(req.id, -libc::ENOSYS, ScmpNotifRespFlags::empty())
} else {
handler(request)
};
ghost = ghost
|| match response.error {
EIDRM if response.id == 0 && response.val == 0 && response.flags == 0 => return,
EOWNERDEAD if response.id == 0 && response.val == 0 && response.flags == 0 => {
#[expect(clippy::cast_possible_wrap)]
let pid = Pid::from_raw(req.pid as libc::pid_t);
let vma = proc_get_vma(pid, req.data.instr_pointer).ok();
crate::warn!("ctx": "confine", "op": "enter_ghost_mode", "pid": req.pid,
"sys": syscall, "arch": SydArch::from(req.data.arch), "args": req.data.args,
"ip": req.data.instr_pointer, "src": vma);
response.id = req.id;
response.error = 0;
response.val = 0;
true
}
_ => false,
};
let response = seccomp_notif_resp {
id: response.id,
val: response.val,
error: response.error,
flags: response.flags,
};
let _ = seccomp_notify_respond(self.fd, std::ptr::addr_of!(response));
if ghost {
drop(unsafe { SafeOwnedFd::from_raw_fd(self.fd) });
self.should_exit.store(true, Ordering::Release);
self.cache.notify_mon();
}
}
#[expect(clippy::cognitive_complexity)]
pub(crate) fn prepare_confine(
seccomp_fd: RawFd,
options: Options,
is_crypt: bool,
safe_kcapi: bool,
transit_uids: &[(Uid, Uid)],
transit_gids: &[(Gid, Gid)],
) -> SydResult<ScmpFilterContext> {
let mut ctx = ScmpFilterContext::new(ScmpAction::KillProcess)?;
ctx.set_ctl_nnp(true)?;
ctx.set_ctl_ssb(options.allow_unsafe_exec_speculative())?;
ctx.set_ctl_tsync(false)?;
ctx.set_act_badarch(ScmpAction::KillProcess)?;
let _ = ctx.set_ctl_optimize(2);
confine_scmp_clone(&mut ctx)?;
confine_scmp_clone3(&mut ctx)?;
confine_scmp_madvise(&mut ctx)?;
let safe_setid =
options.intersects(Options::OPT_ALLOW_SAFE_SETUID | Options::OPT_ALLOW_SAFE_SETGID);
let restrict_cookie = !options.allow_unsafe_nocookie();
let restrict_mkbdev = !options.allow_unsafe_mkbdev();
let restrict_mkcdev = !options.allow_unsafe_mkcdev();
for sysname in EMU_SYSCALLS
.iter()
.chain(ALLOC_SYSCALLS)
.chain(FUTEX_SYSCALLS)
.chain(GETID_SYSCALLS)
.chain(KCOV_SYSCALLS)
.chain(PROF_SYSCALLS)
.chain(VDSO_SYSCALLS)
{
match Sydcall::from_name(sysname) {
Ok(syscall) => {
ctx.add_rule(ScmpAction::Allow, syscall)?;
}
Err(_) => {
info!("ctx": "confine", "op": "allow_emu_syscall",
"msg": xfmt!("invalid or unsupported syscall {sysname}"));
}
}
}
let prctl_ops = EMU_PRCTL_OPS.iter().chain(if safe_setid {
EMU_PRCTL_OPS_SAFESETID.iter()
} else {
[].iter()
});
confine_scmp_read(&mut ctx, 0x10000, restrict_cookie)?;
confine_scmp_write(
&mut ctx,
Some(0x10000),
false,
restrict_cookie,
)?;
confine_scmp_close(&mut ctx, restrict_cookie)?;
confine_scmp_close_range(&mut ctx, restrict_cookie)?;
confine_scmp_execveat(&mut ctx, restrict_cookie)?;
confine_scmp_faccessat2(&mut ctx, restrict_cookie)?;
confine_scmp_fallocate(&mut ctx, restrict_cookie)?;
confine_scmp_fchdir(&mut ctx, restrict_cookie)?;
confine_scmp_fchmod(&mut ctx, restrict_cookie)?;
confine_scmp_fchmodat(&mut ctx, restrict_cookie)?;
confine_scmp_fchmodat2(&mut ctx, restrict_cookie)?;
confine_scmp_fchown(&mut ctx, restrict_cookie)?;
confine_scmp_fchownat(&mut ctx, restrict_cookie)?;
confine_scmp_fgetxattr(&mut ctx, restrict_cookie)?;
confine_scmp_flistxattr(&mut ctx, restrict_cookie)?;
confine_scmp_fremovexattr(&mut ctx, restrict_cookie)?;
confine_scmp_fsetxattr(&mut ctx, restrict_cookie)?;
confine_scmp_ftruncate(&mut ctx, restrict_cookie)?;
confine_scmp_getdents64(&mut ctx, restrict_cookie)?;
confine_scmp_ioctl_syd(&mut ctx, restrict_cookie, Some(seccomp_fd))?;
confine_scmp_inotify_add_watch(&mut ctx, restrict_cookie)?;
confine_scmp_kcmp(&mut ctx, restrict_cookie)?;
confine_scmp_linkat(&mut ctx, restrict_cookie)?;
confine_scmp_memfd_create(&mut ctx, restrict_cookie)?;
confine_scmp_memfd_secret(&mut ctx, restrict_cookie)?;
confine_scmp_mkdirat(&mut ctx, restrict_cookie)?;
confine_scmp_mknodat(&mut ctx, restrict_cookie, restrict_mkbdev, restrict_mkcdev)?;
confine_scmp_open(&mut ctx)?;
confine_scmp_openat(&mut ctx)?;
confine_scmp_openat2(&mut ctx, restrict_cookie)?;
confine_scmp_pidfd_getfd(&mut ctx, restrict_cookie)?;
confine_scmp_pidfd_open(&mut ctx, restrict_cookie)?;
confine_scmp_pidfd_send_signal(&mut ctx, restrict_cookie)?;
confine_scmp_prctl(&mut ctx, prctl_ops)?;
confine_scmp_readlinkat(&mut ctx, restrict_cookie)?;
confine_scmp_renameat2(&mut ctx, restrict_cookie, restrict_mkcdev)?;
confine_scmp_sigaction(&mut ctx)?;
confine_scmp_statx(&mut ctx, restrict_cookie)?;
confine_scmp_symlinkat(&mut ctx, restrict_cookie)?;
confine_scmp_truncate(&mut ctx, restrict_cookie)?;
confine_scmp_umask(&mut ctx, restrict_cookie)?;
confine_scmp_uname(&mut ctx, restrict_cookie)?;
confine_scmp_unlinkat(&mut ctx, restrict_cookie)?;
confine_scmp_utimensat(&mut ctx, restrict_cookie)?;
confine_scmp_wx_syd(&mut ctx)?;
if *HAVE_XATTRAT {
confine_scmp_getxattrat(&mut ctx, restrict_cookie)?;
confine_scmp_listxattrat(&mut ctx, restrict_cookie)?;
confine_scmp_removexattrat(&mut ctx, restrict_cookie)?;
confine_scmp_setxattrat(&mut ctx, restrict_cookie)?;
} else {
confine_scmp_getxattr(&mut ctx, restrict_cookie)?;
confine_scmp_listxattr(&mut ctx, restrict_cookie)?;
confine_scmp_removexattr(&mut ctx, restrict_cookie)?;
confine_scmp_setxattr(&mut ctx, restrict_cookie)?;
}
if cfg!(feature = "kcov") {
for sysname in ["fcntl", "fcntl64"] {
if let Ok(syscall) = Sydcall::from_name(sysname) {
ctx.add_rule(ScmpAction::Allow, syscall)?;
}
}
} else {
confine_scmp_fcntl(&mut ctx, EMU_FCNTL_OPS)?;
}
if safe_kcapi || cfg!(feature = "kcov") {
confine_scmp_pipe2(&mut ctx, restrict_cookie, OFlag::O_CLOEXEC)?;
confine_scmp_splice(&mut ctx)?;
confine_scmp_sendfile(&mut ctx, restrict_cookie)?;
}
let mut clone_flags = CloneFlags::CLONE_FS | CloneFlags::CLONE_SYSVSEM;
if !cfg!(feature = "kcov") && !is_crypt {
clone_flags.insert(CloneFlags::CLONE_FILES);
};
confine_scmp_unshare(&mut ctx, clone_flags)?;
let domains = if !options.allow_unsupp_socket() {
let mut domains = vec![AF_UNIX, AF_INET, AF_INET6, AF_NETLINK];
if safe_kcapi {
domains.push(AF_ALG);
}
Some(domains)
} else {
None
};
let restrict_socket = !options.allow_unsafe_socket();
confine_scmp_socket(
&mut ctx,
domains.as_deref(),
restrict_socket,
restrict_cookie,
)?;
confine_scmp_socketpair(
&mut ctx,
domains.as_deref(),
restrict_socket,
restrict_cookie,
)?;
confine_scmp_accept4(&mut ctx, restrict_cookie)?;
confine_scmp_bind(&mut ctx, restrict_cookie)?;
confine_scmp_connect(&mut ctx, restrict_cookie)?;
confine_scmp_recvmmsg(&mut ctx, restrict_cookie)?;
confine_scmp_recvmsg(&mut ctx, restrict_cookie)?;
confine_scmp_sendmmsg(&mut ctx, restrict_cookie)?;
confine_scmp_sendmsg(&mut ctx, restrict_cookie)?;
confine_scmp_getpeername(&mut ctx, restrict_cookie)?;
confine_scmp_getsockname(&mut ctx, restrict_cookie)?;
confine_scmp_getsockopt(&mut ctx, EMU_GETSOCKOPT_OPS, restrict_cookie)?;
if safe_setid {
let sigs = [SIGCANCEL, SIGTIMER, SIGSETXID];
confine_scmp_tkill(&mut ctx, &sigs)?;
confine_scmp_tgkill(&mut ctx, &sigs)?;
} else {
let sigs = [SIGCANCEL, SIGTIMER];
confine_scmp_tkill(&mut ctx, &sigs)?;
confine_scmp_tgkill(&mut ctx, &sigs)?;
}
let safe_setuid = options.allow_safe_setuid();
let safe_setgid = options.allow_safe_setgid();
if safe_setuid || safe_setgid {
confine_scmp_setid(
"emu",
&mut ctx,
safe_setuid,
safe_setgid,
transit_uids,
transit_gids,
)?;
for sysname in ["capget", "capset"] {
match Sydcall::from_name(sysname) {
Ok(syscall) => {
ctx.add_rule(ScmpAction::Allow, syscall)?;
}
Err(_) => {
info!("ctx": "confine", "op": "allow_emu_syscall",
"msg": xfmt!("invalid or unsupported syscall {sysname}"));
}
}
}
}
Ok(ctx)
}
}
struct Sentinel<'a> {
seccomp_id: Option<u64>,
worker_ref: &'a Worker,
}
impl<'a> Sentinel<'a> {
fn new(worker_ref: &'a Worker) -> Sentinel<'a> {
Self {
seccomp_id: None,
worker_ref,
}
}
#[expect(clippy::arithmetic_side_effects)]
fn deny_syscall(&self, seccomp_id: u64, errno: Errno) {
let response = seccomp_notif_resp {
id: seccomp_id,
val: 0,
error: -(errno as i32),
flags: 0,
};
let _ = seccomp_notify_respond(self.worker_ref.fd, std::ptr::addr_of!(response));
}
}
impl Drop for Sentinel<'_> {
fn drop(&mut self) {
if thread::panicking() {
if let Some(seccomp_id) = self.seccomp_id {
self.deny_syscall(seccomp_id, Errno::EACCES);
self.worker_ref.worker_data.decrement_both();
} else {
self.worker_ref.worker_data.decrement_worker_total();
}
}
}
}