use std::collections::HashSet;
use std::io;
use std::net::IpAddr;
use std::os::unix::io::{AsRawFd, FromRawFd, OwnedFd, RawFd};
use std::sync::Arc;
use crate::error::NotifError;
use crate::arch;
use crate::sys::structs::{
SeccompNotif, SeccompNotifAddfd, SeccompNotifResp,
SECCOMP_ADDFD_FLAG_SEND, SECCOMP_IOCTL_NOTIF_ADDFD, SECCOMP_IOCTL_NOTIF_ID_VALID, SECCOMP_IOCTL_NOTIF_RECV,
SECCOMP_IOCTL_NOTIF_SEND, SECCOMP_IOCTL_NOTIF_SET_FLAGS,
SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP, SECCOMP_USER_NOTIF_FLAG_CONTINUE,
ENOMEM,
};
pub struct OnInjectSuccess(pub Box<dyn FnOnce(i32) + Send + Sync>);
impl std::fmt::Debug for OnInjectSuccess {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str("OnInjectSuccess(<callback>)")
}
}
impl OnInjectSuccess {
pub fn new<F: FnOnce(i32) + Send + Sync + 'static>(f: F) -> Self {
Self(Box::new(f))
}
}
#[derive(Debug)]
pub enum NotifAction {
Continue,
Errno(i32),
InjectFd { srcfd: RawFd, targetfd: i32 },
InjectFdSend { srcfd: OwnedFd, newfd_flags: u32 },
InjectFdSendTracked {
srcfd: OwnedFd,
newfd_flags: u32,
on_success: OnInjectSuccess,
},
ReturnValue(i64),
Hold,
Kill { sig: i32, pgid: i32 },
}
#[derive(Debug, Clone)]
pub enum NetworkPolicy {
Unrestricted,
AllowList(HashSet<IpAddr>),
}
pub(crate) fn is_path_denied_for_notif(
policy_fn_state: &super::state::PolicyFnState,
notif: &SeccompNotif,
notif_fd: RawFd,
) -> bool {
if let Some(path) = resolve_path_for_notif(notif, notif_fd) {
if is_denied_with_symlink_resolve(policy_fn_state, &path) {
return true;
}
}
if let Some(path) = resolve_second_path_for_notif(notif, notif_fd) {
if is_denied_with_symlink_resolve(policy_fn_state, &path) {
return true;
}
}
false
}
fn is_denied_with_symlink_resolve(
policy_fn_state: &super::state::PolicyFnState,
path: &str,
) -> bool {
if policy_fn_state.is_path_denied(path) {
return true;
}
if let Ok(real) = std::fs::canonicalize(path) {
if policy_fn_state.is_path_denied(&real.to_string_lossy()) {
return true;
}
}
false
}
pub(crate) fn dup_fd_from_pid(pid: u32, target_fd: i32) -> Result<OwnedFd, io::Error> {
const SYS_PIDFD_OPEN: i64 = 434;
const SYS_PIDFD_GETFD: i64 = 438;
const PIDFD_THREAD: i64 = libc::O_EXCL as i64; let pidfd = unsafe { libc::syscall(SYS_PIDFD_OPEN, pid as i64, PIDFD_THREAD) };
if pidfd < 0 {
return Err(io::Error::last_os_error());
}
let pidfd_owned = unsafe { OwnedFd::from_raw_fd(pidfd as i32) };
let ret = unsafe {
libc::syscall(SYS_PIDFD_GETFD, pidfd_owned.as_raw_fd() as i64, target_fd as i64, 0i64)
};
if ret < 0 {
Err(io::Error::last_os_error())
} else {
Ok(unsafe { OwnedFd::from_raw_fd(ret as i32) })
}
}
pub struct NotifPolicy {
pub max_memory_bytes: u64,
pub max_processes: u32,
pub has_memory_limit: bool,
pub has_net_allowlist: bool,
pub has_random_seed: bool,
pub has_time_start: bool,
pub time_offset: i64,
pub num_cpus: Option<u32>,
pub port_remap: bool,
pub cow_enabled: bool,
pub chroot_root: Option<std::path::PathBuf>,
pub chroot_readable: Vec<std::path::PathBuf>,
pub chroot_writable: Vec<std::path::PathBuf>,
pub chroot_denied: Vec<std::path::PathBuf>,
pub chroot_mounts: Vec<(std::path::PathBuf, std::path::PathBuf)>,
pub deterministic_dirs: bool,
pub hostname: Option<String>,
pub has_http_acl: bool,
pub virtual_etc_hosts: Option<String>,
}
fn recv_notif(fd: RawFd) -> io::Result<SeccompNotif> {
let mut notif: SeccompNotif = unsafe { std::mem::zeroed() };
let ret = unsafe {
libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_RECV as libc::c_ulong, &mut notif as *mut _)
};
if ret < 0 {
Err(io::Error::last_os_error())
} else {
Ok(notif)
}
}
fn respond_continue(fd: RawFd, id: u64) -> io::Result<()> {
let resp = SeccompNotifResp {
id,
val: 0,
error: 0,
flags: SECCOMP_USER_NOTIF_FLAG_CONTINUE,
};
send_resp_raw(fd, &resp)
}
fn respond_errno(fd: RawFd, id: u64, errno: i32) -> io::Result<()> {
let resp = SeccompNotifResp {
id,
val: 0,
error: -errno,
flags: 0,
};
send_resp_raw(fd, &resp)
}
fn respond_value(fd: RawFd, id: u64, val: i64) -> io::Result<()> {
let resp = SeccompNotifResp {
id,
val,
error: 0,
flags: 0,
};
send_resp_raw(fd, &resp)
}
fn inject_fd_and_send(fd: RawFd, id: u64, srcfd: RawFd, newfd_flags: u32) -> io::Result<i32> {
let addfd = SeccompNotifAddfd {
id,
flags: SECCOMP_ADDFD_FLAG_SEND,
srcfd: srcfd as u32,
newfd: 0, newfd_flags,
};
let ret = unsafe {
libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_ADDFD as libc::c_ulong, &addfd as *const _)
};
if ret < 0 {
Err(io::Error::last_os_error())
} else {
Ok(ret as i32)
}
}
fn inject_fd(fd: RawFd, id: u64, srcfd: RawFd, targetfd: i32) -> io::Result<()> {
let addfd = SeccompNotifAddfd {
id,
flags: 0,
srcfd: srcfd as u32,
newfd: targetfd as u32,
newfd_flags: 0,
};
let ret = unsafe {
libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_ADDFD as libc::c_ulong, &addfd as *const _)
};
if ret < 0 {
Err(io::Error::last_os_error())
} else {
Ok(())
}
}
fn send_resp_raw(fd: RawFd, resp: &SeccompNotifResp) -> io::Result<()> {
let ret = unsafe {
libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_SEND as libc::c_ulong, resp as *const _)
};
if ret < 0 {
Err(io::Error::last_os_error())
} else {
Ok(())
}
}
pub(crate) fn id_valid(fd: RawFd, id: u64) -> io::Result<()> {
let ret = unsafe {
libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_ID_VALID as libc::c_ulong, &id as *const _)
};
if ret < 0 {
Err(io::Error::last_os_error())
} else {
Ok(())
}
}
fn try_set_sync_wakeup(fd: RawFd) {
let flags: u64 = SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP as u64;
unsafe {
libc::ioctl(fd, SECCOMP_IOCTL_NOTIF_SET_FLAGS as libc::c_ulong, &flags as *const _);
}
}
fn read_child_mem_vm(pid: u32, addr: u64, len: usize) -> Result<Vec<u8>, NotifError> {
let mut buf = vec![0u8; len];
let local_iov = libc::iovec {
iov_base: buf.as_mut_ptr() as *mut libc::c_void,
iov_len: len,
};
let remote_iov = libc::iovec {
iov_base: addr as *mut libc::c_void,
iov_len: len,
};
let ret = unsafe {
libc::process_vm_readv(pid as i32, &local_iov, 1, &remote_iov, 1, 0)
};
if ret < 0 {
Err(NotifError::ChildMemoryRead(io::Error::last_os_error()))
} else {
buf.truncate(ret as usize);
Ok(buf)
}
}
fn write_child_mem_vm(pid: u32, addr: u64, data: &[u8]) -> Result<(), NotifError> {
let local_iov = libc::iovec {
iov_base: data.as_ptr() as *mut libc::c_void,
iov_len: data.len(),
};
let remote_iov = libc::iovec {
iov_base: addr as *mut libc::c_void,
iov_len: data.len(),
};
let ret = unsafe {
libc::process_vm_writev(pid as i32, &local_iov, 1, &remote_iov, 1, 0)
};
if ret < 0 {
Err(NotifError::ChildMemoryRead(io::Error::last_os_error()))
} else if (ret as usize) < data.len() {
Err(NotifError::ChildMemoryRead(io::Error::new(
io::ErrorKind::WriteZero,
format!("short write: {} of {} bytes", ret, data.len()),
)))
} else {
Ok(())
}
}
pub(crate) fn read_child_mem(
notif_fd: RawFd,
id: u64,
pid: u32,
addr: u64,
len: usize,
) -> Result<Vec<u8>, NotifError> {
id_valid(notif_fd, id).map_err(NotifError::Ioctl)?;
let result = read_child_mem_vm(pid, addr, len)?;
id_valid(notif_fd, id).map_err(NotifError::Ioctl)?;
Ok(result)
}
pub(crate) fn read_child_cstr(
notif_fd: RawFd,
id: u64,
pid: u32,
addr: u64,
max_len: usize,
) -> Option<String> {
if addr == 0 || max_len == 0 {
return None;
}
const PAGE_SIZE: u64 = 4096;
let mut result = Vec::with_capacity(max_len.min(256));
let mut cur = addr;
while result.len() < max_len {
let page_remaining = PAGE_SIZE - (cur % PAGE_SIZE);
let remaining = max_len - result.len();
let to_read = page_remaining.min(remaining as u64) as usize;
let bytes = read_child_mem(notif_fd, id, pid, cur, to_read).ok()?;
if let Some(nul) = bytes.iter().position(|&b| b == 0) {
result.extend_from_slice(&bytes[..nul]);
return String::from_utf8(result).ok();
}
result.extend_from_slice(&bytes);
cur += to_read as u64;
}
String::from_utf8(result).ok()
}
pub(crate) fn write_child_mem(
notif_fd: RawFd,
id: u64,
pid: u32,
addr: u64,
data: &[u8],
) -> Result<(), NotifError> {
id_valid(notif_fd, id).map_err(NotifError::Ioctl)?;
write_child_mem_vm(pid, addr, data)?;
id_valid(notif_fd, id).map_err(NotifError::Ioctl)?;
Ok(())
}
fn send_response(fd: RawFd, id: u64, action: NotifAction) -> io::Result<()> {
match action {
NotifAction::Continue => respond_continue(fd, id),
NotifAction::Errno(errno) => respond_errno(fd, id, errno),
NotifAction::InjectFd { srcfd, targetfd } => {
inject_fd(fd, id, srcfd, targetfd)?;
respond_continue(fd, id)
}
NotifAction::InjectFdSend { srcfd, newfd_flags } => {
match inject_fd_and_send(fd, id, srcfd.as_raw_fd(), newfd_flags) {
Ok(_new_fd) => Ok(()),
Err(_) => respond_continue(fd, id),
}
}
NotifAction::InjectFdSendTracked { srcfd, newfd_flags, on_success } => {
match inject_fd_and_send(fd, id, srcfd.as_raw_fd(), newfd_flags) {
Ok(new_fd) => {
(on_success.0)(new_fd);
Ok(())
}
Err(_) => respond_continue(fd, id),
}
}
NotifAction::ReturnValue(val) => respond_value(fd, id, val),
NotifAction::Hold => Ok(()), NotifAction::Kill { sig, pgid } => {
unsafe { libc::killpg(pgid, sig) };
respond_errno(fd, id, ENOMEM)
}
}
}
fn maybe_patch_vdso(pid: i32, procfs: &mut super::state::ProcfsState, policy: &NotifPolicy) {
let base = match crate::vdso::find_vdso_base(pid) {
Ok(addr) => addr,
Err(_) => return,
};
if base == procfs.vdso_patched_addr {
return; }
let time_offset = if policy.has_time_start { Some(policy.time_offset) } else { None };
if crate::vdso::patch(pid, time_offset, policy.has_random_seed).is_ok() {
procfs.vdso_patched_addr = base;
}
}
fn syscall_name(nr: i64) -> &'static str {
match nr {
n if n == libc::SYS_openat => "openat",
n if n == libc::SYS_connect => "connect",
n if n == libc::SYS_sendto => "sendto",
n if n == libc::SYS_sendmsg => "sendmsg",
n if n == libc::SYS_bind => "bind",
n if n == libc::SYS_clone => "clone",
n if n == libc::SYS_clone3 => "clone3",
n if Some(n) == arch::SYS_VFORK => "vfork",
n if n == libc::SYS_execve => "execve",
n if n == libc::SYS_execveat => "execveat",
n if n == libc::SYS_mmap => "mmap",
n if n == libc::SYS_munmap => "munmap",
n if n == libc::SYS_brk => "brk",
n if n == libc::SYS_getrandom => "getrandom",
n if n == libc::SYS_unlinkat => "unlinkat",
n if n == libc::SYS_mkdirat => "mkdirat",
_ => "unknown",
}
}
fn syscall_category(nr: i64) -> crate::policy_fn::SyscallCategory {
use crate::policy_fn::SyscallCategory;
match nr {
n if n == libc::SYS_openat || n == libc::SYS_unlinkat
|| n == libc::SYS_mkdirat || n == libc::SYS_renameat2
|| n == libc::SYS_symlinkat || n == libc::SYS_linkat
|| n == libc::SYS_fchmodat || n == libc::SYS_fchownat
|| n == libc::SYS_truncate || n == libc::SYS_readlinkat
|| n == libc::SYS_newfstatat || n == libc::SYS_statx
|| n == libc::SYS_faccessat || n == libc::SYS_getdents64
|| Some(n) == arch::SYS_GETDENTS => SyscallCategory::File,
n if n == libc::SYS_connect || n == libc::SYS_sendto
|| n == libc::SYS_sendmsg || n == libc::SYS_bind
|| n == libc::SYS_getsockname => SyscallCategory::Network,
n if n == libc::SYS_clone || n == libc::SYS_clone3
|| Some(n) == arch::SYS_VFORK || n == libc::SYS_execve
|| n == libc::SYS_execveat => SyscallCategory::Process,
n if n == libc::SYS_mmap || n == libc::SYS_munmap
|| n == libc::SYS_brk || n == libc::SYS_mremap
=> SyscallCategory::Memory,
_ => SyscallCategory::File, }
}
fn read_ppid(pid: u32) -> Option<u32> {
let stat = std::fs::read_to_string(format!("/proc/{}/stat", pid)).ok()?;
let close_paren = stat.rfind(')')?;
let rest = &stat[close_paren + 2..]; let fields: Vec<&str> = rest.split_whitespace().collect();
fields.get(1)?.parse().ok()
}
fn read_path_for_event(notif: &SeccompNotif, addr: u64, notif_fd: RawFd) -> Option<String> {
if addr == 0 { return None; }
let bytes = read_child_mem(notif_fd, notif.id, notif.pid, addr, 256).ok()?;
let nul = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len());
String::from_utf8(bytes[..nul].to_vec()).ok()
}
fn normalize_path(path: &std::path::Path) -> String {
use std::path::{Component, PathBuf};
let mut normalized = PathBuf::new();
let absolute = path.is_absolute();
if absolute {
normalized.push("/");
}
for component in path.components() {
match component {
Component::RootDir | Component::CurDir => {}
Component::ParentDir => {
normalized.pop();
}
Component::Normal(part) => normalized.push(part),
Component::Prefix(_) => {}
}
}
if normalized.as_os_str().is_empty() {
if absolute { "/".into() } else { ".".into() }
} else {
normalized.to_string_lossy().into_owned()
}
}
fn resolve_at_path_for_event(notif: &SeccompNotif, dirfd: i64, path: &str) -> Option<String> {
use std::path::Path;
if Path::new(path).is_absolute() {
return Some(normalize_path(Path::new(path)));
}
let dirfd32 = dirfd as i32;
let base = if dirfd32 == libc::AT_FDCWD {
std::fs::read_link(format!("/proc/{}/cwd", notif.pid)).ok()?
} else {
std::fs::read_link(format!("/proc/{}/fd/{}", notif.pid, dirfd32)).ok()?
};
Some(normalize_path(&base.join(path)))
}
fn resolve_path_for_notif(notif: &SeccompNotif, notif_fd: RawFd) -> Option<String> {
let nr = notif.data.nr as i64;
match nr {
n if n == libc::SYS_openat => {
let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
resolve_at_path_for_event(notif, notif.data.args[0] as i64, &path)
}
n if Some(n) == arch::SYS_OPEN || n == libc::SYS_execve => {
let path = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
}
n if n == libc::SYS_execveat => {
let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
resolve_at_path_for_event(notif, notif.data.args[0] as i64, &path)
}
n if n == libc::SYS_linkat => {
let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
resolve_at_path_for_event(notif, notif.data.args[0] as i64, &path)
}
n if n == libc::SYS_renameat2 => {
let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
resolve_at_path_for_event(notif, notif.data.args[0] as i64, &path)
}
n if n == libc::SYS_symlinkat => {
let target = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &target)
}
n if Some(n) == arch::SYS_LINK => {
let path = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
}
n if Some(n) == arch::SYS_RENAME => {
let path = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
}
n if Some(n) == arch::SYS_SYMLINK => {
let target = read_path_for_event(notif, notif.data.args[0], notif_fd)?;
resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &target)
}
_ => None,
}
}
fn resolve_second_path_for_notif(notif: &SeccompNotif, notif_fd: RawFd) -> Option<String> {
let nr = notif.data.nr as i64;
match nr {
n if n == libc::SYS_renameat2 => {
let path = read_path_for_event(notif, notif.data.args[3], notif_fd)?;
resolve_at_path_for_event(notif, notif.data.args[2] as i64, &path)
}
n if n == libc::SYS_linkat => {
let path = read_path_for_event(notif, notif.data.args[3], notif_fd)?;
resolve_at_path_for_event(notif, notif.data.args[2] as i64, &path)
}
n if Some(n) == arch::SYS_RENAME => {
let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
}
n if Some(n) == arch::SYS_LINK => {
let path = read_path_for_event(notif, notif.data.args[1], notif_fd)?;
resolve_at_path_for_event(notif, libc::AT_FDCWD as i64, &path)
}
_ => None,
}
}
fn read_sockaddr_for_event(notif: &SeccompNotif, addr: u64, len: usize, notif_fd: RawFd)
-> (Option<std::net::IpAddr>, Option<u16>)
{
if addr == 0 || len < 4 { return (None, None); }
let bytes = match read_child_mem(notif_fd, notif.id, notif.pid, addr, len.min(128)) {
Ok(b) => b,
Err(_) => return (None, None),
};
if bytes.len() < 4 { return (None, None); }
let family = u16::from_ne_bytes([bytes[0], bytes[1]]);
let port = u16::from_be_bytes([bytes[2], bytes[3]]);
let ip = match family as u32 {
f if f == crate::sys::structs::AF_INET && bytes.len() >= 8 => {
Some(std::net::IpAddr::V4(std::net::Ipv4Addr::new(
bytes[4], bytes[5], bytes[6], bytes[7],
)))
}
f if f == crate::sys::structs::AF_INET6 && bytes.len() >= 24 => {
let mut addr = [0u8; 16];
addr.copy_from_slice(&bytes[8..24]);
Some(std::net::IpAddr::V6(std::net::Ipv6Addr::from(addr)))
}
_ => None,
};
(ip, if port > 0 { Some(port) } else { None })
}
fn read_argv_for_event(notif: &SeccompNotif, argv_ptr: u64, notif_fd: RawFd) -> Option<Vec<String>> {
if argv_ptr == 0 { return None; }
let mut args = Vec::new();
let ptr_size = std::mem::size_of::<u64>();
for i in 0..64u64 {
let ptr_addr = argv_ptr + i * ptr_size as u64;
let ptr_bytes = read_child_mem(notif_fd, notif.id, notif.pid, ptr_addr, ptr_size).ok()?;
let str_ptr = u64::from_ne_bytes(ptr_bytes[..8].try_into().ok()?);
if str_ptr == 0 { break; }
if let Some(s) = read_path_for_event(notif, str_ptr, notif_fd) {
args.push(s);
} else {
break;
}
}
if args.is_empty() { None } else { Some(args) }
}
async fn emit_policy_event(
notif: &SeccompNotif,
action: &NotifAction,
policy_fn_state: &Arc<tokio::sync::Mutex<super::state::PolicyFnState>>,
notif_fd: RawFd,
) -> Option<crate::policy_fn::Verdict> {
let pfs = policy_fn_state.lock().await;
let tx = match pfs.event_tx.as_ref() {
Some(tx) => tx.clone(),
None => return None,
};
drop(pfs);
let nr = notif.data.nr as i64;
let denied = matches!(action, NotifAction::Errno(_));
let name = syscall_name(nr);
let category = syscall_category(nr);
let parent_pid = read_ppid(notif.pid);
let mut host = None;
let mut port = None;
let mut size = None;
let mut argv = None;
if nr == libc::SYS_execve || nr == libc::SYS_execveat {
let argv_ptr = if nr == libc::SYS_execveat {
notif.data.args[2]
} else {
notif.data.args[1]
};
argv = read_argv_for_event(notif, argv_ptr, notif_fd);
}
if nr == libc::SYS_connect || nr == libc::SYS_sendto || nr == libc::SYS_bind {
let addr_ptr = notif.data.args[1];
let addr_len = notif.data.args[2] as usize;
let (h, p) = read_sockaddr_for_event(notif, addr_ptr, addr_len, notif_fd);
host = h;
port = p;
}
if nr == libc::SYS_mmap {
size = Some(notif.data.args[1]);
}
let event = crate::policy_fn::SyscallEvent {
syscall: name.to_string(),
category,
pid: notif.pid,
parent_pid,
host,
port,
size,
argv,
denied,
};
let is_held = nr == libc::SYS_execve || nr == libc::SYS_execveat
|| nr == libc::SYS_connect || nr == libc::SYS_sendto
|| nr == libc::SYS_bind || nr == libc::SYS_openat;
if is_held {
let (gate_tx, gate_rx) = tokio::sync::oneshot::channel();
let _ = tx.send(crate::policy_fn::PolicyEvent {
event,
gate: Some(gate_tx),
});
match tokio::time::timeout(std::time::Duration::from_secs(5), gate_rx).await {
Ok(Ok(verdict)) => Some(verdict),
_ => None, }
} else {
let _ = tx.send(crate::policy_fn::PolicyEvent {
event,
gate: None,
});
None
}
}
async fn handle_notification(
notif: SeccompNotif,
ctx: &Arc<super::ctx::SupervisorCtx>,
dispatch_table: &super::dispatch::DispatchTable,
fd: RawFd,
) {
let policy = &ctx.policy;
crate::resource::register_child_if_new(ctx, notif.pid as i32).await;
if policy.has_time_start || policy.has_random_seed {
let mut pfs = ctx.procfs.lock().await;
maybe_patch_vdso(notif.pid as i32, &mut pfs, policy);
}
let mut action = {
let nr = notif.data.nr as i64;
let mut path_check_nrs = vec![
libc::SYS_openat, libc::SYS_execve, libc::SYS_execveat,
libc::SYS_linkat, libc::SYS_renameat2, libc::SYS_symlinkat,
];
path_check_nrs.extend([
arch::SYS_OPEN, arch::SYS_LINK, arch::SYS_RENAME, arch::SYS_SYMLINK,
].into_iter().flatten());
let should_precheck_denied = policy.chroot_root.is_none()
&& path_check_nrs.contains(&nr);
if should_precheck_denied {
let pfs = ctx.policy_fn.lock().await;
if is_path_denied_for_notif(&pfs, ¬if, fd) {
NotifAction::Errno(libc::EACCES)
} else {
drop(pfs);
dispatch_table.dispatch(notif, ctx, fd).await
}
} else {
dispatch_table.dispatch(notif, ctx, fd).await
}
};
if let Some(verdict) = emit_policy_event(¬if, &action, &ctx.policy_fn, fd).await {
use crate::policy_fn::Verdict;
match verdict {
Verdict::Deny => { action = NotifAction::Errno(libc::EPERM); }
Verdict::DenyWith(errno) => { action = NotifAction::Errno(errno); }
Verdict::Audit => { }
Verdict::Allow => {}
}
}
let nr = notif.data.nr as i64;
if matches!(action, NotifAction::Continue)
&& crate::sibling_freeze::requires_freeze_on_continue(nr)
{
if let Err(e) = crate::sibling_freeze::freeze_siblings_for_execve(notif.pid as i32) {
eprintln!(
"sandlock: argv-safety freeze failed for pid {}: {} \
— denying execve to preserve TOCTOU invariant",
notif.pid, e
);
action = NotifAction::Errno(libc::EPERM);
}
}
let _ = send_response(fd, notif.id, action);
}
pub async fn supervisor(
notif_fd: OwnedFd,
ctx: Arc<super::ctx::SupervisorCtx>,
) {
let fd = notif_fd.as_raw_fd();
let dispatch_table = Arc::new(super::dispatch::build_dispatch_table(&ctx.policy, &ctx.resource));
try_set_sync_wakeup(fd);
let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel::<SeccompNotif>();
std::thread::spawn(move || {
loop {
match recv_notif(fd) {
Ok(notif) => {
if tx.send(notif).is_err() {
break; }
}
Err(_) => break, }
}
});
let gc = tokio::spawn(process_index_gc(Arc::clone(&ctx.processes)));
while let Some(notif) = rx.recv().await {
handle_notification(notif, &ctx, &dispatch_table, fd).await;
}
gc.abort();
}
async fn process_index_gc(processes: Arc<super::state::ProcessIndex>) {
let interval = std::time::Duration::from_secs(300);
loop {
tokio::time::sleep(interval).await;
if processes.len() == 0 {
continue;
}
processes.prune_dead();
}
}
pub(crate) fn spawn_pid_watcher(
ctx: Arc<super::ctx::SupervisorCtx>,
key: super::state::PidKey,
pidfd: std::os::unix::io::OwnedFd,
) {
tokio::spawn(async move {
let async_fd = match tokio::io::unix::AsyncFd::with_interest(
pidfd,
tokio::io::Interest::READABLE,
) {
Ok(f) => f,
Err(_) => {
cleanup_pid(&ctx, key).await;
return;
}
};
let _ = async_fd.readable().await;
cleanup_pid(&ctx, key).await;
});
}
pub(crate) async fn cleanup_pid(ctx: &super::ctx::SupervisorCtx, key: super::state::PidKey) {
ctx.processes.unregister(key);
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_notif_action_debug() {
let _ = format!("{:?}", NotifAction::Continue);
let _ = format!("{:?}", NotifAction::Errno(1));
let _ = format!("{:?}", NotifAction::InjectFd { srcfd: 3, targetfd: 4 });
let test_fd = unsafe { OwnedFd::from_raw_fd(libc::dup(2)) };
let _ = format!("{:?}", NotifAction::InjectFdSend { srcfd: test_fd, newfd_flags: 0 });
let _ = format!("{:?}", NotifAction::ReturnValue(42));
let _ = format!("{:?}", NotifAction::Hold);
let _ = format!("{:?}", NotifAction::Kill { sig: 9, pgid: 1 });
}
#[test]
fn test_network_state_new() {
let ns = super::super::state::NetworkState::new();
assert!(matches!(ns.network_policy, NetworkPolicy::Unrestricted));
assert!(ns.port_map.bound_ports.is_empty());
}
#[test]
fn test_time_random_state_new() {
let tr = super::super::state::TimeRandomState::new(None, None);
assert!(tr.time_offset.is_none());
assert!(tr.random_state.is_none());
}
#[test]
fn test_resource_state_new() {
let rs = super::super::state::ResourceState::new(1024 * 1024, 10);
assert_eq!(rs.mem_used, 0);
assert_eq!(rs.max_memory_bytes, 1024 * 1024);
assert_eq!(rs.max_processes, 10);
assert!(!rs.hold_forks);
assert!(rs.held_notif_ids.is_empty());
}
#[test]
fn test_process_vm_readv_self() {
let data: u64 = 0xDEADBEEF_CAFEBABE;
let addr = &data as *const u64 as u64;
let pid = std::process::id();
let result = read_child_mem_vm(pid, addr, 8);
assert!(result.is_ok());
let bytes = result.unwrap();
let read_val = u64::from_ne_bytes(bytes[..8].try_into().unwrap());
assert_eq!(read_val, 0xDEADBEEF_CAFEBABE);
}
#[test]
fn test_process_vm_writev_self() {
let mut data: u64 = 0;
let addr = &mut data as *mut u64 as u64;
let pid = std::process::id();
let payload = 0x1234567890ABCDEFu64.to_ne_bytes();
let result = write_child_mem_vm(pid, addr, &payload);
assert!(result.is_ok());
assert_eq!(data, 0x1234567890ABCDEF);
}
}