use std::{
fs::{metadata, set_permissions},
io::{Read, Seek, SeekFrom, Write},
os::{
fd::{AsFd, AsRawFd, RawFd},
unix::fs::PermissionsExt,
},
path::Path,
sync::LazyLock,
};
use libc::{
c_int, c_long, c_ulong, off64_t, siginfo_t, syscall, SYS_ioctl, SYS_kcmp, SYS_tgkill,
SYS_tkill, _IO, _IOR, _IOW, _IOWR,
};
use libseccomp::ScmpFilterContext;
use nix::{
errno::Errno,
fcntl::{OFlag, AT_FDCWD},
sched::CloneFlags,
sys::{
signal::{SigSet, Signal},
stat::Mode,
},
unistd::Pid,
NixPath,
};
use crate::{
compat::{
getdents64, seccomp_notif, seccomp_notif_addfd, seccomp_notif_resp, AddWatchFlags,
FallocateFlags,
},
config::*,
confine::{resolve_syscall, CLONE_NEWTIME},
cookie::{CookieIdx, SYSCOOKIE_POOL},
err::{err2no, scmp2no},
fd::SafeOwnedFd,
path::{XPath, XPathBuf, PATH_MAX},
proc::PROCMAP_QUERY,
retry::retry_on_eintr,
};
pub fn is_same_vm(pid1: Pid, pid2: Pid) -> Result<bool, Errno> {
const KCMP_VM: u64 = 1;
Ok(Errno::result(unsafe { syscall(SYS_kcmp, pid1.as_raw(), pid2.as_raw(), KCMP_VM) })? == 0)
}
pub fn inotify_add_watch<Fd: AsFd, P: ?Sized + NixPath>(
fd: Fd,
path: &P,
mask: AddWatchFlags,
) -> Result<c_int, Errno> {
let res = path.with_nix_path(|cstr| unsafe {
libc::inotify_add_watch(fd.as_fd().as_raw_fd(), cstr.as_ptr(), mask.bits())
})?;
Errno::result(res).map(|wd| wd as c_int)
}
pub fn fallocate64<Fd: AsFd>(
fd: Fd,
mode: FallocateFlags,
off: off64_t,
len: off64_t,
) -> Result<(), Errno> {
Errno::result(unsafe { libc::fallocate64(fd.as_fd().as_raw_fd(), mode.bits(), off, len) })
.map(drop)
}
pub fn truncate64<P: ?Sized + NixPath>(path: &P, len: off64_t) -> Result<(), Errno> {
Errno::result(path.with_nix_path(|cstr| unsafe { libc::truncate64(cstr.as_ptr(), len) })?)
.map(drop)
}
pub fn ftruncate64<Fd: AsFd>(fd: Fd, len: off64_t) -> Result<(), Errno> {
Errno::result(unsafe { libc::ftruncate64(fd.as_fd().as_raw_fd(), len) }).map(drop)
}
#[repr(C)]
#[derive(Debug, Clone, Copy)]
struct fiemap_extent {
fe_logical: u64,
fe_physical: u64,
fe_length: u64,
_fe_reserved64: [u64; 2],
fe_flags: u32,
_fe_reserved: [u32; 3],
}
#[repr(C)]
struct fiemap {
fm_start: u64,
fm_length: u64,
fm_flags: u32,
fm_mapped_extents: u32,
fm_extent_count: u32,
_fm_reserved: u32,
fm_extents: [fiemap_extent; 0],
}
pub const FS_IOC_FIEMAP: c_ulong = _IOWR::<fiemap>(b'f' as u32, 11) as c_ulong;
pub const FIGETBSZ: c_ulong = _IO(0x00, 2) as c_ulong;
#[repr(C)]
#[derive(Debug, Clone, Copy)]
struct file_dedupe_range_info {
dest_fd: i64,
dest_offset: u64,
bytes_deduped: u64,
status: i32,
reserved: u32,
}
#[repr(C)]
struct file_dedupe_range {
src_offset: u64,
src_length: u64,
dest_count: u16,
reserved1: u16,
reserved2: u32,
info: [file_dedupe_range_info; 0],
}
pub const FIDEDUPERANGE: c_ulong = _IOWR::<file_dedupe_range>(0x94, 54) as c_ulong;
#[repr(C)]
#[derive(Debug, Clone, Copy)]
struct fsuuid2 {
len: u8,
uuid: [u8; 16],
}
pub const FS_IOC_GETFSUUID: c_ulong = _IOR::<fsuuid2>(0x15, 0) as c_ulong;
#[repr(C)]
#[derive(Debug, Clone, Copy)]
struct fs_sysfs_path {
len: u8,
name: [u8; 128],
}
pub const FS_IOC_GETFSSYSFSPATH: c_ulong = _IOR::<fs_sysfs_path>(0x15, 1) as c_ulong;
pub const FIBMAP: c_ulong = _IO(0x00, 1) as c_ulong;
pub const KDSETKEYCODE: c_ulong = 0x4B4D;
pub const KDSIGACCEPT: c_ulong = 0x4B4E;
#[repr(C)]
#[derive(Debug, Clone, Copy)]
struct fsxattr {
fsx_xflags: u32,
fsx_extsize: u32,
fsx_nextents: u32,
fsx_projid: u32,
fsx_cowextsize: u32,
fsx_pad: [u8; 8],
}
pub const FS_IOC_FSGETXATTR: c_ulong = _IOR::<fsxattr>(b'X' as u32, 31) as c_ulong;
pub const FS_IOC_FSSETXATTR: c_ulong = _IOW::<fsxattr>(b'X' as u32, 32) as c_ulong;
pub const FS_IOC_SETFLAGS: c_ulong = _IOW::<c_long>(b'f' as u32, 2) as c_ulong;
pub const SECCOMP_IOCTL_MAGIC: u32 = b'!' as u32;
pub const SECCOMP_IOCTL_NOTIF_RECV: c_ulong =
_IOWR::<seccomp_notif>(SECCOMP_IOCTL_MAGIC, 0) as c_ulong;
pub const SECCOMP_IOCTL_NOTIF_SEND: c_ulong =
_IOWR::<seccomp_notif_resp>(SECCOMP_IOCTL_MAGIC, 1) as c_ulong;
pub const SECCOMP_IOCTL_NOTIF_ID_VALID: c_ulong = _IOW::<u64>(SECCOMP_IOCTL_MAGIC, 2) as c_ulong;
pub const SECCOMP_IOCTL_NOTIF_ADDFD: c_ulong =
_IOW::<seccomp_notif_addfd>(SECCOMP_IOCTL_MAGIC, 3) as c_ulong;
pub const SECCOMP_IOCTL_NOTIF_SET_FLAGS: c_ulong = _IOW::<u64>(SECCOMP_IOCTL_MAGIC, 4) as c_ulong;
pub(crate) const SECCOMP_IOCTL_NOTIF_LIST: &[c_ulong] = &[
SECCOMP_IOCTL_NOTIF_RECV,
SECCOMP_IOCTL_NOTIF_SEND,
SECCOMP_IOCTL_NOTIF_ID_VALID,
SECCOMP_IOCTL_NOTIF_ADDFD,
SECCOMP_IOCTL_NOTIF_SET_FLAGS,
];
pub(crate) const SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP: u32 = 1;
pub(crate) fn seccomp_export_pfc(ctx: &ScmpFilterContext) -> Result<String, Errno> {
#[expect(clippy::disallowed_methods)]
let mut file = nix::fcntl::openat(
AT_FDCWD,
"/tmp",
OFlag::O_TMPFILE | OFlag::O_EXCL | OFlag::O_RDWR,
Mode::empty(),
)
.map(SafeOwnedFd::from)?;
ctx.export_pfc(&mut file)
.map_err(|err| scmp2no(&err).unwrap_or(Errno::EFAULT))?;
file.seek(SeekFrom::Start(0)).map_err(|err| err2no(&err))?;
let mut buf = Vec::new();
file.read_to_end(&mut buf).map_err(|err| err2no(&err))?;
let mut pfc = String::from_utf8_lossy(&buf).into_owned();
for &(from, to) in &[
("0x7fc00000", "NOTIFY"),
(
&format!("{SECCOMP_IOCTL_NOTIF_RECV}"),
"SECCOMP_IOCTL_NOTIF_RECV",
),
(
&format!("{SECCOMP_IOCTL_NOTIF_SEND}"),
"SECCOMP_IOCTL_NOTIF_SEND",
),
(
&format!("{SECCOMP_IOCTL_NOTIF_ID_VALID}"),
"SECCOMP_IOCTL_NOTIF_ID_VALID",
),
(
&format!("{SECCOMP_IOCTL_NOTIF_ADDFD}"),
"SECCOMP_IOCTL_NOTIF_ADDFD",
),
(
&format!("{SECCOMP_IOCTL_NOTIF_SET_FLAGS}"),
"SECCOMP_IOCTL_NOTIF_SET_FLAGS",
),
(&format!("{PROCMAP_QUERY}"), "PROCMAP_QUERY"),
] {
pfc = pfc.replace(from, to);
}
Ok(pfc)
}
pub(crate) fn seccomp_notify_set_flags(fd: RawFd, flags: u32) -> Result<(), Errno> {
if !*HAVE_SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP {
return Err(Errno::ENOSYS);
}
retry_on_eintr(|| {
Errno::result(unsafe {
syscall(
SYS_ioctl,
fd,
SECCOMP_IOCTL_NOTIF_SET_FLAGS as c_ulong,
flags,
)
})
})
.map(drop)
}
pub(crate) fn seccomp_notify_id_valid(fd: RawFd, id: u64) -> Result<(), Errno> {
retry_on_eintr(|| {
Errno::result(unsafe {
syscall(SYS_ioctl, fd, SECCOMP_IOCTL_NOTIF_ID_VALID as c_ulong, &id)
})
})
.map(drop)
}
pub(crate) fn seccomp_notify_respond(
fd: RawFd,
response: *const seccomp_notif_resp,
) -> Result<(), Errno> {
retry_on_eintr(|| {
Errno::result(unsafe {
syscall(
SYS_ioctl,
fd,
SECCOMP_IOCTL_NOTIF_SEND as c_ulong,
response,
SYSCOOKIE_POOL.get(CookieIdx::SeccompIoctlNotifSendArg3),
SYSCOOKIE_POOL.get(CookieIdx::SeccompIoctlNotifSendArg4),
SYSCOOKIE_POOL.get(CookieIdx::SeccompIoctlNotifSendArg5),
)
})
})
.map(drop)
}
pub(crate) fn seccomp_notify_addfd(
fd: RawFd,
addfd: *const seccomp_notif_addfd,
) -> Result<RawFd, Errno> {
#[expect(clippy::cast_possible_truncation)]
retry_on_eintr(|| {
Errno::result(unsafe {
syscall(
SYS_ioctl,
fd,
SECCOMP_IOCTL_NOTIF_ADDFD as c_ulong,
addfd,
SYSCOOKIE_POOL.get(CookieIdx::SeccompIoctlNotifAddfdArg3),
SYSCOOKIE_POOL.get(CookieIdx::SeccompIoctlNotifAddfdArg4),
SYSCOOKIE_POOL.get(CookieIdx::SeccompIoctlNotifAddfdArg5),
)
})
})
.map(|fd| fd as RawFd)
}
pub fn oflag_accmode(flags: OFlag) -> OFlag {
let mode = flags & (OFlag::O_ACCMODE | OFlag::O_PATH);
#[expect(clippy::arithmetic_side_effects)]
if mode.contains(OFlag::O_ACCMODE) {
(mode - OFlag::O_ACCMODE) | OFlag::O_RDWR
} else {
mode
}
}
pub fn oflag_nonblock(flags: OFlag) -> bool {
!(flags & (OFlag::O_NONBLOCK | OFlag::O_NDELAY)).is_empty()
}
static SYS_PROCESS_MRELEASE: LazyLock<Option<c_long>> =
LazyLock::new(|| resolve_syscall("process_mrelease"));
pub fn process_mrelease<Fd: AsFd>(pid_fd: Fd) -> Result<(), Errno> {
let sysnum = SYS_PROCESS_MRELEASE.ok_or(Errno::ENOSYS)?;
Errno::result(unsafe { syscall(sysnum, pid_fd.as_fd().as_raw_fd(), 0) }).map(drop)
}
pub fn tgkill(tgid: Pid, tid: Pid, sig: i32) -> Result<(), Errno> {
Errno::result(unsafe { syscall(SYS_tgkill, tgid.as_raw(), tid.as_raw(), sig) }).map(drop)
}
pub fn tkill(tid: Pid, sig: i32) -> Result<(), Errno> {
Errno::result(unsafe { syscall(SYS_tkill, tid.as_raw(), sig) }).map(drop)
}
pub fn sigwaitinfo(set: &SigSet, info: Option<&mut siginfo_t>) -> Result<i32, Errno> {
Errno::result(crate::compat::sigwaitinfo(set.as_ref(), info))
}
#[expect(clippy::disallowed_types)]
pub fn sigtimedwait(
set: &SigSet,
info: Option<&mut siginfo_t>,
timeout: nix::sys::time::TimeSpec,
) -> Result<i32, Errno> {
let info = info.map(|si| si as *mut _).unwrap_or(std::ptr::null_mut());
Errno::result(unsafe { libc::sigtimedwait(set.as_ref(), info, timeout.as_ref()) })
}
#[expect(clippy::disallowed_types)]
pub fn sigtimedpoll(set: &SigSet, info: Option<&mut siginfo_t>) -> Result<i32, Errno> {
sigtimedwait(set, info, nix::sys::time::TimeSpec::new(0, 0))
}
pub fn block_signal(sig: Signal) -> Result<(), Errno> {
let mut mask = SigSet::empty();
mask.add(sig);
mask.thread_block()
}
pub fn unblock_signal(sig: Signal) -> Result<(), Errno> {
let mut mask = SigSet::empty();
mask.add(sig);
mask.thread_unblock()
}
pub fn readlinkat<Fd: AsFd, P: NixPath + ?Sized>(fd: Fd, base: &P) -> Result<XPathBuf, Errno> {
let mut target = [0u8; PATH_MAX];
let n = base.with_nix_path(|cstr| {
#[expect(clippy::cast_sign_loss)]
Errno::result(unsafe {
libc::readlinkat(
fd.as_fd().as_raw_fd(),
cstr.as_ptr(),
target.as_mut_ptr().cast(),
target.len(),
)
})
.map(|n| n as usize)
})??;
if n > target.len() {
return Err(Errno::ENAMETOOLONG);
}
let mut path = Vec::new();
path.try_reserve(n).or(Err(Errno::ENOMEM))?;
path.extend(&target[..n]);
Ok(path.into())
}
pub fn readlinkfd<Fd: AsFd>(fd: Fd) -> Result<XPathBuf, Errno> {
readlinkat(fd, c"").map_err(|errno| {
if errno == Errno::ENOENT {
Errno::EINVAL
} else {
errno
}
})
}
#[expect(clippy::disallowed_methods)]
#[expect(clippy::disallowed_types)]
pub fn cat<P: AsRef<Path>, T: AsRef<[u8]>>(path: P, content: T) -> std::io::Result<()> {
let mut file = std::fs::File::create(path)?;
file.write_all(content.as_ref())?;
Ok(())
}
pub fn chmod_x<P: AsRef<Path>>(path: P) -> std::io::Result<()> {
let metadata = metadata(path.as_ref())?;
let mut permissions = metadata.permissions();
permissions.set_mode(0o700); set_permissions(path.as_ref(), permissions)
}
pub fn format_oflags(flags: OFlag) -> Vec<String> {
let count = flags.into_iter().count();
if count == 0 {
return vec![];
}
let mut fmt = Vec::with_capacity(count);
for flag in flags.iter() {
fmt.push(format_oflag(flag));
}
fmt
}
pub fn format_oflag(flag: OFlag) -> String {
let flag = format!("{flag:?}");
if !flag.starts_with("OFlag(O_") || !flag.ends_with(')') {
return "?".to_string();
}
#[expect(clippy::arithmetic_side_effects)]
String::from_utf8_lossy(&flag.as_bytes()[8..flag.len() - 1]).to_ascii_lowercase()
}
pub fn format_clone_flags(flags: CloneFlags) -> Vec<&'static str> {
let mut names = vec![];
if flags.is_empty() {
return names;
}
if flags.contains(CloneFlags::CLONE_NEWUSER) {
names.push("user");
}
if flags.contains(CloneFlags::CLONE_NEWNS) {
names.push("mount");
}
if flags.contains(CloneFlags::CLONE_NEWUTS) {
names.push("uts");
}
if flags.contains(CloneFlags::CLONE_NEWIPC) {
names.push("ipc");
}
if flags.contains(CloneFlags::CLONE_NEWPID) {
names.push("pid");
}
if flags.contains(CloneFlags::CLONE_NEWNET) {
names.push("net");
}
if flags.contains(CloneFlags::CLONE_NEWCGROUP) {
names.push("cgroup");
}
if flags.contains(CLONE_NEWTIME) {
names.push("time");
}
names
}
pub fn format_clone_names(clone_names: &[&str]) -> String {
match clone_names.len() {
0 => "no namespaces".to_string(),
1 => format!("{} namespace", clone_names[0]),
2 => format!("{} and {} namespaces", clone_names[0], clone_names[1]),
_ => {
let mut s = clone_names.join(", ");
#[expect(clippy::arithmetic_side_effects)]
if let Some(pos) = s.rfind(", ") {
s.replace_range(pos..pos + 2, ", and ");
}
format!("{s} namespaces")
}
}
}
#[expect(clippy::disallowed_methods)]
#[expect(clippy::disallowed_types)]
pub fn grep(dir: &XPath, name: &[u8]) -> Option<XPathBuf> {
let dir = std::fs::File::open(dir.as_path()).ok()?;
let name = XPath::from_bytes(name);
loop {
let mut entries = getdents64(&dir, 128).ok()?;
for entry in &mut entries {
let mut path = XPathBuf::from(entry.name_bytes());
if entry.is_dir() {
path.append_byte(b'/');
} else if entry.is_symlink() {
path.append_byte(b'@');
} else if entry.is_block_device() {
path.append_byte(b'!');
} else if entry.is_char_device() {
path.append_byte(b'$');
} else if entry.is_fifo() {
path.append_byte(b'|');
} else if entry.is_socket() {
path.append_byte(b'~');
}
if *path == *name || (name.len() == 1 && path.ends_with(name.as_bytes())) {
return Some(path);
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_base_offset_root_and_non_root() {
let off = |parent_len: usize| parent_len + usize::from(parent_len > 1);
assert_eq!(off(1), 1, "root parent must not drop first byte");
assert_eq!(off(5), 6, "non-root parent must skip one separator");
}
#[test]
fn test_oflag_rdonly_is_empty() {
let mut flags = OFlag::empty();
flags.insert(OFlag::O_RDONLY);
assert!(flags.is_empty());
}
#[test]
fn test_oflag_accmode() {
assert_eq!(oflag_accmode(OFlag::empty()), OFlag::O_RDONLY);
assert_eq!(oflag_accmode(OFlag::O_RDONLY), OFlag::O_RDONLY);
assert_eq!(oflag_accmode(OFlag::O_WRONLY), OFlag::O_WRONLY);
assert_eq!(oflag_accmode(OFlag::O_RDWR), OFlag::O_RDWR);
assert_eq!(oflag_accmode(OFlag::O_PATH), OFlag::O_PATH);
assert_eq!(
oflag_accmode(OFlag::empty() | OFlag::O_APPEND),
OFlag::O_RDONLY
);
assert_eq!(
oflag_accmode(OFlag::O_RDONLY | OFlag::O_ASYNC),
OFlag::O_RDONLY
);
assert_eq!(
oflag_accmode(OFlag::O_WRONLY | OFlag::O_CREAT | OFlag::O_EXCL | OFlag::O_TRUNC),
OFlag::O_WRONLY
);
assert_eq!(
oflag_accmode(OFlag::O_RDWR | OFlag::O_CLOEXEC | OFlag::O_DIRECTORY),
OFlag::O_RDWR
);
assert_eq!(
oflag_accmode(OFlag::from_bits_retain(libc::O_ACCMODE)),
OFlag::O_RDWR
);
assert_eq!(
oflag_accmode(OFlag::from_bits_retain(libc::O_ACCMODE) | OFlag::O_CLOEXEC),
OFlag::O_RDWR
);
assert_eq!(
oflag_accmode(OFlag::O_PATH | OFlag::O_NOFOLLOW),
OFlag::O_PATH
);
}
#[test]
fn test_oflag_nonblock_1() {
assert!(!oflag_nonblock(OFlag::empty()));
assert!(!oflag_nonblock(OFlag::O_RDONLY));
assert!(oflag_nonblock(OFlag::O_NONBLOCK));
assert!(oflag_nonblock(OFlag::O_NDELAY));
assert!(oflag_nonblock(OFlag::O_RDWR | OFlag::O_NONBLOCK));
}
#[test]
fn test_format_oflag_1() {
let result = format_oflag(OFlag::O_RDWR);
assert_eq!(result, "rdwr");
}
#[test]
fn test_format_oflag_2() {
let result = format_oflag(OFlag::O_CREAT);
assert_eq!(result, "creat");
}
#[test]
fn test_format_oflag_3() {
let result = format_oflag(OFlag::O_CLOEXEC);
assert_eq!(result, "cloexec");
}
#[test]
fn test_format_oflags_1() {
let result = format_oflags(OFlag::empty());
assert!(result.is_empty());
}
#[test]
fn test_format_oflags_2() {
let result = format_oflags(OFlag::O_RDWR | OFlag::O_CREAT);
assert_eq!(result.len(), 2);
assert!(result.contains(&"rdwr".to_string()));
assert!(result.contains(&"creat".to_string()));
}
#[test]
fn test_format_clone_flags_1() {
let result = format_clone_flags(CloneFlags::empty());
assert!(result.is_empty());
}
#[test]
fn test_format_clone_flags_2() {
let result = format_clone_flags(CloneFlags::CLONE_NEWUSER);
assert_eq!(result, vec!["user"]);
}
#[test]
fn test_format_clone_flags_3() {
let result = format_clone_flags(CloneFlags::CLONE_NEWNS | CloneFlags::CLONE_NEWPID);
assert!(result.contains(&"mount"));
assert!(result.contains(&"pid"));
}
#[test]
fn test_format_clone_flags_4() {
let result = format_clone_flags(
CloneFlags::CLONE_NEWUSER
| CloneFlags::CLONE_NEWUTS
| CloneFlags::CLONE_NEWIPC
| CloneFlags::CLONE_NEWNET
| CloneFlags::CLONE_NEWCGROUP,
);
assert!(result.contains(&"user"));
assert!(result.contains(&"uts"));
assert!(result.contains(&"ipc"));
assert!(result.contains(&"net"));
assert!(result.contains(&"cgroup"));
}
#[test]
fn test_format_clone_names_1() {
let result = format_clone_names(&[]);
assert_eq!(result, "no namespaces");
}
#[test]
fn test_format_clone_names_2() {
let result = format_clone_names(&["user"]);
assert_eq!(result, "user namespace");
}
#[test]
fn test_format_clone_names_3() {
let result = format_clone_names(&["user", "pid"]);
assert_eq!(result, "user and pid namespaces");
}
#[test]
fn test_format_clone_names_4() {
let result = format_clone_names(&["user", "pid", "net"]);
assert!(result.contains("user"));
assert!(result.contains("pid"));
assert!(result.contains("and net"));
assert!(result.ends_with("namespaces"));
}
#[test]
fn test_block_signal_1() {
assert!(block_signal(Signal::SIGUSR1).is_ok());
assert!(unblock_signal(Signal::SIGUSR1).is_ok());
}
}