use std::any::Any;
use std::ffi::{CStr, CString, OsStr};
use std::os::fd::BorrowedFd;
use std::os::unix::ffi::OsStrExt;
use std::os::unix::fs::symlink;
use std::os::unix::io::RawFd;
use std::path::Path;
use std::str::FromStr;
use std::sync::Arc;
use std::{fs, mem, ptr};
use caps::{CapSet, CapsHashSet};
use libc::{c_char, setdomainname, uid_t};
use nix::fcntl;
use nix::fcntl::{open, OFlag};
use nix::mount::{mount, umount2, MntFlags, MsFlags};
use nix::sched::{unshare, CloneFlags};
use nix::sys::stat::{mknod, Mode, SFlag};
use nix::unistd::{chown, chroot, close, fchdir, pivot_root, sethostname, Gid, Uid};
use oci_spec::runtime::PosixRlimit;
use super::{Result, Syscall, SyscallError};
use crate::{capabilities, utils};
pub const AT_RECURSIVE: u32 = 0x00008000; #[allow(non_upper_case_globals)]
pub const MOUNT_ATTR__ATIME: u64 = 0x00000070; const MOUNT_ATTR_RDONLY: u64 = 0x00000001;
const MOUNT_ATTR_NOSUID: u64 = 0x00000002;
const MOUNT_ATTR_NODEV: u64 = 0x00000004;
const MOUNT_ATTR_NOEXEC: u64 = 0x00000008;
const MOUNT_ATTR_RELATIME: u64 = 0x00000000;
const MOUNT_ATTR_NOATIME: u64 = 0x00000010;
const MOUNT_ATTR_STRICTATIME: u64 = 0x00000020;
const MOUNT_ATTR_NODIRATIME: u64 = 0x00000080;
const MOUNT_ATTR_NOSYMFOLLOW: u64 = 0x00200000;
pub enum MountRecursive {
Rdonly(bool, u64),
Nosuid(bool, u64),
Nodev(bool, u64),
Noexec(bool, u64),
Atime(bool, u64),
Relatime(bool, u64),
Noatime(bool, u64),
StrictAtime(bool, u64),
NoDiratime(bool, u64),
Nosymfollow(bool, u64),
}
impl FromStr for MountRecursive {
type Err = SyscallError;
fn from_str(option: &str) -> std::result::Result<Self, Self::Err> {
match option {
"rro" => Ok(MountRecursive::Rdonly(false, MOUNT_ATTR_RDONLY)),
"rrw" => Ok(MountRecursive::Rdonly(true, MOUNT_ATTR_RDONLY)),
"rnosuid" => Ok(MountRecursive::Nosuid(false, MOUNT_ATTR_NOSUID)),
"rsuid" => Ok(MountRecursive::Nosuid(true, MOUNT_ATTR_NOSUID)),
"rnodev" => Ok(MountRecursive::Nodev(false, MOUNT_ATTR_NODEV)),
"rdev" => Ok(MountRecursive::Nodev(true, MOUNT_ATTR_NODEV)),
"rnoexec" => Ok(MountRecursive::Noexec(false, MOUNT_ATTR_NOEXEC)),
"rexec" => Ok(MountRecursive::Noexec(true, MOUNT_ATTR_NOEXEC)),
"rnodiratime" => Ok(MountRecursive::NoDiratime(false, MOUNT_ATTR_NODIRATIME)),
"rdiratime" => Ok(MountRecursive::NoDiratime(true, MOUNT_ATTR_NODIRATIME)),
"rrelatime" => Ok(MountRecursive::Relatime(false, MOUNT_ATTR_RELATIME)),
"rnorelatime" => Ok(MountRecursive::Relatime(true, MOUNT_ATTR_RELATIME)),
"rnoatime" => Ok(MountRecursive::Noatime(false, MOUNT_ATTR_NOATIME)),
"ratime" => Ok(MountRecursive::Noatime(true, MOUNT_ATTR_NOATIME)),
"rstrictatime" => Ok(MountRecursive::StrictAtime(false, MOUNT_ATTR_STRICTATIME)),
"rnostrictatime" => Ok(MountRecursive::StrictAtime(true, MOUNT_ATTR_STRICTATIME)),
"rnosymfollow" => Ok(MountRecursive::Nosymfollow(false, MOUNT_ATTR_NOSYMFOLLOW)),
"rsymfollow" => Ok(MountRecursive::Nosymfollow(true, MOUNT_ATTR_NOSYMFOLLOW)),
_ => Err(SyscallError::UnexpectedMountRecursiveOption(
option.to_string(),
)),
}
}
}
#[repr(C)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct MountAttr {
pub attr_set: u64,
pub attr_clr: u64,
pub propagation: u64,
pub userns_fd: u64,
}
impl MountAttr {
pub fn all() -> Self {
MountAttr {
attr_set: MOUNT_ATTR_RDONLY
| MOUNT_ATTR_NOSUID
| MOUNT_ATTR_NODEV
| MOUNT_ATTR_NOEXEC
| MOUNT_ATTR_NODIRATIME
| MOUNT_ATTR_RELATIME
| MOUNT_ATTR_NOATIME
| MOUNT_ATTR_STRICTATIME
| MOUNT_ATTR_NOSYMFOLLOW,
attr_clr: MOUNT_ATTR_RDONLY
| MOUNT_ATTR_NOSUID
| MOUNT_ATTR_NODEV
| MOUNT_ATTR_NOEXEC
| MOUNT_ATTR_NODIRATIME
| MOUNT_ATTR_RELATIME
| MOUNT_ATTR_NOATIME
| MOUNT_ATTR_STRICTATIME
| MOUNT_ATTR_NOSYMFOLLOW
| MOUNT_ATTR__ATIME,
propagation: 0,
userns_fd: 0,
}
}
}
#[derive(Clone)]
pub struct LinuxSyscall;
impl LinuxSyscall {
unsafe fn from_raw_buf<'a, T>(p: *const c_char) -> T
where
T: From<&'a OsStr>,
{
T::from(OsStr::from_bytes(CStr::from_ptr(p).to_bytes()))
}
unsafe fn passwd_to_user(passwd: libc::passwd) -> Arc<OsStr> {
let name: Arc<OsStr> = Self::from_raw_buf(passwd.pw_name);
name
}
fn emulate_close_range(preserve_fds: i32) -> Result<()> {
let open_fds = Self::get_open_fds()?;
let min_fd = preserve_fds + 3;
let to_be_cleaned_up_fds: Vec<i32> = open_fds
.iter()
.filter_map(|&fd| if fd >= min_fd { Some(fd) } else { None })
.collect();
to_be_cleaned_up_fds.iter().for_each(|&fd| {
let _ = fcntl::fcntl(fd, fcntl::F_SETFD(fcntl::FdFlag::FD_CLOEXEC));
});
Ok(())
}
fn get_open_fds() -> Result<Vec<i32>> {
const PROCFS_FD_PATH: &str = "/proc/self/fd";
utils::ensure_procfs(Path::new(PROCFS_FD_PATH)).map_err(|err| {
tracing::error!(?err, "failed to ensure /proc is mounted");
match err {
utils::EnsureProcfsError::Nix(err) => SyscallError::Nix(err),
utils::EnsureProcfsError::IO(err) => SyscallError::IO(err),
}
})?;
let fds: Vec<i32> = fs::read_dir(PROCFS_FD_PATH)
.map_err(|err| {
tracing::error!(?err, "failed to read /proc/self/fd");
err
})?
.filter_map(|entry| match entry {
Ok(entry) => Some(entry.path()),
Err(_) => None,
})
.filter_map(|path| path.file_name().map(|file_name| file_name.to_owned()))
.filter_map(|file_name| file_name.to_str().map(String::from))
.filter_map(|file_name| -> Option<i32> {
match file_name.parse() {
Ok(fd) => Some(fd),
Err(_) => None,
}
})
.collect();
Ok(fds)
}
}
impl Syscall for LinuxSyscall {
fn as_any(&self) -> &dyn Any {
self
}
fn pivot_rootfs(&self, path: &Path) -> Result<()> {
let newroot = open(
path,
OFlag::O_DIRECTORY | OFlag::O_RDONLY | OFlag::O_CLOEXEC,
Mode::empty(),
)
.map_err(|errno| {
tracing::error!(?errno, ?path, "failed to open the new root for pivot root");
errno
})?;
pivot_root(path, path).map_err(|errno| {
tracing::error!(?errno, ?path, "failed to pivot root to");
errno
})?;
mount(
None::<&str>,
"/",
None::<&str>,
MsFlags::MS_SLAVE | MsFlags::MS_REC,
None::<&str>,
)
.map_err(|errno| {
tracing::error!(?errno, "failed to make original root directory rslave");
errno
})?;
umount2("/", MntFlags::MNT_DETACH).map_err(|errno| {
tracing::error!(?errno, "failed to unmount old root directory");
errno
})?;
fchdir(newroot).map_err(|errno| {
tracing::error!(?errno, ?newroot, "failed to change directory to new root");
errno
})?;
close(newroot).map_err(|errno| {
tracing::error!(?errno, ?newroot, "failed to close new root directory");
errno
})?;
Ok(())
}
fn set_ns(&self, rawfd: i32, nstype: CloneFlags) -> Result<()> {
let fd = unsafe { BorrowedFd::borrow_raw(rawfd) };
nix::sched::setns(fd, nstype)?;
Ok(())
}
fn set_id(&self, uid: Uid, gid: Gid) -> Result<()> {
prctl::set_keep_capabilities(true).map_err(|errno| {
tracing::error!(?errno, "failed to set keep capabilities to true");
nix::errno::Errno::from_raw(errno)
})?;
if unsafe { libc::syscall(libc::SYS_setresgid, gid, gid, gid) } == -1 {
let err = nix::errno::Errno::last();
tracing::error!(
?err,
?gid,
"failed to set real, effective and saved set gid"
);
return Err(err.into());
}
if unsafe { libc::syscall(libc::SYS_setresuid, uid, uid, uid) } == -1 {
let err = nix::errno::Errno::last();
tracing::error!(
?err,
?uid,
"failed to set real, effective and saved set uid"
);
return Err(err.into());
}
if uid != Uid::from_raw(0) {
capabilities::reset_effective(self)?;
}
prctl::set_keep_capabilities(false).map_err(|errno| {
tracing::error!(?errno, "failed to set keep capabilities to false");
nix::errno::Errno::from_raw(errno)
})?;
Ok(())
}
fn unshare(&self, flags: CloneFlags) -> Result<()> {
unshare(flags)?;
Ok(())
}
fn set_capability(&self, cset: CapSet, value: &CapsHashSet) -> Result<()> {
match cset {
CapSet::Bounding => {
let all = caps::read(None, CapSet::Bounding)?;
for c in all.difference(value) {
caps::drop(None, CapSet::Bounding, *c)?
}
}
_ => {
caps::set(None, cset, value)?;
}
}
Ok(())
}
fn set_hostname(&self, hostname: &str) -> Result<()> {
sethostname(hostname)?;
Ok(())
}
fn set_domainname(&self, domainname: &str) -> Result<()> {
let ptr = domainname.as_bytes().as_ptr() as *const c_char;
let len = domainname.len();
match unsafe { setdomainname(ptr, len) } {
0 => Ok(()),
-1 => Err(nix::Error::last()),
_ => Err(nix::Error::UnknownErrno),
}?;
Ok(())
}
fn set_rlimit(&self, rlimit: &PosixRlimit) -> Result<()> {
let rlim = &libc::rlimit {
rlim_cur: rlimit.soft(),
rlim_max: rlimit.hard(),
};
#[cfg(not(target_env = "musl"))]
let res = unsafe { libc::setrlimit(rlimit.typ() as u32, rlim) };
#[cfg(target_env = "musl")]
let res = unsafe { libc::setrlimit(rlimit.typ() as i32, rlim) };
match res {
0 => Ok(()),
-1 => Err(SyscallError::Nix(nix::Error::last())),
_ => Err(SyscallError::Nix(nix::Error::UnknownErrno)),
}?;
Ok(())
}
fn get_pwuid(&self, uid: uid_t) -> Option<Arc<OsStr>> {
let mut passwd = unsafe { mem::zeroed::<libc::passwd>() };
let mut buf = vec![0; 2048];
let mut result = ptr::null_mut::<libc::passwd>();
loop {
let r = unsafe {
libc::getpwuid_r(uid, &mut passwd, buf.as_mut_ptr(), buf.len(), &mut result)
};
if r != libc::ERANGE {
break;
}
let newsize = buf.len().checked_mul(2)?;
buf.resize(newsize, 0);
}
if result.is_null() {
return None;
}
if result != &mut passwd {
return None;
}
let user = unsafe { Self::passwd_to_user(result.read()) };
Some(user)
}
fn chroot(&self, path: &Path) -> Result<()> {
chroot(path)?;
Ok(())
}
fn mount(
&self,
source: Option<&Path>,
target: &Path,
fstype: Option<&str>,
flags: MsFlags,
data: Option<&str>,
) -> Result<()> {
mount(source, target, fstype, flags, data)?;
Ok(())
}
fn symlink(&self, original: &Path, link: &Path) -> Result<()> {
symlink(original, link)?;
Ok(())
}
fn mknod(&self, path: &Path, kind: SFlag, perm: Mode, dev: u64) -> Result<()> {
mknod(path, kind, perm, dev)?;
Ok(())
}
fn chown(&self, path: &Path, owner: Option<Uid>, group: Option<Gid>) -> Result<()> {
chown(path, owner, group)?;
Ok(())
}
fn set_groups(&self, groups: &[Gid]) -> Result<()> {
let n_groups = groups.len() as libc::size_t;
let groups_ptr = groups.as_ptr() as *const libc::gid_t;
if unsafe { libc::syscall(libc::SYS_setgroups, n_groups, groups_ptr) } == -1 {
let err = nix::errno::Errno::last();
tracing::error!(?err, ?groups, "failed to set groups");
return Err(err.into());
}
Ok(())
}
#[tracing::instrument(skip(self))]
fn close_range(&self, preserve_fds: i32) -> Result<()> {
match unsafe {
libc::syscall(
libc::SYS_close_range,
3 + preserve_fds,
libc::c_int::MAX,
libc::CLOSE_RANGE_CLOEXEC,
)
} {
0 => Ok(()),
-1 => {
match nix::errno::Errno::last() {
nix::errno::Errno::ENOSYS | nix::errno::Errno::EINVAL => {
Self::emulate_close_range(preserve_fds)
}
e => Err(SyscallError::Nix(e)),
}
}
_ => Err(SyscallError::Nix(nix::errno::Errno::UnknownErrno)),
}?;
Ok(())
}
fn mount_setattr(
&self,
dirfd: RawFd,
pathname: &Path,
flags: u32,
mount_attr: &MountAttr,
size: libc::size_t,
) -> Result<()> {
let path_c_string = pathname
.to_path_buf()
.to_str()
.map(CString::new)
.ok_or_else(|| {
tracing::error!(path = ?pathname, "failed to convert path to string");
nix::Error::EINVAL
})?
.map_err(|err| {
tracing::error!(path = ?pathname, ?err, "failed to convert path to string");
nix::Error::EINVAL
})?;
match unsafe {
libc::syscall(
libc::SYS_mount_setattr,
dirfd,
path_c_string.as_ptr(),
flags,
mount_attr as *const MountAttr,
size,
)
} {
0 => Ok(()),
-1 => Err(nix::Error::last()),
_ => Err(nix::Error::UnknownErrno),
}?;
Ok(())
}
fn set_io_priority(&self, class: i64, priority: i64) -> Result<()> {
let ioprio_who_progress: libc::c_int = 1;
let ioprio_who_pid = 0;
let iop = (class << 13) | priority;
match unsafe {
libc::syscall(
libc::SYS_ioprio_set,
ioprio_who_progress,
ioprio_who_pid,
iop as libc::c_ulong,
)
} {
0 => Ok(()),
-1 => Err(nix::Error::last()),
_ => Err(nix::Error::UnknownErrno),
}?;
Ok(())
}
fn umount2(&self, target: &Path, flags: MntFlags) -> Result<()> {
umount2(target, flags)?;
Ok(())
}
}
#[cfg(test)]
mod tests {
use std::fs;
use std::os::unix::prelude::AsRawFd;
use anyhow::{bail, Context, Result};
use nix::{fcntl, sys, unistd};
use serial_test::serial;
use super::LinuxSyscall;
use crate::syscall::Syscall;
#[test]
#[serial]
fn test_get_open_fds() -> Result<()> {
let file = fs::File::open("/dev/null")?;
let fd = file.as_raw_fd();
let open_fds = LinuxSyscall::get_open_fds()?;
if !open_fds.iter().any(|&v| v == fd) {
bail!("failed to find the opened dev null fds: {:?}", open_fds);
}
drop(file);
if ![0, 1, 2]
.iter()
.all(|&stdio_fd| open_fds.iter().any(|&open_fd| open_fd == stdio_fd))
{
bail!("failed to find the stdio fds: {:?}", open_fds);
}
Ok(())
}
#[test]
#[serial]
fn test_close_range_userspace() -> Result<()> {
let fd = fcntl::open("/dev/null", fcntl::OFlag::O_RDWR, sys::stat::Mode::empty())?;
LinuxSyscall::emulate_close_range(0).context("failed to clean up the fds")?;
let fd_flag = fcntl::fcntl(fd, fcntl::F_GETFD)?;
if (fd_flag & fcntl::FdFlag::FD_CLOEXEC.bits()) == 0 {
bail!("CLOEXEC flag is not set correctly");
}
unistd::close(fd)?;
Ok(())
}
#[test]
#[serial]
fn test_close_range_native() -> Result<()> {
let fd = fcntl::open("/dev/null", fcntl::OFlag::O_RDWR, sys::stat::Mode::empty())?;
let syscall = LinuxSyscall {};
syscall
.close_range(0)
.context("failed to clean up the fds")?;
let fd_flag = fcntl::fcntl(fd, fcntl::F_GETFD)?;
if (fd_flag & fcntl::FdFlag::FD_CLOEXEC.bits()) == 0 {
bail!("CLOEXEC flag is not set correctly");
}
unistd::close(fd)?;
Ok(())
}
}