use std::ffi::c_int;
use std::num::NonZeroUsize;
use libc::SIGCHLD;
use nix::sys::{mman, resource};
use nix::unistd::Pid;
#[derive(Debug, thiserror::Error)]
pub enum CloneError {
#[error("failed to clone process")]
Clone(#[source] nix::Error),
#[error("failed to get system memory page size")]
PageSize(#[source] nix::Error),
#[error("failed to get resource limit")]
ResourceLimit(#[source] nix::Error),
#[error("the stack size is zero")]
ZeroStackSize,
#[error("failed to allocate stack")]
StackAllocation(#[source] nix::Error),
#[error("failed to create stack guard page")]
GuardPage(#[source] nix::Error),
#[error("unknown error code {0}")]
UnknownErrno(i32),
}
pub type CloneCb<'a> = Box<dyn FnMut() -> i32 + 'a>;
pub fn container_clone_sibling(cb: CloneCb) -> Result<Pid, CloneError> {
clone_internal(cb, libc::CLONE_PARENT as u64, None)
}
pub fn container_clone(cb: CloneCb) -> Result<Pid, CloneError> {
clone_internal(cb, 0, Some(SIGCHLD as u64))
}
fn clone_internal(
mut cb: CloneCb,
flags: u64,
exit_signal: Option<u64>,
) -> Result<Pid, CloneError> {
match clone3(&mut cb, flags, exit_signal) {
Ok(pid) => Ok(pid),
Err(CloneError::Clone(nix::Error::ENOSYS)) => {
tracing::debug!("clone3 is not supported, fallback to clone");
let pid = clone(cb, flags, exit_signal)?;
Ok(pid)
}
Err(err) => Err(err),
}
}
fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid, CloneError> {
#[repr(C)]
struct clone3_args {
flags: u64,
pidfd: u64,
child_tid: u64,
parent_tid: u64,
exit_signal: u64,
stack: u64,
stack_size: u64,
tls: u64,
set_tid: u64,
set_tid_size: u64,
cgroup: u64,
}
let mut args = clone3_args {
flags,
pidfd: 0,
child_tid: 0,
parent_tid: 0,
exit_signal: exit_signal.unwrap_or(0),
stack: 0,
stack_size: 0,
tls: 0,
set_tid: 0,
set_tid_size: 0,
cgroup: 0,
};
let args_ptr = &mut args as *mut clone3_args;
let args_size = std::mem::size_of::<clone3_args>();
match unsafe { libc::syscall(libc::SYS_clone3, args_ptr, args_size) } {
-1 => Err(CloneError::Clone(nix::Error::last())),
0 => {
std::process::exit(cb());
}
ret if ret >= 0 => Ok(Pid::from_raw(ret as i32)),
ret => Err(CloneError::UnknownErrno(ret as i32)),
}
}
fn clone(cb: CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid, CloneError> {
const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; const DEFAULT_PAGE_SIZE: usize = 4 * 1024;
let page_size = nix::unistd::sysconf(nix::unistd::SysconfVar::PAGE_SIZE)
.map_err(CloneError::PageSize)?
.map(|size| size as usize)
.unwrap_or(DEFAULT_PAGE_SIZE);
let (rlim_cur, _) =
resource::getrlimit(resource::Resource::RLIMIT_STACK).map_err(CloneError::ResourceLimit)?;
let default_stack_size = if rlim_cur != u64::MAX {
rlim_cur as usize
} else {
tracing::debug!(
"stack size returned by getrlimit() is unlimited, use DEFAULT_STACK_SIZE(8MB)"
);
DEFAULT_STACK_SIZE
};
let child_stack = unsafe {
mman::mmap_anonymous(
None,
NonZeroUsize::new(default_stack_size).ok_or(CloneError::ZeroStackSize)?,
mman::ProtFlags::PROT_READ | mman::ProtFlags::PROT_WRITE,
mman::MapFlags::MAP_PRIVATE | mman::MapFlags::MAP_STACK,
)
.map_err(CloneError::StackAllocation)?
};
unsafe {
mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
.map_err(CloneError::GuardPage)?;
};
let child_stack_top = unsafe { child_stack.as_ptr().add(default_stack_size) };
let combined_flags = (flags | exit_signal.unwrap_or(0)) as c_int;
let data = Box::into_raw(Box::new(cb));
extern "C" fn main(data: *mut libc::c_void) -> libc::c_int {
unsafe { Box::from_raw(data as *mut CloneCb)() }
}
let ret = unsafe {
libc::clone(
main,
child_stack_top,
combined_flags,
data as *mut libc::c_void,
)
};
unsafe { drop(Box::from_raw(data)) };
match ret {
-1 => Err(CloneError::Clone(nix::Error::last())),
pid if ret > 0 => Ok(Pid::from_raw(pid)),
_ => unreachable!("clone returned a negative pid {ret}"),
}
}
#[cfg(test)]
mod test {
use anyhow::{Context, Result, bail};
use nix::sys::wait::{WaitStatus, waitpid};
use nix::unistd;
use super::*;
use crate::channel::channel;
#[test]
fn test_container_fork() -> Result<()> {
let pid = container_clone(Box::new(|| 0))?;
match waitpid(pid, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(pid, p);
assert_eq!(status, 0);
Ok(())
}
_ => bail!("test failed"),
}
}
#[test]
fn test_container_err_fork() -> Result<()> {
let pid = container_clone(Box::new(|| -1))?;
match waitpid(pid, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(pid, p);
assert_eq!(status, 255);
Ok(())
}
_ => bail!("test failed"),
}
}
#[test]
fn test_container_clone_sibling() -> Result<()> {
let (sender, receiver) = &mut channel::<i32>()?;
match unsafe { unistd::fork()? } {
unistd::ForkResult::Parent { child } => {
let sibling_process_pid = Pid::from_raw(
receiver
.recv()
.context("failed to receive the sibling pid from forked process")?,
);
receiver.close()?;
match waitpid(sibling_process_pid, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(sibling_process_pid, p);
assert_eq!(status, 0);
}
_ => bail!("failed to wait on the sibling process"),
}
match waitpid(child, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(child, p);
assert_eq!(status, 0);
}
_ => bail!("failed to wait on the forked process"),
}
}
unistd::ForkResult::Child => {
let pid = container_clone_sibling(Box::new(|| 0))?;
sender.send(pid.as_raw())?;
sender.close()?;
std::process::exit(0);
}
};
Ok(())
}
#[cfg(feature = "libseccomp")]
#[test]
fn test_clone_fallback() -> Result<()> {
use oci_spec::runtime::{
Arch, LinuxSeccompAction, LinuxSeccompBuilder, LinuxSyscallBuilder,
};
use crate::test_utils::TestCallbackError;
fn has_clone3() -> bool {
let res = unsafe { libc::syscall(libc::SYS_clone3, 0, 0) };
let err = (res == -1)
.then(std::io::Error::last_os_error)
.expect("probe syscall should not succeed");
err.raw_os_error() != Some(libc::ENOSYS)
}
let syscall = LinuxSyscallBuilder::default()
.names(vec![String::from("clone3")])
.action(LinuxSeccompAction::ScmpActErrno)
.errno_ret(libc::ENOSYS as u32)
.build()?;
let seccomp_profile = LinuxSeccompBuilder::default()
.default_action(LinuxSeccompAction::ScmpActAllow)
.architectures(vec![Arch::ScmpArchNative])
.syscalls(vec![syscall])
.build()?;
crate::test_utils::test_in_child_process(|| {
let _ = prctl::set_no_new_privileges(true);
crate::seccomp::initialize_seccomp(&seccomp_profile)
.expect("failed to initialize seccomp");
if has_clone3() {
return Err(TestCallbackError::Custom(
"clone3 is not blocked by seccomp".into(),
));
}
let pid = container_clone(Box::new(|| 0)).map_err(|err| err.to_string())?;
match waitpid(pid, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(pid, p);
assert_eq!(status, 0);
}
status => {
return Err(TestCallbackError::Custom(format!(
"failed to wait on child process: {:?}",
status
)));
}
};
Ok(())
})?;
Ok(())
}
}