use std::collections::HashMap;
use std::fs;
use std::fs::File;
use std::os::fd::AsRawFd;
use std::path::PathBuf;
use nix::sys::wait::{WaitStatus, waitpid};
use nix::unistd::Pid;
use oci_spec::runtime::{Linux, LinuxNamespaceType};
#[cfg(feature = "libseccomp")]
use oci_spec::runtime::{SECCOMP_FD_NAME, VERSION as OCI_VERSION};
use crate::hooks;
use crate::network::network_device::dev_change_net_namespace;
use crate::process::args::{ContainerArgs, ContainerType};
use crate::process::fork::{self, CloneCb};
use crate::process::intel_rdt::setup_intel_rdt;
use crate::process::{channel, container_intermediate_process};
use crate::syscall::SyscallError;
use crate::user_ns::UserNamespaceConfig;
#[derive(Debug, thiserror::Error)]
pub enum ProcessError {
#[error(transparent)]
Channel(#[from] channel::ChannelError),
#[error("failed to write deny to setgroups")]
SetGroupsDeny(#[source] std::io::Error),
#[error(transparent)]
UserNamespace(#[from] crate::user_ns::UserNamespaceError),
#[error("container state is required")]
ContainerStateRequired,
#[error("failed to wait for intermediate process")]
WaitIntermediateProcess(#[source] nix::Error),
#[error(transparent)]
IntelRdt(#[from] crate::process::intel_rdt::IntelRdtError),
#[error("failed to create intermediate process")]
IntermediateProcessFailed(#[source] fork::CloneError),
#[error("failed seccomp listener")]
#[cfg(feature = "libseccomp")]
SeccompListener(#[from] crate::process::seccomp_listener::SeccompListenerError),
#[error("failed setup network device")]
Network(#[from] crate::network::NetworkError),
#[error("failed syscall")]
SyscallOther(#[source] SyscallError),
#[error("failed hooks {0}")]
Hooks(#[from] crate::hooks::HookError),
#[error("failed to build OCI state: {0}")]
OciStateBuild(String),
}
type Result<T> = std::result::Result<T, ProcessError>;
pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bool)> {
let (mut main_sender, mut main_receiver) = channel::main_channel()?;
let mut inter_chan = channel::intermediate_channel()?;
let mut init_chan = channel::init_channel()?;
let cb: CloneCb = {
Box::new(|| {
if let Err(ret) = prctl::set_name("youki:[1:INTER]") {
tracing::error!(?ret, "failed to set name for child process");
return ret;
}
match container_intermediate_process::container_intermediate_process(
container_args,
&mut inter_chan,
&mut init_chan,
&mut main_sender,
) {
Ok(_) => 0,
Err(err) => {
tracing::error!("failed to run intermediate process {}", err);
match main_sender.send_error(err.to_string()) {
Ok(_) => {}
Err(e) => {
tracing::error!(
"error in sending intermediate error message {} to main: {}",
err,
e
)
}
}
-1
}
}
})
};
let container_clone_fn = if container_args.as_sibling {
fork::container_clone_sibling
} else {
fork::container_clone
};
let intermediate_pid = container_clone_fn(cb).map_err(|err| {
tracing::error!("failed to fork intermediate process: {}", err);
ProcessError::IntermediateProcessFailed(err)
})?;
main_sender.close().map_err(|err| {
tracing::error!("failed to close unused sender: {}", err);
err
})?;
let (mut inter_sender, inter_receiver) = inter_chan;
let (mut init_sender, init_receiver) = init_chan;
if let Some(config) = &container_args.user_ns_config {
main_receiver.wait_for_mapping_request()?;
setup_mapping(config, intermediate_pid)?;
inter_sender.mapping_written()?;
}
inter_sender.close().map_err(|err| {
tracing::error!("failed to close unused intermediate sender: {}", err);
err
})?;
let init_pid = main_receiver.wait_for_intermediate_ready()?;
let mut need_to_clean_up_intel_rdt_subdirectory = false;
if let Some(linux) = container_args.spec.linux() {
if let Some(intel_rdt) = linux.intel_rdt() {
let container_id = container_args
.container
.as_ref()
.map(|container| container.id());
need_to_clean_up_intel_rdt_subdirectory =
setup_intel_rdt(container_id, &init_pid, intel_rdt)?;
}
}
if let Some(pid_file) = &container_args.pid_file {
if let Err(err) = fs::write(pid_file, format!("{init_pid}")) {
tracing::warn!("failed to write pid to file: {err}");
}
}
if matches!(container_args.container_type, ContainerType::InitContainer) {
if let Some(hooks) = container_args.spec.hooks() {
main_receiver.wait_for_hook_request()?;
if let Some(container_for_hooks) = &container_args.container {
hooks::run_hooks(
hooks.prestart().as_ref(),
Some(&container_for_hooks.state),
None,
Some(init_pid),
)
.map_err(|err| {
tracing::error!("failed to run prestart hooks: {}", err);
err
})?;
hooks::run_hooks(
hooks.create_runtime().as_ref(),
Some(&container_for_hooks.state),
None,
Some(init_pid),
)
.map_err(|err| {
tracing::error!("failed to run create runtime hooks: {}", err);
err
})?;
}
init_sender.hook_done()?;
}
}
if let Some(linux) = container_args.spec.linux() {
move_network_devices_to_container(linux, init_pid, &mut main_receiver, &mut init_sender)?;
#[cfg(feature = "libseccomp")]
if let Some(seccomp) = linux.seccomp() {
let container = container_args
.container
.as_ref()
.ok_or(ProcessError::ContainerStateRequired)?;
let oci_status = match container_args.container_type {
ContainerType::InitContainer => oci_spec::runtime::ContainerState::Creating,
ContainerType::TenantContainer { .. } => oci_spec::runtime::ContainerState::Running,
};
let oci_state = oci_spec::runtime::StateBuilder::default()
.version(OCI_VERSION)
.id(container.state.id.clone())
.status(oci_status)
.pid(init_pid.as_raw())
.bundle(container.state.bundle.clone())
.annotations(container.state.annotations.clone().unwrap_or_default())
.build()
.map_err(|e| ProcessError::OciStateBuild(e.to_string()))?;
let state = oci_spec::runtime::ContainerProcessStateBuilder::default()
.version(OCI_VERSION)
.fds(vec![SECCOMP_FD_NAME.to_string()])
.pid(init_pid.as_raw())
.metadata(seccomp.listener_metadata().clone().unwrap_or_default())
.state(oci_state)
.build()
.map_err(|e| ProcessError::OciStateBuild(e.to_string()))?;
crate::process::seccomp_listener::sync_seccomp(
seccomp,
&state,
&mut init_sender,
&mut main_receiver,
)?;
}
}
init_sender.close().map_err(|err| {
tracing::error!("failed to close unused init sender: {}", err);
err
})?;
main_receiver.wait_for_init_ready().map_err(|err| {
tracing::error!("failed to wait for init ready: {}", err);
err
})?;
tracing::debug!("init pid is {:?}", init_pid);
inter_receiver.close().map_err(|err| {
tracing::error!("failed to close intermediate process receiver: {}", err);
err
})?;
init_receiver.close().map_err(|err| {
tracing::error!("failed to close init process receiver: {}", err);
err
})?;
main_receiver.close().map_err(|err| {
tracing::error!("failed to close main process receiver: {}", err);
err
})?;
match waitpid(intermediate_pid, None) {
Ok(WaitStatus::Exited(_, 0)) => (),
Ok(WaitStatus::Exited(_, s)) => {
tracing::warn!("intermediate process failed with exit status: {s}");
}
Ok(WaitStatus::Signaled(_, sig, _)) => {
tracing::warn!("intermediate process killed with signal: {sig}")
}
Ok(_) => (),
Err(nix::errno::Errno::ECHILD) => {
tracing::warn!("intermediate process already reaped");
}
Err(err) => return Err(ProcessError::WaitIntermediateProcess(err)),
};
Ok((init_pid, need_to_clean_up_intel_rdt_subdirectory))
}
fn setup_mapping(config: &UserNamespaceConfig, pid: Pid) -> Result<()> {
tracing::debug!("write mapping for pid {:?}", pid);
if !config.privileged {
std::fs::write(format!("/proc/{pid}/setgroups"), "deny")
.map_err(ProcessError::SetGroupsDeny)?;
}
config.write_uid_mapping(pid).map_err(|err| {
tracing::error!("failed to write uid mapping for pid {:?}: {}", pid, err);
err
})?;
config.write_gid_mapping(pid).map_err(|err| {
tracing::error!("failed to write gid mapping for pid {:?}: {}", pid, err);
err
})?;
Ok(())
}
fn move_network_devices_to_container(
linux: &Linux,
init_pid: Pid,
main_receiver: &mut channel::MainReceiver,
init_sender: &mut channel::InitSender,
) -> Result<()> {
let devices = match linux.net_devices() {
Some(devs) if !devs.is_empty() => devs,
_ => return Ok(()),
};
if let Some(namespaces) = linux.namespaces() {
let net_ns = match namespaces
.iter()
.find(|ns| ns.typ() == LinuxNamespaceType::Network)
{
Some(ns) => ns,
None => return Ok(()),
};
main_receiver.wait_for_network_setup_ready()?;
let default_ns_path = PathBuf::from(format!("/proc/{}/ns/net", init_pid.as_raw()));
let ns_path = net_ns.path().as_deref().unwrap_or(&default_ns_path);
let netns_file = File::open(ns_path).map_err(|err| {
tracing::error!(
"failed to open network namespace at {}: {}",
ns_path.display(),
err
);
ProcessError::Network(err.into())
})?;
let netns_fd = netns_file.as_raw_fd();
let addrs_map = devices
.iter()
.map(|(name, net_dev)| {
let addrs = dev_change_net_namespace(name, netns_fd, net_dev).map_err(|err| {
tracing::error!("failed to dev_change_net_namespace: {}", err);
err
})?;
Ok((name.clone(), addrs))
})
.collect::<Result<HashMap<String, Vec<crate::network::cidr::CidrAddress>>>>()?;
init_sender.move_network_device(addrs_map)?;
}
Ok(())
}
#[cfg(test)]
mod tests {
use std::fs;
use anyhow::Result;
use nix::sched::{CloneFlags, unshare};
use nix::unistd::{self, getgid, getuid};
use oci_spec::runtime::LinuxIdMappingBuilder;
use serial_test::serial;
use super::*;
use crate::process::channel::{intermediate_channel, main_channel};
use crate::user_ns::UserNamespaceIDMapper;
#[test]
#[serial]
fn setup_uid_mapping_should_succeed() -> Result<()> {
let uid_mapping = LinuxIdMappingBuilder::default()
.host_id(getuid())
.container_id(0u32)
.size(1u32)
.build()?;
let uid_mappings = vec![uid_mapping];
let tmp = tempfile::tempdir()?;
let id_mapper = UserNamespaceIDMapper::new_test(tmp.path().to_path_buf());
let ns_config = UserNamespaceConfig {
uid_mappings: Some(uid_mappings),
privileged: true,
id_mapper: id_mapper.clone(),
..Default::default()
};
let (mut parent_sender, mut parent_receiver) = main_channel()?;
let (mut child_sender, mut child_receiver) = intermediate_channel()?;
match unsafe { unistd::fork()? } {
unistd::ForkResult::Parent { child } => {
parent_receiver.wait_for_mapping_request()?;
parent_receiver.close()?;
id_mapper.ensure_uid_path(&child)?;
setup_mapping(&ns_config, child)?;
let line = fs::read_to_string(id_mapper.get_uid_path(&child))?;
let split_lines = line.split_whitespace();
for (act, expect) in split_lines.zip([
uid_mapping.container_id().to_string(),
uid_mapping.host_id().to_string(),
uid_mapping.size().to_string(),
]) {
assert_eq!(act, expect);
}
child_sender.mapping_written()?;
child_sender.close()?;
}
unistd::ForkResult::Child => {
prctl::set_dumpable(true).unwrap();
unshare(CloneFlags::CLONE_NEWUSER)?;
parent_sender.identifier_mapping_request()?;
parent_sender.close()?;
child_receiver.wait_for_mapping_ack()?;
child_receiver.close()?;
std::process::exit(0);
}
}
Ok(())
}
#[test]
#[serial]
fn setup_gid_mapping_should_succeed() -> Result<()> {
let gid_mapping = LinuxIdMappingBuilder::default()
.host_id(getgid())
.container_id(0u32)
.size(1u32)
.build()?;
let gid_mappings = vec![gid_mapping];
let tmp = tempfile::tempdir()?;
let id_mapper = UserNamespaceIDMapper::new_test(tmp.path().to_path_buf());
let ns_config = UserNamespaceConfig {
gid_mappings: Some(gid_mappings),
id_mapper: id_mapper.clone(),
..Default::default()
};
let (mut parent_sender, mut parent_receiver) = main_channel()?;
let (mut child_sender, mut child_receiver) = intermediate_channel()?;
match unsafe { unistd::fork()? } {
unistd::ForkResult::Parent { child } => {
parent_receiver.wait_for_mapping_request()?;
parent_receiver.close()?;
id_mapper.ensure_gid_path(&child)?;
setup_mapping(&ns_config, child)?;
let line = fs::read_to_string(id_mapper.get_gid_path(&child))?;
let split_lines = line.split_whitespace();
for (act, expect) in split_lines.zip([
gid_mapping.container_id().to_string(),
gid_mapping.host_id().to_string(),
gid_mapping.size().to_string(),
]) {
assert_eq!(act, expect);
}
assert_eq!(
fs::read_to_string(format!("/proc/{}/setgroups", child.as_raw()))?,
"deny\n",
);
child_sender.mapping_written()?;
child_sender.close()?;
}
unistd::ForkResult::Child => {
prctl::set_dumpable(true).unwrap();
unshare(CloneFlags::CLONE_NEWUSER)?;
parent_sender.identifier_mapping_request()?;
parent_sender.close()?;
child_receiver.wait_for_mapping_ack()?;
child_receiver.close()?;
std::process::exit(0);
}
}
Ok(())
}
}