use super::{check_liveness, parse_user, read_state, ContainerStatus};
use pelagos::container::{Command, Namespace, Stdio};
use std::os::unix::io::AsRawFd;
use std::path::PathBuf;
use std::sync::{
atomic::{AtomicI32, Ordering},
Arc,
};
#[derive(Debug, clap::Args)]
pub struct ExecArgs {
pub name: String,
#[clap(long, short = 'i')]
pub interactive: bool,
#[clap(long = "env", short = 'e')]
pub env: Vec<String>,
#[clap(long = "workdir", short = 'w')]
pub workdir: Option<String>,
#[clap(long = "user", short = 'u')]
pub user: Option<String>,
#[clap(multiple_values = true, required = true, allow_hyphen_values = true)]
pub args: Vec<String>,
}
pub fn cmd_exec(args: ExecArgs) -> Result<(), Box<dyn std::error::Error>> {
let state = {
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(2);
let mut s = read_state(&args.name)
.map_err(|e| format!("container '{}' not found: {}", args.name, e))?;
while s.pid == 0
&& s.status == ContainerStatus::Running
&& check_liveness(s.watcher_pid)
&& std::time::Instant::now() < deadline
{
std::thread::sleep(std::time::Duration::from_millis(50));
s = read_state(&args.name)
.map_err(|e| format!("container '{}' not found: {}", args.name, e))?;
}
s
};
if state.status != ContainerStatus::Running || !check_liveness(state.pid) {
return Err(format!("container '{}' is not running", args.name).into());
}
let pid = state.pid;
let ns_entries = discover_namespaces(pid)?;
let environ_pid = {
let children_path = format!("/proc/{}/task/{}/children", pid, pid);
std::fs::read_to_string(&children_path)
.ok()
.and_then(|s| s.split_whitespace().next()?.parse::<i32>().ok())
.unwrap_or(pid)
};
let container_env = read_proc_environ(environ_pid);
let exe = &args.args[0];
let rest = &args.args[1..];
let mut cmd = Command::new(exe).args(rest);
let is_rootless = unsafe { libc::getuid() } != 0;
let has_user_ns = ns_entries.iter().any(|(_, ns)| *ns == Namespace::USER);
let mut has_mount_ns = false;
let mut user_ns_path: Option<PathBuf> = None;
let mut late_ns_paths: Vec<(PathBuf, Namespace)> = Vec::new();
for (path, ns) in &ns_entries {
match *ns {
Namespace::MOUNT => has_mount_ns = true,
Namespace::USER => {
user_ns_path = Some(path.clone());
}
Namespace::PID => {
log::debug!("exec: skipping PID namespace join (host PID namespace limitation)");
}
_ if is_rootless && has_user_ns => {
late_ns_paths.push((path.clone(), *ns));
}
_ => {
cmd = cmd.with_namespace_join(path, *ns);
}
}
}
if user_ns_path.is_some() {
cmd = cmd.skip_rootless_user_ns();
}
let exec_workdir = args.workdir.clone();
if has_mount_ns {
let mnt_ns_path = format!("/proc/{}/ns/mnt", pid);
let mnt_ns_file = std::fs::File::open(&mnt_ns_path)
.map_err(|e| format!("open {}: {}", mnt_ns_path, e))?;
let mnt_ns_fd = mnt_ns_file.as_raw_fd();
let user_ns_file = user_ns_path
.as_ref()
.map(|p| std::fs::File::open(p).map_err(|e| format!("open {:?}: {}", p, e)))
.transpose()?;
let user_ns_fd = user_ns_file.as_ref().map(|f| f.as_raw_fd());
let late_ns_files: Vec<(std::fs::File, Namespace)> = late_ns_paths
.iter()
.map(|(p, ns)| {
std::fs::File::open(p)
.map(|f| (f, *ns))
.map_err(|e| format!("open {:?}: {}", p, e))
})
.collect::<Result<Vec<_>, _>>()?;
let late_ns_fds: Vec<(i32, Namespace)> = late_ns_files
.iter()
.map(|(f, ns)| (f.as_raw_fd(), *ns))
.collect();
let root_pid = find_root_pid(pid);
let root_path = format!("/proc/{}/root", root_pid);
let root_file =
std::fs::File::open(&root_path).map_err(|e| format!("open {}: {}", root_path, e))?;
let root_fd = root_file.as_raw_fd();
cmd = cmd.with_pre_exec(move || {
let _keep_mnt = &mnt_ns_file;
let _keep_root = &root_file;
let _keep_user = &user_ns_file;
let _keep_late = &late_ns_files;
unsafe {
if let Some(user_fd) = user_ns_fd {
if libc::setns(user_fd, libc::CLONE_NEWUSER) != 0 {
return Err(std::io::Error::last_os_error());
}
}
if libc::setns(mnt_ns_fd, libc::CLONE_NEWNS) != 0 {
return Err(std::io::Error::last_os_error());
}
if libc::fchdir(root_fd) != 0 {
return Err(std::io::Error::last_os_error());
}
let dot = std::ffi::CString::new(".").unwrap();
if libc::chroot(dot.as_ptr()) != 0 {
return Err(std::io::Error::last_os_error());
}
let target = exec_workdir.as_deref().unwrap_or("/");
let target_c = std::ffi::CString::new(target).unwrap();
if libc::chdir(target_c.as_ptr()) != 0 {
return Err(std::io::Error::last_os_error());
}
for (fd, _ns) in &late_ns_fds {
if libc::setns(*fd, 0) != 0 {
return Err(std::io::Error::last_os_error());
}
}
}
Ok(())
});
} else {
let root_pid = find_root_pid(pid);
cmd = cmd.with_chroot(format!("/proc/{}/root", root_pid));
if let Some(ref w) = exec_workdir {
cmd = cmd.with_cwd(w);
}
}
for (k, v) in &container_env {
cmd = cmd.env(k, v);
}
for e in &args.env {
if let Some((k, v)) = e.split_once('=') {
cmd = cmd.env(k, v);
} else if let Ok(v) = std::env::var(e) {
cmd = cmd.env(e, v);
}
}
if let Some(ref u) = args.user {
let (uid, gid) = parse_user(u)?;
let uid_map_path = format!("/proc/{}/uid_map", environ_pid);
if let Ok(uid_map) = std::fs::read_to_string(&uid_map_path) {
if !uid_in_ns_map(uid, &uid_map) {
return Err(format!(
"UID {} is not mapped in container '{}' user namespace\n\
uid_map: {}\n\
Hint: restart the container from a login shell rather than a \
'newgrp'/'sg' shell to enable subordinate UID mapping (newuidmap).",
uid,
args.name,
uid_map.split_whitespace().collect::<Vec<_>>().join(" ")
)
.into());
}
}
if let Some(g) = gid {
let gid_map_path = format!("/proc/{}/gid_map", environ_pid);
if let Ok(gid_map) = std::fs::read_to_string(&gid_map_path) {
if !uid_in_ns_map(g, &gid_map) {
return Err(format!(
"GID {} is not mapped in container '{}' user namespace\n\
gid_map: {}\n\
Hint: restart the container from a login shell rather than a \
'newgrp'/'sg' shell to enable subordinate GID mapping (newgidmap).",
g,
args.name,
gid_map.split_whitespace().collect::<Vec<_>>().join(" ")
)
.into());
}
}
cmd = cmd.with_gid(g);
}
cmd = cmd.with_uid(uid);
}
if args.interactive {
let session = cmd
.spawn_interactive()
.map_err(|e| format!("spawn_interactive failed: {}", e))?;
match session.run() {
Ok(status) => {
let code = status.code().unwrap_or(0);
std::process::exit(code);
}
Err(e) => Err(format!("interactive session failed: {}", e).into()),
}
} else {
cmd = cmd
.stdin(Stdio::Inherit)
.stdout(Stdio::Inherit)
.stderr(Stdio::Inherit);
let mut child = cmd
.spawn()
.map_err(|e| format!("exec spawn failed: {}", e))?;
let exit = child
.wait()
.map_err(|e| format!("exec wait failed: {}", e))?;
let code = exit.code().unwrap_or(1);
std::process::exit(code);
}
}
pub fn exec_in_container(pid: i32, args: &[String]) -> Option<bool> {
if args.is_empty() || pid <= 0 {
return None;
}
let ns_entries = discover_namespaces(pid).ok()?;
let mut cmd = Command::new(&args[0]).args(&args[1..]);
cmd = cmd
.stdin(Stdio::Null)
.stdout(Stdio::Null)
.stderr(Stdio::Null);
let mut has_mount_ns = false;
for (path, ns) in &ns_entries {
if *ns == Namespace::MOUNT {
has_mount_ns = true;
} else {
cmd = cmd.with_namespace_join(path, *ns);
}
}
if has_mount_ns {
let mnt_ns_path = format!("/proc/{}/ns/mnt", pid);
let mnt_ns_file = std::fs::File::open(&mnt_ns_path).ok()?;
let mnt_ns_fd = mnt_ns_file.as_raw_fd();
let root_pid = find_root_pid(pid);
let root_path = format!("/proc/{}/root", root_pid);
let root_file = std::fs::File::open(&root_path).ok()?;
let root_fd = root_file.as_raw_fd();
cmd = cmd.with_pre_exec(move || {
let _keep_mnt = &mnt_ns_file;
let _keep_root = &root_file;
unsafe {
if libc::setns(mnt_ns_fd, libc::CLONE_NEWNS) != 0 {
return Err(std::io::Error::last_os_error());
}
if libc::fchdir(root_fd) != 0 {
return Err(std::io::Error::last_os_error());
}
let dot = std::ffi::CString::new(".").unwrap();
if libc::chroot(dot.as_ptr()) != 0 {
return Err(std::io::Error::last_os_error());
}
let root_c = std::ffi::CString::new("/").unwrap();
if libc::chdir(root_c.as_ptr()) != 0 {
return Err(std::io::Error::last_os_error());
}
}
Ok(())
});
} else {
let root_pid = find_root_pid(pid);
cmd = cmd.with_chroot(format!("/proc/{}/root", root_pid));
}
match cmd.spawn() {
Ok(mut child) => child.wait().map(|s| s.success()).ok(),
Err(_) => None,
}
}
pub fn exec_in_container_with_pid_sink(
pid: i32,
args: &[String],
child_pid_sink: Arc<AtomicI32>,
) -> Option<bool> {
if args.is_empty() || pid <= 0 {
return None;
}
let ns_entries = discover_namespaces(pid).ok()?;
let mut cmd = Command::new(&args[0]).args(&args[1..]);
cmd = cmd
.stdin(Stdio::Null)
.stdout(Stdio::Null)
.stderr(Stdio::Null);
let mut has_mount_ns = false;
for (path, ns) in &ns_entries {
if *ns == Namespace::MOUNT {
has_mount_ns = true;
} else {
cmd = cmd.with_namespace_join(path, *ns);
}
}
if has_mount_ns {
let mnt_ns_path = format!("/proc/{}/ns/mnt", pid);
let mnt_ns_file = std::fs::File::open(&mnt_ns_path).ok()?;
let mnt_ns_fd = mnt_ns_file.as_raw_fd();
let root_pid = find_root_pid(pid);
let root_path = format!("/proc/{}/root", root_pid);
let root_file = std::fs::File::open(&root_path).ok()?;
let root_fd = root_file.as_raw_fd();
cmd = cmd.with_pre_exec(move || {
let _keep_mnt = &mnt_ns_file;
let _keep_root = &root_file;
unsafe {
if libc::setns(mnt_ns_fd, libc::CLONE_NEWNS) != 0 {
return Err(std::io::Error::last_os_error());
}
if libc::fchdir(root_fd) != 0 {
return Err(std::io::Error::last_os_error());
}
let dot = std::ffi::CString::new(".").unwrap();
if libc::chroot(dot.as_ptr()) != 0 {
return Err(std::io::Error::last_os_error());
}
let root_c = std::ffi::CString::new("/").unwrap();
if libc::chdir(root_c.as_ptr()) != 0 {
return Err(std::io::Error::last_os_error());
}
}
Ok(())
});
} else {
let root_pid = find_root_pid(pid);
cmd = cmd.with_chroot(format!("/proc/{}/root", root_pid));
}
match cmd.spawn() {
Ok(mut child) => {
child_pid_sink.store(child.pid(), Ordering::Relaxed);
child.wait().map(|s| s.success()).ok()
}
Err(_) => None,
}
}
fn find_root_pid(pid: i32) -> i32 {
let path = format!("/proc/{}/task/{}/children", pid, pid);
if let Ok(content) = std::fs::read_to_string(&path) {
let children: Vec<i32> = content
.split_whitespace()
.filter_map(|s| s.parse().ok())
.collect();
if children.len() == 1 {
return children[0];
}
}
pid
}
pub fn discover_namespaces(
pid: i32,
) -> Result<Vec<(PathBuf, Namespace)>, Box<dyn std::error::Error>> {
let ns_map: &[(&str, Namespace)] = &[
("mnt", Namespace::MOUNT),
("uts", Namespace::UTS),
("ipc", Namespace::IPC),
("net", Namespace::NET),
("pid", Namespace::PID),
("user", Namespace::USER),
("cgroup", Namespace::CGROUP),
];
let mut result = Vec::new();
for &(ns_name, ns_flag) in ns_map {
let container_ns = format!("/proc/{}/ns/{}", pid, ns_name);
let init_ns = format!("/proc/1/ns/{}", ns_name);
let container_ino = match std::fs::metadata(&container_ns) {
Ok(m) => {
use std::os::unix::fs::MetadataExt;
m.ino()
}
Err(_) => continue,
};
let self_ns = format!("/proc/self/ns/{}", ns_name);
let init_ino = match std::fs::metadata(&init_ns) {
Ok(m) => {
use std::os::unix::fs::MetadataExt;
m.ino()
}
Err(_) => match std::fs::metadata(&self_ns) {
Ok(m) => {
use std::os::unix::fs::MetadataExt;
m.ino()
}
Err(_) => continue,
},
};
if container_ino != init_ino {
result.push((PathBuf::from(container_ns), ns_flag));
}
}
let pid_already_found = result.iter().any(|(_, ns)| *ns == Namespace::PID);
if !pid_already_found {
let pfc_path = format!("/proc/{}/ns/pid_for_children", pid);
let init_pid_path = "/proc/1/ns/pid";
let pfc_ino = std::fs::metadata(&pfc_path).ok().map(|m| {
use std::os::unix::fs::MetadataExt;
m.ino()
});
let init_pid_ino = std::fs::metadata(init_pid_path).ok().map(|m| {
use std::os::unix::fs::MetadataExt;
m.ino()
});
if let (Some(pfc), Some(init)) = (pfc_ino, init_pid_ino) {
if pfc != init {
result.push((PathBuf::from(pfc_path), Namespace::PID));
}
}
}
Ok(result)
}
fn read_proc_environ(pid: i32) -> Vec<(String, String)> {
let path = format!("/proc/{}/environ", pid);
let data = match std::fs::read(&path) {
Ok(d) => d,
Err(_) => return Vec::new(),
};
data.split(|&b| b == 0)
.filter(|s| !s.is_empty())
.filter_map(|entry| {
let s = String::from_utf8_lossy(entry);
let (k, v) = s.split_once('=')?;
Some((k.to_string(), v.to_string()))
})
.collect()
}
fn uid_in_ns_map(uid: u32, uid_map: &str) -> bool {
for line in uid_map.lines() {
let mut parts = line.split_whitespace();
let start: Option<u32> = parts.next().and_then(|s| s.parse().ok());
let _host: Option<u32> = parts.next().and_then(|s| s.parse().ok());
let count: Option<u32> = parts.next().and_then(|s| s.parse().ok());
if let (Some(start), Some(count)) = (start, count) {
if uid >= start && uid < start.saturating_add(count) {
return true;
}
}
}
false
}