use anyhow::{Context, Result};
use std::collections::{BTreeSet, HashMap};
use std::io::Write;
use std::os::unix::fs::PermissionsExt;
use std::path::{Path, PathBuf};
use std::sync::LazyLock;
#[derive(Debug, Clone)]
pub(crate) struct SharedLibs {
pub found: Vec<(String, PathBuf)>,
pub missing: Vec<MissingLib>,
pub interpreter: Option<String>,
}
#[derive(Debug, Clone)]
pub(crate) struct MissingLib {
pub soname: String,
}
static LD_SO_CACHE: LazyLock<HashMap<String, PathBuf>> =
LazyLock::new(|| parse_ld_so_cache(Path::new("/etc/ld.so.cache")));
const LD_CACHE_MAGIC: &[u8; 20] = b"glibc-ld.so.cache1.1";
const LD_CACHE_HEADER_SIZE: usize = 48;
const LD_CACHE_ENTRY_SIZE: usize = 24;
fn parse_ld_so_cache(path: &Path) -> HashMap<String, PathBuf> {
let mut map = HashMap::new();
let data = match std::fs::read(path) {
Ok(d) => d,
Err(_) => return map,
};
let Some(magic_pos) = data
.windows(LD_CACHE_MAGIC.len())
.position(|w| w == LD_CACHE_MAGIC)
else {
return map;
};
let hdr = magic_pos;
if data.len() < hdr + LD_CACHE_HEADER_SIZE {
return map;
}
let nlibs = u32::from_le_bytes(data[hdr + 20..hdr + 24].try_into().unwrap()) as usize;
let min_size = hdr + LD_CACHE_HEADER_SIZE + nlibs * LD_CACHE_ENTRY_SIZE;
if data.len() < min_size {
return map;
}
for i in 0..nlibs {
let off = hdr + LD_CACHE_HEADER_SIZE + i * LD_CACHE_ENTRY_SIZE;
let key_off = u32::from_le_bytes(data[off + 4..off + 8].try_into().unwrap()) as usize;
let val_off = u32::from_le_bytes(data[off + 8..off + 12].try_into().unwrap()) as usize;
if key_off >= data.len() || val_off >= data.len() {
continue;
}
let soname = match read_cstr(&data, key_off) {
Some(s) => s,
None => continue,
};
let path_str = match read_cstr(&data, val_off) {
Some(s) => s,
None => continue,
};
if path_str.starts_with('/') {
let p = PathBuf::from(path_str);
if p.is_file() {
map.entry(soname.to_string()).or_insert(p);
}
}
}
map
}
fn read_cstr(data: &[u8], offset: usize) -> Option<&str> {
let end = data[offset..].iter().position(|&b| b == 0)?;
std::str::from_utf8(&data[offset..offset + end]).ok()
}
pub(crate) fn resolve_shared_libs(binary: &Path) -> Result<SharedLibs> {
resolve_shared_libs_inner(binary, &[])
}
fn resolve_shared_libs_with_extra_interp_hints(
binary: &Path,
extra_interp_hints: &[PathBuf],
) -> Result<SharedLibs> {
resolve_shared_libs_inner(binary, extra_interp_hints)
}
#[tracing::instrument(skip_all, fields(binary = %binary.display(), extra_hints = extra_interp_hints.len()))]
fn resolve_shared_libs_inner(binary: &Path, extra_interp_hints: &[PathBuf]) -> Result<SharedLibs> {
type LibCache = LazyLock<std::sync::Mutex<HashMap<(PathBuf, Vec<PathBuf>), SharedLibs>>>;
static CACHE: LibCache = LazyLock::new(|| std::sync::Mutex::new(HashMap::new()));
let canon = std::fs::canonicalize(binary).unwrap_or_else(|_| binary.to_path_buf());
let cache_key = (canon.clone(), extra_interp_hints.to_vec());
if let Ok(cache) = CACHE.lock()
&& let Some(cached) = cache.get(&cache_key)
{
return Ok(cached.clone());
}
let data =
std::fs::read(binary).with_context(|| format!("read binary: {}", binary.display()))?;
let elf = match goblin::elf::Elf::parse(&data) {
Ok(e) => e,
Err(_) => {
return Ok(SharedLibs {
found: vec![],
missing: vec![],
interpreter: None,
});
}
};
let interpreter = elf.interpreter.map(|s| s.to_string());
if elf.libraries.is_empty() && elf.dynamic.is_none() {
return Ok(SharedLibs {
found: vec![],
missing: vec![],
interpreter,
});
}
let root_needed: Vec<String> = elf.libraries.iter().map(|s| s.to_string()).collect();
let root_search = elf_search_paths(&elf, binary);
let mut interp_search_dirs: Vec<PathBuf> = match interpreter {
Some(ref interp) if !is_standard_interpreter(interp) => {
let interp_path = Path::new(interp);
let mut dirs = Vec::new();
if let Some(parent) = interp_path.parent() {
dirs.push(parent.to_path_buf());
if let Some(grandparent) = parent.parent() {
dirs.push(grandparent.join("lib"));
dirs.push(grandparent.join("lib64"));
}
}
dirs
}
_ => Vec::new(),
};
for hint in extra_interp_hints {
if !interp_search_dirs.contains(hint) {
interp_search_dirs.push(hint.clone());
}
}
use rayon::prelude::*;
let mut found: Vec<(String, PathBuf)> = Vec::new();
let mut missing: Vec<MissingLib> = Vec::new();
let mut visited = std::collections::HashSet::new();
let mut level: Vec<(String, ElfSearchPaths)> = root_needed
.iter()
.map(|s| (s.clone(), root_search.clone()))
.collect();
while !level.is_empty() {
let mut resolved: Vec<(String, PathBuf, PathBuf)> = Vec::new();
for (soname, search_paths) in &level {
if !visited.insert(soname.clone()) {
continue;
}
if let Some(host_path) = resolve_soname(soname, search_paths, &interp_search_dirs) {
let canonical =
std::fs::canonicalize(&host_path).unwrap_or_else(|_| host_path.clone());
let canon_str = canonical.to_string_lossy();
let canon_guest = canon_str
.strip_prefix('/')
.unwrap_or(&canon_str)
.to_string();
found.push((canon_guest.clone(), canonical.clone()));
let host_str = host_path.to_string_lossy();
let host_guest = host_str.strip_prefix('/').unwrap_or(&host_str).to_string();
if host_guest != canon_guest {
found.push((host_guest, canonical.clone()));
}
resolved.push((soname.clone(), host_path, canonical));
} else {
missing.push(MissingLib {
soname: soname.clone(),
});
}
}
let next_deps: Vec<(String, ElfSearchPaths)> = resolved
.par_iter()
.flat_map(|(_, _, canonical)| {
let Ok(lib_data) = std::fs::read(canonical) else {
return Vec::new();
};
let Ok(lib_elf) = goblin::elf::Elf::parse(&lib_data) else {
return Vec::new();
};
let lib_search = elf_search_paths(&lib_elf, canonical);
lib_elf
.libraries
.iter()
.map(|name| (name.to_string(), lib_search.clone()))
.collect::<Vec<_>>()
})
.collect();
level = next_deps
.into_iter()
.filter(|(soname, _)| !visited.contains(soname))
.collect();
}
let result = SharedLibs {
found,
missing,
interpreter,
};
if let Ok(mut cache) = CACHE.lock() {
cache.insert(cache_key, result.clone());
}
Ok(result)
}
#[derive(Debug, Clone, Default)]
struct ElfSearchPaths {
rpath: Vec<PathBuf>,
runpath: Vec<PathBuf>,
}
fn elf_search_paths(elf: &goblin::elf::Elf, binary: &Path) -> ElfSearchPaths {
let origin = binary
.parent()
.and_then(|p| std::fs::canonicalize(p).ok())
.unwrap_or_default();
let origin_str = origin.to_string_lossy();
let lib_str = if elf.is_64 { "lib64" } else { "lib" };
let platform_str = std::env::consts::ARCH;
let expand = |raw: &str| -> Vec<PathBuf> {
raw.split(':')
.filter(|s| !s.is_empty())
.map(|p| {
let expanded = p
.replace("$ORIGIN", &origin_str)
.replace("${ORIGIN}", &origin_str)
.replace("$LIB", lib_str)
.replace("${LIB}", lib_str)
.replace("$PLATFORM", platform_str)
.replace("${PLATFORM}", platform_str);
PathBuf::from(expanded)
})
.collect()
};
if !elf.runpaths.is_empty() {
return ElfSearchPaths {
rpath: Vec::new(),
runpath: expand(&elf.runpaths.join(":")),
};
}
if !elf.rpaths.is_empty() {
return ElfSearchPaths {
rpath: expand(&elf.rpaths.join(":")),
runpath: Vec::new(),
};
}
ElfSearchPaths::default()
}
const STANDARD_INTERPRETERS: &[&str] = &[
"/lib/ld-linux.so.2",
"/lib/ld-linux-aarch64.so.1",
"/lib/ld-linux-armhf.so.3",
"/lib64/ld-linux-x86-64.so.2",
"/lib/ld-musl-x86_64.so.1",
"/lib/ld-musl-aarch64.so.1",
"/libexec/ld-elf.so.1",
];
fn is_standard_interpreter(interp: &str) -> bool {
let interp_path = Path::new(interp);
if STANDARD_INTERPRETERS.contains(&interp) {
return true;
}
let Ok(canon) = std::fs::canonicalize(interp_path) else {
return false;
};
STANDARD_INTERPRETERS.iter().any(|std_interp| {
std::fs::canonicalize(std_interp).is_ok_and(|std_canon| std_canon == canon)
})
}
const DEFAULT_LIB_PATHS: &[&str] = &[
"/lib",
"/usr/lib",
"/lib64",
"/usr/lib64",
"/usr/local/lib",
"/usr/local/lib64",
"/lib/x86_64-linux-gnu",
"/usr/lib/x86_64-linux-gnu",
"/lib/aarch64-linux-gnu",
"/usr/lib/aarch64-linux-gnu",
];
static LD_LIBRARY_PATH_DIRS: LazyLock<Vec<PathBuf>> = LazyLock::new(|| {
std::env::var("LD_LIBRARY_PATH")
.unwrap_or_default()
.split(':')
.filter(|s| !s.is_empty())
.map(PathBuf::from)
.collect()
});
fn resolve_soname(
soname: &str,
elf_paths: &ElfSearchPaths,
interp_hints: &[PathBuf],
) -> Option<PathBuf> {
for dir in &elf_paths.rpath {
let candidate = dir.join(soname);
if candidate.is_file() {
return Some(candidate);
}
}
for dir in LD_LIBRARY_PATH_DIRS.iter() {
let candidate = dir.join(soname);
if candidate.is_file() {
return Some(candidate);
}
}
for dir in &elf_paths.runpath {
let candidate = dir.join(soname);
if candidate.is_file() {
return Some(candidate);
}
}
for dir in interp_hints {
let candidate = dir.join(soname);
if candidate.is_file() {
return Some(candidate);
}
}
if let Some(cached_path) = LD_SO_CACHE.get(soname) {
return Some(cached_path.clone());
}
for dir in DEFAULT_LIB_PATHS {
let candidate = Path::new(dir).join(soname);
if candidate.is_file() {
return Some(candidate);
}
}
None
}
const ELF_MAGIC: &[u8; 4] = b"\x7fELF";
fn is_elf(path: &Path) -> bool {
std::fs::File::open(path)
.and_then(|mut f| {
use std::io::Read;
let mut magic = [0u8; 4];
f.read_exact(&mut magic)?;
Ok(magic)
})
.is_ok_and(|m| m == *ELF_MAGIC)
}
fn write_entry(archive: &mut Vec<u8>, name: &str, data: &[u8], mode: u32) -> Result<()> {
let builder = cpio::newc::Builder::new(name).mode(mode).nlink(1);
let mut writer = builder.write(archive as &mut dyn Write, data.len() as u32);
writer
.write_all(data)
.with_context(|| format!("write cpio entry '{name}'"))?;
writer.finish().context("finish cpio entry")?;
Ok(())
}
fn write_symlink_entry(archive: &mut Vec<u8>, name: &str, target: &str) -> Result<()> {
let target_bytes = target.as_bytes();
let builder = cpio::newc::Builder::new(name).mode(0o120777).nlink(1);
let mut writer = builder.write(archive as &mut dyn Write, target_bytes.len() as u32);
writer
.write_all(target_bytes)
.with_context(|| format!("write cpio symlink '{name}' -> '{target}'"))?;
writer.finish().context("finish cpio symlink entry")?;
Ok(())
}
const DEBUG_SECTIONS: &[&[u8]] = &[
b".debug_info",
b".debug_abbrev",
b".debug_line",
b".debug_line_str",
b".debug_str",
b".debug_ranges",
b".debug_aranges",
b".debug_frame",
b".debug_loc",
b".debug_loclists",
b".debug_rnglists",
b".debug_str_offsets",
b".debug_addr",
b".debug_pubtypes",
b".debug_pubnames",
b".debug_types",
b".debug_macro",
b".debug_macinfo",
b".comment",
];
fn strip_debug(path: &Path) -> Result<Vec<u8>> {
let paths_to_try: Vec<&Path> = if is_deleted_self(path) {
vec![path, Path::new("/proc/self/exe")]
} else {
vec![path]
};
for src in &paths_to_try {
if let Ok(data) = std::fs::read(src) {
match strip_debug_sections(&data) {
Ok(stripped) => return Ok(stripped),
Err(e) => {
tracing::warn!(
binary = %src.display(),
error = %e,
"strip_debug_sections failed, using unstripped binary"
);
return Ok(data);
}
}
}
}
std::fs::read(path).with_context(|| format!("read binary: {}", path.display()))
}
fn strip_debug_sections(data: &[u8]) -> std::result::Result<Vec<u8>, object::build::Error> {
crate::elf_strip::rewrite(data, |name| DEBUG_SECTIONS.contains(&name))
}
fn is_deleted_self(path: &Path) -> bool {
let proc_exe = Path::new("/proc/self/exe");
let Ok(target) = std::fs::read_link(proc_exe) else {
return false;
};
let target_str = target.to_string_lossy();
target_str.ends_with(" (deleted)")
&& target_str.trim_end_matches(" (deleted)") == path.to_string_lossy().as_ref()
}
fn register_parent_dirs(dirs: &mut BTreeSet<String>, guest_path: &str) {
let Some(parent) = Path::new(guest_path).parent() else {
return;
};
let mut dir = PathBuf::new();
for component in parent.components() {
dir.push(component);
dirs.insert(dir.to_string_lossy().to_string());
}
}
#[tracing::instrument(skip_all, fields(payload = %payload.display(), includes = include_files.len()))]
pub fn build_initramfs_base(
payload: &Path,
extra_binaries: &[(&str, &Path)],
include_files: &[(&str, &Path)],
busybox: bool,
) -> Result<Vec<u8>> {
let mut validated_includes: Vec<(&str, &Path, u32)> = Vec::with_capacity(include_files.len());
for (archive_path, host_path) in include_files {
if Path::new(archive_path)
.components()
.any(|c| matches!(c, std::path::Component::ParentDir))
{
anyhow::bail!("include_files archive path contains '..': {}", archive_path);
}
if archive_path.starts_with(".ktstr_") {
anyhow::bail!(
"include_files archive path must not start with '.ktstr_': {}",
archive_path
);
}
let meta = std::fs::metadata(host_path).with_context(|| {
format!(
"stat include file '{}': {}",
archive_path,
host_path.display()
)
})?;
if !meta.file_type().is_file() {
anyhow::bail!(
"include_files entry '{}' is not a regular file: {}",
archive_path,
host_path.display()
);
}
validated_includes.push((archive_path, host_path, meta.permissions().mode()));
}
let binary = {
let _s = tracing::debug_span!("strip_debug").entered();
strip_debug(payload).with_context(|| format!("strip/read binary: {}", payload.display()))?
};
let mut archive = Vec::new();
let mut dirs = BTreeSet::new();
let mut shared_libs: Vec<(String, PathBuf)> = Vec::new();
let mut all_binaries: Vec<&Path> = std::iter::once(payload)
.chain(extra_binaries.iter().map(|(_, p)| *p))
.collect();
let mut include_elf_paths: Vec<&Path> = Vec::new();
for (_, host_path) in include_files {
if is_elf(host_path) {
include_elf_paths.push(host_path);
all_binaries.push(host_path);
}
}
let _s_resolve = tracing::debug_span!("resolve_all_libs", count = all_binaries.len()).entered();
for path in &all_binaries {
let _s_one =
tracing::debug_span!("resolve_shared_libs", binary = %path.display()).entered();
let result = resolve_shared_libs(path)
.with_context(|| format!("resolve libs for {}", path.display()))?;
drop(_s_one);
if !result.missing.is_empty() && include_elf_paths.contains(path) {
let names: Vec<&str> = result.missing.iter().map(|m| m.soname.as_str()).collect();
anyhow::bail!(
"{}: missing shared libraries: {}",
path.display(),
names.join(", ")
);
}
tracing::debug!(
binary = %path.display(),
interpreter = ?result.interpreter,
is_include = include_elf_paths.contains(path),
"resolved interpreter for binary"
);
if let Some(ref interp) = result.interpreter {
let interp_path = Path::new(interp);
let is_standard = is_standard_interpreter(interp);
tracing::debug!(
interp = %interp_path.display(),
exists = interp_path.is_file(),
is_standard,
"interpreter details"
);
if interp_path.is_file() {
let canonical = std::fs::canonicalize(interp_path)
.unwrap_or_else(|_| interp_path.to_path_buf());
let canon_str = canonical.to_string_lossy();
let guest = canon_str
.strip_prefix('/')
.unwrap_or(&canon_str)
.to_string();
register_parent_dirs(&mut dirs, &guest);
tracing::debug!(
canonical_guest = %guest,
canonical_host = %canonical.display(),
"packing interpreter canonical path"
);
shared_libs.push((guest.clone(), canonical.clone()));
let orig_guest = interp.strip_prefix('/').unwrap_or(interp).to_string();
if orig_guest != guest {
tracing::debug!(
orig_guest = %orig_guest,
canonical_guest = %guest,
"packing interpreter original (non-canonical) path"
);
register_parent_dirs(&mut dirs, &orig_guest);
shared_libs.push((orig_guest, canonical));
} else {
tracing::debug!("interpreter original path matches canonical, no alias needed");
}
if !is_standard_interpreter(interp) {
let mut interp_hints: Vec<PathBuf> = Vec::new();
if let Some(parent) = interp_path.parent() {
interp_hints.push(parent.to_path_buf());
if let Some(grandparent) = parent.parent() {
interp_hints.push(grandparent.join("lib"));
interp_hints.push(grandparent.join("lib64"));
}
}
if let Ok(interp_result) =
resolve_shared_libs_with_extra_interp_hints(interp_path, &interp_hints)
{
for (g, h) in interp_result.found {
register_parent_dirs(&mut dirs, &g);
shared_libs.push((g, h));
}
}
}
}
}
for (guest_path, host_path) in result.found {
register_parent_dirs(&mut dirs, &guest_path);
shared_libs.push((guest_path, host_path));
}
}
let pre_dedup_count = shared_libs.len();
shared_libs.sort_by(|a, b| a.0.cmp(&b.0));
shared_libs.dedup_by(|a, b| a.0 == b.0);
tracing::debug!(
pre_dedup = pre_dedup_count,
post_dedup = shared_libs.len(),
removed = pre_dedup_count - shared_libs.len(),
"shared_libs dedup"
);
if busybox {
dirs.insert("bin".to_string());
}
for (archive_path, _, _) in &validated_includes {
register_parent_dirs(&mut dirs, archive_path);
}
for (name, _) in extra_binaries {
register_parent_dirs(&mut dirs, name);
}
drop(_s_resolve);
tracing::debug!(
shared_libs_count = shared_libs.len(),
dirs_count = dirs.len(),
dirs = ?dirs,
shared_libs_guests = ?shared_libs.iter().map(|(g, _)| g.as_str()).collect::<Vec<_>>(),
"pre-write archive contents"
);
let _s_write = tracing::debug_span!("write_cpio").entered();
for dir in &dirs {
write_entry(&mut archive, dir, &[], 0o40755)?;
}
write_entry(&mut archive, "init", &binary, 0o100755)?;
if busybox {
write_entry(&mut archive, "bin/busybox", crate::BUSYBOX, 0o100755)?;
}
for (name, path) in extra_binaries {
let data = strip_debug(path)
.with_context(|| format!("strip/read extra binary '{}': {}", name, path.display()))?;
write_entry(&mut archive, name, &data, 0o100755)?;
}
for (archive_path, host_path, mode) in &validated_includes {
let data = std::fs::read(host_path).with_context(|| {
format!(
"read include file '{}': {}",
archive_path,
host_path.display()
)
})?;
write_entry(&mut archive, archive_path, &data, *mode)?;
}
{
let mut written_files: HashMap<PathBuf, String> = HashMap::new();
for (guest_path, host_path) in &shared_libs {
let canonical = std::fs::canonicalize(host_path).unwrap_or_else(|_| host_path.clone());
if let Some(first_guest) = written_files.get(&canonical) {
let target = format!("/{first_guest}");
write_symlink_entry(&mut archive, guest_path, &target)?;
} else {
let data = std::fs::read(host_path).with_context(|| {
format!("read shared lib '{}': {}", guest_path, host_path.display())
})?;
write_entry(&mut archive, guest_path, &data, 0o100755)?;
written_files.insert(canonical, guest_path.clone());
}
}
}
write_entry(&mut archive, ".ktstr_init_ok", &[], 0o100644)?;
drop(_s_write);
Ok(archive)
}
#[derive(Default)]
pub struct SuffixParams<'a> {
pub args: &'a [String],
pub sched_args: &'a [String],
pub sched_enable: &'a [String],
pub sched_disable: &'a [String],
pub exec_cmd: Option<&'a str>,
pub staged_sched_args: &'a [(String, Vec<String>)],
pub workload_root_cgroup: Option<&'a str>,
pub scheduler_cgroup_parent: Option<&'a str>,
}
pub fn build_suffix(base_len: usize, params: &SuffixParams<'_>) -> Result<Vec<u8>> {
let mut suffix = Vec::new();
let args_data = params.args.join("\n");
write_entry(&mut suffix, "args", args_data.as_bytes(), 0o100644)?;
if !params.sched_args.is_empty() {
let sched_args_data = params.sched_args.join("\n");
write_entry(
&mut suffix,
"sched_args",
sched_args_data.as_bytes(),
0o100644,
)?;
}
if !params.sched_enable.is_empty() {
let data = params.sched_enable.join("\n");
write_entry(&mut suffix, "sched_enable", data.as_bytes(), 0o100755)?;
}
if !params.sched_disable.is_empty() {
let data = params.sched_disable.join("\n");
write_entry(&mut suffix, "sched_disable", data.as_bytes(), 0o100755)?;
}
if let Some(cmd) = params.exec_cmd {
write_entry(&mut suffix, "exec_cmd", cmd.as_bytes(), 0o100644)?;
}
if let Some(path) = params.workload_root_cgroup {
write_entry(
&mut suffix,
"workload_root_cgroup",
path.as_bytes(),
0o100644,
)?;
}
if let Some(path) = params.scheduler_cgroup_parent {
write_entry(
&mut suffix,
"scheduler_cgroup_parent",
path.as_bytes(),
0o100644,
)?;
}
for (name, args) in params.staged_sched_args {
if args.is_empty() {
continue;
}
let archive_path = format!(
"{}/sched_args",
crate::test_support::staged::staged_scheduler_archive_dir(name)
);
let data = args.join("\n");
write_entry(&mut suffix, &archive_path, data.as_bytes(), 0o100644)?;
}
cpio::newc::trailer(&mut suffix as &mut dyn Write).context("write cpio trailer")?;
let total = base_len + suffix.len();
let pad = (512 - (total % 512)) % 512;
suffix.extend(std::iter::repeat_n(0u8, pad));
Ok(suffix)
}
#[cfg(target_arch = "x86_64")]
pub(crate) const SHM_ARCH_TAG: &str = "x86_64";
#[cfg(target_arch = "aarch64")]
pub(crate) const SHM_ARCH_TAG: &str = "aarch64";
pub(crate) fn shm_segment_name(content_hash: u64) -> String {
format!("/ktstr-base-{SHM_ARCH_TAG}-{content_hash:016x}")
}
pub(crate) struct MappedShm {
ptr: *const u8,
len: usize,
fd: std::os::fd::OwnedFd,
}
unsafe impl Send for MappedShm {}
unsafe impl Sync for MappedShm {}
impl AsRef<[u8]> for MappedShm {
fn as_ref(&self) -> &[u8] {
unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
}
}
impl Drop for MappedShm {
fn drop(&mut self) {
unsafe {
libc::munmap(self.ptr as *mut libc::c_void, self.len);
}
let _ = rustix::fs::flock(&self.fd, rustix::fs::FlockOperation::Unlock);
}
}
pub(crate) fn shm_load_base(content_hash: u64) -> Option<MappedShm> {
use std::os::fd::AsRawFd;
let name = shm_segment_name(content_hash);
let fd = rustix::shm::open(
name.as_str(),
rustix::shm::OFlags::RDONLY,
rustix::fs::Mode::empty(),
)
.ok()?;
rustix::fs::flock(&fd, rustix::fs::FlockOperation::LockShared).ok()?;
let stat = rustix::fs::fstat(&fd).ok()?;
if stat.st_size <= 0 {
let _ = rustix::fs::flock(&fd, rustix::fs::FlockOperation::Unlock);
return None;
}
let len = stat.st_size as usize;
let ptr = unsafe {
libc::mmap(
std::ptr::null_mut(),
len,
libc::PROT_READ,
libc::MAP_SHARED,
fd.as_raw_fd(),
0,
)
};
if ptr == libc::MAP_FAILED {
let _ = rustix::fs::flock(&fd, rustix::fs::FlockOperation::Unlock);
return None;
}
Some(MappedShm {
ptr: ptr as *const u8,
len,
fd,
})
}
fn shm_store(name: &str, data: &[u8]) -> Result<()> {
use std::os::fd::AsRawFd;
let fd = rustix::shm::open(
name,
rustix::shm::OFlags::CREATE | rustix::shm::OFlags::RDWR,
rustix::fs::Mode::from_raw_mode(0o644),
)
.map_err(|e| anyhow::anyhow!("shm_open: {e}"))?;
tracing::info!(
segment = name,
data_len = data.len(),
"shm_store: waiting for LOCK_EX"
);
rustix::fs::flock(&fd, rustix::fs::FlockOperation::LockExclusive)
.map_err(|e| anyhow::anyhow!("flock: {e}"))?;
let raw_fd = fd.as_raw_fd();
unsafe {
if libc::ftruncate(raw_fd, data.len() as libc::off_t) != 0 {
let err = std::io::Error::last_os_error();
if let Err(e) = rustix::shm::unlink(name) {
tracing::warn!(
err = %e,
segment = name,
"shm_unlink failed on ftruncate error path"
);
}
let _ = rustix::fs::flock(&fd, rustix::fs::FlockOperation::Unlock);
anyhow::bail!("ftruncate: {err}");
}
let ptr = libc::mmap(
std::ptr::null_mut(),
data.len(),
libc::PROT_WRITE,
libc::MAP_SHARED,
raw_fd,
0,
);
if ptr == libc::MAP_FAILED {
let err = std::io::Error::last_os_error();
if let Err(e) = rustix::shm::unlink(name) {
tracing::warn!(
err = %e,
segment = name,
"shm_unlink failed on mmap error path"
);
}
let _ = rustix::fs::flock(&fd, rustix::fs::FlockOperation::Unlock);
anyhow::bail!("mmap: {err}");
}
std::ptr::copy_nonoverlapping(data.as_ptr(), ptr as *mut u8, data.len());
libc::munmap(ptr, data.len());
}
let _ = rustix::fs::flock(&fd, rustix::fs::FlockOperation::Unlock);
Ok(())
}
pub(crate) fn shm_store_base(content_hash: u64, data: &[u8]) -> Result<()> {
shm_store(&shm_segment_name(content_hash), data)
}
#[cfg(test)]
pub(crate) fn shm_unlink_base(content_hash: u64) {
let _ = rustix::shm::unlink(shm_segment_name(content_hash).as_str());
}
fn shm_lz4_segment_name(content_hash: u64) -> String {
format!("/ktstr-lz4-{SHM_ARCH_TAG}-{content_hash:016x}")
}
pub(crate) fn shm_open_lz4(content_hash: u64) -> Option<(std::os::fd::OwnedFd, usize)> {
let name = shm_lz4_segment_name(content_hash);
let fd = rustix::shm::open(
name.as_str(),
rustix::shm::OFlags::RDONLY,
rustix::fs::Mode::empty(),
)
.ok()?;
rustix::fs::flock(&fd, rustix::fs::FlockOperation::LockShared).ok()?;
let stat = rustix::fs::fstat(&fd).ok()?;
if stat.st_size <= 0 {
let _ = rustix::fs::flock(&fd, rustix::fs::FlockOperation::Unlock);
return None;
}
Some((fd, stat.st_size as usize))
}
pub(crate) fn shm_store_lz4(content_hash: u64, data: &[u8]) -> Result<()> {
shm_store(&shm_lz4_segment_name(content_hash), data)
}
pub(crate) struct CowOverlayGuard {
fd: std::os::fd::OwnedFd,
}
impl CowOverlayGuard {
fn new(fd: std::os::fd::OwnedFd) -> Self {
Self { fd }
}
}
impl Drop for CowOverlayGuard {
fn drop(&mut self) {
let _ = rustix::fs::flock(&self.fd, rustix::fs::FlockOperation::Unlock);
}
}
pub(crate) unsafe fn cow_overlay(
host_addr: *mut u8,
len: usize,
shm_fd: std::os::fd::OwnedFd,
) -> Option<CowOverlayGuard> {
use std::os::fd::AsRawFd;
let ptr = unsafe {
libc::mmap(
host_addr as *mut libc::c_void,
len,
libc::PROT_READ | libc::PROT_WRITE,
libc::MAP_PRIVATE | libc::MAP_FIXED | libc::MAP_POPULATE,
shm_fd.as_raw_fd(),
0,
)
};
if ptr == libc::MAP_FAILED {
let _ = rustix::fs::flock(&shm_fd, rustix::fs::FlockOperation::Unlock);
return None;
}
Some(CowOverlayGuard::new(shm_fd))
}
pub(crate) fn shm_close_fd(fd: std::os::fd::OwnedFd) {
let _ = rustix::fs::flock(&fd, rustix::fs::FlockOperation::Unlock);
}
pub fn load_initramfs_parts(
guest_mem: &vm_memory::GuestMemoryMmap,
parts: &[&[u8]],
load_addr: u64,
) -> Result<(u64, u32)> {
use vm_memory::{Bytes, GuestAddress};
let mut offset = 0u64;
for part in parts {
guest_mem
.write_slice(part, GuestAddress(load_addr + offset))
.context("write initramfs part to guest memory")?;
offset += part.len() as u64;
}
Ok((load_addr, offset as u32))
}
pub(crate) const LZ4_LEGACY_MAGIC: [u8; 4] = 0x184C2102u32.to_le_bytes();
const LZ4_CHUNK_SIZE: usize = 8 << 20;
pub(crate) fn lz4_legacy_compress(data: &[u8]) -> Vec<u8> {
use rayon::prelude::*;
let compressed_chunks: Vec<Vec<u8>> = data
.par_chunks(LZ4_CHUNK_SIZE)
.map(lz4_flex::block::compress)
.collect();
let total: usize = 4 + compressed_chunks.iter().map(|c| 4 + c.len()).sum::<usize>();
let mut out = Vec::with_capacity(total);
out.extend_from_slice(&LZ4_LEGACY_MAGIC);
for chunk in &compressed_chunks {
out.extend_from_slice(&(chunk.len() as u32).to_le_bytes());
out.extend_from_slice(chunk);
}
out
}
#[cfg(test)]
#[path = "initramfs_tests.rs"]
mod tests;