ktstr 0.4.14 - Docs.rs

//! Minimal initramfs (cpio newc format) creation via the `cpio` crate.
//! Packs the test binary as `/init` along with scheduler binaries,
//! shared libraries, optional busybox, and user-provided include files
//! into a cpio archive for use as Linux initrd.
//! Init setup is handled by Rust code in `vmm::rust_init`.
use anyhow::{Context, Result};
use std::collections::{BTreeSet, HashMap};
use std::io::Write;
use std::os::unix::fs::PermissionsExt;
use std::path::{Path, PathBuf};
use std::sync::LazyLock;

/// Result of shared library resolution for a binary.
#[derive(Debug, Clone)]
pub(crate) struct SharedLibs {
    /// Resolved `(guest_path, host_path)` pairs.
    pub found: Vec<(String, PathBuf)>,
    /// Library sonames that could not be resolved to a host path.
    pub missing: Vec<MissingLib>,
    /// The binary's PT_INTERP path, if present (e.g. `/lib64/ld-linux-x86-64.so.2`).
    pub interpreter: Option<String>,
}

/// A shared library dependency that could not be resolved.
#[derive(Debug, Clone)]
pub(crate) struct MissingLib {
    /// The soname (e.g. `libssl.so.1.1`).
    pub soname: String,
}

/// Parsed soname-to-path mappings from `/etc/ld.so.cache`.
///
/// The binary cache is the authoritative lookup used by `ld-linux.so`.
/// It contains entries for every library indexed by `ldconfig`, including
/// libraries in directories added via `ldconfig /path` that may not
/// appear in the text-based `/etc/ld.so.conf` files. Parsing the cache
/// catches libraries the conf-based directory scan misses.
///
/// Format (glibc new format, `glibc-ld.so.cache1.1`):
///   - 48-byte header: magic[20] + nlibs[4] + len_strings[4] + flags[4] + unused[16]
///   - nlibs entries of 24 bytes: flags[4] + key[4] + value[4] + osversion[4] + hwcap[8]
///   - String table: key/value are absolute byte offsets from file start
static LD_SO_CACHE: LazyLock<HashMap<String, PathBuf>> =
    LazyLock::new(|| parse_ld_so_cache(Path::new("/etc/ld.so.cache")));

/// Magic bytes at the start of the glibc new-format `ld.so.cache`.
const LD_CACHE_MAGIC: &[u8; 20] = b"glibc-ld.so.cache1.1";
/// Header size: magic(20) + nlibs(4) + len_strings(4) + flags(4) + unused(16).
const LD_CACHE_HEADER_SIZE: usize = 48;
/// Per-entry size: flags(4) + key(4) + value(4) + osversion(4) + hwcap(8).
const LD_CACHE_ENTRY_SIZE: usize = 24;

/// Parse the binary `/etc/ld.so.cache` file into a soname->path map.
///
/// Scans for the new-format magic because some systems prepend the
/// old format (`ld.so-1.7.0`) before the new-format section.
fn parse_ld_so_cache(path: &Path) -> HashMap<String, PathBuf> {
    let mut map = HashMap::new();
    let data = match std::fs::read(path) {
        Ok(d) => d,
        Err(_) => return map,
    };
    // Scan for new-format magic. Usually at offset 0, but old-format
    // systems prepend the legacy section.
    let Some(magic_pos) = data
        .windows(LD_CACHE_MAGIC.len())
        .position(|w| w == LD_CACHE_MAGIC)
    else {
        return map;
    };
    let hdr = magic_pos;
    if data.len() < hdr + LD_CACHE_HEADER_SIZE {
        return map;
    }
    let nlibs = u32::from_le_bytes(data[hdr + 20..hdr + 24].try_into().unwrap()) as usize;
    let min_size = hdr + LD_CACHE_HEADER_SIZE + nlibs * LD_CACHE_ENTRY_SIZE;
    if data.len() < min_size {
        return map;
    }
    for i in 0..nlibs {
        let off = hdr + LD_CACHE_HEADER_SIZE + i * LD_CACHE_ENTRY_SIZE;
        // key and value are absolute byte offsets from file start.
        let key_off = u32::from_le_bytes(data[off + 4..off + 8].try_into().unwrap()) as usize;
        let val_off = u32::from_le_bytes(data[off + 8..off + 12].try_into().unwrap()) as usize;
        if key_off >= data.len() || val_off >= data.len() {
            continue;
        }
        let soname = match read_cstr(&data, key_off) {
            Some(s) => s,
            None => continue,
        };
        let path_str = match read_cstr(&data, val_off) {
            Some(s) => s,
            None => continue,
        };
        // Only accept absolute paths that exist as files.
        if path_str.starts_with('/') {
            let p = PathBuf::from(path_str);
            if p.is_file() {
                map.entry(soname.to_string()).or_insert(p);
            }
        }
    }
    map
}

/// Read a null-terminated C string from `data` at `offset`.
fn read_cstr(data: &[u8], offset: usize) -> Option<&str> {
    let end = data[offset..].iter().position(|&b| b == 0)?;
    std::str::from_utf8(&data[offset..offset + end]).ok()
}

/// Resolve shared library dependencies for a dynamically-linked ELF binary.
/// Parses the ELF dynamic section to read DT_NEEDED entries, then resolves
/// each soname to a host path matching the host dynamic linker's search
/// order: DT_RPATH (legacy; only when DT_RUNPATH is absent) →
/// LD_LIBRARY_PATH → DT_RUNPATH → interp-relative hints → /etc/ld.so.cache
/// → default library paths. A separate /etc/ld.so.conf walk is omitted
/// because ldconfig ingests conf paths into ld.so.cache. When the binary
/// uses a non-standard PT_INTERP, the interpreter's parent and sibling
/// lib dirs feed the interp hints and are propagated to transitive deps.
/// Walks transitive deps via level-parallel BFS. Returns empty result
/// for static binaries or non-ELF files.
pub(crate) fn resolve_shared_libs(binary: &Path) -> Result<SharedLibs> {
    resolve_shared_libs_inner(binary, &[])
}

/// Like [`resolve_shared_libs`] but seeds the BFS with additional
/// interp-relative hint directories on top of the ones derived from
/// the binary's own PT_INTERP. Use this when walking the interpreter
/// itself: the linker has no PT_INTERP of its own, so the auto-derived
/// hint set is empty and toolchain-local libs (interp→libA→libB chains
/// through `/opt/toolchain/lib`) would otherwise fall off the BFS at
/// libA's resolution step.
fn resolve_shared_libs_with_extra_interp_hints(
    binary: &Path,
    extra_interp_hints: &[PathBuf],
) -> Result<SharedLibs> {
    resolve_shared_libs_inner(binary, extra_interp_hints)
}

#[tracing::instrument(skip_all, fields(binary = %binary.display(), extra_hints = extra_interp_hints.len()))]
fn resolve_shared_libs_inner(binary: &Path, extra_interp_hints: &[PathBuf]) -> Result<SharedLibs> {
    // Cache results by canonical path AND extra-hint set — avoids
    // re-resolving the same binary across concurrent initramfs builds
    // (nextest parallelism). Hints are part of the key so a second
    // call on the same binary with different hint sets does not return
    // the prior result.
    type LibCache = LazyLock<std::sync::Mutex<HashMap<(PathBuf, Vec<PathBuf>), SharedLibs>>>;
    static CACHE: LibCache = LazyLock::new(|| std::sync::Mutex::new(HashMap::new()));

    let canon = std::fs::canonicalize(binary).unwrap_or_else(|_| binary.to_path_buf());
    let cache_key = (canon.clone(), extra_interp_hints.to_vec());
    if let Ok(cache) = CACHE.lock()
        && let Some(cached) = cache.get(&cache_key)
    {
        return Ok(cached.clone());
    }

    let data =
        std::fs::read(binary).with_context(|| format!("read binary: {}", binary.display()))?;
    let elf = match goblin::elf::Elf::parse(&data) {
        Ok(e) => e,
        Err(_) => {
            // Not a valid ELF (or 32-bit) — treat as static/non-dynamic.
            return Ok(SharedLibs {
                found: vec![],
                missing: vec![],
                interpreter: None,
            });
        }
    };

    let interpreter = elf.interpreter.map(|s| s.to_string());

    if elf.libraries.is_empty() && elf.dynamic.is_none() {
        // No dynamic section — static binary.
        return Ok(SharedLibs {
            found: vec![],
            missing: vec![],
            interpreter,
        });
    }

    // Extract DT_NEEDED, DT_RUNPATH, and DT_RPATH from the root binary.
    let root_needed: Vec<String> = elf.libraries.iter().map(|s| s.to_string()).collect();
    let root_search = elf_search_paths(&elf, binary);

    // When the binary uses a non-standard interpreter (custom toolchain),
    // collect the interpreter's parent dir and sibling lib dirs. These
    // are passed to resolve_soname as `interp_hints` alongside the
    // RPATH/RUNPATH split so the custom environment's libs are
    // consulted before system libs (ld.so.cache, /etc/ld.so.conf,
    // default paths). LD_LIBRARY_PATH still overrides them per the
    // resolve_soname order contract.
    // Without this, the system libc gets resolved first, causing version
    // mismatches when the custom ld.so loads a libc that requires GLIBC
    // symbols the custom ld.so doesn't provide.
    let mut interp_search_dirs: Vec<PathBuf> = match interpreter {
        Some(ref interp) if !is_standard_interpreter(interp) => {
            let interp_path = Path::new(interp);
            let mut dirs = Vec::new();
            if let Some(parent) = interp_path.parent() {
                dirs.push(parent.to_path_buf());
                // Sibling lib dirs: e.g. for /opt/toolchain/lib64/ld.so,
                // parent is lib64, so siblings are at parent.parent()/lib
                // and parent.parent()/lib64.
                if let Some(grandparent) = parent.parent() {
                    dirs.push(grandparent.join("lib"));
                    dirs.push(grandparent.join("lib64"));
                }
            }
            dirs
        }
        _ => Vec::new(),
    };

    // Caller-supplied hints (used when walking the linker itself: the
    // linker has no PT_INTERP of its own, so the match arm above
    // produces an empty set and the caller passes the toolchain dirs
    // computed at the call site). Extras are appended after the
    // auto-derived dirs so a binary's own PT_INTERP-derived hints take
    // precedence; resolve_soname iterates in order.
    for hint in extra_interp_hints {
        if !interp_search_dirs.contains(hint) {
            interp_search_dirs.push(hint.clone());
        }
    }

    // Resolve the full transitive closure via level-parallel BFS.
    // Each level's file reads (read + ELF parse) run in parallel via
    // rayon. Soname resolution (resolve_soname) is cheap (cache lookups
    // + stat calls), so it stays sequential per level.
    use rayon::prelude::*;

    let mut found: Vec<(String, PathBuf)> = Vec::new();
    let mut missing: Vec<MissingLib> = Vec::new();
    let mut visited = std::collections::HashSet::new();

    // Current level: (soname, search_paths_from_parent)
    let mut level: Vec<(String, ElfSearchPaths)> = root_needed
        .iter()
        .map(|s| (s.clone(), root_search.clone()))
        .collect();

    while !level.is_empty() {
        // Phase 1: resolve sonames to host paths (sequential, cheap).
        let mut resolved: Vec<(String, PathBuf, PathBuf)> = Vec::new();
        for (soname, search_paths) in &level {
            if !visited.insert(soname.clone()) {
                continue;
            }
            if let Some(host_path) = resolve_soname(soname, search_paths, &interp_search_dirs) {
                let canonical =
                    std::fs::canonicalize(&host_path).unwrap_or_else(|_| host_path.clone());
                let canon_str = canonical.to_string_lossy();
                let canon_guest = canon_str
                    .strip_prefix('/')
                    .unwrap_or(&canon_str)
                    .to_string();
                found.push((canon_guest.clone(), canonical.clone()));

                // Also add the non-canonical path if it differs, so the
                // guest dynamic linker can find the lib via either path.
                let host_str = host_path.to_string_lossy();
                let host_guest = host_str.strip_prefix('/').unwrap_or(&host_str).to_string();
                if host_guest != canon_guest {
                    found.push((host_guest, canonical.clone()));
                }

                resolved.push((soname.clone(), host_path, canonical));
            } else {
                missing.push(MissingLib {
                    soname: soname.clone(),
                });
            }
        }

        // Phase 2: read + parse resolved libs in parallel to discover
        // their DT_NEEDED entries and search paths. The interp-relative
        // dirs apply uniformly to every resolve_soname call (via the
        // top-level `interp_search_dirs` slice), so transitive deps
        // don't need them threaded through per-level.
        let next_deps: Vec<(String, ElfSearchPaths)> = resolved
            .par_iter()
            .flat_map(|(_, _, canonical)| {
                let Ok(lib_data) = std::fs::read(canonical) else {
                    return Vec::new();
                };
                let Ok(lib_elf) = goblin::elf::Elf::parse(&lib_data) else {
                    return Vec::new();
                };
                let lib_search = elf_search_paths(&lib_elf, canonical);
                lib_elf
                    .libraries
                    .iter()
                    .map(|name| (name.to_string(), lib_search.clone()))
                    .collect::<Vec<_>>()
            })
            .collect();

        // Build next level from discovered deps, skipping already-visited.
        level = next_deps
            .into_iter()
            .filter(|(soname, _)| !visited.contains(soname))
            .collect();
    }

    let result = SharedLibs {
        found,
        missing,
        interpreter,
    };

    if let Ok(mut cache) = CACHE.lock() {
        cache.insert(cache_key, result.clone());
    }

    Ok(result)
}

/// DT_RPATH / DT_RUNPATH directories for a single binary.
///
/// glibc's `ld.so` treats these differently:
/// - **DT_RUNPATH** (modern): consulted AFTER `LD_LIBRARY_PATH`.
/// - **DT_RPATH** (legacy): consulted BEFORE `LD_LIBRARY_PATH`, but
///   only when `DT_RUNPATH` is absent (DT_RUNPATH presence causes the
///   loader to ignore DT_RPATH entirely).
///
/// Collapsing these into a single list (as the prior code did)
/// silently demoted legacy DT_RPATH binaries to DT_RUNPATH order,
/// which can produce different library resolution than the real
/// dynamic linker.
#[derive(Debug, Clone, Default)]
struct ElfSearchPaths {
    /// DT_RPATH directories, with dynamic tokens expanded. Non-empty
    /// only when the binary has DT_RPATH and no DT_RUNPATH (glibc
    /// ignores DT_RPATH when DT_RUNPATH is present).
    rpath: Vec<PathBuf>,
    /// DT_RUNPATH directories, with dynamic tokens expanded.
    runpath: Vec<PathBuf>,
}

/// Extract search paths from DT_RUNPATH and DT_RPATH, with dynamic
/// string tokens expanded:
/// - `$ORIGIN` / `${ORIGIN}`: binary's parent directory
/// - `$LIB` / `${LIB}`: `lib` or `lib64` based on ELF class
/// - `$PLATFORM` / `${PLATFORM}`: `x86_64` or `aarch64`
///
/// Returns the two sets separately so `resolve_soname` can apply
/// glibc's ordering rules; see [`ElfSearchPaths`].
fn elf_search_paths(elf: &goblin::elf::Elf, binary: &Path) -> ElfSearchPaths {
    let origin = binary
        .parent()
        .and_then(|p| std::fs::canonicalize(p).ok())
        .unwrap_or_default();

    let origin_str = origin.to_string_lossy();
    let lib_str = if elf.is_64 { "lib64" } else { "lib" };
    let platform_str = std::env::consts::ARCH;

    let expand = |raw: &str| -> Vec<PathBuf> {
        raw.split(':')
            .filter(|s| !s.is_empty())
            .map(|p| {
                let expanded = p
                    .replace("$ORIGIN", &origin_str)
                    .replace("${ORIGIN}", &origin_str)
                    .replace("$LIB", lib_str)
                    .replace("${LIB}", lib_str)
                    .replace("$PLATFORM", platform_str)
                    .replace("${PLATFORM}", platform_str);
                PathBuf::from(expanded)
            })
            .collect()
    };

    // Modern: DT_RUNPATH is honored and overrides DT_RPATH completely.
    if !elf.runpaths.is_empty() {
        return ElfSearchPaths {
            rpath: Vec::new(),
            runpath: expand(&elf.runpaths.join(":")),
        };
    }
    // Legacy: only DT_RPATH. Searched before LD_LIBRARY_PATH.
    if !elf.rpaths.is_empty() {
        return ElfSearchPaths {
            rpath: expand(&elf.rpaths.join(":")),
            runpath: Vec::new(),
        };
    }
    ElfSearchPaths::default()
}

/// Well-known system dynamic linker paths. If a binary's PT_INTERP
/// canonicalizes to the same file as one of these, it uses the standard
/// linker and does not need the interpreter packed separately.
const STANDARD_INTERPRETERS: &[&str] = &[
    "/lib/ld-linux.so.2",
    "/lib/ld-linux-aarch64.so.1",
    "/lib/ld-linux-armhf.so.3",
    "/lib64/ld-linux-x86-64.so.2",
    "/lib/ld-musl-x86_64.so.1",
    "/lib/ld-musl-aarch64.so.1",
    "/libexec/ld-elf.so.1",
];

/// Check if `interp` is a standard system linker. Compares the
/// canonicalized path against canonicalized well-known linker paths
/// to catch symlinks (e.g. `/opt/toolchain/lib/ld-linux-x86-64.so.2`
/// symlinking to `/lib64/ld-linux-x86-64.so.2`).
fn is_standard_interpreter(interp: &str) -> bool {
    let interp_path = Path::new(interp);
    // Direct match first (avoids syscalls for common case).
    if STANDARD_INTERPRETERS.contains(&interp) {
        return true;
    }
    // Canonicalize and compare against canonical standard paths.
    let Ok(canon) = std::fs::canonicalize(interp_path) else {
        return false;
    };
    STANDARD_INTERPRETERS.iter().any(|std_interp| {
        std::fs::canonicalize(std_interp).is_ok_and(|std_canon| std_canon == canon)
    })
}

/// Default library search paths used by the dynamic linker.
const DEFAULT_LIB_PATHS: &[&str] = &[
    "/lib",
    "/usr/lib",
    "/lib64",
    "/usr/lib64",
    "/usr/local/lib",
    "/usr/local/lib64",
    "/lib/x86_64-linux-gnu",
    "/usr/lib/x86_64-linux-gnu",
    "/lib/aarch64-linux-gnu",
    "/usr/lib/aarch64-linux-gnu",
];

/// Directories from the `LD_LIBRARY_PATH` environment variable, parsed
/// once on first access. Empty when the variable is unset or empty.
static LD_LIBRARY_PATH_DIRS: LazyLock<Vec<PathBuf>> = LazyLock::new(|| {
    std::env::var("LD_LIBRARY_PATH")
        .unwrap_or_default()
        .split(':')
        .filter(|s| !s.is_empty())
        .map(PathBuf::from)
        .collect()
});

/// Resolve a soname to a host path.
/// Search order matches the host dynamic linker (ld.so):
///   1. DT_RPATH (ONLY if DT_RUNPATH is absent — legacy order)
///   2. LD_LIBRARY_PATH
///   3. DT_RUNPATH (modern; ignored when DT_RPATH was used above)
///   4. interp-relative hints (custom toolchain support, not part of
///      glibc; treated as "RUNPATH-adjacent" to keep LD_LIBRARY_PATH
///      able to override them)
///   5. /etc/ld.so.cache (binary cache from ldconfig — already
///      covers everything in /etc/ld.so.conf, so no separate
///      conf-walk step)
///   6. Default library paths (/lib, /usr/lib, etc.)
///
/// This matches glibc ld.so(8) — specifically, DT_RPATH takes
/// priority over `LD_LIBRARY_PATH` when it is the binary's only
/// rpath-style entry, and DT_RUNPATH is consulted only AFTER
/// `LD_LIBRARY_PATH` so an admin override still wins.
fn resolve_soname(
    soname: &str,
    elf_paths: &ElfSearchPaths,
    interp_hints: &[PathBuf],
) -> Option<PathBuf> {
    // 1. DT_RPATH (legacy). Non-empty only when DT_RUNPATH is absent
    //    per `elf_search_paths`; matches glibc's "DT_RPATH before
    //    LD_LIBRARY_PATH" rule for pre-RUNPATH binaries.
    for dir in &elf_paths.rpath {
        let candidate = dir.join(soname);
        if candidate.is_file() {
            return Some(candidate);
        }
    }

    // 2. LD_LIBRARY_PATH.
    for dir in LD_LIBRARY_PATH_DIRS.iter() {
        let candidate = dir.join(soname);
        if candidate.is_file() {
            return Some(candidate);
        }
    }

    // 3. DT_RUNPATH (modern).
    for dir in &elf_paths.runpath {
        let candidate = dir.join(soname);
        if candidate.is_file() {
            return Some(candidate);
        }
    }

    // 4. Interp-relative hints for non-standard dynamic linkers.
    //    Not a glibc concept — ktstr uses these to keep custom
    //    toolchain libs resolvable without requiring the user to
    //    set LD_LIBRARY_PATH.
    for dir in interp_hints {
        let candidate = dir.join(soname);
        if candidate.is_file() {
            return Some(candidate);
        }
    }

    // 5. ld.so.cache — the binary cache is the real dynamic linker's
    //    primary lookup mechanism. Catches libraries in directories
    //    added via `ldconfig /path` that don't appear in ld.so.conf.
    if let Some(cached_path) = LD_SO_CACHE.get(soname) {
        return Some(cached_path.clone());
    }

    // 6. Default paths. (ld.so.conf step dropped: ldconfig already
    //    ingests conf paths into ld.so.cache above, so a separate
    //    walk here was redundant per glibc's search algorithm.)
    for dir in DEFAULT_LIB_PATHS {
        let candidate = Path::new(dir).join(soname);
        if candidate.is_file() {
            return Some(candidate);
        }
    }

    None
}

/// ELF magic bytes: `\x7fELF`.
const ELF_MAGIC: &[u8; 4] = b"\x7fELF";

/// Check if the first 4 bytes of a file match ELF magic.
fn is_elf(path: &Path) -> bool {
    std::fs::File::open(path)
        .and_then(|mut f| {
            use std::io::Read;
            let mut magic = [0u8; 4];
            f.read_exact(&mut magic)?;
            Ok(magic)
        })
        .is_ok_and(|m| m == *ELF_MAGIC)
}

/// Write one entry (file or directory) into the cpio archive.
fn write_entry(archive: &mut Vec<u8>, name: &str, data: &[u8], mode: u32) -> Result<()> {
    let builder = cpio::newc::Builder::new(name).mode(mode).nlink(1);
    let mut writer = builder.write(archive as &mut dyn Write, data.len() as u32);
    writer
        .write_all(data)
        .with_context(|| format!("write cpio entry '{name}'"))?;
    writer.finish().context("finish cpio entry")?;
    Ok(())
}

/// Write a cpio symlink entry. `name` is the symlink path, `target` is the
/// absolute path it points to. Mode is S_IFLNK | 0777 = 0o120777.
fn write_symlink_entry(archive: &mut Vec<u8>, name: &str, target: &str) -> Result<()> {
    let target_bytes = target.as_bytes();
    let builder = cpio::newc::Builder::new(name).mode(0o120777).nlink(1);
    let mut writer = builder.write(archive as &mut dyn Write, target_bytes.len() as u32);
    writer
        .write_all(target_bytes)
        .with_context(|| format!("write cpio symlink '{name}' -> '{target}'"))?;
    writer.finish().context("finish cpio symlink entry")?;
    Ok(())
}

/// Section names removed during debug stripping. These contain debug
/// info, compiler metadata, and profiling data that inflate the binary
/// but are not needed inside the VM.
const DEBUG_SECTIONS: &[&[u8]] = &[
    b".debug_info",
    b".debug_abbrev",
    b".debug_line",
    b".debug_line_str",
    b".debug_str",
    b".debug_ranges",
    b".debug_aranges",
    b".debug_frame",
    b".debug_loc",
    b".debug_loclists",
    b".debug_rnglists",
    b".debug_str_offsets",
    b".debug_addr",
    b".debug_pubtypes",
    b".debug_pubnames",
    b".debug_types",
    b".debug_macro",
    b".debug_macinfo",
    b".comment",
];

/// Strip debug sections from an ELF binary to reduce initramfs size.
/// Debug info can be 10-50x the loadable segment size and is not needed
/// inside the VM. Uses the `object` crate to parse and rewrite the ELF,
/// removing non-loadable debug sections. Falls back to the original
/// binary on parse or write failure.
///
/// When the binary has been deleted (e.g. by `cargo llvm-cov`),
/// retries via `/proc/self/exe` which remains valid as long as the
/// process is alive.
fn strip_debug(path: &Path) -> Result<Vec<u8>> {
    // Try the original path first, then /proc/self/exe if the binary
    // was deleted (cargo llvm-cov deletes binaries after instrumenting).
    let paths_to_try: Vec<&Path> = if is_deleted_self(path) {
        vec![path, Path::new("/proc/self/exe")]
    } else {
        vec![path]
    };

    for src in &paths_to_try {
        if let Ok(data) = std::fs::read(src) {
            if let Ok(stripped) = strip_debug_sections(&data) {
                return Ok(stripped);
            }
            // object crate failed to parse/write — return unstripped.
            return Ok(data);
        }
    }

    std::fs::read(path).with_context(|| format!("read binary: {}", path.display()))
}

/// Remove debug sections from ELF data using the shared
/// [`crate::elf_strip::rewrite`] primitive. Thin filter — delete
/// sections whose name is in the explicit [`DEBUG_SECTIONS`] list.
fn strip_debug_sections(data: &[u8]) -> std::result::Result<Vec<u8>, object::build::Error> {
    crate::elf_strip::rewrite(data, |name| DEBUG_SECTIONS.contains(&name))
}

/// Check if `path` is the current executable and has been deleted.
fn is_deleted_self(path: &Path) -> bool {
    let proc_exe = Path::new("/proc/self/exe");
    let Ok(target) = std::fs::read_link(proc_exe) else {
        return false;
    };
    let target_str = target.to_string_lossy();
    target_str.ends_with(" (deleted)")
        && target_str.trim_end_matches(" (deleted)") == path.to_string_lossy().as_ref()
}

/// Build the base cpio archive: /init binary, extra binaries, and shared
/// libraries. Does NOT include /args, trailer, or 512-byte padding. The
/// returned bytes are a valid cpio prefix that `build_suffix` can complete
/// with per-invocation args.
///
/// The test binary is packed as `/init` (the kernel's rdinit entry point).
/// Init setup (mounts, scheduler start, etc.) is handled by the Rust init
/// code in `vmm::rust_init`, which runs when the binary detects PID 1.
///
/// When `busybox` is true, embeds busybox at `bin/busybox` for shell mode.
///
/// Expand `guest_path`'s parent into every ancestor directory component
/// and insert each into `dirs`. No-op when the path has no parent
/// (e.g. top-level files like `init`). The component walk produces every
/// intermediate directory, e.g. `include-files/sub/f` registers
/// `include-files` and `include-files/sub`.
fn register_parent_dirs(dirs: &mut BTreeSet<String>, guest_path: &str) {
    let Some(parent) = Path::new(guest_path).parent() else {
        return;
    };
    let mut dir = PathBuf::new();
    for component in parent.components() {
        dir.push(component);
        dirs.insert(dir.to_string_lossy().to_string());
    }
}

/// `include_files` adds files verbatim to the archive (no strip_debug).
/// Each entry is `(archive_path, host_path)`. ELF files get shared library
/// resolution; non-ELF files are copied as-is. Symlinks are followed to
/// their target; the target must be a regular file (FIFOs, device nodes,
/// and sockets are rejected). Archive
/// paths must not contain `..` components. Callers expand directories into
/// individual file entries before calling this function (see
/// `cli::resolve_include_files`).
///
/// The init binary is `strip_debug`'d before being written into the
/// archive. Extras are stripped the same way.
#[tracing::instrument(skip_all, fields(payload = %payload.display(), includes = include_files.len()))]
pub fn build_initramfs_base(
    payload: &Path,
    extra_binaries: &[(&str, &Path)],
    include_files: &[(&str, &Path)],
    busybox: bool,
) -> Result<Vec<u8>> {
    // Validate include_files and collect metadata (reused in the write
    // loop to avoid a second stat syscall per file).
    let mut validated_includes: Vec<(&str, &Path, u32)> = Vec::with_capacity(include_files.len());
    for (archive_path, host_path) in include_files {
        // Reject path traversal.
        if Path::new(archive_path)
            .components()
            .any(|c| matches!(c, std::path::Component::ParentDir))
        {
            anyhow::bail!("include_files archive path contains '..': {}", archive_path);
        }
        // Reject paths that collide with internal sentinel files.
        if archive_path.starts_with(".ktstr_") {
            anyhow::bail!(
                "include_files archive path must not start with '.ktstr_': {}",
                archive_path
            );
        }
        // Follow symlinks: include_files entries are explicitly
        // specified by the test author, so a symlink to a regular
        // file is intentional (e.g. package-manager symlinks in
        // /usr/local/bin). The is_file() check below catches
        // non-regular targets (directories, FIFOs, devices).
        let meta = std::fs::metadata(host_path).with_context(|| {
            format!(
                "stat include file '{}': {}",
                archive_path,
                host_path.display()
            )
        })?;
        // Reject non-regular files (FIFOs, device nodes, sockets block or
        // produce garbage).
        if !meta.file_type().is_file() {
            anyhow::bail!(
                "include_files entry '{}' is not a regular file: {}",
                archive_path,
                host_path.display()
            );
        }
        validated_includes.push((archive_path, host_path, meta.permissions().mode()));
    }

    let binary = {
        let _s = tracing::debug_span!("strip_debug").entered();
        strip_debug(payload).with_context(|| format!("strip/read binary: {}", payload.display()))?
    };
    let mut archive = Vec::new();

    // Collect directory entries needed for shared libraries and includes.
    let mut dirs = BTreeSet::new();

    // Resolve shared library dependencies for init binary and extras.
    let mut shared_libs: Vec<(String, PathBuf)> = Vec::new();
    let mut all_binaries: Vec<&Path> = std::iter::once(payload)
        .chain(extra_binaries.iter().map(|(_, p)| *p))
        .collect();

    // ELF files from include_files join the shared lib resolution chain.
    let mut include_elf_paths: Vec<&Path> = Vec::new();
    for (_, host_path) in include_files {
        if is_elf(host_path) {
            include_elf_paths.push(host_path);
            all_binaries.push(host_path);
        }
    }

    let _s_resolve = tracing::debug_span!("resolve_all_libs", count = all_binaries.len()).entered();
    for path in &all_binaries {
        let _s_one =
            tracing::debug_span!("resolve_shared_libs", binary = %path.display()).entered();
        let result = resolve_shared_libs(path)
            .with_context(|| format!("resolve libs for {}", path.display()))?;
        drop(_s_one);

        // Include-file ELFs must have all shared libs resolvable.
        if !result.missing.is_empty() && include_elf_paths.contains(path) {
            let names: Vec<&str> = result.missing.iter().map(|m| m.soname.as_str()).collect();
            anyhow::bail!(
                "{}: missing shared libraries: {}",
                path.display(),
                names.join(", ")
            );
        }

        // Pack PT_INTERP (dynamic linker) into the initramfs. The
        // interpreter is not a DT_NEEDED entry and won't appear in the
        // resolved shared libs, so it must be added explicitly.
        // For non-standard interpreters, also resolve their own deps.
        tracing::debug!(
            binary = %path.display(),
            interpreter = ?result.interpreter,
            is_include = include_elf_paths.contains(path),
            "resolved interpreter for binary"
        );
        if let Some(ref interp) = result.interpreter {
            let interp_path = Path::new(interp);
            let is_standard = is_standard_interpreter(interp);
            tracing::debug!(
                interp = %interp_path.display(),
                exists = interp_path.is_file(),
                is_standard,
                "interpreter details"
            );
            if interp_path.is_file() {
                let canonical = std::fs::canonicalize(interp_path)
                    .unwrap_or_else(|_| interp_path.to_path_buf());
                let canon_str = canonical.to_string_lossy();
                let guest = canon_str
                    .strip_prefix('/')
                    .unwrap_or(&canon_str)
                    .to_string();
                register_parent_dirs(&mut dirs, &guest);
                tracing::debug!(
                    canonical_guest = %guest,
                    canonical_host = %canonical.display(),
                    "packing interpreter canonical path"
                );
                shared_libs.push((guest.clone(), canonical.clone()));

                // Also add the non-canonical path if it differs.
                let orig_guest = interp.strip_prefix('/').unwrap_or(interp).to_string();
                if orig_guest != guest {
                    tracing::debug!(
                        orig_guest = %orig_guest,
                        canonical_guest = %guest,
                        "packing interpreter original (non-canonical) path"
                    );
                    register_parent_dirs(&mut dirs, &orig_guest);
                    shared_libs.push((orig_guest, canonical));
                } else {
                    tracing::debug!("interpreter original path matches canonical, no alias needed");
                }

                // Non-standard interpreters may have their own shared lib
                // deps (custom toolchain linkers alongside their libs).
                // The linker has no PT_INTERP itself, so calling
                // resolve_shared_libs without extra hints would BFS its
                // DT_NEEDED only against system search paths and miss
                // toolchain-local libs (and their transitive deps). Feed
                // the linker's parent and sibling lib dirs in as extra
                // interp-relative hints so the linker's own libA→libB
                // chain is discovered against the same toolchain dirs
                // the parent binary's BFS already used.
                if !is_standard_interpreter(interp) {
                    let mut interp_hints: Vec<PathBuf> = Vec::new();
                    if let Some(parent) = interp_path.parent() {
                        interp_hints.push(parent.to_path_buf());
                        if let Some(grandparent) = parent.parent() {
                            interp_hints.push(grandparent.join("lib"));
                            interp_hints.push(grandparent.join("lib64"));
                        }
                    }
                    if let Ok(interp_result) =
                        resolve_shared_libs_with_extra_interp_hints(interp_path, &interp_hints)
                    {
                        for (g, h) in interp_result.found {
                            register_parent_dirs(&mut dirs, &g);
                            shared_libs.push((g, h));
                        }
                    }
                }
            }
        }

        for (guest_path, host_path) in result.found {
            register_parent_dirs(&mut dirs, &guest_path);
            shared_libs.push((guest_path, host_path));
        }
    }
    let pre_dedup_count = shared_libs.len();
    shared_libs.sort_by(|a, b| a.0.cmp(&b.0));
    shared_libs.dedup_by(|a, b| a.0 == b.0);
    tracing::debug!(
        pre_dedup = pre_dedup_count,
        post_dedup = shared_libs.len(),
        removed = pre_dedup_count - shared_libs.len(),
        "shared_libs dedup"
    );

    // Busybox needs bin/ directory.
    if busybox {
        dirs.insert("bin".to_string());
    }
    // Include files need their parent directories in the cpio archive.
    // The component walk produces all ancestors (e.g. "include-files/sub/f"
    // yields "include-files" and "include-files/sub").
    for (archive_path, _, _) in &validated_includes {
        register_parent_dirs(&mut dirs, archive_path);
    }
    // Extras with a path component (e.g. "bin/ktstr-jemalloc-probe")
    // need their parent directory registered too, otherwise the
    // kernel's cpio extractor sees the file before the directory
    // entry exists and silently drops it. The `("scheduler",
    // path)` case writes to archive root and has no parent to
    // register, so `register_parent_dirs` no-ops on it.
    for (name, _) in extra_binaries {
        register_parent_dirs(&mut dirs, name);
    }

    drop(_s_resolve);

    tracing::debug!(
        shared_libs_count = shared_libs.len(),
        dirs_count = dirs.len(),
        dirs = ?dirs,
        shared_libs_guests = ?shared_libs.iter().map(|(g, _)| g.as_str()).collect::<Vec<_>>(),
        "pre-write archive contents"
    );

    let _s_write = tracing::debug_span!("write_cpio").entered();
    // Directory entries
    for dir in &dirs {
        write_entry(&mut archive, dir, &[], 0o40755)?;
    }

    // Test binary as /init — the Rust init code detects PID 1 and performs
    // all setup (mounts, scheduler, etc.) before running the test function.
    write_entry(&mut archive, "init", &binary, 0o100755)?;

    // Shell mode: embed busybox.
    if busybox {
        write_entry(&mut archive, "bin/busybox", crate::BUSYBOX, 0o100755)?;
    }

    // Extra binaries (stripped to reduce initramfs size)
    for (name, path) in extra_binaries {
        let data = strip_debug(path)
            .with_context(|| format!("strip/read extra binary '{}': {}", name, path.display()))?;
        write_entry(&mut archive, name, &data, 0o100755)?;
    }

    // Include files: copied verbatim, preserving original content and
    // debug symbols. No strip_debug — included files are user-provided
    // and may be non-ELF.
    for (archive_path, host_path, mode) in &validated_includes {
        let data = std::fs::read(host_path).with_context(|| {
            format!(
                "read include file '{}': {}",
                archive_path,
                host_path.display()
            )
        })?;
        write_entry(&mut archive, archive_path, &data, *mode)?;
    }

    // Shared libraries — write each canonical host file once as a regular
    // file, then write subsequent guest paths that map to the same host
    // file as cpio symlinks. This avoids duplicating large libraries in
    // the initramfs (e.g. libc appearing under both lib64/ and usr/lib64/).
    {
        // canonical host path -> first guest_path written for this file
        let mut written_files: HashMap<PathBuf, String> = HashMap::new();
        for (guest_path, host_path) in &shared_libs {
            let canonical = std::fs::canonicalize(host_path).unwrap_or_else(|_| host_path.clone());
            if let Some(first_guest) = written_files.get(&canonical) {
                // Already written — emit a symlink to the first guest path.
                let target = format!("/{first_guest}");
                write_symlink_entry(&mut archive, guest_path, &target)?;
            } else {
                let data = std::fs::read(host_path).with_context(|| {
                    format!("read shared lib '{}': {}", guest_path, host_path.display())
                })?;
                write_entry(&mut archive, guest_path, &data, 0o100755)?;
                written_files.insert(canonical, guest_path.clone());
            }
        }
    }

    // Sentinel: last entry before the suffix. The guest init checks for
    // this file to detect incomplete initramfs extraction.
    write_entry(&mut archive, ".ktstr_init_ok", &[], 0o100644)?;

    drop(_s_write);

    Ok(archive)
}

/// Per-invocation inputs that turn a cached base archive into a
/// complete initramfs. Borrows all slices so callers can build a
/// `SuffixParams` without copying `Vec<String>` fields.
#[derive(Default)]
pub struct SuffixParams<'a> {
    /// `/args` contents — one entry per line.
    pub args: &'a [String],
    /// `/sched_args` contents, or empty to skip the entry.
    pub sched_args: &'a [String],
    /// `/sched_enable` shell-script lines for kernel-built
    /// schedulers, or empty to skip the entry.
    pub sched_enable: &'a [String],
    /// `/sched_disable` shell-script lines, or empty to skip.
    pub sched_disable: &'a [String],
    /// `/exec_cmd` contents when `--exec` is used; `None` otherwise.
    pub exec_cmd: Option<&'a str>,
}

/// Build the suffix that completes a base archive: `/args` and
/// `/sched_args` entries, optional `/sched_enable` and `/sched_disable`
/// shell scripts for kernel-built schedulers, optional `/exec_cmd`,
/// trailer, and 512-byte padding. `base_len` is needed to compute the
/// padding. The returned Vec is typically ~200 bytes.
pub fn build_suffix(base_len: usize, params: &SuffixParams<'_>) -> Result<Vec<u8>> {
    let mut suffix = Vec::new();

    // Args file
    let args_data = params.args.join("\n");
    write_entry(&mut suffix, "args", args_data.as_bytes(), 0o100644)?;

    // Scheduler args file
    if !params.sched_args.is_empty() {
        let sched_args_data = params.sched_args.join("\n");
        write_entry(
            &mut suffix,
            "sched_args",
            sched_args_data.as_bytes(),
            0o100644,
        )?;
    }

    // Kernel-built scheduler enable/disable scripts
    if !params.sched_enable.is_empty() {
        let data = params.sched_enable.join("\n");
        write_entry(&mut suffix, "sched_enable", data.as_bytes(), 0o100755)?;
    }
    if !params.sched_disable.is_empty() {
        let data = params.sched_disable.join("\n");
        write_entry(&mut suffix, "sched_disable", data.as_bytes(), 0o100755)?;
    }

    if let Some(cmd) = params.exec_cmd {
        write_entry(&mut suffix, "exec_cmd", cmd.as_bytes(), 0o100644)?;
    }

    // Trailer
    cpio::newc::trailer(&mut suffix as &mut dyn Write).context("write cpio trailer")?;

    // Pad to 512-byte boundary (initramfs convention)
    let total = base_len + suffix.len();
    let pad = (512 - (total % 512)) % 512;
    suffix.extend(std::iter::repeat_n(0u8, pad));

    Ok(suffix)
}

// ---------------------------------------------------------------------------
// POSIX shared-memory cache for base initramfs
// ---------------------------------------------------------------------------

/// Target arch tag embedded in shm segment names so caches built on
/// a host running a foreign-arch ktstr binary (cross-arch developer
/// boxes that share `/dev/shm`) cannot collide with ours. Selected
/// at compile time so a single binary always emits one tag.
#[cfg(target_arch = "x86_64")]
pub(crate) const SHM_ARCH_TAG: &str = "x86_64";
#[cfg(target_arch = "aarch64")]
pub(crate) const SHM_ARCH_TAG: &str = "aarch64";

/// Derive an shm segment name from a content hash. Each distinct
/// combination of payload + scheduler binaries gets its own segment.
/// The target arch is included so an x86_64 binary cannot mmap a
/// segment written by an aarch64 binary on the same host.
pub(crate) fn shm_segment_name(content_hash: u64) -> String {
    format!("/ktstr-base-{SHM_ARCH_TAG}-{content_hash:016x}")
}

/// Read-only mmap of a POSIX shared-memory segment. The mapping stays
/// live until the struct is dropped, so callers can borrow the bytes
/// without copying the entire base archive.
///
/// Holds the shared flock (`LOCK_SH`) for the lifetime of the mapping
/// so that a concurrent writer cannot `ftruncate` the segment beneath
/// us, which would cause `SIGBUS` on access to the truncated pages.
///
/// # Drop under `panic = "abort"`
///
/// Cargo.toml release profile sets `panic = "abort"`, so a panic in
/// the release binary aborts the process without unwinding — Drop
/// impls are **not run**. For `MappedShm` this means `munmap` and
/// the `flock` unlock are skipped on the abort path. Neither is a
/// resource leak in practice:
///
/// - **mmap**: the kernel reclaims all mappings on process exit.
/// - **flock**: the lock is released when the fd is closed, which
///   the kernel does on process exit.
///
/// The **SHM segment itself**, however, persists in `/dev/shm` until
/// something calls `shm_unlink` on it. An aborted process never
/// reaches its normal unlink sites, and `libc::atexit` handlers do
/// not run on `abort()` either — so the segment leaks *from this
/// run's perspective*. Cleanup is deferred: the next ktstr run
/// sweeps orphans via [`crate::vmm::cleanup_stale_shm`], which
/// non-blockingly `LOCK_EX`'s each stale entry and unlinks it when
/// no other process holds a lock.
///
/// The resulting accumulation is bounded in the common case:
/// repeated aborts during iterative local development trigger the
/// next-run sweep, and /dev/shm is typically sized at ~50% of RAM
/// (tmpfs default) which can hold many orphans before disk pressure.
/// The pathological case is **one-shot CI jobs** that panic-abort
/// and are never re-run: the segment remains until host reboot or
/// manual `rm /dev/shm/ktstr-*` cleanup. An `atexit`-based cleanup
/// would not help (`abort()` bypasses atexit); a signal handler
/// that runs before the abort could, but would have to be
/// async-signal-safe and cannot reliably walk the tracked-segment
/// set under arbitrary signal arrival. Accepting the bounded leak
/// is the design tradeoff from the panic-strategy decision
/// (panic=abort with audited threads).
pub(crate) struct MappedShm {
    ptr: *const u8,
    len: usize,
    fd: std::os::fd::OwnedFd,
}

// SAFETY: The mmap is MAP_SHARED|PROT_READ over a shm segment whose
// contents are held stable for the mapping's lifetime by a shared
// flock retained in `fd`. The pointer and length are valid for the
// lifetime of the mapping.
unsafe impl Send for MappedShm {}
unsafe impl Sync for MappedShm {}

impl AsRef<[u8]> for MappedShm {
    fn as_ref(&self) -> &[u8] {
        // SAFETY: ptr/len are set by a successful mmap in
        // shm_load_base, and the SHM segment's contents are held
        // stable for the mapping's lifetime by the shared flock
        // retained in self.fd — a cooperating writer cannot
        // ftruncate the segment out from under us.
        unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
    }
}

impl Drop for MappedShm {
    fn drop(&mut self) {
        // Note: under `panic = "abort"` this Drop is skipped entirely.
        // See `MappedShm`'s type doc for the /dev/shm accumulation
        // discussion and the `cleanup_stale_shm` recovery path.
        //
        // SAFETY: ptr/len are from mmap; fd is an OwnedFd opened via
        // rustix::shm::open with LOCK_SH held. Release the mapping
        // first so the lock still protects it, then drop the flock
        // explicitly. The OwnedFd's own drop closes the descriptor
        // after this function returns.
        unsafe {
            libc::munmap(self.ptr as *mut libc::c_void, self.len);
        }
        let _ = rustix::fs::flock(&self.fd, rustix::fs::FlockOperation::Unlock);
    }
}

/// Try to mmap a base initramfs from a POSIX shared-memory segment
/// identified by `content_hash`. Returns a `MappedShm` that borrows
/// the data without copying. Returns `None` on miss or error.
///
/// Acquires a shared flock (`LOCK_SH`) before mmap and keeps it held
/// for the lifetime of the returned `MappedShm`. A concurrent writer
/// calls `ftruncate` under `LOCK_EX` in `shm_store` (and in
/// `shm_write_and_release`, which also `ftruncate`s to 0 on mmap
/// failure); holding `LOCK_SH` for the mapping's lifetime prevents
/// either writer from truncating the segment out from under us, which
/// would turn any access to the truncated pages into `SIGBUS`.
///
/// Note: `flock` is advisory — it only protects against other
/// processes that also call `flock`. A process that writes the
/// segment without taking `LOCK_EX` (e.g. `rm /dev/shm/…` + recreate
/// by an unrelated tool) bypasses this scheme. All callers within
/// this crate cooperate, which is the closed-world guarantee we
/// rely on.
pub(crate) fn shm_load_base(content_hash: u64) -> Option<MappedShm> {
    use std::os::fd::AsRawFd;

    let name = shm_segment_name(content_hash);
    let fd = rustix::shm::open(
        name.as_str(),
        rustix::shm::OFlags::RDONLY,
        rustix::fs::Mode::empty(),
    )
    .ok()?;

    // Shared lock — blocks until any concurrent writer releases
    // LOCK_EX. Held for the mapping's lifetime; released in
    // MappedShm::drop. The fd is dropped on the early-return paths,
    // which implicitly closes the descriptor.
    rustix::fs::flock(&fd, rustix::fs::FlockOperation::LockShared).ok()?;

    let stat = rustix::fs::fstat(&fd).ok()?;
    if stat.st_size <= 0 {
        let _ = rustix::fs::flock(&fd, rustix::fs::FlockOperation::Unlock);
        return None;
    }
    let len = stat.st_size as usize;

    // SAFETY: mmap consumes the raw fd value but does not take
    // ownership; the OwnedFd lives on in `fd` and ultimately closes
    // in `MappedShm::drop` after munmap.
    let ptr = unsafe {
        libc::mmap(
            std::ptr::null_mut(),
            len,
            libc::PROT_READ,
            libc::MAP_SHARED,
            fd.as_raw_fd(),
            0,
        )
    };

    if ptr == libc::MAP_FAILED {
        let _ = rustix::fs::flock(&fd, rustix::fs::FlockOperation::Unlock);
        return None;
    }

    Some(MappedShm {
        ptr: ptr as *const u8,
        len,
        fd,
    })
}

/// Write `data` to a POSIX SHM segment identified by `name`.
///
/// Creates (or opens existing) the segment with `O_CREAT | O_RDWR`,
/// takes an exclusive flock, `ftruncate`s to `data.len()`, `mmap`s
/// `PROT_WRITE | MAP_SHARED`, copies, and cleans up.
///
/// Concurrency: `LOCK_EX` blocks while any reader holds `LOCK_SH` on
/// the same segment (e.g. a live `MappedShm` or `CowOverlayGuard`).
/// The writer thus waits for in-flight VMs before truncating — which
/// is what prevents the `SIGBUS` class of bug addressed by the
/// reader-side flock lifetime in `shm_load_base` and `cow_overlay`.
///
/// Writes are content-addressed at the caller: callers hash `data`
/// to form the segment name. When two callers write the same content
/// to the same hash, the payload length and bytes are identical, so
/// the second `ftruncate(same_len)` is a no-op on page contents and
/// the second memcpy writes the same bytes. A third-party caller
/// that writes DIFFERENT data to an already-used hash (e.g. the
/// rename-test pattern) will overwrite — the store does not enforce
/// idempotence itself.
fn shm_store(name: &str, data: &[u8]) -> Result<()> {
    use std::os::fd::AsRawFd;

    let fd = rustix::shm::open(
        name,
        rustix::shm::OFlags::CREATE | rustix::shm::OFlags::RDWR,
        rustix::fs::Mode::from_raw_mode(0o644),
    )
    .map_err(|e| anyhow::anyhow!("shm_open: {e}"))?;

    // Surfaces the wait explicitly: with readers holding LOCK_SH
    // for VM lifetime, concurrent test runs can block here for
    // seconds. Without this, the user sees silent hang.
    tracing::info!(
        segment = name,
        data_len = data.len(),
        "shm_store: waiting for LOCK_EX"
    );
    // Post-flock error paths (ftruncate/mmap failure) call shm_unlink
    // before the OwnedFd drop so a half-initialized segment (empty /
    // corrupt / wrong size) does not persist in /dev/shm where the
    // next caller would read it. Those paths hold LOCK_EX, so the
    // segment is either newly created by this call or being
    // exclusively rewritten — destroying it is safe. Unlinking may
    // also remove a pre-existing valid segment that the caller had
    // just opened then failed to rewrite; content-addressing makes
    // recovery cheap — the next writer re-creates with the same hash.
    //
    // Pre-flock (shm_open succeeded, flock failed) does NOT unlink:
    // the segment may be a pre-existing valid one that a peer writer
    // holds LOCK_EX on, and we must not destroy another process's
    // cache entry on our own flock error.
    //
    // Success path does NOT unlink — the segment IS the cache. The
    // OwnedFd's drop handles close() on every path.
    rustix::fs::flock(&fd, rustix::fs::FlockOperation::LockExclusive)
        .map_err(|e| anyhow::anyhow!("flock: {e}"))?;

    let raw_fd = fd.as_raw_fd();
    unsafe {
        if libc::ftruncate(raw_fd, data.len() as libc::off_t) != 0 {
            let err = std::io::Error::last_os_error();
            if let Err(e) = rustix::shm::unlink(name) {
                tracing::warn!(
                    err = %e,
                    segment = name,
                    "shm_unlink failed on ftruncate error path"
                );
            }
            let _ = rustix::fs::flock(&fd, rustix::fs::FlockOperation::Unlock);
            anyhow::bail!("ftruncate: {err}");
        }

        let ptr = libc::mmap(
            std::ptr::null_mut(),
            data.len(),
            libc::PROT_WRITE,
            libc::MAP_SHARED,
            raw_fd,
            0,
        );
        if ptr == libc::MAP_FAILED {
            let err = std::io::Error::last_os_error();
            if let Err(e) = rustix::shm::unlink(name) {
                tracing::warn!(
                    err = %e,
                    segment = name,
                    "shm_unlink failed on mmap error path"
                );
            }
            let _ = rustix::fs::flock(&fd, rustix::fs::FlockOperation::Unlock);
            anyhow::bail!("mmap: {err}");
        }

        std::ptr::copy_nonoverlapping(data.as_ptr(), ptr as *mut u8, data.len());
        libc::munmap(ptr, data.len());
    }
    let _ = rustix::fs::flock(&fd, rustix::fs::FlockOperation::Unlock);
    // fd drops here → close(fd).
    Ok(())
}

pub(crate) fn shm_store_base(content_hash: u64, data: &[u8]) -> Result<()> {
    shm_store(&shm_segment_name(content_hash), data)
}

/// Test helper — remove the POSIX shared-memory segment identified
/// by `content_hash`.
#[cfg(test)]
pub(crate) fn shm_unlink_base(content_hash: u64) {
    let _ = rustix::shm::unlink(shm_segment_name(content_hash).as_str());
}

// ---------------------------------------------------------------------------
// Compressed SHM cache — stores LZ4-compressed base for COW overlay into
// guest RAM
// ---------------------------------------------------------------------------

/// Segment name for the LZ4-compressed version of a base initramfs.
/// Uses `lz4` prefix to avoid collisions with segments written by
/// previous compression formats (zstd, gzip). The target arch tag
/// keeps cross-arch caches separate on hosts where both ktstr
/// binaries share `/dev/shm`.
fn shm_lz4_segment_name(content_hash: u64) -> String {
    format!("/ktstr-lz4-{SHM_ARCH_TAG}-{content_hash:016x}")
}

/// Open the compressed SHM segment and return a held OwnedFd + size.
/// The fd has a shared flock held; drop the OwnedFd (via
/// [`shm_close_fd`] or scope exit) to release the lock and close.
/// Returns `None` on miss or error.
pub(crate) fn shm_open_lz4(content_hash: u64) -> Option<(std::os::fd::OwnedFd, usize)> {
    let name = shm_lz4_segment_name(content_hash);
    let fd = rustix::shm::open(
        name.as_str(),
        rustix::shm::OFlags::RDONLY,
        rustix::fs::Mode::empty(),
    )
    .ok()?;
    rustix::fs::flock(&fd, rustix::fs::FlockOperation::LockShared).ok()?;
    let stat = rustix::fs::fstat(&fd).ok()?;
    if stat.st_size <= 0 {
        let _ = rustix::fs::flock(&fd, rustix::fs::FlockOperation::Unlock);
        return None;
    }
    Some((fd, stat.st_size as usize))
}

/// Store compressed initramfs data into an LZ4 SHM segment.
pub(crate) fn shm_store_lz4(content_hash: u64, data: &[u8]) -> Result<()> {
    shm_store(&shm_lz4_segment_name(content_hash), data)
}

/// RAII guard for a live COW-overlay mapping.
///
/// A COW overlay is `MAP_PRIVATE | MAP_FIXED` onto guest memory from
/// a SHM segment fd. `MAP_PRIVATE` pages are lazily read from the
/// backing file on first access; if the SHM segment is truncated or
/// unlinked-with-retruncate between `mmap` and the guest's first
/// read, the access SIGBUSes (see Linux `filemap_fault` against
/// `i_size`). Holding the fd with `LOCK_SH` for the mapping's
/// lifetime blocks any cooperating writer from taking `LOCK_EX` and
/// `ftruncate`ing the segment until after the mapping is torn down.
///
/// Drop order: the guard releases `LOCK_UN` and `close` only. The
/// MAP_FIXED region itself is owned by the caller's VA reservation
/// (e.g. `ReservationGuard` in the VMM) and is munmapped when that
/// reservation drops — which must happen BEFORE this guard drops,
/// so the lock protects the mapping right up until tear-down.
pub(crate) struct CowOverlayGuard {
    fd: std::os::fd::OwnedFd,
}

impl CowOverlayGuard {
    fn new(fd: std::os::fd::OwnedFd) -> Self {
        Self { fd }
    }
}

impl Drop for CowOverlayGuard {
    fn drop(&mut self) {
        // fd was obtained via shm_open in the COW overlay path; release
        // LOCK_SH explicitly so cooperating writers waiting on LOCK_EX
        // observe ordering with the VM's reads. The OwnedFd's own drop
        // closes the descriptor after this function returns.
        let _ = rustix::fs::flock(&self.fd, rustix::fs::FlockOperation::Unlock);
    }
}

/// COW-overlay `len` bytes from `shm_fd` at `host_addr` using
/// `MAP_PRIVATE | MAP_FIXED | MAP_POPULATE`. The guest sees the SHM
/// content but writes go to private anonymous pages (copy-on-write).
/// `MAP_POPULATE` pre-faults the pages so the initial accesses skip
/// the filemap fault path; the lock guard still protects against
/// truncate of pages that may be refaulted from the page cache
/// (MAP_POPULATE alone is not sufficient — truncate invalidates the
/// page cache via `unmap_mapping_range`).
///
/// On success, returns `Some(CowOverlayGuard)` — the guard owns
/// `shm_fd` and holds `LOCK_SH` for the mapping's lifetime. The
/// caller MUST keep the guard alive for as long as the MAP_FIXED
/// mapping is in use (typically the VM lifetime) and drop the guard
/// AFTER the VA region is munmapped.
///
/// On failure, returns `None` and CLOSES `shm_fd` (releasing
/// `LOCK_SH`) so the caller does not need to clean it up.
///
/// # Safety
///
/// The caller MUST have validated that the entire range
/// `[host_addr, host_addr + len)` lies within one contiguous guest
/// memory region. `MAP_FIXED` unmaps whatever is already present
/// across the full range and replaces it with the new mapping; if
/// `len` extends past the region, `MAP_FIXED` silently corrupts
/// unrelated host mappings (kernel-defined behaviour at the mmap
/// layer) and violates Rust's aliasing invariants at the process
/// level. The caller is also responsible for ensuring `shm_fd` is a
/// valid, open file descriptor with `LOCK_SH` already held (the
/// guard inherits both).
pub(crate) unsafe fn cow_overlay(
    host_addr: *mut u8,
    len: usize,
    shm_fd: std::os::fd::OwnedFd,
) -> Option<CowOverlayGuard> {
    use std::os::fd::AsRawFd;

    // SAFETY: caller guarantees [host_addr, host_addr + len) is
    // entirely within a single valid guest memory region and shm_fd
    // is a valid fd holding LOCK_SH. See function-level docs.
    let ptr = unsafe {
        libc::mmap(
            host_addr as *mut libc::c_void,
            len,
            libc::PROT_READ | libc::PROT_WRITE,
            libc::MAP_PRIVATE | libc::MAP_FIXED | libc::MAP_POPULATE,
            shm_fd.as_raw_fd(),
            0,
        )
    };
    if ptr == libc::MAP_FAILED {
        // Drop the OwnedFd on the failure path so the caller expects
        // no cleanup responsibility on the None branch; drop releases
        // the LOCK_SH and closes the descriptor.
        let _ = rustix::fs::flock(&shm_fd, rustix::fs::FlockOperation::Unlock);
        return None;
    }
    Some(CowOverlayGuard::new(shm_fd))
}

/// Close a SHM fd and release its shared flock.
pub(crate) fn shm_close_fd(fd: std::os::fd::OwnedFd) {
    // Explicit flock-unlock so a cooperating writer waiting on LOCK_EX
    // observes ordering with our earlier reads (LOCK_SH); the OwnedFd
    // drop at scope exit then closes the descriptor.
    let _ = rustix::fs::flock(&fd, rustix::fs::FlockOperation::Unlock);
}

/// Write one or more byte slices sequentially into guest memory as a
/// single contiguous initramfs. Passing a one-element `parts` slice
/// matches the "single blob" caller; multi-element slices avoid the
/// copy into a monolithic `Vec` that the split base/suffix production
/// path would otherwise need. Returns (address, total_size) for
/// boot_params.
pub fn load_initramfs_parts(
    guest_mem: &vm_memory::GuestMemoryMmap,
    parts: &[&[u8]],
    load_addr: u64,
) -> Result<(u64, u32)> {
    use vm_memory::{Bytes, GuestAddress};
    let mut offset = 0u64;
    for part in parts {
        guest_mem
            .write_slice(part, GuestAddress(load_addr + offset))
            .context("write initramfs part to guest memory")?;
        offset += part.len() as u64;
    }
    Ok((load_addr, offset as u32))
}

/// LZ4 legacy format magic number (`0x184C2102` little-endian).
/// This is the format the kernel's initramfs decompressor expects
/// (CONFIG_RD_LZ4 / lib/decompress_unlz4.c).
pub(crate) const LZ4_LEGACY_MAGIC: [u8; 4] = 0x184C2102u32.to_le_bytes();

/// Maximum uncompressed chunk size for LZ4 legacy format.
/// Must match `LZ4_DEFAULT_UNCOMPRESSED_CHUNK_SIZE` in the kernel
/// (lib/decompress_unlz4.c: `8 << 20`).
const LZ4_CHUNK_SIZE: usize = 8 << 20;

/// Compress `data` into LZ4 legacy frame format for the kernel's
/// initramfs decompressor. The format is:
///   [4-byte magic] ([4-byte compressed_size LE] [compressed block])*
///
/// Input is split into `LZ4_CHUNK_SIZE` (8MB) chunks, compressed in
/// parallel with rayon, then assembled sequentially.
pub(crate) fn lz4_legacy_compress(data: &[u8]) -> Vec<u8> {
    use rayon::prelude::*;

    // Compress all chunks in parallel.
    let compressed_chunks: Vec<Vec<u8>> = data
        .par_chunks(LZ4_CHUNK_SIZE)
        .map(lz4_flex::block::compress)
        .collect();

    // Assemble: magic + (size + data) per chunk.
    let total: usize = 4 + compressed_chunks.iter().map(|c| 4 + c.len()).sum::<usize>();
    let mut out = Vec::with_capacity(total);
    out.extend_from_slice(&LZ4_LEGACY_MAGIC);
    for chunk in &compressed_chunks {
        out.extend_from_slice(&(chunk.len() as u32).to_le_bytes());
        out.extend_from_slice(chunk);
    }
    out
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Thin test wrapper over [`build_suffix`] that takes only the
    /// args tests commonly vary. The remaining [`SuffixParams`] fields
    /// default to empty, so assertions stay focused on the args- and
    /// sched-args-only suffix shape.
    fn build_suffix_args(
        base_len: usize,
        args: &[String],
        sched_args: &[String],
    ) -> Result<Vec<u8>> {
        build_suffix(
            base_len,
            &SuffixParams {
                args,
                sched_args,
                ..Default::default()
            },
        )
    }

    /// Thin test wrapper that produces a complete cpio newc archive by
    /// concatenating [`build_initramfs_base`] and [`build_suffix_args`]
    /// output. Production callers build base and suffix separately so
    /// they can stream the parts into guest memory without an
    /// intermediate `Vec`; the monolithic form is only needed for
    /// round-trip archive-shape assertions in tests.
    fn build_initramfs(
        payload: &Path,
        extra_binaries: &[(&str, &Path)],
        args: &[String],
    ) -> Result<Vec<u8>> {
        let base = build_initramfs_base(payload, extra_binaries, &[], false)?;
        let suffix = build_suffix_args(base.len(), args, &[])?;
        let mut archive = Vec::with_capacity(base.len() + suffix.len());
        archive.extend_from_slice(&base);
        archive.extend_from_slice(&suffix);
        Ok(archive)
    }

    /// Extract cpio entry names from a newc archive for test assertions.
    fn cpio_entry_names(archive: &[u8]) -> Vec<String> {
        let mut names = Vec::new();
        let mut remaining: &[u8] = archive;
        while let Ok(reader) = cpio::newc::Reader::new(remaining) {
            let name = reader.entry().name().to_string();
            if reader.entry().is_trailer() {
                break;
            }
            names.push(name);
            remaining = reader.finish().unwrap();
        }
        names
    }

    /// Extract cpio entries with name, size, mode, and inode for diagnostics.
    fn cpio_entries(archive: &[u8]) -> Vec<(String, u32, u32, u32)> {
        let mut entries = Vec::new();
        let mut remaining: &[u8] = archive;
        while let Ok(reader) = cpio::newc::Reader::new(remaining) {
            if reader.entry().is_trailer() {
                break;
            }
            let name = reader.entry().name().to_string();
            let size = reader.entry().file_size();
            let mode = reader.entry().mode();
            let ino = reader.entry().ino();
            entries.push((name, size, mode, ino));
            remaining = reader.finish().unwrap();
        }
        entries
    }

    #[test]
    fn cpio_header_format() {
        let mut archive = Vec::new();
        write_entry(&mut archive, "test", b"hello", 0o100644).unwrap();
        assert_eq!(&archive[..6], b"070701");
    }

    #[test]
    fn cpio_trailer() {
        let mut archive = Vec::new();
        write_entry(&mut archive, "test", b"data", 0o100755).unwrap();
        cpio::newc::trailer(&mut archive as &mut dyn std::io::Write).unwrap();
        let s = String::from_utf8_lossy(&archive);
        assert!(s.contains("TRAILER!!!"));
    }

    #[test]
    fn build_initramfs_has_init() {
        let exe = crate::resolve_current_exe().unwrap();
        let initrd = build_initramfs(&exe, &[], &[]).unwrap();
        let s = String::from_utf8_lossy(&initrd);
        assert!(s.contains("init"), "should contain init entry");
        assert!(s.contains("TRAILER!!!"));
    }

    #[test]
    fn build_initramfs_base_is_valid_cpio() {
        let exe = crate::resolve_current_exe().unwrap();
        let initrd = build_initramfs_base(&exe, &[], &[], false).unwrap();
        assert_eq!(&initrd[..6], b"070701");
        // Base is NOT 512-aligned on its own; only base+suffix is.
        let full = build_initramfs(&exe, &[], &[]).unwrap();
        assert!(initrd.len() <= full.len());
    }

    #[test]
    fn build_initramfs_padded() {
        let exe = crate::resolve_current_exe().unwrap();
        let initrd = build_initramfs(&exe, &[], &[]).unwrap();
        assert_eq!(initrd.len() % 512, 0);
    }

    #[test]
    fn initramfs_nonexistent_file() {
        let result = build_initramfs(Path::new("/nonexistent"), &[], &[]);
        assert!(result.is_err());
    }

    #[test]
    fn initramfs_nonexistent_extra_binary() {
        let exe = crate::resolve_current_exe().unwrap();
        let result = build_initramfs(&exe, &[("bad", Path::new("/nonexistent"))], &[]);
        assert!(result.is_err());
    }

    #[test]
    fn initramfs_with_args() {
        let exe = crate::resolve_current_exe().unwrap();
        let args = vec!["run".into(), "--json".into(), "scenario".into()];
        let initrd = build_initramfs(&exe, &[], &args).unwrap();
        let s = String::from_utf8_lossy(&initrd);
        assert!(s.contains("args"));
    }

    #[test]
    fn initramfs_empty_args() {
        let exe = crate::resolve_current_exe().unwrap();
        let initrd = build_initramfs(&exe, &[], &[]).unwrap();
        assert_eq!(initrd.len() % 512, 0);
    }

    // -- base + suffix split tests --

    #[test]
    fn suffix_adds_args_and_trailer() {
        let exe = crate::resolve_current_exe().unwrap();
        let base = build_initramfs_base(&exe, &[], &[], false).unwrap();
        let args = vec!["run".into(), "--json".into()];
        let suffix = build_suffix_args(base.len(), &args, &[]).unwrap();
        let s = String::from_utf8_lossy(&suffix);
        assert!(s.contains("args"), "suffix should contain args entry");
        assert!(s.contains("TRAILER!!!"), "suffix should contain trailer");
        assert_eq!(
            (base.len() + suffix.len()) % 512,
            0,
            "base+suffix should be 512-byte aligned"
        );
    }

    #[test]
    fn split_matches_monolithic() {
        let exe = crate::resolve_current_exe().unwrap();
        let args = vec!["run".into(), "--json".into(), "scenario".into()];
        let monolithic = build_initramfs(&exe, &[], &args).unwrap();
        let base = build_initramfs_base(&exe, &[], &[], false).unwrap();
        let suffix = build_suffix_args(base.len(), &args, &[]).unwrap();
        let mut split = Vec::with_capacity(base.len() + suffix.len());
        split.extend_from_slice(&base);
        split.extend_from_slice(&suffix);
        assert_eq!(
            monolithic, split,
            "split path should produce identical output"
        );
    }

    #[test]
    fn suffix_different_args_differ() {
        let exe = crate::resolve_current_exe().unwrap();
        let base = build_initramfs_base(&exe, &[], &[], false).unwrap();
        let a = build_suffix_args(base.len(), &["a".into()], &[]).unwrap();
        let b = build_suffix_args(base.len(), &["b".into()], &[]).unwrap();
        assert_ne!(a, b, "different args should produce different suffixes");
    }

    #[test]
    fn suffix_empty_args() {
        let exe = crate::resolve_current_exe().unwrap();
        let base = build_initramfs_base(&exe, &[], &[], false).unwrap();
        let suffix = build_suffix_args(base.len(), &[], &[]).unwrap();
        assert_eq!((base.len() + suffix.len()) % 512, 0);
        let s = String::from_utf8_lossy(&suffix);
        assert!(s.contains("TRAILER!!!"));
    }

    #[test]
    fn suffix_with_sched_enable() {
        let exe = crate::resolve_current_exe().unwrap();
        let base = build_initramfs_base(&exe, &[], &[], false).unwrap();
        let sched_enable = vec!["echo 1 > /sys/kernel/sched_ext/enable".to_string()];
        let suffix = build_suffix(
            base.len(),
            &SuffixParams {
                sched_enable: &sched_enable,
                ..Default::default()
            },
        )
        .unwrap();
        let mut archive = Vec::with_capacity(base.len() + suffix.len());
        archive.extend_from_slice(&base);
        archive.extend_from_slice(&suffix);
        let entries = cpio_entries(&archive);
        let entry = entries
            .iter()
            .find(|(name, ..)| name == "sched_enable")
            .expect("sched_enable entry missing");
        assert_eq!(
            entry.1 as usize,
            sched_enable[0].len(),
            "sched_enable size should match joined content length",
        );
        // 0o100755 = S_IFREG | 0o755 (executable — it's a shell script).
        assert_eq!(entry.2, 0o100755, "sched_enable must be executable");
    }

    #[test]
    fn suffix_with_sched_disable() {
        let exe = crate::resolve_current_exe().unwrap();
        let base = build_initramfs_base(&exe, &[], &[], false).unwrap();
        let sched_disable = vec!["echo 0 > /sys/kernel/sched_ext/enable".to_string()];
        let suffix = build_suffix(
            base.len(),
            &SuffixParams {
                sched_disable: &sched_disable,
                ..Default::default()
            },
        )
        .unwrap();
        let mut archive = Vec::with_capacity(base.len() + suffix.len());
        archive.extend_from_slice(&base);
        archive.extend_from_slice(&suffix);
        let entries = cpio_entries(&archive);
        let entry = entries
            .iter()
            .find(|(name, ..)| name == "sched_disable")
            .expect("sched_disable entry missing");
        assert_eq!(entry.1 as usize, sched_disable[0].len());
        assert_eq!(entry.2, 0o100755, "sched_disable must be executable");
    }

    #[test]
    fn suffix_with_exec_cmd() {
        let exe = crate::resolve_current_exe().unwrap();
        let base = build_initramfs_base(&exe, &[], &[], false).unwrap();
        let cmd = "/usr/bin/stress-ng --cpu 1 --timeout 5s";
        let suffix = build_suffix(
            base.len(),
            &SuffixParams {
                exec_cmd: Some(cmd),
                ..Default::default()
            },
        )
        .unwrap();
        let mut archive = Vec::with_capacity(base.len() + suffix.len());
        archive.extend_from_slice(&base);
        archive.extend_from_slice(&suffix);
        let entries = cpio_entries(&archive);
        let entry = entries
            .iter()
            .find(|(name, ..)| name == "exec_cmd")
            .expect("exec_cmd entry missing");
        assert_eq!(entry.1 as usize, cmd.len());
        // 0o100644 = S_IFREG | 0o644 (non-executable data file — read by init, not exec'd).
        assert_eq!(entry.2, 0o100644, "exec_cmd must be a plain data file");
    }

    #[test]
    fn suffix_omits_empty_optional_entries() {
        // Confirms the is_empty() / Option::None guards in build_suffix —
        // empty sched_enable, empty sched_disable, and None exec_cmd must
        // not leave zero-length cpio entries in the archive.
        let exe = crate::resolve_current_exe().unwrap();
        let base = build_initramfs_base(&exe, &[], &[], false).unwrap();
        let suffix = build_suffix(base.len(), &SuffixParams::default()).unwrap();
        let mut archive = Vec::with_capacity(base.len() + suffix.len());
        archive.extend_from_slice(&base);
        archive.extend_from_slice(&suffix);
        let names = cpio_entry_names(&archive);
        assert!(!names.iter().any(|n| n == "sched_enable"));
        assert!(!names.iter().any(|n| n == "sched_disable"));
        assert!(!names.iter().any(|n| n == "exec_cmd"));
    }

    #[test]
    fn try_cow_overlay_rejects_cross_region_span() {
        // The bounds check in try_cow_overlay relies on
        // GuestMemoryMmap::get_slice failing when a range would cross
        // a region boundary. This test locks that semantic in: two
        // non-contiguous regions; a range that starts in region A but
        // extends past its end must be rejected. If this ever passes
        // (e.g. vm-memory swaps in multi-region get_slices semantics
        // here), try_cow_overlay's MAP_FIXED would silently clobber
        // whatever host mapping sits between the regions.
        use vm_memory::{GuestAddress, GuestMemory};
        let region_a_size: usize = 64 * 1024;
        let region_b_size: usize = 64 * 1024;
        let region_a_start: u64 = 0;
        let region_b_start: u64 = 1 << 20; // 1 MiB gap
        let mem = vm_memory::GuestMemoryMmap::<()>::from_ranges(&[
            (GuestAddress(region_a_start), region_a_size),
            (GuestAddress(region_b_start), region_b_size),
        ])
        .unwrap();

        // Range fully inside region A: must succeed.
        assert!(
            mem.get_slice(GuestAddress(region_a_start), region_a_size)
                .is_ok(),
            "full-region slice must succeed"
        );

        // Range starting mid-region-A and extending past region A's
        // end: must fail. This is the exact shape of the hazardous
        // cow_overlay case.
        let overrun_start = region_a_start + (region_a_size as u64 / 2);
        let overrun_len = region_a_size; // well past the region's end
        assert!(
            mem.get_slice(GuestAddress(overrun_start), overrun_len)
                .is_err(),
            "cross-boundary slice must fail"
        );

        // Range starting at a GPA inside the gap between regions:
        // also fails (no region covers the start address).
        let gap_addr = (region_a_start + region_a_size as u64) + 0x1000;
        assert!(
            mem.get_slice(GuestAddress(gap_addr), 4).is_err(),
            "gap-start slice must fail"
        );
    }

    #[test]
    fn try_cow_overlay_preserves_adjacent_region_bytes() {
        // Proves the invariant at the application layer: with the
        // bounds check in place, we never invoke mmap(MAP_FIXED),
        // which means bytes outside the validated range stay
        // untouched. We simulate "before" bytes in region B, run the
        // same bounds check try_cow_overlay uses, observe that it
        // rejects the request, and check region B's bytes survive.
        use vm_memory::{Bytes, GuestAddress, GuestMemory};
        let region_a_size: usize = 64 * 1024;
        let region_b_size: usize = 64 * 1024;
        let region_a_start: u64 = 0;
        let region_b_start: u64 = 1 << 20;
        let mem = vm_memory::GuestMemoryMmap::<()>::from_ranges(&[
            (GuestAddress(region_a_start), region_a_size),
            (GuestAddress(region_b_start), region_b_size),
        ])
        .unwrap();

        // Seed region B with a detectable marker.
        let marker: Vec<u8> = (0..region_b_size).map(|i| (i & 0xff) as u8).collect();
        mem.write_slice(&marker, GuestAddress(region_b_start))
            .unwrap();

        // Compute an oversized COW request: starts in region A, len
        // spans the whole guest range up to the end of region B.
        let overrun_load_addr = region_a_start;
        let overrun_len = (region_b_start + region_b_size as u64) as usize;

        // This is the same check try_cow_overlay uses; on failure it
        // returns early and never invokes cow_overlay. We assert the
        // rejection and the preservation of region B's contents.
        assert!(
            mem.get_slice(GuestAddress(overrun_load_addr), overrun_len)
                .is_err(),
            "oversized overlay must be rejected before MAP_FIXED"
        );
        let mut readback = vec![0u8; region_b_size];
        mem.read_slice(&mut readback, GuestAddress(region_b_start))
            .unwrap();
        assert_eq!(
            readback, marker,
            "region B must be untouched when bounds check rejects cow_overlay"
        );
    }

    #[test]
    fn load_initramfs_parts_sequential() {
        let part1 = vec![0xAAu8; 4096];
        let part2 = vec![0xBBu8; 512];
        let mem = vm_memory::GuestMemoryMmap::<()>::from_ranges(&[(
            vm_memory::GuestAddress(0),
            16 << 20,
        )])
        .unwrap();
        let (addr, size) = load_initramfs_parts(&mem, &[&part1, &part2], 0x200000).unwrap();
        assert_eq!(addr, 0x200000);
        assert_eq!(size, 4608);
        let mut buf = vec![0u8; 4608];
        use vm_memory::{Bytes, GuestAddress};
        mem.read_slice(&mut buf, GuestAddress(0x200000)).unwrap();
        assert_eq!(&buf[..4096], &part1[..]);
        assert_eq!(&buf[4096..], &part2[..]);
    }

    // -- shared lib resolution tests --

    #[test]
    fn resolve_shared_libs_nonexistent_returns_error() {
        let result = resolve_shared_libs(Path::new("/nonexistent/binary"));
        // Nonexistent file cannot be read.
        assert!(result.is_err());
    }

    #[test]
    fn resolve_shared_libs_non_elf_returns_empty() {
        let tmp = std::env::temp_dir().join("ktstr-test-resolve-nonelf");
        std::fs::write(&tmp, b"not an elf").unwrap();
        let result = resolve_shared_libs(&tmp).unwrap();
        assert!(result.found.is_empty());
        assert!(result.missing.is_empty());
        let _ = std::fs::remove_file(&tmp);
    }

    #[test]
    fn resolve_shared_libs_dynamic_binary() {
        let sh = Path::new("/bin/sh");
        if sh.exists() {
            let shared = resolve_shared_libs(sh).unwrap();
            if !shared.found.is_empty() {
                assert!(
                    shared.found.iter().any(|(g, _)| g.contains("libc")),
                    "dynamic binary should depend on libc: {:?}",
                    shared.found
                );
                for (g, _) in &shared.found {
                    assert!(!g.starts_with('/'), "guest path should be relative: {g}");
                }
            }
        }
    }

    #[test]
    fn elf_dynamic_needed_extracts_sonames() {
        let sh = Path::new("/bin/sh");
        if !sh.exists() || !is_elf(sh) {
            skip!("/bin/sh not ELF");
        }
        let data = std::fs::read(sh).unwrap();
        let elf = goblin::elf::Elf::parse(&data).unwrap();
        let needed: Vec<&str> = elf.libraries.clone();
        assert!(
            needed.iter().any(|n| n.contains("libc")),
            "/bin/sh should need libc: {:?}",
            needed
        );
    }

    #[test]
    fn resolve_soname_finds_libc() {
        let result = resolve_soname("libc.so.6", &ElfSearchPaths::default(), &[]);
        assert!(
            result.is_some(),
            "should resolve libc.so.6 via default paths"
        );
        assert!(result.unwrap().is_file());
    }

    /// Regression for glibc-divergence bug in [`resolve_soname`]: DT_RPATH
    /// must be consulted BEFORE DT_RUNPATH (via the LD_LIBRARY_PATH step),
    /// and DT_RUNPATH must come BEFORE interp-relative hints.
    ///
    /// The test populates both a unique `rpath` dir and a unique `runpath`
    /// dir, each containing a distinct "library" file with the same
    /// soname. The file picked must be the one from `rpath` — matching
    /// glibc's "DT_RPATH before LD_LIBRARY_PATH, LD_LIBRARY_PATH before
    /// DT_RUNPATH" ordering.
    #[test]
    fn resolve_soname_rpath_beats_runpath_when_both_present() {
        let tmp = tempfile::TempDir::new().unwrap();
        let rpath_dir = tmp.path().join("rpath");
        let runpath_dir = tmp.path().join("runpath");
        std::fs::create_dir_all(&rpath_dir).unwrap();
        std::fs::create_dir_all(&runpath_dir).unwrap();
        let soname = "libktstrfake-rpath-beats-runpath.so.1";
        std::fs::write(rpath_dir.join(soname), b"rpath-copy").unwrap();
        std::fs::write(runpath_dir.join(soname), b"runpath-copy").unwrap();

        let paths = ElfSearchPaths {
            rpath: vec![rpath_dir.clone()],
            runpath: vec![runpath_dir.clone()],
        };
        let got = resolve_soname(soname, &paths, &[]).expect("should resolve");
        assert_eq!(
            got,
            rpath_dir.join(soname),
            "DT_RPATH must be preferred over DT_RUNPATH when both are \
             populated (the LD_LIBRARY_PATH step separates them)"
        );
    }

    /// Regression: DT_RUNPATH must beat the interp-hint fallback,
    /// otherwise a binary with a perfectly good DT_RUNPATH could pick
    /// up a wrong copy from the dynamic linker's directory.
    #[test]
    fn resolve_soname_runpath_beats_interp_hints() {
        let tmp = tempfile::TempDir::new().unwrap();
        let runpath_dir = tmp.path().join("runpath");
        let interp_dir = tmp.path().join("interp");
        std::fs::create_dir_all(&runpath_dir).unwrap();
        std::fs::create_dir_all(&interp_dir).unwrap();
        let soname = "libktstrfake-runpath-beats-interp.so.1";
        std::fs::write(runpath_dir.join(soname), b"runpath-copy").unwrap();
        std::fs::write(interp_dir.join(soname), b"interp-copy").unwrap();

        let paths = ElfSearchPaths {
            rpath: Vec::new(),
            runpath: vec![runpath_dir.clone()],
        };
        let got = resolve_soname(soname, &paths, std::slice::from_ref(&interp_dir))
            .expect("should resolve");
        assert_eq!(
            got,
            runpath_dir.join(soname),
            "DT_RUNPATH must be searched before interp-relative hints"
        );
    }

    /// Legacy-binary path: when [`elf_search_paths`] populates `rpath`
    /// with `runpath` empty (binary has DT_RPATH and no DT_RUNPATH),
    /// DT_RPATH must resolve the soname before interp-relative hints
    /// get a chance. This guards the "rpath precedes interp_hints"
    /// ordering in [`resolve_soname`] for the legacy-only-RPATH case.
    #[test]
    fn resolve_soname_rpath_only_wins_when_runpath_empty() {
        let tmp = tempfile::TempDir::new().unwrap();
        let rpath_dir = tmp.path().join("rpath-legacy");
        let interp_dir = tmp.path().join("interp");
        std::fs::create_dir_all(&rpath_dir).unwrap();
        std::fs::create_dir_all(&interp_dir).unwrap();
        let soname = "libktstrfake-rpath-legacy.so.1";
        std::fs::write(rpath_dir.join(soname), b"rpath-copy").unwrap();
        std::fs::write(interp_dir.join(soname), b"interp-copy").unwrap();

        let paths = ElfSearchPaths {
            rpath: vec![rpath_dir.clone()],
            runpath: Vec::new(),
        };
        let got = resolve_soname(soname, &paths, std::slice::from_ref(&interp_dir))
            .expect("should resolve");
        assert_eq!(
            got,
            rpath_dir.join(soname),
            "legacy binary with DT_RPATH (no DT_RUNPATH) must resolve \
             via DT_RPATH, not interp hints"
        );
    }

    #[test]
    fn suffix_with_sched_args() {
        let exe = crate::resolve_current_exe().unwrap();
        let base = build_initramfs_base(&exe, &[], &[], false).unwrap();
        let sched_args = vec!["--enable-borrow".into(), "--llc".into()];
        let suffix = build_suffix_args(base.len(), &[], &sched_args).unwrap();
        let s = String::from_utf8_lossy(&suffix);
        assert!(
            s.contains("sched_args"),
            "suffix should contain sched_args entry"
        );
        assert!(s.contains("TRAILER!!!"));
        assert_eq!((base.len() + suffix.len()) % 512, 0);
    }

    #[test]
    fn suffix_without_sched_args_omits_entry() {
        let exe = crate::resolve_current_exe().unwrap();
        let base = build_initramfs_base(&exe, &[], &[], false).unwrap();
        let suffix = build_suffix_args(base.len(), &[], &[]).unwrap();
        let s = String::from_utf8_lossy(&suffix);
        assert!(
            !s.contains("sched_args"),
            "empty sched_args should not produce entry"
        );
    }

    #[test]
    fn shm_segment_name_format() {
        let name = shm_segment_name(0xDEADBEEF);
        assert!(name.starts_with("/ktstr-base-"));
        assert!(name.contains("deadbeef"));
    }

    #[test]
    fn is_deleted_self_returns_false_for_nonexistent() {
        assert!(!is_deleted_self(Path::new("/nonexistent/binary")));
    }

    #[test]
    fn is_deleted_self_returns_false_for_current() {
        let exe = crate::resolve_current_exe().unwrap();
        // Current binary is not deleted.
        assert!(!is_deleted_self(&exe));
    }

    #[test]
    fn shm_store_load_unlink_roundtrip() {
        let hash = 0xABCD_EF01_2345_6789u64;
        let data = vec![0x42u8; 1024];
        shm_store_base(hash, &data).unwrap();
        let loaded = shm_load_base(hash);
        assert!(loaded.is_some());
        assert_eq!(loaded.unwrap().as_ref(), &data[..]);
        shm_unlink_base(hash);
        // After unlink, load should return None.
        assert!(shm_load_base(hash).is_none());
    }

    #[test]
    fn shm_load_nonexistent_returns_none() {
        let hash = 0xFFFF_FFFF_FFFF_FFFFu64;
        shm_unlink_base(hash); // ensure clean
        assert!(shm_load_base(hash).is_none());
    }

    #[test]
    fn shm_store_last_writer_wins_even_with_size_change() {
        // Documents actual semantics: shm_store reuses the segment name,
        // so a second write with different size overwrites the first.
        // Idempotent writes (same content_hash → same contents) rely on
        // callers to derive the hash from the actual content — this test
        // deliberately uses differently-sized payloads to prove the
        // writer does NOT assume the old name's size is still valid.
        let hash = 0x1234_5678_9ABC_DEF0u64;
        let d1 = vec![0x11u8; 64];
        let d2 = vec![0x22u8; 128];
        shm_store_base(hash, &d1).unwrap();
        shm_store_base(hash, &d2).unwrap();
        let loaded = shm_load_base(hash);
        assert!(loaded.is_some());
        assert_eq!(loaded.unwrap().as_ref(), &d2[..]);
        shm_unlink_base(hash);
    }

    #[test]
    fn shm_segment_name_unique_per_hash() {
        let n1 = shm_segment_name(0);
        let n2 = shm_segment_name(1);
        assert_ne!(n1, n2);
        assert!(n1.starts_with("/ktstr-base-"));
        assert!(n2.starts_with("/ktstr-base-"));
    }

    #[test]
    fn shm_unlink_nonexistent_is_noop() {
        // Should not panic.
        shm_unlink_base(0xDEAD_DEAD_DEAD_DEADu64);
    }

    #[test]
    fn mapped_shm_send_sync() {
        fn assert_send_sync<T: Send + Sync>() {}
        assert_send_sync::<MappedShm>();
    }

    #[test]
    fn shm_load_base_holds_lock_until_drop() {
        // Invariant: as long as a MappedShm is live, the SHM
        // segment's flock is held in LOCK_SH. A concurrent writer
        // calling LOCK_EX | LOCK_NB must fail with EWOULDBLOCK. Once
        // the MappedShm is dropped, the lock releases and a subsequent
        // LOCK_EX | LOCK_NB must succeed.
        //
        // This is the core invariant — if it regresses, shm_store's
        // ftruncate can race with a live reader and cause SIGBUS on
        // the mapped pages.
        let hash = 0xD0D0_BEEF_F00D_BA5Eu64;
        shm_unlink_base(hash); // clean any stale segment
        shm_store_base(hash, &vec![0x55u8; 256]).unwrap();
        let loaded = shm_load_base(hash).expect("load must succeed");

        // Open a second fd and attempt LOCK_EX|LOCK_NB. Should fail
        // with EWOULDBLOCK because the MappedShm holds LOCK_SH.
        let name = shm_segment_name(hash);
        let fd2 = rustix::shm::open(
            name.as_str(),
            rustix::shm::OFlags::RDONLY,
            rustix::fs::Mode::empty(),
        )
        .expect("second shm_open must succeed");
        let err = rustix::fs::flock(&fd2, rustix::fs::FlockOperation::NonBlockingLockExclusive);
        assert!(
            matches!(err, Err(e) if e == rustix::io::Errno::WOULDBLOCK),
            "LOCK_EX|LOCK_NB must be blocked by the live reader's LOCK_SH (got {err:?})",
        );
        drop(fd2);

        // Drop the mapping; lock releases.
        drop(loaded);

        // Now LOCK_EX|LOCK_NB must succeed on a fresh fd.
        let fd3 = rustix::shm::open(
            name.as_str(),
            rustix::shm::OFlags::RDONLY,
            rustix::fs::Mode::empty(),
        )
        .expect("third shm_open must succeed");
        rustix::fs::flock(&fd3, rustix::fs::FlockOperation::NonBlockingLockExclusive)
            .expect("LOCK_EX|LOCK_NB must succeed after the MappedShm is dropped");
        rustix::fs::flock(&fd3, rustix::fs::FlockOperation::Unlock).ok();
        drop(fd3);
        shm_unlink_base(hash);
    }

    #[test]
    fn strip_debug_current_exe() {
        let exe = crate::resolve_current_exe().unwrap();
        let data = strip_debug(&exe).unwrap();
        assert!(!data.is_empty());
        // Stripped binary should be an ELF (first 4 bytes = 0x7f ELF).
        assert_eq!(&data[..4], b"\x7fELF");
    }

    #[test]
    fn strip_debug_nonexistent_fails() {
        let result = strip_debug(Path::new("/nonexistent/binary"));
        assert!(result.is_err());
    }

    #[test]
    fn build_initramfs_base_contains_init() {
        let exe = crate::resolve_current_exe().unwrap();
        let base = build_initramfs_base(&exe, &[], &[], false).unwrap();
        let s = String::from_utf8_lossy(&base);
        assert!(s.contains("init"), "base should contain init entry");
    }

    #[test]
    fn build_initramfs_base_includes_extra_shared_libs() {
        let exe = crate::resolve_current_exe().unwrap();
        let sched = crate::test_support::require_binary("scx-ktstr");
        let extras: Vec<(&str, &Path)> = vec![("scheduler", sched.as_path())];
        let base = build_initramfs_base(&exe, &extras, &[], false).unwrap();
        let s = String::from_utf8_lossy(&base);
        assert!(
            s.contains("lib64/libelf"),
            "initramfs with scx-ktstr extra should contain libelf; \
             resolved libs: {:?}",
            resolve_shared_libs(sched.as_path()).unwrap().found
        );
    }

    #[test]
    fn load_initramfs_to_memory() {
        let data = vec![0xAA; 4096];
        let mem = vm_memory::GuestMemoryMmap::<()>::from_ranges(&[(
            vm_memory::GuestAddress(0),
            16 << 20,
        )])
        .unwrap();
        let (addr, size) = load_initramfs_parts(&mem, &[&data], 0x200000).unwrap();
        assert_eq!(addr, 0x200000);
        assert_eq!(size, 4096);
        let mut buf = vec![0u8; 4096];
        use vm_memory::{Bytes, GuestAddress};
        mem.read_slice(&mut buf, GuestAddress(0x200000)).unwrap();
        assert_eq!(buf, data);
    }

    // -- include_files and busybox tests --

    #[test]
    fn busybox_with_include_files() {
        let exe = crate::resolve_current_exe().unwrap();
        // Per-test tempdir: TempDir generates a unique directory name
        // per invocation and cleans up on Drop. Replaces a fixed
        // `/tmp/ktstr-test-include-busybox` path that collided across
        // parallel nextest runs and left orphans after test panics.
        let tmp_dir = tempfile::TempDir::new().unwrap();
        let tmp = tmp_dir.path().join("included");
        std::fs::write(&tmp, b"hello").unwrap();
        let includes: Vec<(&str, &Path)> = vec![("include-files/test.txt", tmp.as_path())];
        let base = build_initramfs_base(&exe, &[], &includes, true).unwrap();
        let names = cpio_entry_names(&base);
        assert!(
            names.iter().any(|n| n == "bin/busybox"),
            "busybox=true should have bin/busybox entry: {:?}",
            names
        );
    }

    #[test]
    fn include_files_no_busybox_when_empty() {
        let exe = crate::resolve_current_exe().unwrap();
        let base = build_initramfs_base(&exe, &[], &[], false).unwrap();
        let names = cpio_entry_names(&base);
        assert!(
            !names.iter().any(|n| n == "bin/busybox"),
            "busybox=false should not have bin/busybox entry: {:?}",
            names
        );
    }

    #[test]
    fn include_files_preserves_mode() {
        let tmp_dir = tempfile::TempDir::new().unwrap();
        let tmp = tmp_dir.path().join("script");
        std::fs::write(&tmp, b"script content").unwrap();
        // Set executable mode.
        std::fs::set_permissions(&tmp, std::fs::Permissions::from_mode(0o100755)).unwrap();

        let exe = crate::resolve_current_exe().unwrap();
        let includes: Vec<(&str, &Path)> = vec![("include-files/run.sh", tmp.as_path())];
        let base = build_initramfs_base(&exe, &[], &includes, true).unwrap();
        let s = String::from_utf8_lossy(&base);
        assert!(
            s.contains("include-files/run.sh"),
            "include path should appear in cpio"
        );
    }

    #[test]
    fn include_files_elf_gets_shared_libs() {
        // /bin/sh is a dynamic ELF on most systems.
        let sh = Path::new("/bin/sh");
        if !sh.exists() {
            skip!("/bin/sh not found");
        }
        if !is_elf(sh) {
            skip!("/bin/sh is not ELF");
        }
        let exe = crate::resolve_current_exe().unwrap();
        let includes: Vec<(&str, &Path)> = vec![("include-files/sh", sh)];
        let base = build_initramfs_base(&exe, &[], &includes, true).unwrap();
        let s = String::from_utf8_lossy(&base);
        // Dynamic ELF should pull in libc shared libs.
        let shared = resolve_shared_libs(sh).unwrap();
        if !shared.found.is_empty() {
            assert!(
                shared.found.iter().any(|(g, _)| s.contains(g.as_str())),
                "include ELF shared libs should appear in archive: {:?}",
                shared.found
            );
        }
    }

    #[test]
    fn include_files_non_elf_no_shared_libs() {
        let tmp_dir = tempfile::TempDir::new().unwrap();
        let tmp = tmp_dir.path().join("hello.sh");
        std::fs::write(&tmp, b"#!/bin/sh\necho hello\n").unwrap();
        let exe = crate::resolve_current_exe().unwrap();
        let includes: Vec<(&str, &Path)> = vec![("include-files/hello.sh", tmp.as_path())];
        // Should not fail (ELF parsing skipped for non-ELF).
        let base = build_initramfs_base(&exe, &[], &includes, true).unwrap();
        let s = String::from_utf8_lossy(&base);
        assert!(s.contains("include-files/hello.sh"));
    }

    #[test]
    fn include_files_adds_directory_entries() {
        let tmp_dir = tempfile::TempDir::new().unwrap();
        let tmp = tmp_dir.path().join("file.txt");
        std::fs::write(&tmp, b"data").unwrap();
        let exe = crate::resolve_current_exe().unwrap();
        let includes: Vec<(&str, &Path)> =
            vec![("include-files/subdir/nested/file.txt", tmp.as_path())];
        let base = build_initramfs_base(&exe, &[], &includes, true).unwrap();
        let s = String::from_utf8_lossy(&base);
        assert!(s.contains("include-files"), "should have include-files dir");
        assert!(
            s.contains("include-files/subdir"),
            "should have subdir entry"
        );
        assert!(
            s.contains("include-files/subdir/nested"),
            "should have nested subdir entry"
        );
        assert!(s.contains("bin"), "should have bin dir for busybox");
    }

    #[test]
    fn is_elf_detects_elf_binary() {
        let exe = crate::resolve_current_exe().unwrap();
        assert!(is_elf(&exe), "test binary should be ELF");
    }

    #[test]
    fn is_elf_rejects_non_elf() {
        let tmp_dir = tempfile::TempDir::new().unwrap();
        let tmp = tmp_dir.path().join("not-elf");
        std::fs::write(&tmp, b"not an elf file").unwrap();
        assert!(!is_elf(&tmp));
    }

    #[test]
    fn is_elf_rejects_short_file() {
        let tmp_dir = tempfile::TempDir::new().unwrap();
        let tmp = tmp_dir.path().join("short-elf");
        std::fs::write(&tmp, b"ab").unwrap();
        assert!(!is_elf(&tmp));
    }

    #[test]
    fn is_elf_nonexistent_returns_false() {
        assert!(!is_elf(Path::new("/nonexistent/file")));
    }

    #[test]
    fn include_files_rejects_path_traversal() {
        let tmp_dir = tempfile::TempDir::new().unwrap();
        let tmp = tmp_dir.path().join("payload");
        std::fs::write(&tmp, b"data").unwrap();
        let exe = crate::resolve_current_exe().unwrap();
        let includes: Vec<(&str, &Path)> = vec![("include-files/../etc/passwd", tmp.as_path())];
        let result = build_initramfs_base(&exe, &[], &includes, true);
        assert!(result.is_err());
        let err = result.unwrap_err().to_string();
        assert!(
            err.contains(".."),
            "error should mention path traversal: {err}"
        );
    }

    #[test]
    fn include_files_rejects_fifo() {
        let tmp_dir = tempfile::TempDir::new().unwrap();
        let fifo_path = tmp_dir.path().join("fifo");
        // Create a FIFO.
        let c_path = std::ffi::CString::new(fifo_path.to_str().unwrap()).unwrap();
        let rc = unsafe { libc::mkfifo(c_path.as_ptr(), 0o644) };
        assert_eq!(
            rc,
            0,
            "ktstr: mkfifo({}) failed -- test infrastructure broken",
            fifo_path.display(),
        );
        let exe = crate::resolve_current_exe().unwrap();
        let includes: Vec<(&str, &Path)> = vec![("include-files/pipe", fifo_path.as_path())];
        let result = build_initramfs_base(&exe, &[], &includes, true);
        assert!(result.is_err());
        let err = result.unwrap_err().to_string();
        assert!(
            err.contains("not a regular file"),
            "error should reject FIFO: {err}"
        );
    }

    #[test]
    fn include_files_rejects_directory() {
        let tmp_dir = tempfile::TempDir::new().unwrap();
        let dir_path = tmp_dir.path().join("mydir");
        std::fs::create_dir(&dir_path).unwrap();
        let exe = crate::resolve_current_exe().unwrap();
        let includes: Vec<(&str, &Path)> = vec![("include-files/mydir", dir_path.as_path())];
        let result = build_initramfs_base(&exe, &[], &includes, true);
        assert!(result.is_err());
        let err = result.unwrap_err().to_string();
        assert!(
            err.contains("not a regular file"),
            "error should reject directory: {err}"
        );
    }

    #[test]
    fn busybox_independent_of_include_files() {
        let exe = crate::resolve_current_exe().unwrap();
        // busybox=true but no include_files.
        let base = build_initramfs_base(&exe, &[], &[], true).unwrap();
        let names = cpio_entry_names(&base);
        assert!(
            names.iter().any(|n| n == "bin/busybox"),
            "busybox=true should have bin/busybox entry even without includes: {:?}",
            names
        );
    }

    // -- ld.so.cache parsing tests --

    #[test]
    fn parse_ld_so_cache_finds_libc() {
        let cache = parse_ld_so_cache(Path::new("/etc/ld.so.cache"));
        // libc.so.6 is in every glibc system's ld.so.cache.
        assert!(
            cache.contains_key("libc.so.6"),
            "ld.so.cache should contain libc.so.6: found {} entries",
            cache.len(),
        );
        let path = &cache["libc.so.6"];
        assert!(
            path.is_file(),
            "cached libc path should exist: {}",
            path.display()
        );
    }

    #[test]
    fn parse_ld_so_cache_nonexistent_returns_empty() {
        let cache = parse_ld_so_cache(Path::new("/nonexistent/ld.so.cache"));
        assert!(cache.is_empty());
    }

    #[test]
    fn parse_ld_so_cache_bad_magic_returns_empty() {
        let tmp_dir = tempfile::TempDir::new().unwrap();
        let tmp = tmp_dir.path().join("ldcache");
        std::fs::write(&tmp, b"not a valid cache file").unwrap();
        let cache = parse_ld_so_cache(&tmp);
        assert!(cache.is_empty());
    }

    #[test]
    fn parse_ld_so_cache_truncated_returns_empty() {
        let tmp_dir = tempfile::TempDir::new().unwrap();
        let tmp = tmp_dir.path().join("ldcache");
        // Valid magic but truncated header.
        let mut data = LD_CACHE_MAGIC.to_vec();
        data.extend_from_slice(&[0u8; 10]); // not enough for full header
        std::fs::write(&tmp, &data).unwrap();
        let cache = parse_ld_so_cache(&tmp);
        assert!(cache.is_empty());
    }

    #[test]
    fn ld_so_cache_consistent_with_resolve_soname() {
        // If libc.so.6 is in the cache, resolve_soname should find it.
        let result = resolve_soname("libc.so.6", &ElfSearchPaths::default(), &[]);
        assert!(
            result.is_some(),
            "resolve_soname should find libc.so.6 (cache or paths)"
        );
        assert!(result.unwrap().is_file());
    }

    #[test]
    fn no_duplicate_cpio_entries() {
        let exe = crate::resolve_current_exe().unwrap();
        let base = build_initramfs_base(&exe, &[], &[], false).unwrap();
        let entries = cpio_entries(&base);
        let mut seen = std::collections::HashSet::new();
        let mut duplicates = Vec::new();
        for (name, size, mode, ino) in &entries {
            if !seen.insert(name.clone()) {
                duplicates.push((name.clone(), *size, *mode, *ino));
            }
        }
        assert!(
            duplicates.is_empty(),
            "archive contains duplicate entries: {:?}",
            duplicates
        );
    }

    #[test]
    fn no_duplicate_entries_with_include_files() {
        let exe = crate::resolve_current_exe().unwrap();
        // Create include files in a deeply nested path mimicking custom
        // linker library directories.
        let tmp_dir_guard = tempfile::TempDir::new().unwrap();
        let tmp_dir = tmp_dir_guard.path();
        let lib_data = vec![0xCCu8; 4096];
        let f1 = tmp_dir.join("libcustom1.so");
        let f2 = tmp_dir.join("libcustom2.so");
        let f3 = tmp_dir.join("libcustom3.so");
        std::fs::write(&f1, &lib_data).unwrap();
        std::fs::write(&f2, &lib_data).unwrap();
        std::fs::write(&f3, &lib_data).unwrap();

        let includes: Vec<(&str, &Path)> = vec![
            ("usr/local/custom/platform/lib/libcustom1.so", f1.as_path()),
            ("usr/local/custom/platform/lib/libcustom2.so", f2.as_path()),
            ("usr/local/custom/platform/lib/libcustom3.so", f3.as_path()),
        ];

        let base = build_initramfs_base(&exe, &[], &includes, false).unwrap();
        let entries = cpio_entries(&base);
        let entry_names: Vec<&str> = entries.iter().map(|(n, _, _, _)| n.as_str()).collect();

        // Check that all include files are present.
        for (archive_path, _) in &includes {
            assert!(
                entry_names.contains(archive_path),
                "missing include file entry '{}'; archive entries: {:?}",
                archive_path,
                entry_names
            );
        }

        // Check that all include files have correct size.
        for (archive_path, _) in &includes {
            let entry = entries.iter().find(|(n, _, _, _)| n == archive_path);
            assert!(
                entry.is_some_and(|(_, size, _, _)| *size == lib_data.len() as u32),
                "include file '{}' has wrong size: {:?}",
                archive_path,
                entry
            );
        }

        // Check that directory entries exist for the nested path.
        assert!(entry_names.contains(&"usr"), "missing 'usr' dir entry");
        assert!(
            entry_names.contains(&"usr/local"),
            "missing 'usr/local' dir entry"
        );
        assert!(
            entry_names.contains(&"usr/local/custom"),
            "missing 'usr/local/custom' dir entry"
        );
        assert!(
            entry_names.contains(&"usr/local/custom/platform"),
            "missing 'usr/local/custom/platform' dir entry"
        );
        assert!(
            entry_names.contains(&"usr/local/custom/platform/lib"),
            "missing 'usr/local/custom/platform/lib' dir entry"
        );

        // Check that directories come before files they contain.
        let dir_pos = entries
            .iter()
            .position(|(n, _, _, _)| n == "usr/local/custom/platform/lib")
            .unwrap();
        for (archive_path, _) in &includes {
            let file_pos = entries
                .iter()
                .position(|(n, _, _, _)| n == *archive_path)
                .unwrap();
            assert!(
                dir_pos < file_pos,
                "directory entry must precede file '{}': dir at {}, file at {}",
                archive_path,
                dir_pos,
                file_pos
            );
        }

        // No duplicate entries.
        let mut seen = std::collections::HashSet::new();
        let mut duplicates = Vec::new();
        for (name, _, _, _) in &entries {
            if !seen.insert(name.clone()) {
                duplicates.push(name.clone());
            }
        }
        assert!(
            duplicates.is_empty(),
            "duplicate entries in archive: {:?}",
            duplicates
        );
    }

    #[test]
    fn include_elf_shared_libs_all_present_in_archive() {
        // Use /bin/sh as an include file — its shared libs must all
        // appear in the archive with non-zero sizes.
        let sh_path = Path::new("/bin/sh");
        let sh_resolved = std::fs::canonicalize(sh_path).unwrap_or_else(|_| sh_path.to_path_buf());
        let sh = sh_resolved.as_path();
        if !sh.exists() || !is_elf(sh) {
            skip!("/bin/sh not available or not ELF");
        }
        let exe = crate::resolve_current_exe().unwrap();
        let includes: Vec<(&str, &Path)> = vec![("include-files/sh", sh)];
        let base = build_initramfs_base(&exe, &[], &includes, false).unwrap();
        let entries = cpio_entries(&base);
        let entry_map: std::collections::HashMap<&str, (u32, u32, u32)> = entries
            .iter()
            .map(|(n, s, m, i)| (n.as_str(), (*s, *m, *i)))
            .collect();

        let shared = resolve_shared_libs(sh).unwrap();
        for (guest_path, _host_path) in &shared.found {
            assert!(
                entry_map.contains_key(guest_path.as_str()),
                "shared lib '{}' missing from archive; entries: {:?}",
                guest_path,
                entries
                    .iter()
                    .map(|(n, _, _, _)| n.as_str())
                    .collect::<Vec<_>>()
            );
            let (size, _, _) = entry_map[guest_path.as_str()];
            assert!(
                size > 0,
                "shared lib '{}' has zero size in archive",
                guest_path
            );
        }

        // Check the include file itself is present.
        assert!(
            entry_map.contains_key("include-files/sh"),
            "include file itself missing from archive"
        );
    }

    #[test]
    fn all_inode_zero_entries_have_nlink_one() {
        // Check that all entries use ino=0 and nlink=1, so the kernel
        // initramfs unpacker never enters the hardlink path.
        let exe = crate::resolve_current_exe().unwrap();
        let base = build_initramfs_base(&exe, &[], &[], false).unwrap();
        let mut remaining: &[u8] = base.as_slice();
        while let Ok(reader) = cpio::newc::Reader::new(remaining) {
            if reader.entry().is_trailer() {
                break;
            }
            let name = reader.entry().name().to_string();
            let ino = reader.entry().ino();
            let nlink = reader.entry().nlink();
            assert_eq!(
                ino, 0,
                "entry '{}' has non-zero inode {}: risk of kernel hardlink confusion",
                name, ino
            );
            assert_eq!(
                nlink, 1,
                "entry '{}' has nlink {}: kernel only hardlinks when nlink >= 2",
                name, nlink
            );
            remaining = reader.finish().unwrap();
        }
    }

    #[test]
    fn lz4_legacy_compress_format() {
        let data = vec![0xAAu8; 4096];
        let compressed = lz4_legacy_compress(&data);
        // Must start with LZ4 legacy magic.
        assert_eq!(
            &compressed[..4],
            &LZ4_LEGACY_MAGIC,
            "output must start with LZ4 legacy magic 0x184C2102"
        );
        // First chunk: 4-byte compressed size follows magic.
        let chunk_size = u32::from_le_bytes(compressed[4..8].try_into().unwrap()) as usize;
        assert!(
            chunk_size > 0 && chunk_size < data.len(),
            "compressed chunk should be non-empty and smaller than input: {}",
            chunk_size
        );
        // Decompress and check roundtrip.
        let decompressed = lz4_flex::block::decompress(&compressed[8..8 + chunk_size], data.len())
            .expect("lz4 block decompress failed");
        assert_eq!(decompressed, data);
    }

    #[test]
    fn lz4_legacy_compress_large_input_splits_chunks() {
        // Input larger than LZ4_CHUNK_SIZE (8MB) must produce multiple chunks.
        let data = vec![0xBBu8; LZ4_CHUNK_SIZE + 1024];
        let compressed = lz4_legacy_compress(&data);
        assert_eq!(&compressed[..4], &LZ4_LEGACY_MAGIC);
        // Parse chunks: should be at least 2.
        let mut pos = 4;
        let mut chunk_count = 0;
        let mut total_decompressed = Vec::new();
        while pos + 4 <= compressed.len() {
            let chunk_size =
                u32::from_le_bytes(compressed[pos..pos + 4].try_into().unwrap()) as usize;
            if chunk_size == 0 {
                break;
            }
            pos += 4;
            let remaining_uncompressed = data.len() - total_decompressed.len();
            let expected_chunk_len = remaining_uncompressed.min(LZ4_CHUNK_SIZE);
            let decompressed =
                lz4_flex::block::decompress(&compressed[pos..pos + chunk_size], expected_chunk_len)
                    .expect("lz4 block decompress failed");
            total_decompressed.extend_from_slice(&decompressed);
            pos += chunk_size;
            chunk_count += 1;
        }
        assert!(
            chunk_count >= 2,
            "input > 8MB should produce >= 2 chunks, got {}",
            chunk_count
        );
        assert_eq!(total_decompressed, data);
    }

    #[test]
    fn lz4_legacy_compress_empty_input() {
        let compressed = lz4_legacy_compress(&[]);
        // Empty input: just the magic, no chunks.
        assert_eq!(compressed, LZ4_LEGACY_MAGIC);
    }

    /// Build a synthetic cpio archive from generated data for LZ4 tests.
    /// Uses generic paths to avoid banned terms.
    fn build_synthetic_cpio(total_size: usize) -> Vec<u8> {
        let mut archive = Vec::new();
        // Directory entries.
        write_entry(&mut archive, "lib", &[], 0o40755).unwrap();
        write_entry(&mut archive, "data", &[], 0o40755).unwrap();

        // Fill with generated binary data to reach target size.
        // Use a simple PRNG for reproducible high-entropy content.
        let mut rng_state = 0x12345678u64;
        let entry_size = 256 * 1024; // 256KB per entry
        let mut entry_num = 0;
        while archive.len() + entry_size < total_size {
            let mut payload = vec![0u8; entry_size];
            for byte in &mut payload {
                rng_state = rng_state.wrapping_mul(6364136223846793005).wrapping_add(1);
                *byte = (rng_state >> 33) as u8;
            }
            let name = format!("lib/test_{entry_num:04}.so");
            write_entry(&mut archive, &name, &payload, 0o100755).unwrap();
            entry_num += 1;
        }

        // Pad remaining space with a data file.
        if archive.len() < total_size {
            let remaining = total_size - archive.len() - 200; // room for header
            let remaining = remaining.min(total_size);
            let mut payload = vec![0u8; remaining];
            for byte in &mut payload {
                rng_state = rng_state.wrapping_mul(6364136223846793005).wrapping_add(1);
                *byte = (rng_state >> 33) as u8;
            }
            write_entry(&mut archive, "data/fill.bin", &payload, 0o100644).unwrap();
        }

        // Trailer and padding.
        cpio::newc::trailer(&mut archive as &mut dyn std::io::Write).unwrap();
        let pad = (512 - (archive.len() % 512)) % 512;
        archive.extend(std::iter::repeat_n(0u8, pad));
        archive
    }

    /// Simulate the kernel's unlz4() decompression loop (non-fill path).
    /// This mirrors lib/decompress_unlz4.c behavior:
    ///   1. Read and validate 4-byte magic (0x184C2102)
    ///   2. Loop: read 4-byte LE chunk size, decompress chunk, advance
    ///   3. Handle concatenated magic (re-encounter mid-stream)
    ///   4. Terminate on size < 4 or size == 0
    fn simulate_kernel_unlz4(input: &[u8]) -> Result<Vec<u8>, String> {
        const UNCOMP_CHUNK_SIZE: usize = 8 << 20; // LZ4_DEFAULT_UNCOMPRESSED_CHUNK_SIZE

        if input.len() < 4 {
            return Err("input too short for magic".into());
        }

        let mut inp = 0usize; // current position
        let mut size = input.len() as isize; // remaining bytes

        // Read and validate magic.
        let magic = u32::from_le_bytes(input[inp..inp + 4].try_into().unwrap());
        if magic != 0x184C2102 {
            return Err(format!("invalid header: 0x{magic:08X}"));
        }
        inp += 4;
        size -= 4;

        let mut output = Vec::new();

        loop {
            if size < 4 {
                // End of input — clean exit.
                break;
            }

            let chunksize = u32::from_le_bytes(input[inp..inp + 4].try_into().unwrap()) as usize;

            // Handle concatenated magic mid-stream.
            if chunksize == 0x184C2102 {
                inp += 4;
                size -= 4;
                continue;
            }

            // Zero chunk size — end of stream.
            if chunksize == 0 {
                break;
            }

            inp += 4;
            size -= 4;

            // Kernel: LZ4_decompress_safe(inp, outp, chunksize, dest_len)
            // dest_len = uncomp_chunksize (8MB max output)
            let chunk_data = &input[inp..inp + chunksize];
            let decompressed = lz4_flex::block::decompress(chunk_data, UNCOMP_CHUNK_SIZE)
                .map_err(|e| format!("LZ4_decompress_safe failed: {e}"))?;

            output.extend_from_slice(&decompressed);

            size -= chunksize as isize;
            if size == 0 {
                break;
            } else if size < 0 {
                return Err("data corrupted: size went negative".into());
            }
            inp += chunksize;
        }

        Ok(output)
    }

    /// Roundtrip test with synthetic cpio data through the kernel's
    /// unlz4() decompression logic. Uses generated test data with
    /// generic paths.
    #[test]
    fn lz4_legacy_kernel_unlz4_roundtrip() {
        // Single chunk (< 8MB).
        let small = build_synthetic_cpio(1 << 20); // ~1MB
        let compressed = lz4_legacy_compress(&small);
        let decompressed = simulate_kernel_unlz4(&compressed)
            .expect("kernel unlz4 simulation failed on small input");
        assert_eq!(decompressed, small);

        // Multi-chunk (> 8MB, forces chunk splitting).
        let large = build_synthetic_cpio(10 << 20); // ~10MB
        let compressed = lz4_legacy_compress(&large);
        let decompressed = simulate_kernel_unlz4(&compressed)
            .expect("kernel unlz4 simulation failed on multi-chunk input");
        assert_eq!(decompressed, large);
    }

    /// Test concatenated LZ4 legacy streams (base + suffix) through
    /// the kernel unlz4 simulation. This is the format used when
    /// base and suffix are compressed separately.
    #[test]
    fn lz4_legacy_kernel_unlz4_concatenated() {
        let base = build_synthetic_cpio(2 << 20); // ~2MB
        let suffix_data = b"arg1\narg2\narg3\n";

        let lz4_base = lz4_legacy_compress(&base);
        let lz4_suffix = lz4_legacy_compress(suffix_data);

        // Concatenate the two streams.
        let mut combined = Vec::with_capacity(lz4_base.len() + lz4_suffix.len());
        combined.extend_from_slice(&lz4_base);
        combined.extend_from_slice(&lz4_suffix);

        let decompressed = simulate_kernel_unlz4(&combined)
            .expect("kernel unlz4 simulation failed on concatenated streams");

        let mut expected = Vec::with_capacity(base.len() + suffix_data.len());
        expected.extend_from_slice(&base);
        expected.extend_from_slice(suffix_data);
        assert_eq!(decompressed, expected);
    }

    /// Check lz4_flex block output is decompressible by the C lz4
    /// library (same decompressor as the kernel's LZ4_decompress_safe).
    /// Uses synthetic cpio data with generic paths.
    #[test]
    fn lz4_legacy_compress_c_compat() {
        let lz4_check = std::process::Command::new("lz4").arg("--version").output();
        if lz4_check.is_err() {
            skip!("lz4 CLI not found");
        }

        let data = build_synthetic_cpio(2 << 20); // ~2MB
        let compressed = lz4_legacy_compress(&data);
        let compressed_path = std::env::temp_dir().join("ktstr-test-lz4-compat.lz4");
        let decompressed_path = std::env::temp_dir().join("ktstr-test-lz4-compat.bin");
        std::fs::write(&compressed_path, &compressed).unwrap();

        let output = std::process::Command::new("lz4")
            .args(["-d", "-f", "--no-frame-crc"])
            .arg(&compressed_path)
            .arg(&decompressed_path)
            .output()
            .expect("lz4 -d failed to execute");

        let _ = std::fs::remove_file(&compressed_path);

        assert!(
            output.status.success(),
            "lz4 -d failed: stderr={}",
            String::from_utf8_lossy(&output.stderr),
        );

        let result = std::fs::read(&decompressed_path).unwrap();
        let _ = std::fs::remove_file(&decompressed_path);
        assert_eq!(result.len(), data.len(), "decompressed size mismatch");
        assert_eq!(&result[..], &data[..], "decompressed content mismatch");
    }

    /// Check our output can be decompressed by `lz4 -d` when compressed
    /// with `lz4 -l` as reference. Tests cross-compatibility of our
    /// legacy format framing with the reference implementation.
    #[test]
    fn lz4_legacy_reference_cross_compat() {
        let lz4_check = std::process::Command::new("lz4").arg("--version").output();
        if lz4_check.is_err() {
            skip!("lz4 CLI not found");
        }

        let data = build_synthetic_cpio(2 << 20);

        // Compress with `lz4 -l` (reference legacy mode).
        let input_path = std::env::temp_dir().join("ktstr-test-lz4-ref-input.bin");
        let ref_path = std::env::temp_dir().join("ktstr-test-lz4-ref.lz4");
        std::fs::write(&input_path, &data).unwrap();

        let ref_output = std::process::Command::new("lz4")
            .args(["-l", "-f"])
            .arg(&input_path)
            .arg(&ref_path)
            .output()
            .expect("lz4 -l failed to execute");
        let _ = std::fs::remove_file(&input_path);

        assert!(
            ref_output.status.success(),
            "lz4 -l failed: stderr={}",
            String::from_utf8_lossy(&ref_output.stderr),
        );

        // Decompress reference output through our kernel simulation.
        let ref_compressed = std::fs::read(&ref_path).unwrap();
        let _ = std::fs::remove_file(&ref_path);

        let ref_decompressed = simulate_kernel_unlz4(&ref_compressed)
            .expect("kernel unlz4 simulation failed on lz4 -l output");
        assert_eq!(
            ref_decompressed, data,
            "reference lz4 -l roundtrip mismatch"
        );

        // Also compress with our encoder, decompress with lz4 -d.
        let our_compressed = lz4_legacy_compress(&data);
        let our_lz4_path = std::env::temp_dir().join("ktstr-test-lz4-ref-ours.lz4");
        let our_decompressed_path = std::env::temp_dir().join("ktstr-test-lz4-ref-ours.bin");
        std::fs::write(&our_lz4_path, &our_compressed).unwrap();

        let our_output = std::process::Command::new("lz4")
            .args(["-d", "-f", "--no-frame-crc"])
            .arg(&our_lz4_path)
            .arg(&our_decompressed_path)
            .output()
            .expect("lz4 -d on our output failed to execute");

        let _ = std::fs::remove_file(&our_lz4_path);

        assert!(
            our_output.status.success(),
            "lz4 -d on our output failed: stderr={}",
            String::from_utf8_lossy(&our_output.stderr),
        );

        let our_result = std::fs::read(&our_decompressed_path).unwrap();
        let _ = std::fs::remove_file(&our_decompressed_path);
        assert_eq!(our_result, data, "our lz4 output cross-compat mismatch");
    }
}