ktstr 0.6.0 - Docs.rs

//! Guest-only typed senders for the host-bound bulk TLV stream.
//!
//! Every function in this module is callable ONLY from inside a
//! ktstr guest VM. Host-context invocations log a `tracing::warn!`
//! and no-op.
//!
//! Each function frames its payload with the corresponding
//! [`super::wire::MsgType`] so call sites do not pass raw u32 ids.
//! The frame format is the [`super::wire::ShmMessage`] header +
//! payload described on the [`super::wire`] module doc.
//!
//! # Backpressure
//!
//! The bulk channel uses the kernel virtio_console TX path: a full
//! virtqueue blocks the writer until the host's `add_used` rate
//! catches up. Callers that cannot block (panic hook, signal
//! handlers, anything called from a critical section) MUST write
//! directly to COM2 (`/dev/ttyS1`) — the 16550 UART PIO path
//! commits synchronously inside `KVM_RUN` and never blocks the
//! guest on host backpressure. The panic hook in
//! [`super::rust_init`] follows this discipline.

use crate::sync::MutexExt;
use crate::vmm::wire::{
    KERNEL_OP_REPLY_MAX, KernelOpReplyPayload, KernelOpRequestPayload, KernelOpRequestResult,
    LifecyclePhase, MSG_TYPE_KERNEL_OP_REPLY, MSG_TYPE_SNAPSHOT_REPLY, MsgType,
    SNAPSHOT_REASON_MAX, SNAPSHOT_STATUS_ERR, SNAPSHOT_STATUS_OK, SNAPSHOT_TAG_MAX, ShmMessage,
    SnapshotReplyPayload, SnapshotRequestPayload, SnapshotRequestResult,
};
use zerocopy::{FromBytes, IntoBytes};

/// Mutex serializing guest-side bulk-port writes. Every guest writer
/// (`write_msg`) takes this lock before submitting bytes to
/// `/dev/vport0p1`, so the in-stream order of bytes on port 1 stays
/// `[header][payload]` regardless of which producer (step executor,
/// sched-exit-mon, profraw flusher) emitted the frame.
pub static GUEST_WRITE_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());

// ---------------------------------------------------------------------------
// is_guest detection
// ---------------------------------------------------------------------------

/// Detect whether the current process is running inside a ktstr guest
/// VM, by looking for the `KTSTR_GUEST=1` token on `/proc/cmdline`.
///
/// PID is NOT a reliable signal: the guest test code runs as forked
/// children of init (PID 1), not as PID 1 itself. The guest kernel
/// command line, populated by the host VMM, is the unique fingerprint.
///
/// The result is cached in a `OnceLock` — `/proc/cmdline` is read at
/// most once per process. False on the host (no cmdline match) and
/// false on any non-Linux platform that lacks `/proc/cmdline` (read
/// fails).
///
/// In test builds, the `IS_GUEST_TEST_OVERRIDE` thread-local takes
/// precedence over the `OnceLock`-cached natural detection; the
/// `OnceLock` is consulted only when no override is set on the
/// calling thread.
pub fn is_guest() -> bool {
    #[cfg(test)]
    {
        // Test-only override: tests run on the host but need to
        // exercise the guest-only path (write_msg). The override is
        // thread-local so parallel tests don't fight over it.
        if let Some(v) = IS_GUEST_TEST_OVERRIDE.with(|c| c.get()) {
            return v;
        }
    }
    static IS_GUEST: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
    *IS_GUEST.get_or_init(|| {
        std::fs::read_to_string("/proc/cmdline")
            .ok()
            .is_some_and(|c| c.split_whitespace().any(|tok| tok == "KTSTR_GUEST=1"))
    })
}

// Test-only thread-local override for `is_guest`. `None` means
// "consult /proc/cmdline"; `Some(b)` pins the result for the
// current thread. Per-thread so parallel tests cannot interfere.
#[cfg(test)]
thread_local! {
    static IS_GUEST_TEST_OVERRIDE: std::cell::Cell<Option<bool>> = const { std::cell::Cell::new(None) };
}

/// RAII guard that overrides [`is_guest`] for the duration of its
/// scope on the current thread, and restores the previous value on
/// drop. Avoids leaking override state across tests sharing a thread
/// (e.g. via test-runner thread pools).
///
/// `pub(crate)` so other test modules in the crate can use the
/// fixture when they need to exercise guest-only paths.
#[cfg(test)]
pub(crate) struct IsGuestOverrideGuard {
    prev: Option<bool>,
}

#[cfg(test)]
impl IsGuestOverrideGuard {
    pub(crate) fn new(value: bool) -> Self {
        let prev = IS_GUEST_TEST_OVERRIDE.with(|c| c.replace(Some(value)));
        Self { prev }
    }
}

#[cfg(test)]
impl Drop for IsGuestOverrideGuard {
    fn drop(&mut self) {
        let prev = self.prev;
        IS_GUEST_TEST_OVERRIDE.with(|c| c.set(prev));
    }
}

// ---------------------------------------------------------------------------
// Bulk-port writer (guest → host TLV)
// ---------------------------------------------------------------------------

/// Reject a call to a guest-only entry point when invoked from host
/// context. Returns `true` if the caller may proceed (we're inside a
/// guest VM); `false` after emitting a `tracing::warn!` that names the
/// caller and the message type, so a host-side caller surfaces in the
/// log instead of silently no-op'ing.
fn assert_guest_context(fn_name: &str, msg_type: u32) -> bool {
    if !is_guest() {
        tracing::warn!(
            msg_type = msg_type,
            "guest_comms::{fn_name} called from host context"
        );
        return false;
    }
    true
}

/// Cached `/dev/vport0p1` writer. Opened lazily on the first
/// successful `write_to_bulk_port` call after the kernel's
/// virtio_console driver creates the device node (post multiport
/// handshake). `OnceLock<Option<...>>` so repeated open failures
/// (port not yet ready) do not pin the slot to None permanently —
/// instead we re-attempt until `try_open_bulk_port` succeeds, then
/// cache the file handle for the rest of the process.
static BULK_PORT_FD: std::sync::OnceLock<std::sync::Mutex<Option<std::fs::File>>> =
    std::sync::OnceLock::new();

/// Try to open `/dev/vport0p1` for writing. Returns None when the
/// device is not yet present — the kernel virtio_console driver
/// creates it only after the host emits PORT_OPEN on the c_ivq for
/// port 1 and the kernel's `find_port_by_id` resolves the
/// `/sys/class/virtio-ports/vport0p1` entry.
///
/// Open mode: read+write, blocking. O_RDWR is required because the
/// kernel's `port_fops_open` (drivers/char/virtio_console.c) sets
/// `guest_connected = true` on the first open and returns EBUSY on
/// any subsequent open of the same port. A write-only open would
/// block a later read-only open needed by `request_snapshot`'s
/// reply reader. The port-2 stats relay already uses O_RDWR
/// (rust_init.rs `start_sched_stats_relay`).
fn try_open_bulk_port() -> Option<std::fs::File> {
    std::fs::OpenOptions::new()
        .read(true)
        .write(true)
        .open("/dev/vport0p1")
        .ok()
}

/// Write a TLV-framed message to the host through the bulk channel
/// (virtio-console port 1, `/dev/vport0p1`). The frame format is
/// 16-byte [`ShmMessage`] header + `payload.len()` bytes; the host
/// parses the same byte stream via [`super::host_comms::parse_tlv_stream`].
///
/// Returns `true` when the frame was fully written, `false` when the
/// bulk port is not yet open (multiport handshake still in flight),
/// the writev failed, or the call originated from host context. The
/// existing fire-and-forget callers (Exit, TestResult, PayloadMetrics,
/// Profraw, Stimulus, RawPayloadOutput, SchedExit, ScenarioStart,
/// ScenarioEnd, SnapshotRequest) discard the return at statement
/// position — only [`send_sys_rdy`]'s retry loop in `ktstr_guest_init`
/// observes it.
///
/// Backpressure: the kernel's virtio_console TX path (`hvc_push` /
/// `port_fops_write`) blocks the writer until the host's
/// `add_used` rate catches up. There is no drop path; callers that
/// cannot block (panic hook, signal handlers, anything called from
/// a critical section) MUST write directly to COM2 (`/dev/ttyS1`).
///
/// `assert_guest_context` rejects host-context invocations with a
/// `tracing::warn` so a host-side caller surfaces in the log instead
/// of silently no-op'ing.
fn write_msg(msg_type: u32, payload: &[u8]) -> bool {
    if !assert_guest_context("write_msg", msg_type) {
        return false;
    }
    let _guard = GUEST_WRITE_LOCK.lock_unpoisoned();
    write_to_bulk_port(msg_type, payload)
}

/// Try to write a TLV-framed message to `/dev/vport0p1`. Returns
/// true when the message was fully written, false when the bulk
/// port is not yet available or the write failed.
///
/// Lazy-open semantics: the multiport handshake completes
/// asynchronously during kernel virtio_console init, so the device
/// node may appear any time after the first `write_msg` call. We
/// retry the open on every call until it succeeds; once cached,
/// subsequent writes go through the cached `File`.
///
/// Submission shape: header and payload are submitted together via
/// `writev(2)` with two `iovec` slices, avoiding a per-call concat
/// allocation. The host's [`super::bulk::HostAssembler`] tolerates
/// partial frames in the byte stream, so any per-iovec virtqueue
/// submissions reassemble correctly.
fn write_to_bulk_port(msg_type: u32, payload: &[u8]) -> bool {
    let slot = BULK_PORT_FD.get_or_init(|| std::sync::Mutex::new(None));
    let mut guard = slot.lock_unpoisoned();
    if guard.is_none() {
        match try_open_bulk_port() {
            Some(f) => *guard = Some(f),
            None => return false,
        }
    }
    let f = guard.as_mut().expect("bulk port handle just installed");
    let Ok(length_u32) = u32::try_from(payload.len()) else {
        tracing::warn!(
            len = payload.len(),
            msg_type,
            "write_to_bulk_port: payload exceeds u32::MAX; dropping"
        );
        return false;
    };
    let msg = ShmMessage {
        msg_type,
        length: length_u32,
        crc32: crc32fast::hash(payload),
        _pad: 0,
    };
    let header_bytes = msg.as_bytes();
    let total = header_bytes.len() + payload.len();
    let fd = std::os::unix::io::AsRawFd::as_raw_fd(f);
    let mut iovs = [
        std::io::IoSlice::new(header_bytes),
        std::io::IoSlice::new(payload),
    ];
    let mut bufs: &mut [std::io::IoSlice<'_>] = &mut iovs[..];
    let mut written: usize = 0;
    while !bufs.is_empty() {
        // SAFETY: `bufs` is a non-empty slice of `IoSlice<'_>`, which
        // is `#[repr(transparent)]` over `libc::iovec` on unix targets.
        // Casting `*const IoSlice` to `*const libc::iovec` is sound.
        // `fd` is a borrowed raw fd from the cached `File`; the
        // `File` outlives the syscall because `guard` keeps it owned.
        let r = unsafe {
            libc::writev(
                fd,
                bufs.as_ptr() as *const libc::iovec,
                bufs.len() as libc::c_int,
            )
        };
        if r < 0 {
            let err = std::io::Error::last_os_error();
            if err.kind() == std::io::ErrorKind::Interrupted {
                continue;
            }
            tracing::warn!(
                %err,
                msg_type,
                len = payload.len(),
                "write_to_bulk_port: writev failed"
            );
            // Drop the cached handle so the next call retries the open
            // (the device may have transiently closed during a guest
            // reset path).
            *guard = None;
            return false;
        }
        if r == 0 {
            // `writev` returning 0 with no error is unexpected for a
            // char device; treat as an EOF-like failure.
            tracing::warn!(
                msg_type,
                len = payload.len(),
                written,
                total,
                "write_to_bulk_port: writev returned 0"
            );
            *guard = None;
            return false;
        }
        let n = r as usize;
        written += n;
        std::io::IoSlice::advance_slices(&mut bufs, n);
    }
    debug_assert_eq!(written, total);
    true
}

// ---------------------------------------------------------------------------
// Typed senders
// ---------------------------------------------------------------------------

/// Send the guest exit code to the host. Payload: 4-byte LE i32.
///
/// Frames the exit code with [`MsgType::Exit`] and routes through
/// the bulk port. The host's `collect_results` reads the latest
/// `Exit` entry to override the BSP run-loop sentinel.
pub fn send_exit(code: i32) {
    write_msg(MsgType::Exit.wire_value(), &code.to_le_bytes());
}

/// Send a test result to the host. Payload: postcard-encoded
/// [`crate::assert::AssertResult`].
///
/// Frames with [`MsgType::TestResult`]. Guest and host both use
/// `postcard` so layout never diverges; the host's
/// `crate::test_support::output::parse_assert_result_from_drain`
/// decodes with the same library.
///
/// Required: `result` MUST round-trip through postcard without
/// erroring — every field is owned `String` / `bool` / nested
/// `serde::Serialize` derives, so the only failure path is OOM
/// during the `Vec<u8>` allocation, which the surrounding eprintln
/// guards against silent loss.
pub fn send_test_result(result: &crate::assert::AssertResult) {
    match postcard::to_stdvec(result) {
        Ok(bytes) => {
            if bytes.len() > crate::vmm::bulk::MAX_BULK_FRAME_PAYLOAD as usize {
                tracing::error!(
                    size = bytes.len(),
                    max = crate::vmm::bulk::MAX_BULK_FRAME_PAYLOAD,
                    "AssertResult exceeds bulk port frame limit, sending truncated verdict"
                );
                let truncated =
                    crate::assert::AssertResult::fail(crate::assert::AssertDetail::new(
                        crate::assert::DetailKind::Other,
                        format!(
                            "AssertResult postcard size {} exceeded bulk port limit {}; \
                             original details dropped",
                            bytes.len(),
                            crate::vmm::bulk::MAX_BULK_FRAME_PAYLOAD,
                        ),
                    ));
                if let Ok(small) = postcard::to_stdvec(&truncated) {
                    write_msg(MsgType::TestResult.wire_value(), &small);
                }
            } else {
                write_msg(MsgType::TestResult.wire_value(), &bytes);
            }
        }
        Err(e) => {
            eprintln!("ktstr: postcard-encode AssertResult for bulk-port emit: {e}");
        }
    }
}

/// Send per-payload-invocation metrics to the host. Payload:
/// postcard-encoded [`crate::test_support::PayloadMetrics`].
///
/// Frames with [`MsgType::PayloadMetrics`].
pub fn send_payload_metrics(metrics: &crate::test_support::PayloadMetrics) {
    match postcard::to_stdvec(metrics) {
        Ok(bytes) => {
            write_msg(MsgType::PayloadMetrics.wire_value(), &bytes);
        }
        Err(e) => {
            eprintln!("ktstr: postcard-encode PayloadMetrics for bulk-port emit: {e}");
        }
    }
}

/// Send a coverage profraw blob to the host. Payload: raw `.profraw`
/// bytes produced by `__llvm_profile_get_data`.
///
/// Frames with [`MsgType::Profraw`].
pub fn send_profraw(buf: &[u8]) {
    write_msg(MsgType::Profraw.wire_value(), buf);
}

/// Send a stimulus event from the guest step executor.
///
/// Payload: byte-serialised [`crate::vmm::wire::StimulusPayload`]
/// (24 bytes, `IntoBytes`-derived). Frames with
/// [`MsgType::Stimulus`].
pub fn send_stimulus(payload: &[u8]) {
    write_msg(MsgType::Stimulus.wire_value(), payload);
}

/// Send raw stdout/stderr from an LlmExtract payload. Payload:
/// postcard-encoded [`crate::test_support::RawPayloadOutput`].
///
/// Frames with [`MsgType::RawPayloadOutput`].
pub(crate) fn send_raw_payload_output(raw: &crate::test_support::RawPayloadOutput) {
    match postcard::to_stdvec(raw) {
        Ok(bytes) => {
            write_msg(MsgType::RawPayloadOutput.wire_value(), &bytes);
        }
        Err(e) => {
            eprintln!("ktstr: postcard-encode RawPayloadOutput for bulk-port emit: {e}");
        }
    }
}

/// Send a scheduler-process exit notification. Payload: 4-byte LE i32
/// containing the scheduler's exit code.
///
/// Frames with [`MsgType::SchedExit`]. The host's freeze coordinator
/// promotes a SchedExit message into the run-wide kill flag so the
/// test ends promptly instead of waiting for the watchdog.
pub fn send_sched_exit(code: i32) {
    write_msg(MsgType::SchedExit.wire_value(), &code.to_le_bytes());
}

/// Send a scenario-start marker.
///
/// `MSG_TYPE_SCENARIO_START` is load-bearing: the host's freeze
/// coordinator gates the entire periodic-capture pipeline on the
/// first CRC-valid arrival (stamps `scenario_start_ns`, which the
/// capture loop reads as the anchor for boundary computation). A
/// silent loss here means `periodic_fired` stays at 0 regardless
/// of how many boundaries the workload should have crossed — the
/// failure mode the sibling-Claude mitosis report surfaced.
///
/// `send_sys_rdy` already retries until the bulk-port multiport
/// handshake completes, so by Phase 5 the port is normally
/// already open. The retry here is belt-and-braces for the rare
/// case where the cached File handle was invalidated between
/// `send_sys_rdy` and this call (process restart, fd close from
/// an unrelated path) — a fresh `try_open_bulk_port` on each
/// retry recovers transparently. 5 retries × 100 ms = 500 ms
/// total budget, an order of magnitude under the periodic
/// capture's typical inter-boundary spacing so retries don't
/// shift downstream timing measurably.
pub fn send_scenario_start() {
    for attempt in 0..5 {
        if write_msg(MsgType::ScenarioStart.wire_value(), &[]) {
            return;
        }
        if attempt + 1 < 5 {
            std::thread::sleep(std::time::Duration::from_millis(100));
        }
    }
    tracing::warn!(
        "send_scenario_start: 5 retries failed — bulk port write never \
         succeeded; periodic captures will see scenario_anchor=0 and \
         silently 0-fire"
    );
}

/// Send a scenario-end marker. Payload: 8-byte LE u64 elapsed
/// milliseconds since scenario start.
pub fn send_scenario_end(elapsed_ms: u64) {
    write_msg(MsgType::ScenarioEnd.wire_value(), &elapsed_ms.to_le_bytes());
}

pub fn send_scenario_pause() {
    write_msg(MsgType::ScenarioPause.wire_value(), &[]);
}

pub fn send_scenario_resume() {
    write_msg(MsgType::ScenarioResume.wire_value(), &[]);
}

/// Send the boot-complete signal to the host. Payload: empty.
/// Returns `true` when the frame was fully written, `false` when the
/// bulk port is not yet open (the multiport handshake completes
/// asynchronously during kernel virtio_console init, so
/// `/dev/vport0p1` may not exist on the first call after
/// `mount_filesystems()` returns) or the write failed.
///
/// Frames an empty payload with [`MsgType::SysRdy`] and routes
/// through the bulk port. The host's freeze coordinator promotes
/// a CRC-valid SYS_RDY frame into the monitor's boot-complete
/// eventfd, releasing the monitor's pre-sample epoll wait. Called
/// from the guest's `ktstr_guest_init` after `mount_filesystems`
/// completes, so the host's first sample observes a fully-booted
/// guest with `setup_per_cpu_areas` and KASLR randomization
/// already done.
///
/// The boolean return lets the caller retry on transient
/// not-yet-open failures: the multiport handshake completes
/// independently of `mount_filesystems`'s devtmpfs mount, so a
/// single call right after the mount can race the handshake. The
/// retry loop in `ktstr_guest_init` polls until success or budget
/// exhaustion, ensuring the host eventually observes the signal
/// rather than silently dropping the boot-complete event.
pub fn send_sys_rdy() -> bool {
    write_msg(MsgType::SysRdy.wire_value(), &[])
}

/// Send the typed [`crate::vmm::wire::KernAddrs`] payload to the
/// host so the monitor can translate kernel virtual addresses
/// without walking guest page tables. Called from
/// `ktstr_guest_init` after `mount_filesystems` and before
/// `send_sys_rdy`.
///
/// The wire layout, the per-field encoding (including the +1
/// bias on present-bit slots), and the host-side decode contract
/// all live on the typed struct — see
/// [`crate::vmm::wire::KernAddrs`] for the full reference. This
/// helper is a thin transport wrapper that delegates to
/// [`crate::vmm::wire::KernAddrs::to_payload`] and ships the
/// bytes through the host_comms TLV channel.
///
/// The runtime `_text` KVA in the payload powers the
/// cross-architecture virt-KASLR derive at
/// `src/vmm/freeze_coord/dispatch.rs`'s KERN_ADDRS arm:
/// `virt_kaslr = _text_runtime - _text_link`, where the link-time
/// KVA comes from the host's vmlinux parse
/// (`KernelSymbols::kernel_text_kva` at `src/monitor/symbols.rs`).
/// `_text` is defined in `vmlinux.lds.S` on every Linux build so
/// the derivation works on both x86_64 and aarch64.
///
/// Two independent paths feed the same
/// `Arc<AtomicU64> kern_virt_kaslr` on the host: this guest-side
/// derivation (cross-arch), and the BSP-side
/// `KVM_GET_MSRS(MSR_LSTAR)` readback
/// (`src/vmm/x86_64/msr_kaslr::read_and_derive`, x86_64-only).
/// Either is sufficient on x86_64; on aarch64 only the guest
/// channel and the `nokaslr` cmdline gate participate.
pub fn send_kern_addrs(addrs: &super::wire::KernAddrs) -> bool {
    let payload = addrs.to_payload();
    write_msg(super::wire::MSG_TYPE_KERN_ADDRS, &payload)
}

/// Read the runtime virtual address of `_text` (the kernel image
/// start symbol) from `/proc/kallsyms`.
///
/// Returns `Some(kva)` when the symbol is present AND the address
/// is non-zero (kallsyms masks addresses to `0000000000000000`
/// when `kernel.kptr_restrict >= 1` and the reader lacks
/// `CAP_SYSLOG`). `rust_init` runs as PID 1 with all caps including
/// `CAP_SYSLOG`, so the read sees real addresses regardless of the
/// `kptr_restrict` sysctl default.
///
/// The kernel writes the post-relocation KVA into the symbol table
/// via `handle_relocations` in `arch/x86/boot/compressed/misc.c`
/// (x86_64) and via the kallsyms relocation pass in
/// `init/main.c::__init` (aarch64) before userspace boots, so by
/// the time guest userland can read `/proc/kallsyms` the entry
/// already reflects the runtime virt-KASLR slide. `_text` is
/// defined in `vmlinux.lds.S` on every Linux build, so this
/// returns a meaningful value on both x86_64 and aarch64 — and on
/// any other architecture ktstr might target in future.
pub fn read_kernel_text_from_kallsyms() -> Option<u64> {
    read_kallsyms_symbol_kva("_text", &["T", "t"])
}

/// Read the runtime virtual address of `page_offset_base` (the
/// CONFIG_RANDOMIZE_MEMORY direct-map slide global) from
/// `/proc/kallsyms`. Companion to [`read_kernel_text_from_kallsyms`].
///
/// `page_offset_base` is declared `unsigned long page_offset_base
/// __ro_after_init` at `arch/x86/kernel/head64.c:63` (kernel-PhD
/// confirmed); type letter is `D`/`d` (writable data section, even
/// though `__ro_after_init` lives in `.data..ro_after_init` which
/// the linker places in the RO image post-mark_rodata_ro). This
/// reader returns the RUNTIME KVA of the symbol; the VALUE stored
/// at that KVA — the direct-map base produced by
/// `kernel_randomize_memory` at boot — must be read separately by
/// the host via `text_kva_to_pa_with_base(kva, START_KERNEL_MAP,
/// phys_base)` followed by `read_u64`. Returns `None` when the
/// symbol is absent (CONFIG_RANDOMIZE_MEMORY=n, arm64) OR when
/// kptr_restrict masks the address to 0 (kernel-PhD: default is 0,
/// ktstr-init runs as PID 1 with CAP_SYSLOG so this is rare).
pub fn read_kernel_page_offset_base_from_kallsyms() -> Option<u64> {
    read_kallsyms_symbol_kva("page_offset_base", &["D", "d"])
}

/// Shared `/proc/kallsyms` symbol-KVA reader. Both
/// [`read_kernel_text_from_kallsyms`] (type `T`/`t`) and
/// [`read_kernel_page_offset_base_from_kallsyms`] (type `D`/`d`)
/// dispatch through here. The 16-char hex format is fixed-width per
/// `kernel/kallsyms.c::s_show` on 64-bit kernels (kernel-PhD
/// confirmed); leading zeros are not suppressed. Returns `None` on
/// (a) `/proc/kallsyms` unreadable, (b) symbol absent, (c)
/// `addr == 0` (kptr_restrict elevated or symbol stripped — caller
/// MUST treat as "not readable", not as a legitimate zero KVA).
fn read_kallsyms_symbol_kva(name: &str, allowed_types: &[&str]) -> Option<u64> {
    let kallsyms = std::fs::read_to_string("/proc/kallsyms").ok()?;
    for line in kallsyms.lines() {
        let mut parts = line.split_ascii_whitespace();
        let addr = parts.next()?;
        let typ = parts.next()?;
        let sym = parts.next()?;
        if sym == name && allowed_types.contains(&typ) {
            let kva = u64::from_str_radix(addr, 16).ok()?;
            if kva != 0 {
                return Some(kva);
            }
        }
    }
    None
}

/// Derive the KASLR physical displacement from `/proc/iomem`.
///
/// On both x86_64 and aarch64 the kernel registers a "Kernel code"
/// resource in `/proc/iomem` whose start address is the physical
/// load address of `_text`. The KASLR offset is the difference
/// between this runtime PA and the default (non-KASLR) load PA.
///
/// x86_64: default load PA = `LOAD_PHYSICAL_ADDR` (0x100_0000,
/// CONFIG_PHYSICAL_START). `phys_base = code_pa - 0x100_0000`.
///
/// aarch64: default load PA = DRAM base (`System RAM` start from
/// iomem) + `TEXT_OFFSET`. `TEXT_OFFSET` is 0 on kernels since
/// v5.8 (commit 2b5fcc5), so `phys_base = code_pa - ram_start`.
/// Older kernels with `TEXT_OFFSET = 0x80000` (or randomized via
/// `CONFIG_ARM64_RANDOMIZE_TEXT_OFFSET`) would produce a biased
/// value; ktstr.kconfig targets 6.x where `TEXT_OFFSET = 0`.
pub fn read_phys_base_from_iomem() -> Option<u64> {
    let iomem = std::fs::read_to_string("/proc/iomem").ok()?;
    #[cfg(target_arch = "x86_64")]
    {
        for line in iomem.lines() {
            let line = line.trim();
            if line.ends_with(": Kernel code") {
                let range = line.split(':').next()?.trim();
                let start = range.split('-').next()?.trim();
                let phys_load = u64::from_str_radix(start, 16).ok()?;
                return Some(phys_load.wrapping_sub(0x100_0000));
            }
        }
        None
    }
    #[cfg(target_arch = "aarch64")]
    {
        // First "System RAM" entry = lowest-addressed DRAM region.
        // KERNEL_LOAD_ADDR == DRAM_START by construction in our VMM,
        // so the kernel always loads at this base.
        let mut ram_start: Option<u64> = None;
        let mut code_start: Option<u64> = None;
        for line in iomem.lines() {
            let line = line.trim();
            if ram_start.is_none() && line.ends_with(": System RAM") {
                let range = line.split(':').next()?.trim();
                let start = range.split('-').next()?.trim();
                ram_start = Some(u64::from_str_radix(start, 16).ok()?);
            }
            if line.ends_with(": Kernel code") {
                let range = line.split(':').next()?.trim();
                let start = range.split('-').next()?.trim();
                code_start = Some(u64::from_str_radix(start, 16).ok()?);
            }
        }
        Some(code_start?.wrapping_sub(ram_start?))
    }
}

/// Send a stdout chunk to the host. Payload: opaque UTF-8 bytes.
///
/// Frames with [`MsgType::Stdout`]. Replaces the prior COM2
/// stdout redirect: the guest's stdout pipe forwarder (set up in
/// `redirect_stdio_to_bulk_port`) reads chunks from the pipe
/// read-end and feeds them through this sender. The host
/// concatenates chunks in arrival order to reconstruct the
/// stream. Each chunk SHOULD fit comfortably under
/// [`crate::vmm::bulk::MAX_BULK_FRAME_PAYLOAD`]; oversized chunks
/// are rejected by `write_to_bulk_port`'s `u32::try_from` length
/// guard plus the host-side per-frame cap and are logged.
///
/// Required: caller MUST split chunks at sub-cap boundaries. The
/// pipe forwarder uses 4 KiB reads which is well under the cap.
///
/// Optional: a not-yet-open bulk port returns `false` and the
/// chunk is dropped. The forwarder thread continues reading the
/// pipe — early-init bytes (before the multiport handshake
/// completes) are lost, mirroring the existing COM2 fallback's
/// "first bytes may not reach the host" caveat.
pub fn send_stdout_chunk(buf: &[u8]) -> bool {
    write_msg(MsgType::Stdout.wire_value(), buf)
}

/// Send a stderr chunk to the host. Payload: opaque UTF-8 bytes.
///
/// Frames with [`MsgType::Stderr`]. Same chunked semantics as
/// [`send_stdout_chunk`].
pub fn send_stderr_chunk(buf: &[u8]) -> bool {
    write_msg(MsgType::Stderr.wire_value(), buf)
}

/// Send a scheduler-log chunk to the host. Payload: opaque UTF-8
/// bytes from the scheduler child process's captured log.
///
/// Frames with [`MsgType::SchedLog`]. The host concatenates
/// chunks in arrival order and the embedded `SCHED_OUTPUT_START` /
/// `SCHED_OUTPUT_END` delimiters travel verbatim inside the chunk
/// bytes, so the existing `parse_sched_output` walker (verifier
/// module) keeps slicing the log without changes. Replaces the
/// prior COM2 dump path in `dump_sched_output`.
///
/// Required: caller chunks at sub-cap boundaries; same constraint
/// as [`send_stdout_chunk`].
#[allow(dead_code)]
pub fn send_sched_log(buf: &[u8]) {
    write_msg(MsgType::SchedLog.wire_value(), buf);
}

/// Send a lifecycle phase event to the host. Payload: 1-byte
/// [`LifecyclePhase`] discriminant followed by a UTF-8 reason
/// suffix (only `SchedulerNotAttached` populates `reason`; every
/// other phase passes `""`).
///
/// Frames with [`MsgType::Lifecycle`]. Replaces the prior
/// `KTSTR_INIT_STARTED` / `KTSTR_PAYLOAD_STARTING` /
/// `SCHEDULER_DIED` / `SCHEDULER_NOT_ATTACHED` COM2 sentinel
/// strings. Host classifies init failure stages by walking the
/// per-VM lifecycle bucket instead of substring-matching on COM2
/// output.
///
/// Required: phase wire value MUST be in 1..=4. The 0 byte is
/// reserved as the host-side "unknown" sentinel and is rejected
/// by [`LifecyclePhase::from_wire`].
pub fn send_lifecycle(phase: LifecyclePhase, reason: &str) {
    let mut buf = Vec::with_capacity(1 + reason.len());
    buf.push(phase.wire_value());
    buf.extend_from_slice(reason.as_bytes());
    write_msg(MsgType::Lifecycle.wire_value(), &buf);
}

/// Send a shell-exec exit code to the host. Payload: 4-byte LE
/// i32 carrying the exec'd process's exit code.
///
/// Frames with [`MsgType::ExecExit`]. Replaces the prior COM2
/// `KTSTR_EXEC_EXIT=N` sentinel line emitted by `cargo ktstr
/// shell --exec <cmd>`.
pub fn send_exec_exit(code: i32) {
    write_msg(MsgType::ExecExit.wire_value(), &code.to_le_bytes());
}

/// Send a kernel ring-buffer dump to the host. Payload: opaque
/// UTF-8 bytes from `rmesg::logs_raw`.
///
/// Frames with [`MsgType::Dmesg`]. Sent on the
/// initramfs-extraction failure path so the host sees the kernel
/// OOM messages without scraping COM2.
#[allow(dead_code)]
pub fn send_dmesg(buf: &[u8]) {
    write_msg(MsgType::Dmesg.wire_value(), buf);
}

/// Send a probe-pipeline JSON output chunk to the host. Payload:
/// opaque UTF-8 bytes from the probe output stream.
///
/// Frames with [`MsgType::ProbeOutput`]. Replaces the prior COM2
/// ProbeDrain path so probe output and scheduler-log dumps stop
/// interleaving on the same serial port.
///
/// Required: caller chunks at sub-cap boundaries; same constraint
/// as [`send_stdout_chunk`].
#[allow(dead_code)]
pub fn send_probe_output(buf: &[u8]) {
    write_msg(MsgType::ProbeOutput.wire_value(), buf);
}

// ---------------------------------------------------------------------------
// Snapshot request (guest → host) + reply read-back
// ---------------------------------------------------------------------------

/// Monotonic guest-side request id counter. Bumped by every call to
/// [`request_snapshot`] before publishing the request frame.
/// `AtomicU32` so concurrent requests from different guest threads do
/// not produce duplicate ids. Wraparound past `u32::MAX` is
/// theoretically possible after billions of requests; the host's
/// reply pairing tolerates it because the comparison is equality
/// against the issuer's most-recent value, not a monotonicity check.
static SNAPSHOT_REQUEST_COUNTER: std::sync::atomic::AtomicU32 =
    std::sync::atomic::AtomicU32::new(1);

/// Mutex serialising guest-side request/reply RPCs over the
/// port-1 transport — both [`request_snapshot`] and
/// [`request_kernel_op`] take it before publishing. Without it two
/// guest threads issuing concurrent requests would interleave their
/// TX writes and race for each other's replies on the shared read fd
/// (only one open is permitted per port, so the snapshot reader and
/// the kernel-op reader share the same `BULK_PORT_FD` handle). The
/// freeze coordinator's `on_demand_in_flight` latch already collapses
/// doorbell floods to one capture per thaw on the host side; this
/// lock keeps the guest-side request/reply pairing well-defined for
/// every RPC kind too.
static SNAPSHOT_REQUEST_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());

/// Monotonic guest-side request id counter for
/// [`request_kernel_op`]. Kept separate from
/// [`SNAPSHOT_REQUEST_COUNTER`] so the two RPC kinds have independent
/// id sequences — the reply's [`MsgType`] distinguishes which counter
/// the id pairs against, but separate counters keep on-the-wire ids
/// monotonic per request kind which simplifies host-side logs.
static KERNEL_OP_REQUEST_COUNTER: std::sync::atomic::AtomicU32 =
    std::sync::atomic::AtomicU32::new(1);

/// Cached read-side handle on `/dev/vport0p1`. Reused across snapshot
/// requests so the kernel's port-1 read queue refills only once per
/// guest process. `OnceLock<Option<File>>` so a not-yet-ready open
/// (multiport handshake still in flight) does not pin the slot to
/// None — the next call retries.
/// Number of fast-poll iterations at the start of
/// [`bounded_read_exact`] before escalating to the slow-poll cadence.
/// Four iterations of 100µs gives ~400µs of fast-path coverage,
/// enough to absorb a host reply that lands in the virtqueue while
/// the guest is still entering `ppoll`, without burning more than
/// a hundred microseconds of cumulative wake-up budget.
const SNAPSHOT_FAST_POLL_ITERS: u32 = 4;
/// Per-iteration ppoll timeout for the first
/// [`SNAPSHOT_FAST_POLL_ITERS`] iterations (100µs). Sub-millisecond
/// granularity is the reason this path uses `ppoll` rather than
/// `poll(2)` (which only takes millisecond timeouts).
const SNAPSHOT_FAST_POLL_INTERVAL: std::time::Duration = std::time::Duration::from_micros(100);
/// Per-iteration ppoll timeout after the fast-poll preamble (5ms).
/// Bounds the worst-case extra latency when virtio_console's
/// `port_fops_poll` does not deliver an early wake, while keeping
/// vCPU-thread wake-up cost low across the full snapshot deadline.
const SNAPSHOT_SLOW_POLL_INTERVAL: std::time::Duration = std::time::Duration::from_millis(5);

/// Read exactly `buf.len()` bytes from `f`, bounded by `deadline`.
/// Uses `ppoll(POLLIN)` between reads to wait without blocking past
/// the deadline. Returns `ErrorKind::TimedOut` when the deadline
/// expires before the read completes.
///
/// Each `ppoll` call's timeout is capped at an adaptive interval, not
/// the full remaining deadline:
///
/// * The first [`SNAPSHOT_FAST_POLL_ITERS`] iterations use a
///   [`SNAPSHOT_FAST_POLL_INTERVAL`] timeout (100µs). On the common
///   path the host's reply is already buffered in the virtqueue by
///   the time the guest enters `ppoll`, so a sub-millisecond bound
///   keeps wake-up latency low without burning CPU on the vCPU
///   thread.
/// * Subsequent iterations escalate to [`SNAPSHOT_SLOW_POLL_INTERVAL`]
///   (5ms), bounding the per-iteration wakeup cost while still
///   guaranteeing prompt deadline checks across the outer loop.
///
/// Each interval is further capped against the remaining deadline so
/// the loop never overshoots the caller's timeout.
fn bounded_read_exact(
    f: &mut std::fs::File,
    buf: &mut [u8],
    deadline: std::time::Instant,
) -> std::io::Result<()> {
    use std::io::Read;
    use std::os::unix::io::AsRawFd;
    let fd = f.as_raw_fd();
    let mut filled = 0usize;
    let mut iter: u32 = 0;
    while filled < buf.len() {
        let now = std::time::Instant::now();
        if now >= deadline {
            return Err(std::io::Error::new(
                std::io::ErrorKind::TimedOut,
                format!(
                    "snapshot reply deadline elapsed after reading {filled} of {} header/payload bytes",
                    buf.len()
                ),
            ));
        }
        let remaining = deadline - now;
        let interval = if iter < SNAPSHOT_FAST_POLL_ITERS {
            SNAPSHOT_FAST_POLL_INTERVAL
        } else {
            SNAPSHOT_SLOW_POLL_INTERVAL
        };
        // Cap the per-iteration sleep at min(interval, remaining) so
        // the last iteration before the deadline does not overshoot.
        let slice = remaining.min(interval);
        let ts = libc::timespec {
            tv_sec: slice.as_secs() as libc::time_t,
            tv_nsec: slice.subsec_nanos() as libc::c_long,
        };
        let mut pfd = libc::pollfd {
            fd,
            events: libc::POLLIN,
            revents: 0,
        };
        // SAFETY: pfd is a valid &mut to a single pollfd; nfds is 1.
        // `ts` is a local timespec passed by const pointer. sigmask
        // is null so the caller's signal mask applies unchanged.
        // Every poll outcome (ready, timeout, EINTR, error) loops
        // back to the read attempt; EINTR is harmless because the
        // outer loop re-evaluates the deadline on every iteration.
        let pr = unsafe { libc::ppoll(&mut pfd, 1, &ts, std::ptr::null()) };
        iter = iter.saturating_add(1);
        if pr < 0 {
            let err = std::io::Error::last_os_error();
            if err.kind() == std::io::ErrorKind::Interrupted {
                continue;
            }
            return Err(err);
        }
        if pr == 0 {
            // ppoll timeout — re-check deadline at the loop head.
            continue;
        }
        match f.read(&mut buf[filled..]) {
            Ok(0) => {
                return Err(std::io::Error::new(
                    std::io::ErrorKind::UnexpectedEof,
                    format!(
                        "snapshot reply read returned 0 after {filled} of {} bytes",
                        buf.len()
                    ),
                ));
            }
            Ok(n) => {
                filled += n;
            }
            Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
            Err(e) => return Err(e),
        }
    }
    Ok(())
}

/// Read a single TLV frame (16-byte header + payload bytes) from
/// `/dev/vport0p1`. Returns the parsed message type and payload on
/// success.
///
/// Reads the header with `bounded_read_exact`, decodes the length, then
/// reads the payload with `bounded_read_exact`. On any I/O failure
/// (premature EOF, EINTR, etc.) the cached handle is dropped so a
/// subsequent call retries the open.
///
/// `max_payload_size` caps the payload allocation against a hostile
/// or corrupted host that frames an oversized length. Callers pass
/// the upper bound of any payload they expect to read on this
/// transport (e.g. `size_of::<SnapshotReplyPayload>()` for snapshot
/// replies, [`KERNEL_OP_REPLY_MAX`] for postcard-encoded kernel-op
/// replies); a length above the cap is rejected with `InvalidData`
/// BEFORE the `vec![0u8; length]` allocation so a forged
/// `length = u32::MAX` cannot OOM the guest's PID 1 init.
fn read_bulk_port_frame(
    f: &mut std::fs::File,
    max_payload_size: usize,
    deadline: std::time::Instant,
) -> std::io::Result<(u32, Vec<u8>)> {
    let mut header = [0u8; std::mem::size_of::<ShmMessage>()];
    bounded_read_exact(f, &mut header, deadline)?;
    let msg = ShmMessage::read_from_bytes(&header).map_err(|_| {
        std::io::Error::new(
            std::io::ErrorKind::InvalidData,
            "ShmMessage::read_from_bytes failed (header underflow)",
        )
    })?;
    let length = msg.length as usize;
    if length > max_payload_size {
        return Err(std::io::Error::new(
            std::io::ErrorKind::InvalidData,
            format!(
                "TLV length {length} exceeds max payload {max_payload_size} for port-1 RX; \
                 rejecting before allocation to avoid guest OOM"
            ),
        ));
    }
    let mut payload = vec![0u8; length];
    if length > 0 {
        bounded_read_exact(f, &mut payload, deadline)?;
    }
    let computed = crc32fast::hash(&payload);
    if computed != msg.crc32 {
        return Err(std::io::Error::new(
            std::io::ErrorKind::InvalidData,
            format!(
                "TLV CRC mismatch: header crc=0x{:08x} computed=0x{computed:08x} length={length}",
                msg.crc32
            ),
        ));
    }
    Ok((msg.msg_type, payload))
}

/// Request a host-driven snapshot. Publishes a snapshot request via
/// the virtio-console port-1 TLV stream and blocks reading port 1 RX
/// until a matching [`MsgType::SnapshotReply`] arrives (or `timeout`
/// elapses).
///
/// `kind` selects the dispatch path on the host:
/// [`crate::vmm::wire::SNAPSHOT_KIND_CAPTURE`] for a capture-now
/// request, [`crate::vmm::wire::SNAPSHOT_KIND_WATCH`] for a
/// hardware-watchpoint registration.
///
/// `tag` is copied into the request payload's tag buffer up to
/// [`SNAPSHOT_TAG_MAX`] bytes. Longer tags are truncated.
///
/// Returns one of [`SnapshotRequestResult`] variants. The serialised
/// guest lock ensures only one in-flight request per process — this
/// matches the host coordinator's `on_demand_in_flight` invariant.
pub fn request_snapshot(
    kind: u32,
    tag: &str,
    timeout: std::time::Duration,
) -> SnapshotRequestResult {
    if !is_guest() {
        return SnapshotRequestResult::TransportError {
            reason: "request_snapshot called from host context (virtio-console port 1 \
                     is reachable only from inside the guest)"
                .into(),
        };
    }
    let _guard = SNAPSHOT_REQUEST_LOCK.lock_unpoisoned();
    // Allocate a request id. Skip 0 so the wait loop's `reply.request_id
    // == request_id` check cannot accidentally match a zero-initialised
    // reply payload from an earlier protocol version.
    let mut request_id = SNAPSHOT_REQUEST_COUNTER.fetch_add(1, std::sync::atomic::Ordering::AcqRel);
    if request_id == 0 {
        request_id = SNAPSHOT_REQUEST_COUNTER.fetch_add(1, std::sync::atomic::Ordering::AcqRel);
    }
    // Build the request payload.
    let tag_bytes = tag.as_bytes();
    let tag_len = tag_bytes.len().min(SNAPSHOT_TAG_MAX);
    let mut tag_buf = [0u8; SNAPSHOT_TAG_MAX];
    tag_buf[..tag_len].copy_from_slice(&tag_bytes[..tag_len]);
    let payload = SnapshotRequestPayload {
        request_id,
        kind,
        tag: tag_buf,
    };
    // Send via the existing port-1 TX writer. `write_msg` already
    // takes `GUEST_WRITE_LOCK` internally, so this serialises with
    // every other guest TLV producer.
    let bytes = payload.as_bytes();
    write_msg(MsgType::SnapshotRequest.wire_value(), bytes);
    // Read replies from the same O_RDWR fd used for writes.
    // The kernel's port_fops_open allows only one concurrent open
    // per port (EBUSY on second open), so a separate read-only
    // open would fail. The write fd is opened O_RDWR by
    // try_open_bulk_port.
    let read_slot = BULK_PORT_FD.get_or_init(|| std::sync::Mutex::new(None));
    let mut read_guard = read_slot.lock_unpoisoned();
    if read_guard.is_none() {
        match try_open_bulk_port() {
            Some(f) => *read_guard = Some(f),
            None => {
                return SnapshotRequestResult::TransportError {
                    reason: "/dev/vport0p1 not yet open \
                             (multiport handshake still in flight)"
                        .into(),
                };
            }
        }
    }
    let f = read_guard
        .as_mut()
        .expect("bulk port handle just installed");
    // Read TLV reply frames until we observe one whose payload
    // request_id matches ours. Frames addressed to other request ids
    // (none in current protocol — the host only writes replies in
    // response to a specific request) or unknown msg_types are
    // logged + dropped.
    let deadline = std::time::Instant::now() + timeout;
    loop {
        let now = std::time::Instant::now();
        if now >= deadline {
            return SnapshotRequestResult::TransportError {
                reason: format!(
                    "host did not deliver matching snapshot reply within {timeout:?} \
                     (request_id={request_id}, kind={kind})"
                ),
            };
        }
        let frame =
            match read_bulk_port_frame(f, std::mem::size_of::<SnapshotReplyPayload>(), deadline) {
                Ok(frame) => frame,
                Err(e) if e.kind() == std::io::ErrorKind::TimedOut => {
                    return SnapshotRequestResult::TransportError {
                        reason: format!(
                            "snapshot reply deadline elapsed before frame complete \
                         (request_id={request_id}, kind={kind}): {e}"
                        ),
                    };
                }
                Err(e) => {
                    // I/O error on the read fd — drop the cached
                    // handle so the next call retries the open and
                    // surface the failure to the caller.
                    *read_guard = None;
                    return SnapshotRequestResult::TransportError {
                        reason: format!(
                            "snapshot reply read failed (request_id={request_id}): {e}"
                        ),
                    };
                }
            };
        let (msg_type, frame_payload) = frame;
        if msg_type != MSG_TYPE_SNAPSHOT_REPLY {
            tracing::warn!(
                msg_type,
                len = frame_payload.len(),
                request_id,
                "request_snapshot: ignoring unexpected TLV on port 1 RX (only \
                 SnapshotReply is expected on this transport in current protocol)"
            );
            continue;
        }
        if frame_payload.len() != std::mem::size_of::<SnapshotReplyPayload>() {
            tracing::warn!(
                request_id,
                got = frame_payload.len(),
                want = std::mem::size_of::<SnapshotReplyPayload>(),
                "request_snapshot: malformed reply payload size; ignoring"
            );
            continue;
        }
        let reply = match SnapshotReplyPayload::read_from_bytes(&frame_payload) {
            Ok(r) => r,
            Err(_) => {
                tracing::warn!(
                    request_id,
                    "request_snapshot: SnapshotReplyPayload::read_from_bytes failed; ignoring"
                );
                continue;
            }
        };
        if reply.request_id != request_id {
            tracing::warn!(
                expected = request_id,
                got = reply.request_id,
                "request_snapshot: stale reply id (likely a leftover from a prior \
                 request that timed out on the guest side); ignoring"
            );
            continue;
        }
        return match reply.status {
            SNAPSHOT_STATUS_OK => SnapshotRequestResult::Ok,
            SNAPSHOT_STATUS_ERR => {
                let len = reply
                    .reason
                    .iter()
                    .position(|&b| b == 0)
                    .unwrap_or(SNAPSHOT_REASON_MAX);
                let reason = String::from_utf8_lossy(&reply.reason[..len]).to_string();
                SnapshotRequestResult::HostError { reason }
            }
            other => SnapshotRequestResult::TransportError {
                reason: format!(
                    "host reply with unknown status {other} \
                     (expected OK={SNAPSHOT_STATUS_OK} or ERR={SNAPSHOT_STATUS_ERR})"
                ),
            },
        };
    }
}

/// Request a host-driven kernel-memory op (`Op::WriteKernel{Hot,Cold}`
/// / `Op::ReadKernel{Hot,Cold}`). Publishes a postcard-encoded
/// [`KernelOpRequestPayload`] via the virtio-console port-1 TLV
/// stream and blocks reading port 1 RX until a matching
/// [`MsgType::KernelOpReply`] arrives (or `timeout` elapses).
///
/// The supplied `request` carries the full op intent — mode
/// (hot/cold), direction (write/read), tag (for read replies and
/// diagnostics), and the ordered batch of `(target, value)` entries.
/// The function stamps a fresh `request_id` into the payload before
/// publishing (overriding whatever the caller put there) so the
/// reply pairing stays well-defined; the returned reply mirrors that
/// id back in [`KernelOpReplyPayload::request_id`].
///
/// Returns one of [`KernelOpRequestResult`] variants. Distinct from
/// [`SnapshotRequestResult`]: the "host completed but op failed"
/// carrier is [`KernelOpReplyPayload::success`] = false +
/// [`KernelOpReplyPayload::reason`], not a separate enum arm,
/// because postcard-encoded replies can carry per-entry result data
/// (e.g. read values) that an enum arm would erase.
///
/// Shares [`SNAPSHOT_REQUEST_LOCK`] with [`request_snapshot`]: only
/// one in-flight guest→host RPC per process, regardless of kind —
/// the shared `BULK_PORT_FD` read handle cannot safely demux two
/// concurrent reply streams.
///
/// **Throughput note.** This helper holds the `BULK_PORT_FD` slot
/// lock for the entire reply-wait loop (up to `timeout`, default
/// 30 s for the cold-path freeze-rendezvous round-trip). Concurrent
/// guest writers (`write_msg` callers — stimulus producers, scenario
/// lifecycle events) on the same port-1 transport BLOCK on the
/// shared slot lock until the reply lands. Deadlock potential is
/// zero (the `GUEST_WRITE_LOCK` and `SNAPSHOT_REQUEST_LOCK` are
/// acquired in independent orders by independent paths, and no
/// path holds both simultaneously), but a long-running cold-op
/// rendezvous serializes against unrelated TX traffic during its
/// reply wait.
pub fn request_kernel_op(
    request: KernelOpRequestPayload,
    timeout: std::time::Duration,
) -> KernelOpRequestResult {
    if !is_guest() {
        return KernelOpRequestResult::TransportError {
            reason: "request_kernel_op called from host context (virtio-console port 1 \
                     is reachable only from inside the guest)"
                .into(),
        };
    }
    let _guard = SNAPSHOT_REQUEST_LOCK.lock_unpoisoned();
    // Allocate a request id. Skip 0 so the wait loop's `reply.request_id
    // == request_id` check cannot accidentally match a zero-initialised
    // reply payload from an earlier protocol version.
    let mut request_id =
        KERNEL_OP_REQUEST_COUNTER.fetch_add(1, std::sync::atomic::Ordering::AcqRel);
    if request_id == 0 {
        request_id = KERNEL_OP_REQUEST_COUNTER.fetch_add(1, std::sync::atomic::Ordering::AcqRel);
    }
    // Stamp the freshly-allocated id into a clone of the request
    // payload (the caller's `request_id` field is overwritten — the
    // function owns id allocation per the doc contract).
    let stamped = KernelOpRequestPayload {
        request_id,
        ..request
    };
    let payload_bytes = match postcard::to_allocvec(&stamped) {
        Ok(b) => b,
        Err(e) => {
            return KernelOpRequestResult::TransportError {
                reason: format!(
                    "request_kernel_op: postcard encode failed (request_id={request_id}): {e}"
                ),
            };
        }
    };
    // Send via the existing port-1 TX writer. `write_msg` already
    // takes `GUEST_WRITE_LOCK` internally, so this serialises with
    // every other guest TLV producer.
    write_msg(MsgType::KernelOpRequest.wire_value(), &payload_bytes);
    // Read replies from the same O_RDWR fd used for writes. See
    // `request_snapshot` for the bulk-port handle lifecycle notes;
    // both helpers share `BULK_PORT_FD`.
    let read_slot = BULK_PORT_FD.get_or_init(|| std::sync::Mutex::new(None));
    let mut read_guard = read_slot.lock_unpoisoned();
    if read_guard.is_none() {
        match try_open_bulk_port() {
            Some(f) => *read_guard = Some(f),
            None => {
                return KernelOpRequestResult::TransportError {
                    reason: "/dev/vport0p1 not yet open \
                             (multiport handshake still in flight)"
                        .into(),
                };
            }
        }
    }
    let f = read_guard
        .as_mut()
        .expect("bulk port handle just installed");
    let deadline = std::time::Instant::now() + timeout;
    loop {
        let now = std::time::Instant::now();
        if now >= deadline {
            return KernelOpRequestResult::TransportError {
                reason: format!(
                    "host did not deliver matching kernel-op reply within {timeout:?} \
                     (request_id={request_id})"
                ),
            };
        }
        let frame = match read_bulk_port_frame(f, KERNEL_OP_REPLY_MAX, deadline) {
            Ok(frame) => frame,
            Err(e) if e.kind() == std::io::ErrorKind::TimedOut => {
                return KernelOpRequestResult::TransportError {
                    reason: format!(
                        "kernel-op reply deadline elapsed before frame complete \
                         (request_id={request_id}): {e}"
                    ),
                };
            }
            Err(e) => {
                *read_guard = None;
                return KernelOpRequestResult::TransportError {
                    reason: format!("kernel-op reply read failed (request_id={request_id}): {e}"),
                };
            }
        };
        let (msg_type, frame_payload) = frame;
        if msg_type != MSG_TYPE_KERNEL_OP_REPLY {
            tracing::warn!(
                msg_type,
                len = frame_payload.len(),
                request_id,
                "request_kernel_op: ignoring non-KernelOpReply TLV on port 1 RX (likely a \
                 stale snapshot reply from a prior request that timed out on the guest side)"
            );
            continue;
        }
        let reply: KernelOpReplyPayload = match postcard::from_bytes(&frame_payload) {
            Ok(r) => r,
            Err(e) => {
                tracing::warn!(
                    request_id,
                    error = %e,
                    "request_kernel_op: postcard decode failed; ignoring"
                );
                continue;
            }
        };
        if reply.request_id != request_id {
            tracing::warn!(
                expected = request_id,
                got = reply.request_id,
                "request_kernel_op: stale reply id (likely a leftover from a prior \
                 request that timed out on the guest side); ignoring"
            );
            continue;
        }
        return KernelOpRequestResult::Ok(reply);
    }
}

#[cfg(test)]
mod tests {
    //! Unit coverage for the typed sender wrappers.
    //!
    //! Every guest_comms helper routes through `write_msg`
    //! which gates on `is_guest()`. The host-context check
    //! rejects every call from these tests — verifying that gate
    //! holds is the safest unit-test scope: it confirms the wrappers
    //! do not accidentally write to a host process's memory.
    //!
    //! End-to-end transport (guest → bulk port → host drain → TLV
    //! parse) is exercised by the integration test suite under
    //! `tests/`.

    use super::*;

    /// `send_exit` from host context must be a no-op (no panic).
    #[test]
    fn send_exit_from_host_context_is_noop() {
        let _g = IsGuestOverrideGuard::new(false);
        send_exit(0);
        send_exit(-1);
    }

    /// `send_test_result` from host context is a no-op.
    #[test]
    fn send_test_result_from_host_context_is_noop() {
        let _g = IsGuestOverrideGuard::new(false);
        send_test_result(&crate::assert::AssertResult::pass());
    }

    /// `send_payload_metrics` from host context is a no-op.
    #[test]
    fn send_payload_metrics_from_host_context_is_noop() {
        let _g = IsGuestOverrideGuard::new(false);
        let pm = crate::test_support::PayloadMetrics {
            payload_index: 0,
            metrics: vec![],
            exit_code: 0,
        };
        send_payload_metrics(&pm);
    }

    /// `send_profraw` from host context is a no-op.
    #[test]
    fn send_profraw_from_host_context_is_noop() {
        let _g = IsGuestOverrideGuard::new(false);
        send_profraw(b"\x01\x02\x03");
    }

    /// `send_stimulus` from host context is a no-op.
    #[test]
    fn send_stimulus_from_host_context_is_noop() {
        let _g = IsGuestOverrideGuard::new(false);
        send_stimulus(&[0u8; 24]);
    }

    /// `send_raw_payload_output` from host context is a no-op.
    #[test]
    fn send_raw_payload_output_from_host_context_is_noop() {
        let _g = IsGuestOverrideGuard::new(false);
        let raw = crate::test_support::RawPayloadOutput {
            payload_index: 0,
            stdout: String::new(),
            stderr: String::new(),
            hint: None,
            metric_hints: vec![],
            metric_bounds: None,
        };
        send_raw_payload_output(&raw);
    }

    /// `send_sched_exit` from host context is a no-op.
    #[test]
    fn send_sched_exit_from_host_context_is_noop() {
        let _g = IsGuestOverrideGuard::new(false);
        send_sched_exit(0);
        send_sched_exit(-1);
    }

    /// `send_scenario_start` from host context is a no-op.
    #[test]
    fn send_scenario_start_from_host_context_is_noop() {
        let _g = IsGuestOverrideGuard::new(false);
        send_scenario_start();
    }

    /// `send_scenario_end` from host context is a no-op.
    #[test]
    fn send_scenario_end_from_host_context_is_noop() {
        let _g = IsGuestOverrideGuard::new(false);
        send_scenario_end(0);
        send_scenario_end(u64::MAX);
    }

    /// `send_sys_rdy` from host context returns false (no-op +
    /// failure indicator for the retry caller).
    #[test]
    fn send_sys_rdy_from_host_context_returns_false() {
        let _g = IsGuestOverrideGuard::new(false);
        assert!(
            !send_sys_rdy(),
            "host-context call must return false so the guest's \
             retry loop can distinguish 'wrote' from 'noop'"
        );
    }

    /// `send_stdout_chunk` from host context returns false
    /// (no-op + failure indicator), mirroring `send_sys_rdy`.
    #[test]
    fn send_stdout_chunk_from_host_context_returns_false() {
        let _g = IsGuestOverrideGuard::new(false);
        assert!(!send_stdout_chunk(b"hello"));
    }

    /// `send_stderr_chunk` from host context returns false.
    #[test]
    fn send_stderr_chunk_from_host_context_returns_false() {
        let _g = IsGuestOverrideGuard::new(false);
        assert!(!send_stderr_chunk(b"oops"));
    }

    /// `send_sched_log` from host context is a no-op.
    #[test]
    fn send_sched_log_from_host_context_is_noop() {
        let _g = IsGuestOverrideGuard::new(false);
        send_sched_log(b"---SCHED_OUTPUT_START---\n");
    }

    /// `send_lifecycle` from host context is a no-op for every
    /// phase, including the reason-bearing variant.
    #[test]
    fn send_lifecycle_from_host_context_is_noop() {
        let _g = IsGuestOverrideGuard::new(false);
        send_lifecycle(LifecyclePhase::InitStarted, "");
        send_lifecycle(LifecyclePhase::PayloadStarting, "");
        send_lifecycle(LifecyclePhase::SchedulerDied, "");
        send_lifecycle(LifecyclePhase::SchedulerNotAttached, "verifier rejected");
    }

    /// `send_exec_exit` from host context is a no-op.
    #[test]
    fn send_exec_exit_from_host_context_is_noop() {
        let _g = IsGuestOverrideGuard::new(false);
        send_exec_exit(0);
        send_exec_exit(-1);
    }

    /// `send_dmesg` from host context is a no-op.
    #[test]
    fn send_dmesg_from_host_context_is_noop() {
        let _g = IsGuestOverrideGuard::new(false);
        send_dmesg(b"[    0.000000] Linux version 6.16.0\n");
    }

    /// `send_probe_output` from host context is a no-op.
    #[test]
    fn send_probe_output_from_host_context_is_noop() {
        let _g = IsGuestOverrideGuard::new(false);
        send_probe_output(b"{}\n");
    }

    /// `request_snapshot` from host context returns `TransportError`.
    #[test]
    fn request_snapshot_from_host_context_returns_transport_error() {
        let _g = IsGuestOverrideGuard::new(false);
        let r = request_snapshot(0, "tag", std::time::Duration::from_millis(0));
        match r {
            SnapshotRequestResult::TransportError { .. } => {}
            other => panic!("expected TransportError from host context, got {other:?}"),
        }
    }

    /// `request_kernel_op` from host context returns
    /// `TransportError` (mirrors `request_snapshot`'s host-context
    /// gate). The virtio-console port-1 transport is reachable only
    /// from inside the guest; a host-context call must not silently
    /// no-op or panic.
    #[test]
    fn request_kernel_op_from_host_context_returns_transport_error() {
        let _g = IsGuestOverrideGuard::new(false);
        let request = crate::vmm::wire::KernelOpRequestPayload {
            request_id: 0,
            mode: crate::vmm::wire::KernelOpMode::Hot,
            direction: crate::vmm::wire::KernelOpDirection::Write,
            tag: String::new(),
            entries: vec![],
        };
        let r = request_kernel_op(request, std::time::Duration::from_millis(0));
        match r {
            crate::vmm::wire::KernelOpRequestResult::TransportError { .. } => {}
            other => panic!("expected TransportError from host context, got {other:?}"),
        }
    }

    /// `read_bulk_port_frame` rejects a payload whose `length`
    /// exceeds the caller-supplied `max_payload_size` cap. Pins
    /// the parameterized cap introduced for the kernel-op reply
    /// path — a callers passes its own limit and the function
    /// must honour it, NOT the old hardcoded
    /// `size_of::<SnapshotReplyPayload>()` value.
    #[test]
    fn read_bulk_port_frame_respects_caller_supplied_cap() {
        use std::os::unix::io::FromRawFd;
        let mut fds = [0i32; 2];
        // SAFETY: standard pipe(2) call; fds is a valid &mut to a
        // 2-element i32 array. Returning <0 indicates failure.
        let r = unsafe { libc::pipe(fds.as_mut_ptr()) };
        assert_eq!(r, 0, "pipe(2) failed: {}", std::io::Error::last_os_error());
        // SAFETY: pipe(2) just returned the fds; both are open and
        // owned by this scope. From_raw_fd takes ownership so the
        // File closes them on drop.
        let mut read_end = unsafe { std::fs::File::from_raw_fd(fds[0]) };
        let mut write_end = unsafe { std::fs::File::from_raw_fd(fds[1]) };

        // Frame a header with length = 200 but cap at 100. The
        // function must reject WITHOUT reading the (forged) payload.
        let header = ShmMessage {
            msg_type: MSG_TYPE_KERNEL_OP_REPLY,
            length: 200,
            crc32: 0,
            _pad: 0,
        };
        use std::io::Write;
        write_end
            .write_all(header.as_bytes())
            .expect("write forged header");
        drop(write_end);

        let deadline = std::time::Instant::now() + std::time::Duration::from_secs(5);
        let err = read_bulk_port_frame(&mut read_end, 100, deadline)
            .expect_err("cap=100 must reject length=200");
        assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
        let msg = err.to_string();
        assert!(
            msg.contains("exceeds max payload 100"),
            "error must cite the caller-supplied cap, got: {msg}"
        );
    }

    /// `read_bulk_port_frame` must reject a header whose `length`
    /// exceeds `size_of::<SnapshotReplyPayload>()` BEFORE allocating
    /// the payload buffer. A hostile or corrupted host could otherwise
    /// frame `length = u32::MAX` and cause `vec![0u8; u32::MAX]` to
    /// OOM the guest's PID 1 init, panicking the kernel.
    #[test]
    fn read_bulk_port_frame_rejects_oversized_length_before_alloc() {
        use std::os::unix::io::FromRawFd;
        // Build a pipe, write a forged 16-byte header with
        // length = u32::MAX, then call read_bulk_port_frame on the
        // read side. The function must return InvalidData without
        // attempting to read or allocate the (huge) payload.
        let mut fds = [0i32; 2];
        // SAFETY: standard pipe(2) call; fds is a valid &mut to a
        // 2-element i32 array. Returning <0 indicates failure.
        let r = unsafe { libc::pipe(fds.as_mut_ptr()) };
        assert_eq!(r, 0, "pipe(2) failed: {}", std::io::Error::last_os_error());
        // SAFETY: pipe(2) just returned the fds; both are open and
        // owned by this scope. From_raw_fd takes ownership so the
        // File closes them on drop.
        let mut read_end = unsafe { std::fs::File::from_raw_fd(fds[0]) };
        let mut write_end = unsafe { std::fs::File::from_raw_fd(fds[1]) };

        let header = ShmMessage {
            msg_type: MSG_TYPE_SNAPSHOT_REPLY,
            length: u32::MAX,
            crc32: 0,
            _pad: 0,
        };
        use std::io::Write;
        write_end
            .write_all(header.as_bytes())
            .expect("write forged header");
        // Drop the writer so the reader observes EOF after the
        // header rather than blocking forever on the missing payload.
        drop(write_end);

        let deadline = std::time::Instant::now() + std::time::Duration::from_secs(5);
        let err = read_bulk_port_frame(
            &mut read_end,
            std::mem::size_of::<SnapshotReplyPayload>(),
            deadline,
        )
        .expect_err("oversized length must be rejected");
        assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
        let msg = err.to_string();
        assert!(
            msg.contains("exceeds max payload"),
            "error must explain the cap, got: {msg}"
        );
    }

    /// `read_bulk_port_frame` must accept a length that exactly
    /// matches `size_of::<SnapshotReplyPayload>()` — the cap is an
    /// upper bound, not a strict-less-than check. This pins the
    /// boundary so a future tightening of the cap would force a
    /// deliberate test update rather than silently breaking the
    /// snapshot-reply path.
    #[test]
    fn read_bulk_port_frame_accepts_exact_max_payload() {
        use std::os::unix::io::FromRawFd;
        let mut fds = [0i32; 2];
        // SAFETY: pipe(2) on a freshly-zeroed 2-element i32 array.
        let r = unsafe { libc::pipe(fds.as_mut_ptr()) };
        assert_eq!(r, 0, "pipe(2) failed: {}", std::io::Error::last_os_error());
        // SAFETY: pipe just returned both fds; ownership transfers
        // to the File handles which close on drop.
        let mut read_end = unsafe { std::fs::File::from_raw_fd(fds[0]) };
        let mut write_end = unsafe { std::fs::File::from_raw_fd(fds[1]) };

        let payload = vec![0u8; std::mem::size_of::<SnapshotReplyPayload>()];
        let header = ShmMessage {
            msg_type: MSG_TYPE_SNAPSHOT_REPLY,
            length: payload.len() as u32,
            crc32: crc32fast::hash(&payload),
            _pad: 0,
        };
        use std::io::Write;
        write_end.write_all(header.as_bytes()).expect("header");
        write_end.write_all(&payload).expect("payload");
        drop(write_end);

        let deadline = std::time::Instant::now() + std::time::Duration::from_secs(5);
        let (msg_type, body) = read_bulk_port_frame(
            &mut read_end,
            std::mem::size_of::<SnapshotReplyPayload>(),
            deadline,
        )
        .expect("exact-size payload must succeed");
        assert_eq!(msg_type, MSG_TYPE_SNAPSHOT_REPLY);
        assert_eq!(body.len(), std::mem::size_of::<SnapshotReplyPayload>());
    }

    #[test]
    fn is_guest_override_round_trips_through_thread_local() {
        // Toggling override should affect is_guest() result.
        {
            let _g = IsGuestOverrideGuard::new(false);
            assert!(!is_guest());
        }
        {
            let _g = IsGuestOverrideGuard::new(true);
            assert!(is_guest());
        }
    }

    #[test]
    fn is_guest_override_guards_nest_correctly() {
        let _outer = IsGuestOverrideGuard::new(true);
        assert!(is_guest());
        {
            let _inner = IsGuestOverrideGuard::new(false);
            assert!(!is_guest());
        }
        // Inner dropped — outer's value is restored.
        assert!(is_guest());
    }
}