ktstr 0.14.0 - Docs.rs

use anyhow::{Context, Result};
use kvm_bindings::{
    KVM_CAP_HALT_POLL, KVM_CAP_SPLIT_IRQCHIP, KVM_CAP_X2APIC_API, KVM_CAP_X86_DISABLE_EXITS,
    KVM_CLOCK_TSC_STABLE, KVM_IRQ_ROUTING_MSI, KVM_PIT_SPEAKER_DUMMY,
    KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK, KVM_X2APIC_API_USE_32BIT_IDS,
    KVM_X86_DISABLE_EXITS_HLT, KVM_X86_DISABLE_EXITS_PAUSE, KvmIrqRouting, kvm_enable_cap,
    kvm_irq_routing, kvm_irq_routing_entry, kvm_irq_routing_entry__bindgen_ty_1,
    kvm_irq_routing_msi, kvm_irq_routing_msi__bindgen_ty_1, kvm_pit_config,
};
use kvm_ioctls::{Cap, Kvm, VcpuFd, VmFd};
use std::mem::ManuallyDrop;
use std::sync::Arc;
use vm_memory::{GuestAddress, GuestMemoryMmap};

use super::ioapic::{IOAPIC_BASE, IOAPIC_SIZE, Ioapic, MsiRoute};
use super::topology::{apic_id, generate_cpuid, max_apic_id};
use crate::vmm::numa_mem::{NumaMemoryLayout, ReservationGuard};
use crate::vmm::pi_mutex::PiMutex;
use crate::vmm::topology::Topology;

/// Physical address where the kernel is loaded.
pub(crate) const KERNEL_LOAD_ADDR: u64 = 0x100000; // 1 MB

/// Physical address of boot parameters (zero page).
pub(crate) const BOOT_PARAMS_ADDR: u64 = 0x7000;

/// Physical address of the kernel command line.
pub(crate) const CMDLINE_ADDR: u64 = 0x20000;

/// Maximum command line length.
pub(crate) const CMDLINE_MAX: usize = 4096;

// ---- Memory layout constants shared by boot.rs and acpi.rs ----

/// End of Extended BIOS Data Area (640K - 1K).
pub(crate) const EBDA_START: u64 = 0x9FC00;

/// Start of high memory (1 MB).
pub(crate) const HIMEM_START: u64 = 0x10_0000;

/// Start of PCI MMIO gap (3 GB). Memory below this is usable RAM.
pub(crate) const MMIO_GAP_START: u64 = 0xC000_0000;

/// End of PCI MMIO gap (4 GB). Memory above this resumes as RAM.
pub(crate) const MMIO_GAP_END: u64 = 0x1_0000_0000;

/// Virtio-console MMIO base: start of the MMIO gap.
pub(crate) const VIRTIO_CONSOLE_MMIO_BASE: u64 = MMIO_GAP_START;

/// Virtio-block MMIO base: one page above virtio-console.
/// Each virtio-mmio device occupies `VIRTIO_MMIO_SIZE = 0x1000`.
pub(crate) const VIRTIO_BLK_MMIO_BASE: u64 = MMIO_GAP_START + 0x1000;

/// Virtio-net MMIO base: one page above virtio-blk.
/// Each virtio-mmio device occupies `VIRTIO_MMIO_SIZE = 0x1000`.
pub(crate) const VIRTIO_NET_MMIO_BASE: u64 = MMIO_GAP_START + 0x2000;

/// GSI for virtio-console. On the in-kernel-irqchip path the in-kernel
/// IOAPIC routes this GSI; on split-irqchip (>254 APIC IDs) the userspace
/// IOAPIC translates the guest's RTE for it into an MSI route.
pub(crate) const VIRTIO_CONSOLE_IRQ: u32 = 5;

/// GSI for virtio-block. Routed via the in-kernel IOAPIC (<=254) or the
/// userspace IOAPIC (split-irqchip, >254). The IOAPIC's 24-line cap leaves
/// ample free slots after COM1=4, COM2=3, virtio-console=5, virtio-blk=6.
pub(crate) const VIRTIO_BLK_IRQ: u32 = 6;

/// GSI for virtio-net. Routed via the in-kernel IOAPIC (<=254) or the
/// userspace IOAPIC (split-irqchip, >254); well within the IOAPIC's
/// 24-line cap.
pub(crate) const VIRTIO_NET_IRQ: u32 = 7;

/// E820 memory type: usable RAM.
pub(crate) const E820_RAM: u32 = 1;

/// Offset from code32_start to 64-bit entry point in bzImage.
pub(crate) const STARTUP64_OFFSET: u64 = 0x200;

/// TSS address — same as Firecracker/libkrun.
const KVM_TSS_ADDRESS: u64 = 0xfffb_d000;

/// Identity map address — placed immediately after the 3-page TSS region.
/// KVM requires this to be set before creating vCPUs on x86_64.
const KVM_IDENTITY_MAP_ADDRESS: u64 = KVM_TSS_ADDRESS + 3 * 4096;

/// IOAPIC supports 24 input pins (IRQ 0-23).
const NUM_IOAPIC_PINS: u64 = 24;

/// APIC IDs above this require x2APIC mode (8-bit xAPIC limit).
pub(crate) const MAX_XAPIC_ID: u32 = 254;

/// Per-VM halt poll interval (nanoseconds) for non-performance_mode VMs.
/// Matches the x86 kernel default (KVM_HALT_POLL_NS_DEFAULT in
/// arch/x86/include/asm/kvm_host.h). Set to 0 for overcommitted
/// topologies where halt polling wastes host CPU time.
const HALT_POLL_NS: u64 = 200_000;

/// Required KVM capabilities — Firecracker checks these 14.
const REQUIRED_CAPS: &[Cap] = &[
    Cap::Irqchip,
    Cap::Ioeventfd,
    Cap::Irqfd,
    Cap::UserMemory,
    Cap::SetTssAddr,
    Cap::Pit2,
    Cap::PitState2,
    Cap::AdjustClock,
    Cap::Debugregs,
    Cap::MpState,
    Cap::VcpuEvents,
    Cap::Xcrs,
    Cap::Xsave,
    Cap::ExtCpuid,
];

/// A KVM virtual machine with configured topology.
#[allow(dead_code)] // configuration fields read conditionally; reservation held for RAII drop
pub struct KtstrKvm {
    pub kvm: ManuallyDrop<Kvm>,
    pub vm_fd: ManuallyDrop<VmFd>,
    pub vcpus: Vec<VcpuFd>,
    pub guest_mem: ManuallyDrop<GuestMemoryMmap>,
    pub topology: Topology,
    /// Per-node GPA layout used by ACPI SRAT/HMAT generation. `None`
    /// in deferred mode before `allocate_and_register_memory()`.
    pub(crate) numa_layout: Option<NumaMemoryLayout>,
    /// Whether KVM supports the immediate_exit mechanism (KVM_CAP_IMMEDIATE_EXIT).
    pub has_immediate_exit: bool,
    /// Split IRQ chip mode: LAPIC in kernel, PIC/IOAPIC emulated in userspace.
    /// Enabled when any APIC ID exceeds the 8-bit xAPIC limit (254).
    pub(crate) split_irqchip: bool,
    /// Userspace IOAPIC device, present only on the split-irqchip path.
    /// The run loops wrap it in an [`IoapicHandle`] (device + raw VM fd) to
    /// service IOAPIC MMIO and reprogram MSI routes; `None` for <=254-vCPU
    /// guests, which use the in-kernel IOAPIC.
    pub(crate) ioapic: Option<Arc<PiMutex<Ioapic>>>,
    /// Whether hugepages were requested at construction time.
    /// Stored so deferred memory allocation uses the same backing.
    use_hugepages: bool,
    /// Performance mode flag. Stored so deferred memory allocation
    /// can check hugepage availability fresh when memory_mib was
    /// unknown at construction time.
    performance_mode: bool,
    /// Owns the VA reservation for per-node MAP_FIXED mmaps.
    /// Drop munmaps the entire reservation.
    _reservation: Option<ReservationGuard>,
    /// RAII guards for COW-overlayed initramfs segments. Each guard
    /// holds the lz4 SHM fd with `LOCK_SH`; dropping it releases the
    /// flock and closes the fd. Must drop AFTER `_reservation` so the
    /// COW VMAs are torn down (via the reservation's munmap) before
    /// the flock is released — otherwise a concurrent writer could
    /// take `LOCK_EX` and truncate the segment while the guest still
    /// holds pages that fault through the backing file.
    pub(crate) cow_overlay_guards: Vec<crate::vmm::initramfs::CowOverlayGuard>,
}

impl Drop for KtstrKvm {
    fn drop(&mut self) {
        unsafe {
            // Ordered teardown: vCPU fds → VM fd → guest memory →
            // VA reservation → COW flock guards → /dev/kvm.
            //
            // Closing VmFd triggers kvm_destroy_vm which calls
            // mmu_notifier_unregister (synchronous SRCU wait). All
            // KVM references to this process's page tables are removed
            // before the guest memory munmap fires, preventing stale
            // mmu_notifier callbacks from racing with the unmap.
            let vcpus = std::mem::take(&mut self.vcpus);
            drop(vcpus);
            ManuallyDrop::drop(&mut self.vm_fd);
            ManuallyDrop::drop(&mut self.guest_mem);
            let reservation = self._reservation.take();
            drop(reservation);
            let cow_guards = std::mem::take(&mut self.cow_overlay_guards);
            drop(cow_guards);
            ManuallyDrop::drop(&mut self.kvm);
        }
    }
}

impl KtstrKvm {
    /// Create a new KVM VM with the given topology and memory size.
    pub fn new(topo: Topology, memory_mib: u32, performance_mode: bool) -> Result<Self> {
        Self::new_inner(topo, Some(memory_mib), false, performance_mode)
    }

    /// Create a new KVM VM with hugepage-backed guest memory.
    pub fn new_with_hugepages(
        topo: Topology,
        memory_mib: u32,
        performance_mode: bool,
    ) -> Result<Self> {
        Self::new_inner(topo, Some(memory_mib), true, performance_mode)
    }

    /// Create a KVM VM without allocating guest memory.
    ///
    /// Sets up /dev/kvm, VM fd, TSS, identity map, IRQ chip, vCPUs, and
    /// CPUID — none of which depend on guest memory size. Memory is
    /// allocated later via `allocate_and_register_memory`.
    pub fn new_deferred(
        topo: Topology,
        use_hugepages: bool,
        performance_mode: bool,
    ) -> Result<Self> {
        Self::new_inner(topo, None, use_hugepages, performance_mode)
    }

    /// Allocate guest memory and register it with KVM.
    ///
    /// Should be called exactly once on a VM created with
    /// `new_deferred`; calling twice unconditionally replaces the
    /// backing memory. Replaces the placeholder guest memory with a
    /// real allocation of `memory_mib` mebibytes and sets
    /// `numa_layout` to the computed per-node GPA layout. Re-checks
    /// hugepage availability when performance_mode is set, since
    /// memory_mib was unknown at construction time and `use_hugepages`
    /// may have been false.
    pub fn allocate_and_register_memory(&mut self, memory_mib: u32) -> Result<()> {
        let layout = NumaMemoryLayout::compute(
            &self.topology,
            memory_mib,
            0,
            Some((MMIO_GAP_START, MMIO_GAP_END)),
        )?;
        let alloc =
            layout.allocate_and_register(&self.vm_fd, self.use_hugepages, self.performance_mode)?;
        // SAFETY: this is the only call to ManuallyDrop::drop on
        // self.guest_mem; the next line replaces it with
        // ManuallyDrop::new(...).
        unsafe { ManuallyDrop::drop(&mut self.guest_mem) };
        self.guest_mem = ManuallyDrop::new(alloc.guest_mem);
        self._reservation = Some(alloc.reservation);
        self.numa_layout = Some(layout);
        Ok(())
    }

    fn new_inner(
        topo: Topology,
        memory_mib: Option<u32>,
        use_hugepages: bool,
        performance_mode: bool,
    ) -> Result<Self> {
        let kvm = Kvm::new().context("open /dev/kvm")?;

        // Check required capabilities (Firecracker pattern)
        for &cap in REQUIRED_CAPS {
            anyhow::ensure!(
                kvm.check_extension(cap),
                "KVM missing required capability: {:?}",
                cap
            );
        }

        let has_immediate_exit = kvm.check_extension(Cap::ImmediateExit);

        let vm_fd = crate::vmm::create_vm_with_retry(&kvm)?;

        // TSS (required on x86_64 before creating vCPUs). Maps
        // transient host errnos (ENOMEM, EBUSY) into
        // ResourceContention so the macro SKIPs cleanly instead of
        // panicking under host-resource pressure.
        vm_fd
            .set_tss_address(KVM_TSS_ADDRESS as usize)
            .map_err(|e| crate::vmm::map_transient_to_contention(e, "set TSS"))?;

        // Identity map — one page after the 3-page TSS region.
        // Must be set before creating vCPUs.
        vm_fd
            .set_identity_map_address(KVM_IDENTITY_MAP_ADDRESS)
            .map_err(|e| crate::vmm::map_transient_to_contention(e, "set identity map address"))?;

        // Determine whether any APIC ID exceeds the 8-bit xAPIC limit.
        // If so, use split IRQ chip (LAPIC-only in kernel) + x2APIC API.
        let max_apic_id = max_apic_id(&topo);
        let split_irqchip = max_apic_id > MAX_XAPIC_ID;

        if split_irqchip {
            // Split IRQ chip: only LAPIC is emulated in kernel.
            // PIC and IOAPIC are not created — userspace handles them.
            let mut cap = kvm_enable_cap {
                cap: KVM_CAP_SPLIT_IRQCHIP,
                ..Default::default()
            };
            cap.args[0] = NUM_IOAPIC_PINS;
            // KVM_CAP_SPLIT_IRQCHIP allocates the in-kernel LAPIC
            // tables and sets up the userspace IRQ routing slots.
            // ENOMEM under host pressure is transient — route through
            // the contention classifier so the macro SKIPs cleanly.
            vm_fd
                .enable_cap(&cap)
                .map_err(|e| crate::vmm::map_transient_to_contention(e, "enable split IRQ chip"))?;

            // Enable x2APIC API for 32-bit destination IDs and correct
            // broadcast behavior with APIC IDs > 254.
            let mut cap = kvm_enable_cap {
                cap: KVM_CAP_X2APIC_API,
                ..Default::default()
            };
            cap.args[0] =
                (KVM_X2APIC_API_USE_32BIT_IDS | KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) as u64;
            vm_fd.enable_cap(&cap).context("enable x2APIC API")?;
        } else {
            // Full IRQ chip (PIC + IOAPIC + LAPIC) — must exist before KVM_CREATE_VCPU
            vm_fd
                .create_irq_chip()
                .map_err(|e| crate::vmm::map_transient_to_contention(e, "create IRQ chip"))?;

            // PIT (timer) with dummy speaker port.
            // Only created with full IRQ chip — PIT routes through the in-kernel
            // IOAPIC (IRQ 0 -> GSI 2). With split IRQ chip there is no in-kernel
            // IOAPIC, so PIT creation fails.
            let pit_config = kvm_pit_config {
                flags: KVM_PIT_SPEAKER_DUMMY,
                ..Default::default()
            };
            vm_fd
                .create_pit2(pit_config)
                .map_err(|e| crate::vmm::map_transient_to_contention(e, "create PIT"))?;
        }

        // Userspace IOAPIC device for the split-irqchip path (no in-kernel
        // IOAPIC there). `None` for <=254-vCPU guests. The run loops build an
        // IoapicHandle around this to translate guest RTE writes into MSI
        // routes; see `super::ioapic` and `IoapicHandle`.
        let ioapic = split_irqchip.then(|| Arc::new(PiMutex::new(Ioapic::new())));

        // Disable PAUSE and HLT VM exits in performance mode.
        // Two separate enable_cap calls: kvm_disable_exits() uses |=
        // (additive), so multiple calls accumulate. Separate calls
        // ensure PAUSE succeeds unconditionally even if HLT is rejected.
        //
        // PAUSE: reduces vmexit overhead during guest spinlocks.
        //        Unconditionally allowed by KVM.
        // HLT:   eliminates the most frequent exit type during boot/idle.
        //        BSP shutdown uses I8042 reset (port 0x64, 0xFE via
        //        reboot=k) and VcpuExit::Shutdown, not VcpuExit::Hlt.
        //        KVM blocks HLT disable when mitigate_smt_rsb is active
        //        (host has X86_BUG_SMT_RSB and cpu_smt_possible()).
        if performance_mode {
            let mut cap = kvm_enable_cap {
                cap: KVM_CAP_X86_DISABLE_EXITS,
                ..Default::default()
            };

            // 1. PAUSE — always allowed.
            cap.args[0] = KVM_X86_DISABLE_EXITS_PAUSE as u64;
            if let Err(e) = vm_fd.enable_cap(&cap) {
                eprintln!(
                    "performance_mode: WARNING: \
                     KVM_CAP_X86_DISABLE_EXITS (PAUSE) not supported: {e}"
                );
            }

            // 2. HLT — may fail on mitigate_smt_rsb hosts.
            cap.args[0] = KVM_X86_DISABLE_EXITS_HLT as u64;
            if let Err(e) = vm_fd.enable_cap(&cap) {
                eprintln!(
                    "performance_mode: WARNING: \
                     KVM_CAP_X86_DISABLE_EXITS (HLT) rejected: {e}"
                );
            }
        }

        // Set per-VM halt poll interval. Skipped in performance_mode:
        // KVM_HINTS_REALTIME enables guest haltpoll cpuidle, which writes
        // MSR_KVM_POLL_CONTROL=0 per-vCPU (arch_haltpoll_enable →
        // kvm_disable_host_haltpoll), disabling host halt polling via
        // kvm_arch_no_poll(). KVM_CAP_HALT_POLL is redundant there.
        //
        // When vCPUs exceed online host CPUs (overcommit), halt polling
        // wastes host CPU time — disable it.
        if !performance_mode {
            let host_cpus = unsafe { libc::sysconf(libc::_SC_NPROCESSORS_ONLN) };
            let poll_ns: u64 = if host_cpus > 0 && topo.total_cpus() <= host_cpus as u32 {
                HALT_POLL_NS
            } else {
                0
            };
            let mut cap = kvm_enable_cap {
                cap: KVM_CAP_HALT_POLL,
                ..Default::default()
            };
            cap.args[0] = poll_ns;
            if let Err(e) = vm_fd.enable_cap(&cap) {
                eprintln!(
                    "kvm: WARNING: KVM_CAP_HALT_POLL not supported ({e}), using kernel default"
                );
            }
        }

        let (guest_mem, numa_layout, reservation) = match memory_mib {
            Some(mb) => {
                let layout =
                    NumaMemoryLayout::compute(&topo, mb, 0, Some((MMIO_GAP_START, MMIO_GAP_END)))?;
                let alloc =
                    layout.allocate_and_register(&vm_fd, use_hugepages, performance_mode)?;
                (alloc.guest_mem, Some(layout), Some(alloc.reservation))
            }
            None => {
                let placeholder = GuestMemoryMmap::<()>::from_ranges(&[(GuestAddress(0), 4096)])
                    .context("allocate placeholder guest memory")?;
                (placeholder, None, None)
            }
        };

        // Fetch host CPUID once, reuse for all vCPUs (Firecracker pattern).
        let base_cpuid = kvm
            .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
            .context("get_supported_cpuid")?;

        // Reject (rather than silently truncate) a VM whose RAM relocates
        // above the guest's addressable physical space. generate_cpuid leaves
        // CPUID leaf 0x8000_0008 EAX = the host's MAXPHYADDR (it patches only
        // ECX), so the guest's phys-addr width is the host's. If the relocated
        // RAM top exceeds 1<<phys_bits, the guest kernel silently caps
        // last_pfn at max_arch_pfn (e820__end_ram_pfn) and boots with less RAM
        // than advertised — a silent data drop. Surface it as a
        // host-capability skip, same class as the max_vcpus check below.
        //
        // Reference note: qemu likewise rejects (its phys-bits-too-low
        // hard-fail), never truncates. cloud-hypervisor instead caps the
        // guest MAXPHYADDR at 46 and sizes the MMIO/device area from it (migration
        // portability); ktstr has no migration, so it exposes the host's
        // MAXPHYADDR and rejects. libkrun and firecracker also leave the
        // host's MAXPHYADDR but lack this RAM bound, so the check is a
        // correct superset. Bounds the RAM top only; widen to max(RAM, MMIO)
        // top if a high MMIO window above RAM is ever added.
        if let Some(layout) = &numa_layout {
            let phys_bits = base_cpuid
                .as_slice()
                .iter()
                .find(|e| e.function == 0x8000_0008)
                .map(|e| e.eax & 0xff)
                .unwrap_or(36);
            if let Some(top) = layout.ram_top_exceeds_phys_bits(phys_bits) {
                return Err(anyhow::Error::new(
                    crate::vmm::host_topology::ResourceContention {
                        reason: format!(
                            "guest RAM top {top:#x} exceeds the guest MAXPHYADDR \
                             (1<<{phys_bits}); this host's physical-address \
                             width cannot back a VM this large without the guest \
                             silently truncating RAM"
                        ),
                    },
                ));
            }
        }

        // Create vCPUs with topology-specific CPUID. KVM_CREATE_VCPU
        // allocates per-vCPU kernel memory (struct kvm_vcpu, kvm_run
        // page, posted-interrupt descriptor); EMFILE / ENOMEM here is
        // host-resource pressure, not a test fault — route through
        // the contention classifier so the macro SKIPs cleanly.
        let total = topo.total_cpus();
        // A topology wider than the host's KVM_CAP_MAX_VCPUS cannot run
        // here; surface it as a clean skip (a host-capability limit, same
        // SKIP class as overcommit) before the per-vCPU create loop, rather
        // than a mid-loop create_vcpu errno. KVM_CAP_MAX_VCPUS is
        // host-dependent (commonly 1024; CONFIG_KVM_MAX_NR_VCPUS,
        // arch/x86/kvm/Kconfig).
        let max_vcpus = kvm.get_max_vcpus();
        if total as usize > max_vcpus {
            return Err(anyhow::Error::new(
                crate::vmm::host_topology::ResourceContention {
                    reason: format!(
                        "topology requires {total} vCPUs but this host's \
                     KVM_CAP_MAX_VCPUS is {max_vcpus}; cannot run a VM this wide"
                    ),
                },
            ));
        }
        // The vcpu_id passed below is apic_id(topo, cpu_id), whose sparse
        // range can exceed the vCPU count; KVM_CREATE_VCPU requires the id be
        // < KVM_CAP_MAX_VCPU_ID. Skip cleanly if the host's cap is too low,
        // same class as the max_vcpus check above.
        // `max_apic_id` is the u32 already bound above for the split-irqchip
        // decision; reuse it.
        let max_vcpu_id = kvm.get_max_vcpu_id();
        if (max_apic_id as usize) >= max_vcpu_id {
            return Err(anyhow::Error::new(
                crate::vmm::host_topology::ResourceContention {
                    reason: format!(
                        "topology's max APIC ID {max_apic_id} (the KVM vcpu_id) is \
                         >= this host's KVM_CAP_MAX_VCPU_ID {max_vcpu_id}; cannot \
                         create a vCPU at that ID"
                    ),
                },
            ));
        }
        let mut vcpus = Vec::with_capacity(total as usize);
        for cpu_id in 0..total {
            // vcpu_id = apic_id, not cpu_id: KVM hardwires the in-kernel LAPIC
            // x2apic_id to vcpu_id (arch/x86/kvm/lapic.c kvm_apic_set_x2apic_id,
            // read-only), and an MSI/IPI dest plus the guest's read_apic_id()
            // resolve against it. The CPUID/MADT advertise the sparse apic_id,
            // so vcpu_id must equal it or sparse APIC IDs are unrouteable. The
            // vcpus Vec stays indexed by cpu_id (push order); only the KVM
            // vcpu_id changes (a no-op for dense topologies where apic==cpu_id).
            let aid = apic_id(&topo, cpu_id);
            let vcpu = vm_fd.create_vcpu(aid as u64).map_err(|e| {
                crate::vmm::map_transient_to_contention(
                    e,
                    format!("create vCPU cpu_id={cpu_id} apic_id={aid}"),
                )
            })?;

            let cpuid_entries =
                generate_cpuid(base_cpuid.as_slice(), &topo, cpu_id, performance_mode);
            let cpuid = kvm_bindings::CpuId::from_entries(&cpuid_entries).context("build CpuId")?;
            vcpu.set_cpuid2(&cpuid)
                .with_context(|| format!("set CPUID for vCPU {cpu_id}"))?;

            vcpus.push(vcpu);
        }

        // Check TSC stability via KVM_GET_CLOCK. An unstable TSC
        // (missing KVM_CLOCK_TSC_STABLE) means kvmclock falls back to
        // host-side timekeeping per-vCPU, adding overhead to
        // clock_gettime and degrading timer accuracy. Common in nested
        // virtualization where the L0 hypervisor does not expose
        // constant TSC to L1.
        //
        // Only checked in performance_mode: non-perf tests use binary
        // pass/fail (cpuset, starvation) where timing precision doesn't
        // affect results.
        //
        // A get→set→get roundtrip is required: use_master_clock
        // starts false and is only evaluated by
        // pvclock_update_vm_gtod_copy(). That function is called by
        // kvm_vm_ioctl_set_clock() but NOT by kvm_vm_ioctl_get_clock()
        // or vCPU creation. Without the set_clock() call, get_clock()
        // always returns flags=0 regardless of actual TSC stability.
        //
        // Flags must be cleared before set_clock(): get_clock() may
        // set KVM_CLOCK_REALTIME, and set_clock() applies a realtime
        // adjustment when that flag is present (x86.c:7209-7215),
        // double-counting elapsed time. KVM_CLOCK_TSC_STABLE and
        // KVM_CLOCK_HOST_TSC are output-only and ignored by set_clock().
        if performance_mode {
            match vm_fd.get_clock() {
                Ok(clock) => {
                    let mut set_data = clock;
                    set_data.flags = 0;
                    if let Err(e) = vm_fd.set_clock(&set_data) {
                        eprintln!(
                            "performance_mode: WARNING: KVM_SET_CLOCK failed ({e}), \
                             cannot check TSC stability"
                        );
                    } else {
                        match vm_fd.get_clock() {
                            Ok(clock2) => {
                                if clock2.flags & KVM_CLOCK_TSC_STABLE == 0 {
                                    eprintln!(
                                        "performance_mode: WARNING: TSC not stable \
                                         (KVM_CLOCK_TSC_STABLE not set), \
                                         timing measurements may have higher variance \
                                         (nested virt?)."
                                    );
                                }
                            }
                            Err(e) => {
                                eprintln!(
                                    "performance_mode: WARNING: KVM_GET_CLOCK failed ({e}), \
                                     cannot check TSC stability"
                                );
                            }
                        }
                    }
                }
                Err(e) => {
                    eprintln!(
                        "performance_mode: WARNING: KVM_GET_CLOCK failed ({e}), \
                         cannot check TSC stability"
                    );
                }
            }
        }

        Ok(KtstrKvm {
            kvm: ManuallyDrop::new(kvm),
            vm_fd: ManuallyDrop::new(vm_fd),
            vcpus,
            guest_mem: ManuallyDrop::new(guest_mem),
            topology: topo,
            numa_layout,
            has_immediate_exit,
            split_irqchip,
            ioapic,
            use_hugepages,
            performance_mode,
            _reservation: reservation,
            cow_overlay_guards: Vec::new(),
        })
    }
}

/// Call `KVM_GET_CLOCK` via a raw VM fd (libc::ioctl direct).
/// Companion to the safe-wrapper boot-time probe above — used by
/// the freeze coordinator (see [`crate::vmm::freeze_coord`]) for
/// the freeze rendezvous save/restore where the coordinator's
/// `freeze_and_capture` + `thaw_and_barrier` sibling closures
/// can't borrow `&vm.vm_fd` (vm is consumed by a downstream closure
/// in the same scope) and therefore use the raw fd (Copy) cached
/// at coord-thread spawn time.
///
/// Mirrors `kvm_ioctls::VmFd::get_clock` — same ioctl number
/// (`KVM_GET_CLOCK = KVMIO | 0x7c`), same `kvm_clock_data` payload,
/// same error mapping. The underlying ioctl path
/// (`arch/x86/kvm/x86.c kvm_vm_ioctl_get_clock` → `get_kvmclock`)
/// is a pure seqcount read on the host side with no lock
/// acquisition. The save/restore pairing keeps the guest's
/// post-resume kvm_clock view at the parked-state value rather
/// than the freeze-advanced host monotonic; the planned per-vCPU
/// `KVM_KVMCLOCK_CTRL` emit at freeze entry is complementary —
/// it sets `PVCLOCK_GUEST_STOPPED` so the guest's soft-lockup
/// watchdog (`pvclock_touch_watchdogs` in
/// `arch/x86/kernel/pvclock.c`) skips the freeze interval and
/// does not fire on long freezes.
pub(crate) fn kvm_get_clock_via_raw_fd(
    vm_fd: i32,
) -> std::io::Result<kvm_bindings::kvm_clock_data> {
    // KVMIO | 0x7c, ioctl_ior_nr! per kvm-ioctls 0.24.0
    // kvm_ioctls.rs:109. `kvm_clock_data` size is 8 (clock) +
    // 4 (flags) + 4 (pad0) + 8 (realtime) + 8 (host_tsc) + 4*4
    // (pad) = 48 bytes; `_IOC_SIZE` (0x30 = 48) is encoded into
    // the ioctl number. If kvm-bindings ever bumps the struct
    // past 48 bytes, the encoded size in our ioctl constant
    // diverges from the kernel's expectation and the syscall
    // returns EINVAL silently — guard the size at compile time.
    const _: () = assert!(std::mem::size_of::<kvm_bindings::kvm_clock_data>() == 48);
    const KVM_GET_CLOCK_IOCTL: libc::c_ulong = 0x8030_ae7c;
    let mut clock = kvm_bindings::kvm_clock_data::default();
    // SAFETY: `vm_fd` is a valid kvm_vmfd (caller is the freeze
    // coordinator, which got the fd from vm.vm_fd.as_raw_fd() at
    // closure-definition time and the fd is alive for the
    // duration of `run_vm`). `kvm_clock_data` is `#[repr(C)]`
    // POD; the kernel writes <= sizeof::<kvm_clock_data>() bytes.
    let rc = unsafe {
        libc::ioctl(
            vm_fd,
            KVM_GET_CLOCK_IOCTL,
            &mut clock as *mut kvm_bindings::kvm_clock_data,
        )
    };
    if rc < 0 {
        Err(std::io::Error::last_os_error())
    } else {
        Ok(clock)
    }
}

/// Call `KVM_SET_CLOCK` via a raw VM fd (libc::ioctl direct).
/// Sibling of [`kvm_get_clock_via_raw_fd`] for the restore-side of
/// the freeze rendezvous kvm_clock save/restore. Mirrors
/// `kvm_ioctls::VmFd::set_clock`. The underlying ioctl path
/// (`arch/x86/kvm/x86.c kvm_vm_ioctl_set_clock`) takes the
/// `pvclock_sc` seqcount write side, recomputes
/// `master_kernel_ns`, sets `ka->kvmclock_offset = data.clock -
/// now_raw_ns`, then queues `KVM_REQ_CLOCK_UPDATE` on every vCPU
/// (processed at the next KVM_RUN entry per-vCPU).
///
/// Caller MUST clear `flags` to 0 before calling (per the
/// boot-time precedent above) — leaving `KVM_CLOCK_REALTIME` in
/// flags causes the kernel to apply a realtime adjustment that
/// double-counts elapsed time.
pub(crate) fn kvm_set_clock_via_raw_fd(
    vm_fd: i32,
    clock: &kvm_bindings::kvm_clock_data,
) -> std::io::Result<()> {
    // KVMIO | 0x7b, ioctl_iow_nr! per kvm-ioctls 0.24.0
    // kvm_ioctls.rs:106.
    const KVM_SET_CLOCK_IOCTL: libc::c_ulong = 0x4030_ae7b;
    // SAFETY: `vm_fd` is a valid kvm_vmfd (see SAFETY note on
    // [`kvm_get_clock_via_raw_fd`]). The kernel reads exactly
    // sizeof::<kvm_clock_data>() bytes from the pointer; the
    // payload is `#[repr(C)]` POD.
    let rc = unsafe {
        libc::ioctl(
            vm_fd,
            KVM_SET_CLOCK_IOCTL,
            clock as *const kvm_bindings::kvm_clock_data,
        )
    };
    if rc < 0 {
        Err(std::io::Error::last_os_error())
    } else {
        Ok(())
    }
}

/// Call `KVM_SET_GSI_ROUTING` via a raw VM fd (libc::ioctl direct).
/// Sibling of [`kvm_set_clock_via_raw_fd`] for the userspace-IOAPIC
/// (split-irqchip / >255-vCPU) path: an AP run loop holds only a cached
/// Copy raw vm fd, not `&vm.vm_fd`, and reprograms the device MSI routes
/// when the guest writes the IOAPIC redirection table. Mirrors
/// `kvm_ioctls::VmFd::set_gsi_routing` — same ioctl, same `kvm_irq_routing`
/// FAM payload. `KVM_SET_GSI_ROUTING` is a whole-table replace
/// (virt/kvm/irqchip.c `kvm_set_irq_routing`, under `kvm->irq_lock` plus an
/// SRCU grace period), so the caller passes the COMPLETE route set.
pub(crate) fn kvm_set_gsi_routing_via_raw_fd(
    vm_fd: i32,
    routing: &KvmIrqRouting,
) -> std::io::Result<()> {
    // ioctl_iow_nr!(KVM_SET_GSI_ROUTING, KVMIO=0xAE, 0x6a, kvm_irq_routing)
    // = _IOW(0xAE, 0x6a, size_of::<kvm_irq_routing>()) = 0x4008_AE6A. The
    // encoded size is the FAM HEADER (nr:u32 + flags:u32 = 8); the kernel
    // reads `nr` entries past the pointer.
    const _: () = assert!(std::mem::size_of::<kvm_irq_routing>() == 8);
    const KVM_SET_GSI_ROUTING_IOCTL: libc::c_ulong = 0x4008_AE6A;
    // SAFETY: `vm_fd` is a live kvm vm fd (cached from vm.vm_fd.as_raw_fd(),
    // valid for the run loop's lifetime). `as_fam_struct_ref()` points at a
    // `kvm_irq_routing` whose `nr` matches the entries the kernel reads.
    let rc = unsafe {
        libc::ioctl(
            vm_fd,
            KVM_SET_GSI_ROUTING_IOCTL,
            routing.as_fam_struct_ref() as *const kvm_irq_routing,
        )
    };
    if rc < 0 {
        Err(std::io::Error::last_os_error())
    } else {
        Ok(())
    }
}

/// Build a `KVM_SET_GSI_ROUTING` table from the IOAPIC's `(gsi, MsiRoute)`
/// set. Each entry is a `KVM_IRQ_ROUTING_MSI` route carrying the
/// extended-destination MSI the IOAPIC translated the guest's RTE into.
fn build_device_msi_routing(routes: &[(u32, MsiRoute)]) -> Result<KvmIrqRouting> {
    let mut routing = KvmIrqRouting::new(routes.len()).map_err(|e| {
        anyhow::anyhow!(
            "allocate kvm_irq_routing for {} routes: {e:?}",
            routes.len()
        )
    })?;
    let slice = routing.as_mut_slice();
    for (i, (gsi, msi)) in routes.iter().enumerate() {
        slice[i] = kvm_irq_routing_entry {
            gsi: *gsi,
            type_: KVM_IRQ_ROUTING_MSI,
            flags: 0,
            pad: 0,
            u: kvm_irq_routing_entry__bindgen_ty_1 {
                msi: kvm_irq_routing_msi {
                    address_lo: msi.address_lo,
                    address_hi: msi.address_hi,
                    data: msi.data,
                    __bindgen_anon_1: kvm_irq_routing_msi__bindgen_ty_1 { pad: 0 },
                },
            },
        };
    }
    Ok(routing)
}

/// Owns the userspace IOAPIC device plus the cached raw VM fd needed to
/// reprogram KVM's MSI routing table. Cloned (via `Arc`) into each AP run
/// loop on the split-irqchip path; `None` on the in-kernel-irqchip path
/// (<=254 vCPUs), where the kernel IOAPIC delivers device IRQs directly.
pub(crate) struct IoapicHandle {
    ioapic: Arc<PiMutex<Ioapic>>,
    vm_fd_raw: i32,
    /// Count of failed `KVM_SET_GSI_ROUTING` installs. A failure leaves a
    /// guest-programmed device IRQ unrouted — it will not deliver and the
    /// device hangs on first use. Bumped in `mmio_write`, read at teardown
    /// (`routing_failures`) so a hung-device test reports the count instead of
    /// an opaque timeout.
    routing_failures: std::sync::atomic::AtomicU64,
    /// The route set most recently installed via `KVM_SET_GSI_ROUTING`.
    /// `mmio_write` skips the install ioctl (which waits an SRCU grace
    /// period) when the freshly-computed `gsi_routes()` set is byte-identical
    /// — the guest programs each 64-bit RTE as two 32-bit MMIO writes, and
    /// the high-word write of a still-masked entry yields the same
    /// `is_masked`-filtered route set as before it, so roughly half the
    /// per-RTE installs are redundant. Guarded by its own mutex, not the
    /// `ioapic` lock: the compare + ioctl + cache update run as one critical
    /// section so the cache can never diverge from KVM's actual routing table
    /// under concurrent IOAPIC programming, while the `ioapic` lock is
    /// released first so a slow install never stalls another vCPU's IOAPIC
    /// MMIO access.
    ///
    /// This dedup is intentionally stronger than the reference userspace
    /// IOAPICs: qemu, cloud-hypervisor, and libkrun all re-issue
    /// `KVM_SET_GSI_ROUTING` on every redtbl write (qemu's change-counting
    /// dedup applies only to its PCI-MSI path, not the IOAPIC). Skipping an
    /// unchanged table is safe because the ioctl is a whole-table replace
    /// KVM applies idempotently -- an identical re-install only burns an
    /// SRCU grace period (virt/kvm/irqchip.c `kvm_set_irq_routing` has no
    /// unchanged-table early-out).
    last_installed: PiMutex<Option<Vec<(u32, MsiRoute)>>>,
}

impl IoapicHandle {
    pub(crate) fn new(ioapic: Arc<PiMutex<Ioapic>>, vm_fd_raw: i32) -> Self {
        IoapicHandle {
            ioapic,
            vm_fd_raw,
            routing_failures: std::sync::atomic::AtomicU64::new(0),
            last_installed: PiMutex::new(None),
        }
    }

    /// Service a guest MMIO read of the IOAPIC window.
    pub(crate) fn mmio_read(&self, offset: u64, data: &mut [u8]) {
        self.ioapic.lock().mmio_read(offset, data);
    }

    /// Service a guest MMIO write of the IOAPIC window. If the write changed
    /// a redirection entry, rebuild the full MSI routing table and install it
    /// — unless it is byte-identical to the last install, in which case the
    /// (SRCU-grace-period) ioctl is skipped (see the `last_installed` cache).
    ///
    /// The route snapshot is taken under the `ioapic` lock, which is then
    /// released; the compare + ioctl + cache update run under the separate
    /// `last_installed` lock. So a slow install never stalls another vCPU's
    /// IOAPIC MMIO access (the `ioapic` lock is free during the ioctl), and
    /// the cache stays consistent with KVM's table — installs serialize on
    /// `last_installed` (one whole-table replace at a time) and the cache is
    /// updated in the same critical section as each install.
    ///
    /// Cross-vCPU atomicity is traded for that latency, diverging from every
    /// reference userspace IOAPIC: qemu (one BQL across redtbl-write →
    /// route-commit), cloud-hypervisor (one device mutex across the install),
    /// and libkrun (device mutex held across a synchronous worker-hop install)
    /// are all single-lock atomic and accept the peer-vCPU IOAPIC-MMIO stall
    /// for the SRCU-grace-period ioctl; we release the `ioapic` lock first to
    /// keep that ioctl off the vCPU blocking budget. The resulting window —
    /// two vCPUs racing the IOAPIC, an older snapshot installing after a newer
    /// one and leaving KVM's table briefly stale — is unreachable for a
    /// spec-compliant guest: Linux serializes every IOAPIC RTE program under
    /// one global `ioapic_lock` (the `ioapic_write_entry`,
    /// `ioapic_set_affinity`, and `eoi_ioapic_pin` paths in
    /// arch/x86/kernel/apic/io_apic.c), so only one vCPU programs the IOAPIC
    /// at a time. A guest that races its own IOAPIC can transiently install a
    /// stale-but-valid whole-table replace built only from its own programmed
    /// routes; it self-corrects on the next RTE write (which re-snapshots the
    /// current register file) and can only mis-route its own device IRQs to
    /// its own APICs — no host memory is touched and no unprogrammed route is
    /// installable.
    ///
    /// Delegates to [`Self::mmio_write_with`] passing the real
    /// `KVM_SET_GSI_ROUTING` installer; the seam exists only so a host-side
    /// test can inject a counting/failing installer.
    pub(crate) fn mmio_write(&self, offset: u64, data: &[u8]) -> Result<()> {
        let fd = self.vm_fd_raw;
        self.mmio_write_with(offset, data, move |routing| {
            kvm_set_gsi_routing_via_raw_fd(fd, routing)
        })
    }

    /// [`Self::mmio_write`] with the routing install injected as `install`,
    /// so a host-side test drives the dedup + cache-on-success logic with a
    /// counting/failing closure instead of a live KVM fd. The production
    /// caller passes the real `KVM_SET_GSI_ROUTING` installer. `install` runs
    /// at most once per call — only when the write changed a route AND the
    /// set differs from `last_installed` — hence `FnOnce`. (No reference VMM
    /// exposes such a seam or unit-tests this path; they re-install
    /// unconditionally — see the `last_installed` divergence note.)
    fn mmio_write_with(
        &self,
        offset: u64,
        data: &[u8],
        install: impl FnOnce(&KvmIrqRouting) -> std::io::Result<()>,
    ) -> Result<()> {
        let routes = {
            let mut io = self.ioapic.lock();
            if io.mmio_write(offset, data) {
                Some(io.gsi_routes())
            } else {
                None
            }
        };
        if let Some(routes) = routes {
            let mut last = self.last_installed.lock();
            if last.as_deref() == Some(routes.as_slice()) {
                // Unchanged from the last successful install — skip the
                // whole-table replace and its SRCU grace period.
                return Ok(());
            }
            let routing = build_device_msi_routing(&routes)?;
            if let Err(e) = install(&routing) {
                self.routing_failures
                    .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
                return Err(anyhow::anyhow!("KVM_SET_GSI_ROUTING: {e}"));
            }
            // Cache only after a successful install so a failed ioctl never
            // makes a later identical attempt skip a needed retry.
            *last = Some(routes);
        }
        Ok(())
    }

    /// Service a `KVM_EXIT_IOAPIC_EOI` for `vector` (clears remote-IRR on a
    /// matching level entry; a no-op for the edge device pins of v0).
    pub(crate) fn eoi(&self, vector: u8) {
        // v0 is edge-only, so end_of_interrupt's returned pending-pins Vec is
        // always empty (edge entries never set remote-IRR, and this IOAPIC is a
        // register-file + MSI translator that never *services* a pin, so
        // remote-IRR is never set at all). The Vec is intentionally dropped;
        // the debug_assert is a tripwire so that adding a level-triggered
        // device WITHOUT completing level re-injection fails loudly in tests
        // instead of silently dropping the re-assert and wedging the line.
        let pending = self.ioapic.lock().end_of_interrupt(vector);
        debug_assert!(
            pending.is_empty(),
            "v0 IOAPIC is edge-only but EOI returned {} pin(s) needing level \
             re-injection — a level-triggered device was added without \
             completing level support (re-injection is dropped here)",
            pending.len()
        );
    }

    /// Number of failed `KVM_SET_GSI_ROUTING` installs since VM start. Read at
    /// teardown to surface routing failures into the result (a nonzero count
    /// explains a device that never delivered IRQs).
    pub(crate) fn routing_failures(&self) -> u64 {
        self.routing_failures
            .load(std::sync::atomic::Ordering::Relaxed)
    }

    /// If `addr` lies in the IOAPIC MMIO window, the offset within it;
    /// otherwise `None`. Lets the run-loop dispatcher route an MMIO exit
    /// without importing the device's base/size constants.
    pub(crate) fn in_range(&self, addr: u64) -> Option<u64> {
        (IOAPIC_BASE..IOAPIC_BASE + IOAPIC_SIZE)
            .contains(&addr)
            .then(|| addr - IOAPIC_BASE)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// `IoapicHandle` dedups a redundant GSI-routing install (skips the
    /// ioctl when the route set is byte-identical to the last successful
    /// install) AND caches only on success (a failed install must not poison
    /// a later identical retry). Drives the real `mmio_write_with` dedup path
    /// with an injected counting/failing installer — no live KVM fd. No
    /// reference VMM unit-tests this path (they re-install unconditionally).
    #[test]
    fn ioapic_handle_dedups_install_and_caches_on_success_only() {
        use crate::vmm::x86_64::ioapic::{IOREGSEL, IOWIN, REG_REDTBL_BASE};
        use std::cell::Cell;

        let handle = IoapicHandle::new(std::sync::Arc::new(PiMutex::new(Ioapic::new())), -1);
        // Pin 6's low-dword redtbl register (entry i lo = REG_REDTBL_BASE + 2i).
        let lo_reg = (REG_REDTBL_BASE + 2 * 6) as u8;
        let installs = Cell::new(0u32);

        // Drive one IOAPIC MMIO write through the dedup path with an injected
        // installer that bumps `installs` and returns Ok/Err per `ok`.
        let step = |off: u64, data: &[u8], ok: bool| -> Result<()> {
            handle.mmio_write_with(off, data, |_routing| {
                installs.set(installs.get() + 1);
                if ok {
                    Ok(())
                } else {
                    Err(std::io::Error::other("injected install failure"))
                }
            })
        };

        // Program pin 6's RTE: select the lo reg (not a route change → no
        // install), then write vector 0x40 with the mask bit clear (an
        // unmasked route) → route change → install #1.
        step(IOREGSEL, &[lo_reg], true).unwrap();
        step(IOWIN, &0x40u32.to_le_bytes(), true).unwrap();
        assert_eq!(
            installs.get(),
            1,
            "programming an unmasked RTE installs once"
        );

        // Rewrite the identical lo dword: the register file reports dirty, but
        // the route set is byte-identical → dedup SKIPS the install ioctl.
        step(IOREGSEL, &[lo_reg], true).unwrap();
        step(IOWIN, &0x40u32.to_le_bytes(), true).unwrap();
        assert_eq!(
            installs.get(),
            1,
            "a redundant RTE rewrite must dedup (no second install)"
        );

        // Cache-on-success-only: change the vector (0x50) with a FAILING
        // installer → the install is attempted (count 2) but errors, so
        // `last_installed` must NOT be updated to the 0x50 route set.
        step(IOREGSEL, &[lo_reg], true).unwrap();
        assert!(
            step(IOWIN, &0x50u32.to_le_bytes(), false).is_err(),
            "an injected install failure propagates as an error"
        );
        assert_eq!(installs.get(), 2, "the changed RTE attempts an install");
        assert_eq!(
            handle.routing_failures(),
            1,
            "the failed install is counted"
        );

        // Retry the SAME changed RTE with a succeeding installer. Because the
        // failed install did not cache 0x50, this must install AGAIN (count 3)
        // rather than dedup-skip — proving a failed install never wedges a
        // device behind a poisoned cache.
        step(IOREGSEL, &[lo_reg], true).unwrap();
        step(IOWIN, &0x50u32.to_le_bytes(), true).unwrap();
        assert_eq!(
            installs.get(),
            3,
            "a failed install must not poison the cache — the identical retry re-installs"
        );
    }

    #[test]
    fn build_device_msi_routing_lays_out_fam_entries() {
        let routes = vec![
            (
                4u32,
                MsiRoute {
                    address_lo: 0xFEE0_1004,
                    address_hi: 0x0000_0100,
                    data: 0x0000_8030,
                },
            ),
            (
                6u32,
                MsiRoute {
                    address_lo: 0xFEE0_2000,
                    address_hi: 0x0000_0000,
                    data: 0x0000_0040,
                },
            ),
        ];
        let mut routing = build_device_msi_routing(&routes).expect("build routing");
        let entries = routing.as_mut_slice();
        assert_eq!(entries.len(), 2, "one FAM entry per route");
        for (i, (gsi, msi)) in routes.iter().enumerate() {
            let e = &entries[i];
            assert_eq!(e.gsi, *gsi, "entry {i} gsi");
            assert_eq!(e.type_, KVM_IRQ_ROUTING_MSI, "entry {i} type is MSI");
            assert_eq!(e.flags, 0, "entry {i} flags");
            // SAFETY: every entry was built as the `.msi` union variant above.
            let m = unsafe { e.u.msi };
            assert_eq!(m.address_lo, msi.address_lo, "entry {i} address_lo");
            assert_eq!(m.address_hi, msi.address_hi, "entry {i} address_hi");
            assert_eq!(m.data, msi.data, "entry {i} data");
        }
    }

    #[test]
    fn build_device_msi_routing_empty_is_valid() {
        // All-masked IOAPIC -> empty route set -> nr=0 table (the re-mask-all
        // case the kernel reviewer verified: FamStructWrapper::new(0) yields a
        // valid header-only kvm_irq_routing{nr:0}).
        let mut routing = build_device_msi_routing(&[]).expect("empty routing");
        assert_eq!(routing.as_mut_slice().len(), 0, "no entries for empty set");
    }
    use std::os::fd::AsRawFd;
    use vm_memory::GuestMemory;

    #[test]
    fn create_vm_basic() {
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 2,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 128, false);
        assert!(vm.is_ok(), "VM creation failed: {:?}", vm.err());
        let vm = vm.unwrap();
        assert_eq!(vm.vcpus.len(), 2);
    }

    #[test]
    fn create_vm_multi_llc() {
        let topo = Topology {
            llcs: 2,
            cores_per_llc: 2,
            threads_per_core: 2,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 256, false);
        assert!(vm.is_ok(), "multi-LLC VM creation failed: {:?}", vm.err());
        let vm = vm.unwrap();
        assert_eq!(vm.vcpus.len(), 8);
    }

    #[test]
    fn create_vm_single_cpu() {
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 1,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 64, false);
        assert!(vm.is_ok());
        assert_eq!(vm.unwrap().vcpus.len(), 1);
    }

    #[test]
    fn create_vm_large_topology() {
        let topo = Topology {
            llcs: 4,
            cores_per_llc: 4,
            threads_per_core: 2,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 512, false);
        assert!(vm.is_ok(), "large topology failed: {:?}", vm.err());
        assert_eq!(vm.unwrap().vcpus.len(), 32);
    }

    #[test]
    fn create_vm_odd_topology() {
        let topo = Topology {
            llcs: 3,
            cores_per_llc: 3,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 128, false);
        assert!(vm.is_ok(), "odd topology failed: {:?}", vm.err());
        assert_eq!(vm.unwrap().vcpus.len(), 9);
    }

    #[test]
    fn memory_size_correct() {
        use vm_memory::GuestMemoryRegion;
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 1,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 256, false).unwrap();
        let total: u64 = vm.guest_mem.iter().map(|r| r.len()).sum();
        assert_eq!(total, 256 << 20);
    }

    #[test]
    fn tss_address_matches_firecracker() {
        assert_eq!(KVM_TSS_ADDRESS, 0xfffb_d000);
    }

    #[test]
    fn identity_map_follows_tss() {
        assert_eq!(KVM_IDENTITY_MAP_ADDRESS, KVM_TSS_ADDRESS + 3 * 4096);
        assert_eq!(KVM_IDENTITY_MAP_ADDRESS, 0xfffc_0000);
    }

    #[test]
    fn required_caps_non_empty() {
        assert!(!REQUIRED_CAPS.is_empty());
        assert!(REQUIRED_CAPS.len() >= 14);
    }

    #[test]
    fn small_topology_uses_full_irqchip() {
        let topo = Topology {
            llcs: 2,
            cores_per_llc: 4,
            threads_per_core: 2,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        // max APIC ID = apic_id(15) = 1<<3 | 3<<1 | 1 = 15, well under 254
        assert!(max_apic_id(&topo) <= MAX_XAPIC_ID);
        let vm = KtstrKvm::new(topo, 256, false).unwrap();
        assert!(!vm.split_irqchip, "small topology should use full IRQ chip");
    }

    #[test]
    fn large_topology_uses_split_irqchip() {
        // 15 LLCs x 8 cores x 2 threads = 240 vCPUs
        // max APIC ID = apic_id(239) = 14<<4 | 7<<1 | 1 = 239, under 254
        // So try bigger: 14 LLCs x 9 cores x 2 threads = 252 vCPUs
        // core_bits = bits_needed(9) = 4, thread_bits = 1, core_shift = 5
        // max APIC ID = apic_id(251) = 13<<5 | 8<<1 | 1 = 433
        let topo = Topology {
            llcs: 14,
            cores_per_llc: 9,
            threads_per_core: 2,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        assert!(
            max_apic_id(&topo) > MAX_XAPIC_ID,
            "max APIC ID {} should exceed {}",
            max_apic_id(&topo),
            MAX_XAPIC_ID,
        );
        let vm = match KtstrKvm::new(topo, 4096, false) {
            Ok(v) => v,
            Err(e) => {
                // Some hosts reject 252-vCPU VMs (EEXIST from
                // KVM_CREATE_VCPU when split irqchip + x2APIC
                // interact with host KVM limitations). The APIC ID
                // assertion above validates the split irqchip logic;
                // skip the VM creation test on those hosts.
                skip!("large_topology VM creation: {e:#}");
            }
        };
        assert!(vm.split_irqchip, "large topology should use split IRQ chip");
        assert_eq!(vm.vcpus.len(), 252);
    }

    #[test]
    fn split_irqchip_boundary() {
        // Find a topology that is exactly at the boundary.
        // 8 LLCs x 8 cores x 2 threads: core_shift = 4, max APIC ID = 7<<4 | 7<<1 | 1 = 127
        let small = Topology {
            llcs: 8,
            cores_per_llc: 8,
            threads_per_core: 2,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        assert!(
            max_apic_id(&small) <= MAX_XAPIC_ID,
            "8l/8c/2t max APIC ID {} should be <= 254",
            max_apic_id(&small),
        );
        let vm = KtstrKvm::new(small, 2048, false).unwrap();
        assert!(!vm.split_irqchip);

        // 15 LLCs x 8 cores x 2 threads: core_shift = 4, max APIC ID = 14<<4 | 7<<1 | 1 = 239
        let still_small = Topology {
            llcs: 15,
            cores_per_llc: 8,
            threads_per_core: 2,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        assert!(
            max_apic_id(&still_small) <= MAX_XAPIC_ID,
            "15l/8c/2t max APIC ID {} should be <= 254",
            max_apic_id(&still_small),
        );
        let vm = KtstrKvm::new(still_small, 4096, false).unwrap();
        assert!(!vm.split_irqchip);
    }

    #[test]
    fn immediate_exit_cap_detected() {
        use crate::vmm::x86_64::test_helpers::single_vcpu_kvm;
        let vm = single_vcpu_kvm();
        // KVM_CAP_IMMEDIATE_EXIT is available since Linux 4.12.
        assert!(vm.has_immediate_exit);
    }

    #[test]
    fn performance_mode_succeeds() {
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 2,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 128, true);
        assert!(
            vm.is_ok(),
            "performance_mode VM creation failed: {:?}",
            vm.err()
        );
    }

    #[test]
    fn performance_mode_does_not_affect_vcpu_count() {
        let topo = Topology {
            llcs: 2,
            cores_per_llc: 2,
            threads_per_core: 2,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm_normal = KtstrKvm::new(topo, 256, false).unwrap();
        let vm_perf = KtstrKvm::new(topo, 256, true).unwrap();
        assert_eq!(vm_normal.vcpus.len(), vm_perf.vcpus.len());
    }

    #[test]
    fn halt_poll_ns_constant() {
        assert_eq!(HALT_POLL_NS, 200_000);
    }

    #[test]
    fn non_perf_mode_succeeds_with_halt_poll() {
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 2,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 128, false);
        assert!(
            vm.is_ok(),
            "non-perf VM with halt poll failed: {:?}",
            vm.err()
        );
    }

    #[test]
    fn disable_exits_hlt_bit_value() {
        // KVM_X86_DISABLE_EXITS_HLT is bit 1 (value 2) in the kernel ABI.
        assert_eq!(KVM_X86_DISABLE_EXITS_HLT, 2);
    }

    #[test]
    fn disable_exits_pause_and_hlt_no_overlap() {
        assert_ne!(
            KVM_X86_DISABLE_EXITS_PAUSE, KVM_X86_DISABLE_EXITS_HLT,
            "PAUSE and HLT bits must be distinct"
        );
        assert_eq!(
            KVM_X86_DISABLE_EXITS_PAUSE & KVM_X86_DISABLE_EXITS_HLT,
            0,
            "PAUSE and HLT bits must not overlap"
        );
    }

    #[test]
    fn tsc_stability_check_roundtrip() {
        // Check the get→set→get roundtrip succeeds with
        // performance_mode=true (which enables the TSC check).
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 2,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 64, true).unwrap();
        let clock = vm.vm_fd.get_clock().unwrap();
        let mut set_data = clock;
        set_data.flags = 0;
        vm.vm_fd.set_clock(&set_data).unwrap();
        let clock2 = vm.vm_fd.get_clock().unwrap();
        // On bare-metal with invariant TSC, KVM_CLOCK_TSC_STABLE
        // should be set after the roundtrip forces
        // pvclock_update_vm_gtod_copy. In nested virt it may not be.
        // Either way, the roundtrip must not fail.
        let _ = clock2.flags & KVM_CLOCK_TSC_STABLE;
    }

    #[test]
    fn kvm_clock_data_default_is_zeroed() {
        let clock = kvm_bindings::kvm_clock_data::default();
        assert_eq!(clock.clock, 0);
        assert_eq!(clock.flags, 0);
        assert_eq!(clock.pad0, 0);
        assert_eq!(clock.realtime, 0);
        assert_eq!(clock.host_tsc, 0);
        assert_eq!(clock.pad, [0u32; 4]);
    }

    #[test]
    fn kvm_clock_data_size_matches_ioctl_encoding() {
        // The hand-encoded `_IOC_SIZE = 0x30 = 48` in the
        // KVM_GET_CLOCK / KVM_SET_CLOCK ioctl-number constants in
        // this file presumes this exact size. A compile-time
        // `const _: () = assert!(...)` next to the first constant
        // guards builds; this runtime check is a belt-and-
        // suspenders guard against a future split of kvm-bindings
        // that drops the compile-time assert.
        assert_eq!(std::mem::size_of::<kvm_bindings::kvm_clock_data>(), 48);
    }

    #[test]
    fn raw_fd_get_clock_matches_safe_wrapper() {
        // Cross-check: the hand-encoded ioctl number 0x8030_ae7c
        // hits the same kernel path as kvm_ioctls::VmFd::get_clock.
        // If the number were wrong, libc::ioctl would return ENOTTY
        // (-22), which surfaces as Err and the assertion below would
        // observe it. If the number aimed at a different ioctl, the
        // returned clock value would not advance monotonically and
        // the safe-vs-raw comparison would diverge dramatically.
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 1,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 64, false).unwrap();
        let raw_fd = vm.vm_fd.as_raw_fd();
        let via_safe = vm.vm_fd.get_clock().expect("safe GET_CLOCK");
        let via_raw = super::kvm_get_clock_via_raw_fd(raw_fd).expect("raw GET_CLOCK");
        // Both reads hit the same in-kernel pvclock via separate
        // seqcount reads; the later one must be >= the earlier
        // (kvm_clock is monotonic non-decreasing).
        assert!(
            via_raw.clock >= via_safe.clock,
            "raw-fd GET regressed below safe GET (raw={}, safe={}) — ioctl number drift",
            via_raw.clock,
            via_safe.clock,
        );
        // < 1 second drift means we are reading the same ioctl,
        // not some unrelated kernel time source.
        assert!(
            via_raw.clock - via_safe.clock < 1_000_000_000,
            "raw-fd vs safe GET differ by >1s (raw={}, safe={}) — likely different kernel state",
            via_raw.clock,
            via_safe.clock,
        );
    }

    #[test]
    fn raw_fd_set_clock_roundtrip_with_flags_zero() {
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 1,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 64, false).unwrap();
        let raw_fd = vm.vm_fd.as_raw_fd();
        let mut clock = super::kvm_get_clock_via_raw_fd(raw_fd).expect("raw GET_CLOCK");
        clock.flags = 0;
        super::kvm_set_clock_via_raw_fd(raw_fd, &clock).expect("raw SET_CLOCK");
        let after = super::kvm_get_clock_via_raw_fd(raw_fd).expect("raw GET_CLOCK after");
        assert!(after.clock >= clock.clock);
    }

    #[test]
    fn performance_mode_with_hlt_disable_succeeds() {
        // performance_mode issues two separate enable_cap calls:
        // PAUSE (always succeeds) then HLT (may be rejected by
        // mitigate_smt_rsb). Either way, VM creation must succeed.
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 2,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 128, true);
        assert!(
            vm.is_ok(),
            "performance_mode with HLT disable failed: {:?}",
            vm.err()
        );
    }
}