ktstr 0.6.0 - Docs.rs

use anyhow::{Context, Result};
use kvm_bindings::{
    KVM_CAP_HALT_POLL, KVM_CAP_SPLIT_IRQCHIP, KVM_CAP_X2APIC_API, KVM_CAP_X86_DISABLE_EXITS,
    KVM_CLOCK_TSC_STABLE, KVM_PIT_SPEAKER_DUMMY, KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK,
    KVM_X2APIC_API_USE_32BIT_IDS, KVM_X86_DISABLE_EXITS_HLT, KVM_X86_DISABLE_EXITS_PAUSE,
    kvm_enable_cap, kvm_pit_config,
};
use kvm_ioctls::{Cap, Kvm, VcpuFd, VmFd};
use std::mem::ManuallyDrop;
use vm_memory::{GuestAddress, GuestMemoryMmap};

use super::topology::{generate_cpuid, max_apic_id};
use crate::vmm::numa_mem::{NumaMemoryLayout, ReservationGuard};
use crate::vmm::topology::Topology;

/// Physical address where the kernel is loaded.
pub(crate) const KERNEL_LOAD_ADDR: u64 = 0x100000; // 1 MB

/// Physical address of boot parameters (zero page).
pub(crate) const BOOT_PARAMS_ADDR: u64 = 0x7000;

/// Physical address of the kernel command line.
pub(crate) const CMDLINE_ADDR: u64 = 0x20000;

/// Maximum command line length.
pub(crate) const CMDLINE_MAX: usize = 4096;

// ---- Memory layout constants shared by boot.rs and acpi.rs ----

/// End of Extended BIOS Data Area (640K - 1K).
pub(crate) const EBDA_START: u64 = 0x9FC00;

/// Start of high memory (1 MB).
pub(crate) const HIMEM_START: u64 = 0x10_0000;

/// Start of PCI MMIO gap (3 GB). Memory below this is usable RAM.
pub(crate) const MMIO_GAP_START: u64 = 0xC000_0000;

/// End of PCI MMIO gap (4 GB). Memory above this resumes as RAM.
pub(crate) const MMIO_GAP_END: u64 = 0x1_0000_0000;

/// Virtio-console MMIO base: start of the MMIO gap.
pub(crate) const VIRTIO_CONSOLE_MMIO_BASE: u64 = MMIO_GAP_START;

/// Virtio-block MMIO base: one page above virtio-console.
/// Each virtio-mmio device occupies `VIRTIO_MMIO_SIZE = 0x1000`.
pub(crate) const VIRTIO_BLK_MMIO_BASE: u64 = MMIO_GAP_START + 0x1000;

/// Virtio-net MMIO base: one page above virtio-blk.
/// Each virtio-mmio device occupies `VIRTIO_MMIO_SIZE = 0x1000`.
pub(crate) const VIRTIO_NET_MMIO_BASE: u64 = MMIO_GAP_START + 0x2000;

/// IRQ for virtio-console (GSI routed through IOAPIC).
/// Uses IRQ 5 — available with full IRQ chip. With split IRQ chip
/// (no IOAPIC), MSI would be needed; not supported for now.
pub(crate) const VIRTIO_CONSOLE_IRQ: u32 = 5;

/// IRQ for virtio-block (GSI 6, full IRQ chip). Same constraints as
/// virtio-console — split-irqchip not supported. IOAPIC GSI ≤23
/// limit leaves ample free slots for additional virtio devices
/// after COM1=4, COM2=3, virtio-console=5, virtio-blk=6.
pub(crate) const VIRTIO_BLK_IRQ: u32 = 6;

/// IRQ for virtio-net (GSI 7, full IRQ chip). Same constraints as
/// virtio-blk — split-irqchip not supported. Still well within the
/// IOAPIC's 24-line cap.
pub(crate) const VIRTIO_NET_IRQ: u32 = 7;

/// E820 memory type: usable RAM.
pub(crate) const E820_RAM: u32 = 1;

/// Offset from code32_start to 64-bit entry point in bzImage.
pub(crate) const STARTUP64_OFFSET: u64 = 0x200;

/// TSS address — same as Firecracker/libkrun.
const KVM_TSS_ADDRESS: u64 = 0xfffb_d000;

/// Identity map address — placed immediately after the 3-page TSS region.
/// KVM requires this to be set before creating vCPUs on x86_64.
const KVM_IDENTITY_MAP_ADDRESS: u64 = KVM_TSS_ADDRESS + 3 * 4096;

/// IOAPIC supports 24 input pins (IRQ 0-23).
const NUM_IOAPIC_PINS: u64 = 24;

/// APIC IDs above this require x2APIC mode (8-bit xAPIC limit).
const MAX_XAPIC_ID: u32 = 254;

/// Per-VM halt poll interval (nanoseconds) for non-performance_mode VMs.
/// Matches the x86 kernel default (KVM_HALT_POLL_NS_DEFAULT in
/// arch/x86/include/asm/kvm_host.h). Set to 0 for overcommitted
/// topologies where halt polling wastes host CPU time.
const HALT_POLL_NS: u64 = 200_000;

/// Required KVM capabilities — Firecracker checks these 14.
const REQUIRED_CAPS: &[Cap] = &[
    Cap::Irqchip,
    Cap::Ioeventfd,
    Cap::Irqfd,
    Cap::UserMemory,
    Cap::SetTssAddr,
    Cap::Pit2,
    Cap::PitState2,
    Cap::AdjustClock,
    Cap::Debugregs,
    Cap::MpState,
    Cap::VcpuEvents,
    Cap::Xcrs,
    Cap::Xsave,
    Cap::ExtCpuid,
];

/// A KVM virtual machine with configured topology.
#[allow(dead_code)] // configuration fields read conditionally; reservation held for RAII drop
pub struct KtstrKvm {
    pub kvm: ManuallyDrop<Kvm>,
    pub vm_fd: ManuallyDrop<VmFd>,
    pub vcpus: Vec<VcpuFd>,
    pub guest_mem: ManuallyDrop<GuestMemoryMmap>,
    pub topology: Topology,
    /// Per-node GPA layout used by ACPI SRAT/HMAT generation. `None`
    /// in deferred mode before `allocate_and_register_memory()`.
    pub(crate) numa_layout: Option<NumaMemoryLayout>,
    /// Whether KVM supports the immediate_exit mechanism (KVM_CAP_IMMEDIATE_EXIT).
    pub has_immediate_exit: bool,
    /// Split IRQ chip mode: LAPIC in kernel, PIC/IOAPIC emulated in userspace.
    /// Enabled when any APIC ID exceeds the 8-bit xAPIC limit (254).
    pub(crate) split_irqchip: bool,
    /// Whether hugepages were requested at construction time.
    /// Stored so deferred memory allocation uses the same backing.
    use_hugepages: bool,
    /// Performance mode flag. Stored so deferred memory allocation
    /// can check hugepage availability fresh when memory_mib was
    /// unknown at construction time.
    performance_mode: bool,
    /// Owns the VA reservation for per-node MAP_FIXED mmaps.
    /// Drop munmaps the entire reservation.
    _reservation: Option<ReservationGuard>,
    /// RAII guards for COW-overlayed initramfs segments. Each guard
    /// holds the lz4 SHM fd with `LOCK_SH`; dropping it releases the
    /// flock and closes the fd. Must drop AFTER `_reservation` so the
    /// COW VMAs are torn down (via the reservation's munmap) before
    /// the flock is released — otherwise a concurrent writer could
    /// take `LOCK_EX` and truncate the segment while the guest still
    /// holds pages that fault through the backing file.
    pub(crate) cow_overlay_guards: Vec<crate::vmm::initramfs::CowOverlayGuard>,
}

impl Drop for KtstrKvm {
    fn drop(&mut self) {
        unsafe {
            // Ordered teardown: vCPU fds → VM fd → guest memory →
            // VA reservation → COW flock guards → /dev/kvm.
            //
            // Closing VmFd triggers kvm_destroy_vm which calls
            // mmu_notifier_unregister (synchronous SRCU wait). All
            // KVM references to this process's page tables are removed
            // before the guest memory munmap fires, preventing stale
            // mmu_notifier callbacks from racing with the unmap.
            let vcpus = std::mem::take(&mut self.vcpus);
            drop(vcpus);
            ManuallyDrop::drop(&mut self.vm_fd);
            ManuallyDrop::drop(&mut self.guest_mem);
            let reservation = self._reservation.take();
            drop(reservation);
            let cow_guards = std::mem::take(&mut self.cow_overlay_guards);
            drop(cow_guards);
            ManuallyDrop::drop(&mut self.kvm);
        }
    }
}

impl KtstrKvm {
    /// Create a new KVM VM with the given topology and memory size.
    pub fn new(topo: Topology, memory_mib: u32, performance_mode: bool) -> Result<Self> {
        Self::new_inner(topo, Some(memory_mib), false, performance_mode)
    }

    /// Create a new KVM VM with hugepage-backed guest memory.
    pub fn new_with_hugepages(
        topo: Topology,
        memory_mib: u32,
        performance_mode: bool,
    ) -> Result<Self> {
        Self::new_inner(topo, Some(memory_mib), true, performance_mode)
    }

    /// Create a KVM VM without allocating guest memory.
    ///
    /// Sets up /dev/kvm, VM fd, TSS, identity map, IRQ chip, vCPUs, and
    /// CPUID — none of which depend on guest memory size. Memory is
    /// allocated later via `allocate_and_register_memory`.
    pub fn new_deferred(
        topo: Topology,
        use_hugepages: bool,
        performance_mode: bool,
    ) -> Result<Self> {
        Self::new_inner(topo, None, use_hugepages, performance_mode)
    }

    /// Allocate guest memory and register it with KVM.
    ///
    /// Should be called exactly once on a VM created with
    /// `new_deferred`; calling twice unconditionally replaces the
    /// backing memory. Replaces the placeholder guest memory with a
    /// real allocation of `memory_mib` mebibytes and sets
    /// `numa_layout` to the computed per-node GPA layout. Re-checks
    /// hugepage availability when performance_mode is set, since
    /// memory_mib was unknown at construction time and `use_hugepages`
    /// may have been false.
    pub fn allocate_and_register_memory(&mut self, memory_mib: u32) -> Result<()> {
        let layout = NumaMemoryLayout::compute(&self.topology, memory_mib, 0)?;
        let alloc =
            layout.allocate_and_register(&self.vm_fd, self.use_hugepages, self.performance_mode)?;
        // SAFETY: this is the only call to ManuallyDrop::drop on
        // self.guest_mem; the next line replaces it with
        // ManuallyDrop::new(...).
        unsafe { ManuallyDrop::drop(&mut self.guest_mem) };
        self.guest_mem = ManuallyDrop::new(alloc.guest_mem);
        self._reservation = Some(alloc.reservation);
        self.numa_layout = Some(layout);
        Ok(())
    }

    fn new_inner(
        topo: Topology,
        memory_mib: Option<u32>,
        use_hugepages: bool,
        performance_mode: bool,
    ) -> Result<Self> {
        let kvm = Kvm::new().context("open /dev/kvm")?;

        // Check required capabilities (Firecracker pattern)
        for &cap in REQUIRED_CAPS {
            anyhow::ensure!(
                kvm.check_extension(cap),
                "KVM missing required capability: {:?}",
                cap
            );
        }

        let has_immediate_exit = kvm.check_extension(Cap::ImmediateExit);

        let vm_fd = crate::vmm::create_vm_with_retry(&kvm)?;

        // TSS (required on x86_64 before creating vCPUs). Maps
        // transient host errnos (ENOMEM, EBUSY) into
        // ResourceContention so the macro SKIPs cleanly instead of
        // panicking under host-resource pressure.
        vm_fd
            .set_tss_address(KVM_TSS_ADDRESS as usize)
            .map_err(|e| crate::vmm::map_transient_to_contention(e, "set TSS"))?;

        // Identity map — one page after the 3-page TSS region.
        // Must be set before creating vCPUs.
        vm_fd
            .set_identity_map_address(KVM_IDENTITY_MAP_ADDRESS)
            .map_err(|e| crate::vmm::map_transient_to_contention(e, "set identity map address"))?;

        // Determine whether any APIC ID exceeds the 8-bit xAPIC limit.
        // If so, use split IRQ chip (LAPIC-only in kernel) + x2APIC API.
        let max_apic_id = max_apic_id(&topo);
        let split_irqchip = max_apic_id > MAX_XAPIC_ID;

        if split_irqchip {
            // Split IRQ chip: only LAPIC is emulated in kernel.
            // PIC and IOAPIC are not created — userspace handles them.
            let mut cap = kvm_enable_cap {
                cap: KVM_CAP_SPLIT_IRQCHIP,
                ..Default::default()
            };
            cap.args[0] = NUM_IOAPIC_PINS;
            // KVM_CAP_SPLIT_IRQCHIP allocates the in-kernel LAPIC
            // tables and sets up the userspace IRQ routing slots.
            // ENOMEM under host pressure is transient — route through
            // the contention classifier so the macro SKIPs cleanly.
            vm_fd
                .enable_cap(&cap)
                .map_err(|e| crate::vmm::map_transient_to_contention(e, "enable split IRQ chip"))?;

            // Enable x2APIC API for 32-bit destination IDs and correct
            // broadcast behavior with APIC IDs > 254.
            let mut cap = kvm_enable_cap {
                cap: KVM_CAP_X2APIC_API,
                ..Default::default()
            };
            cap.args[0] =
                (KVM_X2APIC_API_USE_32BIT_IDS | KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) as u64;
            vm_fd.enable_cap(&cap).context("enable x2APIC API")?;
        } else {
            // Full IRQ chip (PIC + IOAPIC + LAPIC) — must exist before KVM_CREATE_VCPU
            vm_fd
                .create_irq_chip()
                .map_err(|e| crate::vmm::map_transient_to_contention(e, "create IRQ chip"))?;

            // PIT (timer) with dummy speaker port.
            // Only created with full IRQ chip — PIT routes through the in-kernel
            // IOAPIC (IRQ 0 -> GSI 2). With split IRQ chip there is no in-kernel
            // IOAPIC, so PIT creation fails.
            let pit_config = kvm_pit_config {
                flags: KVM_PIT_SPEAKER_DUMMY,
                ..Default::default()
            };
            vm_fd
                .create_pit2(pit_config)
                .map_err(|e| crate::vmm::map_transient_to_contention(e, "create PIT"))?;
        }

        // Disable PAUSE and HLT VM exits in performance mode.
        // Two separate enable_cap calls: kvm_disable_exits() uses |=
        // (additive), so multiple calls accumulate. Separate calls
        // ensure PAUSE succeeds unconditionally even if HLT is rejected.
        //
        // PAUSE: reduces vmexit overhead during guest spinlocks.
        //        Unconditionally allowed by KVM.
        // HLT:   eliminates the most frequent exit type during boot/idle.
        //        BSP shutdown uses I8042 reset (port 0x64, 0xFE via
        //        reboot=k) and VcpuExit::Shutdown, not VcpuExit::Hlt.
        //        KVM blocks HLT disable when mitigate_smt_rsb is active
        //        (host has X86_BUG_SMT_RSB and cpu_smt_possible()).
        if performance_mode {
            let mut cap = kvm_enable_cap {
                cap: KVM_CAP_X86_DISABLE_EXITS,
                ..Default::default()
            };

            // 1. PAUSE — always allowed.
            cap.args[0] = KVM_X86_DISABLE_EXITS_PAUSE as u64;
            if let Err(e) = vm_fd.enable_cap(&cap) {
                eprintln!(
                    "performance_mode: WARNING: \
                     KVM_CAP_X86_DISABLE_EXITS (PAUSE) not supported: {e}"
                );
            }

            // 2. HLT — may fail on mitigate_smt_rsb hosts.
            cap.args[0] = KVM_X86_DISABLE_EXITS_HLT as u64;
            if let Err(e) = vm_fd.enable_cap(&cap) {
                eprintln!(
                    "performance_mode: WARNING: \
                     KVM_CAP_X86_DISABLE_EXITS (HLT) rejected: {e}"
                );
            }
        }

        // Set per-VM halt poll interval. Skipped in performance_mode:
        // KVM_HINTS_REALTIME enables guest haltpoll cpuidle, which writes
        // MSR_KVM_POLL_CONTROL=0 per-vCPU (arch_haltpoll_enable →
        // kvm_disable_host_haltpoll), disabling host halt polling via
        // kvm_arch_no_poll(). KVM_CAP_HALT_POLL is redundant there.
        //
        // When vCPUs exceed online host CPUs (overcommit), halt polling
        // wastes host CPU time — disable it.
        if !performance_mode {
            let host_cpus = unsafe { libc::sysconf(libc::_SC_NPROCESSORS_ONLN) };
            let poll_ns: u64 = if host_cpus > 0 && topo.total_cpus() <= host_cpus as u32 {
                HALT_POLL_NS
            } else {
                0
            };
            let mut cap = kvm_enable_cap {
                cap: KVM_CAP_HALT_POLL,
                ..Default::default()
            };
            cap.args[0] = poll_ns;
            if let Err(e) = vm_fd.enable_cap(&cap) {
                eprintln!(
                    "kvm: WARNING: KVM_CAP_HALT_POLL not supported ({e}), using kernel default"
                );
            }
        }

        let (guest_mem, numa_layout, reservation) = match memory_mib {
            Some(mb) => {
                let layout = NumaMemoryLayout::compute(&topo, mb, 0)?;
                let alloc =
                    layout.allocate_and_register(&vm_fd, use_hugepages, performance_mode)?;
                (alloc.guest_mem, Some(layout), Some(alloc.reservation))
            }
            None => {
                let placeholder = GuestMemoryMmap::<()>::from_ranges(&[(GuestAddress(0), 4096)])
                    .context("allocate placeholder guest memory")?;
                (placeholder, None, None)
            }
        };

        // Fetch host CPUID once, reuse for all vCPUs (Firecracker pattern).
        let base_cpuid = kvm
            .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
            .context("get_supported_cpuid")?;

        // Create vCPUs with topology-specific CPUID. KVM_CREATE_VCPU
        // allocates per-vCPU kernel memory (struct kvm_vcpu, kvm_run
        // page, posted-interrupt descriptor); EMFILE / ENOMEM here is
        // host-resource pressure, not a test fault — route through
        // the contention classifier so the macro SKIPs cleanly.
        let total = topo.total_cpus();
        let mut vcpus = Vec::with_capacity(total as usize);
        for cpu_id in 0..total {
            let vcpu = vm_fd.create_vcpu(cpu_id as u64).map_err(|e| {
                crate::vmm::map_transient_to_contention(e, format!("create vCPU {cpu_id}"))
            })?;

            let cpuid_entries =
                generate_cpuid(base_cpuid.as_slice(), &topo, cpu_id, performance_mode);
            let cpuid = kvm_bindings::CpuId::from_entries(&cpuid_entries).context("build CpuId")?;
            vcpu.set_cpuid2(&cpuid)
                .with_context(|| format!("set CPUID for vCPU {cpu_id}"))?;

            vcpus.push(vcpu);
        }

        // Check TSC stability via KVM_GET_CLOCK. An unstable TSC
        // (missing KVM_CLOCK_TSC_STABLE) means kvmclock falls back to
        // host-side timekeeping per-vCPU, adding overhead to
        // clock_gettime and degrading timer accuracy. Common in nested
        // virtualization where the L0 hypervisor does not expose
        // constant TSC to L1.
        //
        // Only checked in performance_mode: non-perf tests use binary
        // pass/fail (cpuset, starvation) where timing precision doesn't
        // affect results.
        //
        // A get→set→get roundtrip is required: use_master_clock
        // starts false and is only evaluated by
        // pvclock_update_vm_gtod_copy(). That function is called by
        // kvm_vm_ioctl_set_clock() but NOT by kvm_vm_ioctl_get_clock()
        // or vCPU creation. Without the set_clock() call, get_clock()
        // always returns flags=0 regardless of actual TSC stability.
        //
        // Flags must be cleared before set_clock(): get_clock() may
        // set KVM_CLOCK_REALTIME, and set_clock() applies a realtime
        // adjustment when that flag is present (x86.c:7209-7215),
        // double-counting elapsed time. KVM_CLOCK_TSC_STABLE and
        // KVM_CLOCK_HOST_TSC are output-only and ignored by set_clock().
        if performance_mode {
            match vm_fd.get_clock() {
                Ok(clock) => {
                    let mut set_data = clock;
                    set_data.flags = 0;
                    if let Err(e) = vm_fd.set_clock(&set_data) {
                        eprintln!(
                            "performance_mode: WARNING: KVM_SET_CLOCK failed ({e}), \
                             cannot check TSC stability"
                        );
                    } else {
                        match vm_fd.get_clock() {
                            Ok(clock2) => {
                                if clock2.flags & KVM_CLOCK_TSC_STABLE == 0 {
                                    eprintln!(
                                        "performance_mode: WARNING: TSC not stable \
                                         (KVM_CLOCK_TSC_STABLE not set), \
                                         timing measurements may have higher variance \
                                         (nested virt?)."
                                    );
                                }
                            }
                            Err(e) => {
                                eprintln!(
                                    "performance_mode: WARNING: KVM_GET_CLOCK failed ({e}), \
                                     cannot check TSC stability"
                                );
                            }
                        }
                    }
                }
                Err(e) => {
                    eprintln!(
                        "performance_mode: WARNING: KVM_GET_CLOCK failed ({e}), \
                         cannot check TSC stability"
                    );
                }
            }
        }

        Ok(KtstrKvm {
            kvm: ManuallyDrop::new(kvm),
            vm_fd: ManuallyDrop::new(vm_fd),
            vcpus,
            guest_mem: ManuallyDrop::new(guest_mem),
            topology: topo,
            numa_layout,
            has_immediate_exit,
            split_irqchip,
            use_hugepages,
            performance_mode,
            _reservation: reservation,
            cow_overlay_guards: Vec::new(),
        })
    }
}

/// Call `KVM_GET_CLOCK` via a raw VM fd (libc::ioctl direct).
/// Companion to the safe-wrapper boot-time probe above — used by
/// the freeze coordinator (see [`crate::vmm::freeze_coord`]) for
/// the freeze rendezvous save/restore where the coordinator's
/// `freeze_and_capture` + `thaw_and_barrier` sibling closures
/// can't borrow `&vm.vm_fd` (vm is consumed by a downstream closure
/// in the same scope) and therefore use the raw fd (Copy) cached
/// at coord-thread spawn time.
///
/// Mirrors `kvm_ioctls::VmFd::get_clock` — same ioctl number
/// (`KVM_GET_CLOCK = KVMIO | 0x7c`), same `kvm_clock_data` payload,
/// same error mapping. The underlying ioctl path
/// (`arch/x86/kvm/x86.c kvm_vm_ioctl_get_clock` → `get_kvmclock`)
/// is a pure seqcount read on the host side with no lock
/// acquisition. The save/restore pairing keeps the guest's
/// post-resume kvm_clock view at the parked-state value rather
/// than the freeze-advanced host monotonic; the planned per-vCPU
/// `KVM_KVMCLOCK_CTRL` emit at freeze entry is complementary —
/// it sets `PVCLOCK_GUEST_STOPPED` so the guest's soft-lockup
/// watchdog (`pvclock_touch_watchdogs` in
/// `arch/x86/kernel/pvclock.c`) skips the freeze interval and
/// does not fire on long freezes.
pub(crate) fn kvm_get_clock_via_raw_fd(
    vm_fd: i32,
) -> std::io::Result<kvm_bindings::kvm_clock_data> {
    // KVMIO | 0x7c, ioctl_ior_nr! per kvm-ioctls 0.24.0
    // kvm_ioctls.rs:109. `kvm_clock_data` size is 8 (clock) +
    // 4 (flags) + 4 (pad0) + 8 (realtime) + 8 (host_tsc) + 4*4
    // (pad) = 48 bytes; `_IOC_SIZE` (0x30 = 48) is encoded into
    // the ioctl number. If kvm-bindings ever bumps the struct
    // past 48 bytes, the encoded size in our ioctl constant
    // diverges from the kernel's expectation and the syscall
    // returns EINVAL silently — guard the size at compile time.
    const _: () = assert!(std::mem::size_of::<kvm_bindings::kvm_clock_data>() == 48);
    const KVM_GET_CLOCK_IOCTL: libc::c_ulong = 0x8030_ae7c;
    let mut clock = kvm_bindings::kvm_clock_data::default();
    // SAFETY: `vm_fd` is a valid kvm_vmfd (caller is the freeze
    // coordinator, which got the fd from vm.vm_fd.as_raw_fd() at
    // closure-definition time and the fd is alive for the
    // duration of `run_vm`). `kvm_clock_data` is `#[repr(C)]`
    // POD; the kernel writes <= sizeof::<kvm_clock_data>() bytes.
    let rc = unsafe {
        libc::ioctl(
            vm_fd,
            KVM_GET_CLOCK_IOCTL,
            &mut clock as *mut kvm_bindings::kvm_clock_data,
        )
    };
    if rc < 0 {
        Err(std::io::Error::last_os_error())
    } else {
        Ok(clock)
    }
}

/// Call `KVM_SET_CLOCK` via a raw VM fd (libc::ioctl direct).
/// Sibling of [`kvm_get_clock_via_raw_fd`] for the restore-side of
/// the freeze rendezvous kvm_clock save/restore. Mirrors
/// `kvm_ioctls::VmFd::set_clock`. The underlying ioctl path
/// (`arch/x86/kvm/x86.c kvm_vm_ioctl_set_clock`) takes the
/// `pvclock_sc` seqcount write side, recomputes
/// `master_kernel_ns`, sets `ka->kvmclock_offset = data.clock -
/// now_raw_ns`, then queues `KVM_REQ_CLOCK_UPDATE` on every vCPU
/// (processed at the next KVM_RUN entry per-vCPU).
///
/// Caller MUST clear `flags` to 0 before calling (per the
/// boot-time precedent above) — leaving `KVM_CLOCK_REALTIME` in
/// flags causes the kernel to apply a realtime adjustment that
/// double-counts elapsed time.
pub(crate) fn kvm_set_clock_via_raw_fd(
    vm_fd: i32,
    clock: &kvm_bindings::kvm_clock_data,
) -> std::io::Result<()> {
    // KVMIO | 0x7b, ioctl_iow_nr! per kvm-ioctls 0.24.0
    // kvm_ioctls.rs:106.
    const KVM_SET_CLOCK_IOCTL: libc::c_ulong = 0x4030_ae7b;
    // SAFETY: `vm_fd` is a valid kvm_vmfd (see SAFETY note on
    // [`kvm_get_clock_via_raw_fd`]). The kernel reads exactly
    // sizeof::<kvm_clock_data>() bytes from the pointer; the
    // payload is `#[repr(C)]` POD.
    let rc = unsafe {
        libc::ioctl(
            vm_fd,
            KVM_SET_CLOCK_IOCTL,
            clock as *const kvm_bindings::kvm_clock_data,
        )
    };
    if rc < 0 {
        Err(std::io::Error::last_os_error())
    } else {
        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::os::fd::AsRawFd;
    use vm_memory::GuestMemory;

    #[test]
    fn create_vm_basic() {
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 2,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 128, false);
        assert!(vm.is_ok(), "VM creation failed: {:?}", vm.err());
        let vm = vm.unwrap();
        assert_eq!(vm.vcpus.len(), 2);
    }

    #[test]
    fn create_vm_multi_llc() {
        let topo = Topology {
            llcs: 2,
            cores_per_llc: 2,
            threads_per_core: 2,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 256, false);
        assert!(vm.is_ok(), "multi-LLC VM creation failed: {:?}", vm.err());
        let vm = vm.unwrap();
        assert_eq!(vm.vcpus.len(), 8);
    }

    #[test]
    fn create_vm_single_cpu() {
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 1,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 64, false);
        assert!(vm.is_ok());
        assert_eq!(vm.unwrap().vcpus.len(), 1);
    }

    #[test]
    fn create_vm_large_topology() {
        let topo = Topology {
            llcs: 4,
            cores_per_llc: 4,
            threads_per_core: 2,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 512, false);
        assert!(vm.is_ok(), "large topology failed: {:?}", vm.err());
        assert_eq!(vm.unwrap().vcpus.len(), 32);
    }

    #[test]
    fn create_vm_odd_topology() {
        let topo = Topology {
            llcs: 3,
            cores_per_llc: 3,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 128, false);
        assert!(vm.is_ok(), "odd topology failed: {:?}", vm.err());
        assert_eq!(vm.unwrap().vcpus.len(), 9);
    }

    #[test]
    fn memory_size_correct() {
        use vm_memory::GuestMemoryRegion;
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 1,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 256, false).unwrap();
        let total: u64 = vm.guest_mem.iter().map(|r| r.len()).sum();
        assert_eq!(total, 256 << 20);
    }

    #[test]
    fn tss_address_matches_firecracker() {
        assert_eq!(KVM_TSS_ADDRESS, 0xfffb_d000);
    }

    #[test]
    fn identity_map_follows_tss() {
        assert_eq!(KVM_IDENTITY_MAP_ADDRESS, KVM_TSS_ADDRESS + 3 * 4096);
        assert_eq!(KVM_IDENTITY_MAP_ADDRESS, 0xfffc_0000);
    }

    #[test]
    fn required_caps_non_empty() {
        assert!(!REQUIRED_CAPS.is_empty());
        assert!(REQUIRED_CAPS.len() >= 14);
    }

    #[test]
    fn small_topology_uses_full_irqchip() {
        let topo = Topology {
            llcs: 2,
            cores_per_llc: 4,
            threads_per_core: 2,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        // max APIC ID = apic_id(15) = 1<<3 | 3<<1 | 1 = 15, well under 254
        assert!(max_apic_id(&topo) <= MAX_XAPIC_ID);
        let vm = KtstrKvm::new(topo, 256, false).unwrap();
        assert!(!vm.split_irqchip, "small topology should use full IRQ chip");
    }

    #[test]
    fn large_topology_uses_split_irqchip() {
        // 15 LLCs x 8 cores x 2 threads = 240 vCPUs
        // max APIC ID = apic_id(239) = 14<<4 | 7<<1 | 1 = 239, under 254
        // So try bigger: 14 LLCs x 9 cores x 2 threads = 252 vCPUs
        // core_bits = bits_needed(9) = 4, thread_bits = 1, core_shift = 5
        // max APIC ID = apic_id(251) = 13<<5 | 8<<1 | 1 = 433
        let topo = Topology {
            llcs: 14,
            cores_per_llc: 9,
            threads_per_core: 2,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        assert!(
            max_apic_id(&topo) > MAX_XAPIC_ID,
            "max APIC ID {} should exceed {}",
            max_apic_id(&topo),
            MAX_XAPIC_ID,
        );
        let vm = match KtstrKvm::new(topo, 4096, false) {
            Ok(v) => v,
            Err(e) => {
                // Some hosts reject 252-vCPU VMs (EEXIST from
                // KVM_CREATE_VCPU when split irqchip + x2APIC
                // interact with host KVM limitations). The APIC ID
                // assertion above validates the split irqchip logic;
                // skip the VM creation test on those hosts.
                skip!("large_topology VM creation: {e:#}");
            }
        };
        assert!(vm.split_irqchip, "large topology should use split IRQ chip");
        assert_eq!(vm.vcpus.len(), 252);
    }

    #[test]
    fn split_irqchip_boundary() {
        // Find a topology that is exactly at the boundary.
        // 8 LLCs x 8 cores x 2 threads: core_shift = 4, max APIC ID = 7<<4 | 7<<1 | 1 = 127
        let small = Topology {
            llcs: 8,
            cores_per_llc: 8,
            threads_per_core: 2,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        assert!(
            max_apic_id(&small) <= MAX_XAPIC_ID,
            "8l/8c/2t max APIC ID {} should be <= 254",
            max_apic_id(&small),
        );
        let vm = KtstrKvm::new(small, 2048, false).unwrap();
        assert!(!vm.split_irqchip);

        // 15 LLCs x 8 cores x 2 threads: core_shift = 4, max APIC ID = 14<<4 | 7<<1 | 1 = 239
        let still_small = Topology {
            llcs: 15,
            cores_per_llc: 8,
            threads_per_core: 2,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        assert!(
            max_apic_id(&still_small) <= MAX_XAPIC_ID,
            "15l/8c/2t max APIC ID {} should be <= 254",
            max_apic_id(&still_small),
        );
        let vm = KtstrKvm::new(still_small, 4096, false).unwrap();
        assert!(!vm.split_irqchip);
    }

    #[test]
    fn immediate_exit_cap_detected() {
        use crate::vmm::x86_64::test_helpers::single_vcpu_kvm;
        let vm = single_vcpu_kvm();
        // KVM_CAP_IMMEDIATE_EXIT is available since Linux 4.12.
        assert!(vm.has_immediate_exit);
    }

    #[test]
    fn performance_mode_succeeds() {
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 2,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 128, true);
        assert!(
            vm.is_ok(),
            "performance_mode VM creation failed: {:?}",
            vm.err()
        );
    }

    #[test]
    fn performance_mode_does_not_affect_vcpu_count() {
        let topo = Topology {
            llcs: 2,
            cores_per_llc: 2,
            threads_per_core: 2,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm_normal = KtstrKvm::new(topo, 256, false).unwrap();
        let vm_perf = KtstrKvm::new(topo, 256, true).unwrap();
        assert_eq!(vm_normal.vcpus.len(), vm_perf.vcpus.len());
    }

    #[test]
    fn halt_poll_ns_constant() {
        assert_eq!(HALT_POLL_NS, 200_000);
    }

    #[test]
    fn non_perf_mode_succeeds_with_halt_poll() {
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 2,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 128, false);
        assert!(
            vm.is_ok(),
            "non-perf VM with halt poll failed: {:?}",
            vm.err()
        );
    }

    #[test]
    fn disable_exits_hlt_bit_value() {
        // KVM_X86_DISABLE_EXITS_HLT is bit 1 (value 2) in the kernel ABI.
        assert_eq!(KVM_X86_DISABLE_EXITS_HLT, 2);
    }

    #[test]
    fn disable_exits_pause_and_hlt_no_overlap() {
        assert_ne!(
            KVM_X86_DISABLE_EXITS_PAUSE, KVM_X86_DISABLE_EXITS_HLT,
            "PAUSE and HLT bits must be distinct"
        );
        assert_eq!(
            KVM_X86_DISABLE_EXITS_PAUSE & KVM_X86_DISABLE_EXITS_HLT,
            0,
            "PAUSE and HLT bits must not overlap"
        );
    }

    #[test]
    fn tsc_stability_check_roundtrip() {
        // Check the get→set→get roundtrip succeeds with
        // performance_mode=true (which enables the TSC check).
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 2,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 64, true).unwrap();
        let clock = vm.vm_fd.get_clock().unwrap();
        let mut set_data = clock;
        set_data.flags = 0;
        vm.vm_fd.set_clock(&set_data).unwrap();
        let clock2 = vm.vm_fd.get_clock().unwrap();
        // On bare-metal with invariant TSC, KVM_CLOCK_TSC_STABLE
        // should be set after the roundtrip forces
        // pvclock_update_vm_gtod_copy. In nested virt it may not be.
        // Either way, the roundtrip must not fail.
        let _ = clock2.flags & KVM_CLOCK_TSC_STABLE;
    }

    #[test]
    fn kvm_clock_data_default_is_zeroed() {
        let clock = kvm_bindings::kvm_clock_data::default();
        assert_eq!(clock.clock, 0);
        assert_eq!(clock.flags, 0);
        assert_eq!(clock.pad0, 0);
        assert_eq!(clock.realtime, 0);
        assert_eq!(clock.host_tsc, 0);
        assert_eq!(clock.pad, [0u32; 4]);
    }

    #[test]
    fn kvm_clock_data_size_matches_ioctl_encoding() {
        // The hand-encoded `_IOC_SIZE = 0x30 = 48` in the
        // KVM_GET_CLOCK / KVM_SET_CLOCK ioctl-number constants in
        // this file presumes this exact size. A compile-time
        // `const _: () = assert!(...)` next to the first constant
        // guards builds; this runtime check is a belt-and-
        // suspenders guard against a future split of kvm-bindings
        // that drops the compile-time assert.
        assert_eq!(std::mem::size_of::<kvm_bindings::kvm_clock_data>(), 48);
    }

    #[test]
    fn raw_fd_get_clock_matches_safe_wrapper() {
        // Cross-check: the hand-encoded ioctl number 0x8030_ae7c
        // hits the same kernel path as kvm_ioctls::VmFd::get_clock.
        // If the number were wrong, libc::ioctl would return ENOTTY
        // (-22), which surfaces as Err and the assertion below would
        // observe it. If the number aimed at a different ioctl, the
        // returned clock value would not advance monotonically and
        // the safe-vs-raw comparison would diverge dramatically.
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 1,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 64, false).unwrap();
        let raw_fd = vm.vm_fd.as_raw_fd();
        let via_safe = vm.vm_fd.get_clock().expect("safe GET_CLOCK");
        let via_raw = super::kvm_get_clock_via_raw_fd(raw_fd).expect("raw GET_CLOCK");
        // Both reads hit the same in-kernel pvclock via separate
        // seqcount reads; the later one must be >= the earlier
        // (kvm_clock is monotonic non-decreasing).
        assert!(
            via_raw.clock >= via_safe.clock,
            "raw-fd GET regressed below safe GET (raw={}, safe={}) — ioctl number drift",
            via_raw.clock,
            via_safe.clock,
        );
        // < 1 second drift means we are reading the same ioctl,
        // not some unrelated kernel time source.
        assert!(
            via_raw.clock - via_safe.clock < 1_000_000_000,
            "raw-fd vs safe GET differ by >1s (raw={}, safe={}) — likely different kernel state",
            via_raw.clock,
            via_safe.clock,
        );
    }

    #[test]
    fn raw_fd_set_clock_roundtrip_with_flags_zero() {
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 1,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 64, false).unwrap();
        let raw_fd = vm.vm_fd.as_raw_fd();
        let mut clock = super::kvm_get_clock_via_raw_fd(raw_fd).expect("raw GET_CLOCK");
        clock.flags = 0;
        super::kvm_set_clock_via_raw_fd(raw_fd, &clock).expect("raw SET_CLOCK");
        let after = super::kvm_get_clock_via_raw_fd(raw_fd).expect("raw GET_CLOCK after");
        assert!(after.clock >= clock.clock);
    }

    #[test]
    fn performance_mode_with_hlt_disable_succeeds() {
        // performance_mode issues two separate enable_cap calls:
        // PAUSE (always succeeds) then HLT (may be rejected by
        // mitigate_smt_rsb). Either way, VM creation must succeed.
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 2,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let vm = KtstrKvm::new(topo, 128, true);
        assert!(
            vm.is_ok(),
            "performance_mode with HLT disable failed: {:?}",
            vm.err()
        );
    }
}