supermachine 0.7.69

//! KVM (Linux/x86_64) implementation of the portable [`crate::hypervisor`]
//! backend contract — the sibling of `crate::hvf`. Increment 1: VM + in-kernel
//! irqchip lifecycle, guest-memory mapping, vCPU creation + CPUID, and the
//! run/exit primitive (`step()` → [`VcpuExit`]).
//!
//! The seam's register model is aarch64-architectural; on x86 those methods
//! (`CoreReg`/`SysReg`) are vestigial — the x86 orchestration sets RIP/CR*/EFER
//! on the concrete [`KvmVcpu`] via [`KvmVcpu::enter_long_mode`] and friends, and
//! drives the loop through `step()`. Snapshot + cross-thread force-exit are
//! later increments (6 and 5).

use std::cell::RefCell;
use std::collections::HashMap;
use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, AtomicUsize, Ordering};
use std::sync::{Arc, Mutex, Once, OnceLock};

use kvm_bindings::{
    kvm_clock_data, kvm_debugregs, kvm_fpu, kvm_irqchip, kvm_lapic_state, kvm_mp_state,
    kvm_msr_entry, kvm_pit_state2, kvm_regs, kvm_segment, kvm_sregs, kvm_userspace_memory_region,
    kvm_vcpu_events, kvm_xcrs, CpuId, Msrs, KVM_MAX_CPUID_ENTRIES, KVM_MEM_READONLY,
    KVM_MP_STATE_UNINITIALIZED,
};
use kvm_ioctls::{IoEventAddress, Kvm, NoDatamatch, VcpuExit as KvmExit, VcpuFd, VmFd};
use vmm_sys_util::eventfd::EventFd;

use crate::hypervisor::{CoreReg, HypervisorVcpu, HypervisorVm, SysReg, VcpuExit, VcpuHandle};

pub mod run;

/// Backend error: a flat message wrapping the underlying KVM errno (the
/// orchestration only ever surfaces these as strings).
#[derive(Debug)]
pub struct KvmError(String);

impl KvmError {
    fn unsupported(what: &str) -> Self {
        KvmError(format!("kvm: unsupported ({what})"))
    }
}

impl From<kvm_ioctls::Error> for KvmError {
    fn from(e: kvm_ioctls::Error) -> Self {
        KvmError(format!("kvm: {e}"))
    }
}

impl From<std::io::Error> for KvmError {
    fn from(e: std::io::Error) -> Self {
        KvmError(format!("kvm io: {e}"))
    }
}

impl From<crate::arch::x86_64::boot::BootError> for KvmError {
    fn from(e: crate::arch::x86_64::boot::BootError) -> Self {
        KvmError(format!("kvm boot: {e}"))
    }
}

impl std::fmt::Display for KvmError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(&self.0)
    }
}

impl std::error::Error for KvmError {}

impl crate::hypervisor::BackendError for KvmError {
    fn other(msg: &str) -> Self {
        KvmError(msg.to_string())
    }
}

/// A VM + its in-kernel interrupt controller. Owns the `/dev/kvm` handle and
/// the VM fd; tracks memory slots so a region can be unmapped by gpa.
pub struct KvmVm {
    _kvm: Kvm,
    // `Arc` so the seam's `irq_line()` can hand a cloneable, 'static IRQ-raise
    // closure to device threads (they call `set_irq_line` on this fd). All
    // `self.vm.*` calls deref through the Arc unchanged.
    vm: Arc<VmFd>,
    next_slot: AtomicU32,
    next_vcpu: AtomicU64,
    supported_cpuid: CpuId,
    /// gpa → slot, so `unmap_ram` can find the slot to zero.
    slots: Mutex<HashMap<u64, u32>>,
}

impl HypervisorVm for KvmVm {
    type Error = KvmError;
    type Vcpu = KvmVcpu;

    fn create() -> Result<Self, KvmError> {
        let kvm = Kvm::new()?;
        let vm = kvm.create_vm()?;
        // x86 prerequisites for the in-kernel irqchip: a TSS region and the
        // identity-map page must be set before KVM_CREATE_IRQCHIP.
        vm.set_tss_address(0xfffb_d000)?;
        vm.create_irq_chip()?;
        let mut supported_cpuid = kvm.get_supported_cpuid(KVM_MAX_CPUID_ENTRIES)?;
        // Mask CET in leaf 7/0: ECX[7]=SHSTK (user shadow stack), EDX[20]=IBT
        // (indirect-branch tracking). We don't set up CET state, so a kernel
        // that enables it would fault all userspace immediately (no-endbr64
        // entry → #CP, RET → shadow-stack mismatch). Proven necessary by the
        // kvm-boot spike.
        for e in supported_cpuid.as_mut_slice() {
            if e.function == 7 && e.index == 0 {
                e.ecx &= !(1 << 7);
                e.edx &= !(1 << 20);
            }
        }
        Ok(KvmVm {
            _kvm: kvm,
            vm: Arc::new(vm),
            next_slot: AtomicU32::new(0),
            next_vcpu: AtomicU64::new(0),
            supported_cpuid,
            slots: Mutex::new(HashMap::new()),
        })
    }

    unsafe fn map_ram(
        &self,
        host_ptr: *mut u8,
        gpa: u64,
        len: usize,
        prot: u64,
    ) -> Result<(), KvmError> {
        let slot = self.next_slot.fetch_add(1, Ordering::Relaxed);
        let mut flags = 0u32;
        if prot & crate::hypervisor::prot::WRITE == 0 {
            flags |= KVM_MEM_READONLY;
        }
        let region = kvm_userspace_memory_region {
            slot,
            guest_phys_addr: gpa,
            memory_size: len as u64,
            userspace_addr: host_ptr as u64,
            flags,
        };
        // SAFETY: caller guarantees host_ptr stays valid for the VM's lifetime.
        unsafe { self.vm.set_user_memory_region(region)? };
        self.slots.lock().unwrap().insert(gpa, slot);
        Ok(())
    }

    unsafe fn unmap_ram(&self, gpa: u64, _len: usize) -> Result<(), KvmError> {
        if let Some(slot) = self.slots.lock().unwrap().remove(&gpa) {
            // memory_size = 0 deletes the slot.
            let region = kvm_userspace_memory_region {
                slot,
                guest_phys_addr: gpa,
                memory_size: 0,
                userspace_addr: 0,
                flags: 0,
            };
            // SAFETY: caller guarantees no vCPU is accessing this range.
            unsafe { self.vm.set_user_memory_region(region)? };
        }
        Ok(())
    }

    fn create_vcpu(&self) -> Result<KvmVcpu, KvmError> {
        // Per-VM KVM vCPU index (the APIC id) — must be 0..N within this VM.
        let kvm_index = self.next_vcpu.fetch_add(1, Ordering::Relaxed);
        let mut vcpu = self.vm.create_vcpu(kvm_index)?;
        vcpu.set_cpuid2(&self.supported_cpuid)?;
        // Register the cross-thread control block under a process-global id +
        // ensure the SIGUSR1 handler is installed before this vCPU can run.
        install_force_exit_signal();
        let id = NEXT_REG_ID.fetch_add(1, Ordering::Relaxed);
        // Publish the immediate_exit flag pointer NOW (main thread, before the
        // run thread is spawned). The kvm_run page is an mmap on the vCPU fd
        // with a process-stable address, so a pointer captured here is valid for
        // the lifetime of the fd regardless of which thread later runs it. This
        // closes the start()/teardown race: a force_exit issued before the run
        // thread binds itself (tid==0) still gates guest entry via this flag.
        let immediate_exit_ptr = std::ptr::addr_of_mut!(vcpu.get_kvm_run().immediate_exit) as usize;
        let reg = Arc::new(VcpuReg {
            tid: AtomicU64::new(0),
            exit: AtomicBool::new(false),
            immediate_exit_ptr: AtomicUsize::new(immediate_exit_ptr),
        });
        registry().lock().unwrap().insert(id, reg.clone());
        Ok(KvmVcpu {
            vcpu: RefCell::new(vcpu),
            id,
            reg,
            bound: AtomicBool::new(false),
        })
    }

    fn set_irq(&self, intid: u32, level: bool) -> Result<(), KvmError> {
        // The in-kernel irqchip (IOAPIC/PIC) is per-VM on KVM; drive the GSI line.
        self.vm.set_irq_line(intid, level)?;
        Ok(())
    }

    fn irq_line(&self) -> Arc<dyn Fn(u32, bool) + Send + Sync> {
        // Clone the shared VM-fd handle into the closure so device threads can
        // drive the per-VM in-kernel irqchip after the borrow ends.
        let vm = Arc::clone(&self.vm);
        Arc::new(move |intid, level| {
            let _ = vm.set_irq_line(intid, level);
        })
    }

    fn capture_intc(&self) -> Result<Vec<u8>, KvmError> {
        // Serialize PIT + 3 irqchips (PIC master/slave + IOAPIC) + kvmclock as
        // length-prefixed POD blobs. (The full snapshot path in `kvm::run` has
        // its own inline serialization for SMSNAP05; this standalone blob is the
        // seam form the backend-agnostic snapshot pipeline consumes.)
        let s = self.capture_devices()?;
        let mut out = Vec::new();
        push_pod(&mut out, &s.pit);
        for chip in &s.irqchips {
            push_pod(&mut out, chip);
        }
        push_pod(&mut out, &s.clock);
        Ok(out)
    }

    fn restore_intc(&self, blob: &[u8]) -> Result<(), KvmError> {
        let mut p = 0usize;
        let pit = read_pod(blob, &mut p)?;
        let irqchips = [
            read_pod(blob, &mut p)?,
            read_pod(blob, &mut p)?,
            read_pod(blob, &mut p)?,
        ];
        let clock = read_pod(blob, &mut p)?;
        self.restore_devices(&KvmDeviceState {
            pit,
            irqchips,
            clock,
        })
    }

    fn dax_mapper(self: &Arc<Self>) -> Arc<dyn crate::fuse::HvfMapper> {
        // The KVM mapper retains a shared VM handle for memslot map/unmap.
        crate::kvm::run::kvm_dax_mapper(Arc::clone(self))
    }

    fn host_monotonic_ticks() -> u64 {
        // CLOCK_MONOTONIC in nanoseconds. Currently informational on KVM: the
        // guest clock travels in the VM-global device blob (kvm_clock_data via
        // capture_intc/restore_intc) and KVM re-anchors the TSC on KVM_SET_CLOCK,
        // so capture_clock_ref / restore_clock ignore this value. Kept correct
        // for when vmm::snapshot drives both backends through one container.
        let mut ts = libc::timespec {
            tv_sec: 0,
            tv_nsec: 0,
        };
        // SAFETY: writes into a local timespec; CLOCK_MONOTONIC is always present.
        unsafe { libc::clock_gettime(libc::CLOCK_MONOTONIC, &mut ts) };
        (ts.tv_sec as u64)
            .wrapping_mul(1_000_000_000)
            .wrapping_add(ts.tv_nsec as u64)
    }

    fn boot_linux(
        &self,
        vcpu: &KvmVcpu,
        mem: &mut [u8],
        cfg: &crate::hypervisor::LinuxBootConfig,
    ) -> Result<(), KvmError> {
        // x86: write kernel + initrd + boot_params + GDT into guest RAM and
        // compute the long-mode entry registers, then apply them to the BSP. The
        // FDT field is unused (x86 has no device tree; the cmdline carries the
        // virtio-mmio device list). SMP (MP table) is the caller's concern.
        use crate::arch::x86_64::boot::{self, BootConfig};
        let bcfg = BootConfig {
            mem_size: cfg.ram_size,
            cmdline: cfg.cmdline,
            bzimage: cfg.kernel,
            initrd: cfg.initrd,
        };
        let regs = boot::setup_boot(mem, &bcfg)?;
        vcpu.apply_boot_regs(&regs)
    }
}

/// Append a length-prefixed POD value to `out` (little-endian u32 length, then
/// the value's raw bytes). Used by [`KvmVm::capture_intc`].
fn push_pod<T>(out: &mut Vec<u8>, v: &T) {
    // SAFETY: kvm-bindings device structs are `repr(C)` POD; reading them as
    // bytes is sound and copies no padding-sensitive invariants.
    let bytes =
        unsafe { std::slice::from_raw_parts(v as *const T as *const u8, std::mem::size_of::<T>()) };
    out.extend_from_slice(&(bytes.len() as u32).to_le_bytes());
    out.extend_from_slice(bytes);
}

/// Read a length-prefixed POD value written by [`push_pod`], validating the
/// stored length matches `size_of::<T>()` (rejects a truncated / mismatched blob).
fn read_pod<T: Copy>(b: &[u8], p: &mut usize) -> Result<T, KvmError> {
    if *p + 4 > b.len() {
        return Err(KvmError("intc blob truncated (length prefix)".to_string()));
    }
    let len = u32::from_le_bytes([b[*p], b[*p + 1], b[*p + 2], b[*p + 3]]) as usize;
    *p += 4;
    if len != std::mem::size_of::<T>() || *p + len > b.len() {
        return Err(KvmError("intc blob field size mismatch".to_string()));
    }
    let mut v = std::mem::MaybeUninit::<T>::uninit();
    // SAFETY: len == size_of::<T>() (checked) and the source range is in bounds;
    // T is POD (`Copy`), so an arbitrary byte pattern is a valid value.
    unsafe {
        std::ptr::copy_nonoverlapping(b[*p..].as_ptr(), v.as_mut_ptr() as *mut u8, len);
    }
    *p += len;
    Ok(unsafe { v.assume_init() })
}

/// Stream form of [`push_pod`]: length-prefixed POD value to a `Write`. Used by
/// `HypervisorVcpu::write_snapshot_state` (byte-identical to the SMSNAP per-vCPU
/// blob encoding).
fn write_pod_to<T>(w: &mut dyn std::io::Write, v: &T) -> std::io::Result<()> {
    // SAFETY: kvm-bindings POD struct; reading its bytes is sound.
    let bytes =
        unsafe { std::slice::from_raw_parts(v as *const T as *const u8, std::mem::size_of::<T>()) };
    w.write_all(&(bytes.len() as u32).to_le_bytes())?;
    w.write_all(bytes)
}

/// Stream form of [`read_pod`]: read a length-prefixed POD value from a `Read`,
/// validating the stored length matches `size_of::<T>()`.
fn read_pod_from<T: Copy>(r: &mut dyn std::io::Read) -> std::io::Result<T> {
    let mut lb = [0u8; 4];
    r.read_exact(&mut lb)?;
    let len = u32::from_le_bytes(lb) as usize;
    if len != std::mem::size_of::<T>() {
        return Err(std::io::Error::new(
            std::io::ErrorKind::InvalidData,
            "snapshot-state field size mismatch",
        ));
    }
    let mut v = std::mem::MaybeUninit::<T>::uninit();
    // SAFETY: len == size_of::<T>() (checked); T is POD (`Copy`), so any byte
    // pattern is a valid value.
    let buf = unsafe { std::slice::from_raw_parts_mut(v.as_mut_ptr() as *mut u8, len) };
    r.read_exact(buf)?;
    Ok(unsafe { v.assume_init() })
}

impl KvmVm {
    /// In-kernel PIT (8254) — gives the kernel IRQ0 timekeeping so it gets past
    /// timer calibration. Call once after `create()` for the boot path. (Kept
    /// separate from `create` because the seam's `HypervisorVm::create` is the
    /// portable contract; the PIT is an x86-boot specific.)
    pub fn create_pit(&self) -> Result<(), KvmError> {
        self.vm
            .create_pit2(kvm_bindings::kvm_pit_config::default())?;
        Ok(())
    }

    /// Drive a legacy interrupt line on the in-kernel irqchip. The run loop
    /// mirrors a device's `irq_line()` level here after each access to deliver
    /// serial RX/THRE (IRQ4) and virtio used-buffer (IRQ5) interrupts.
    pub fn set_irq_line(&self, irq: u32, level: bool) -> Result<(), KvmError> {
        self.vm.set_irq_line(irq, level)?;
        Ok(())
    }

    /// Register an MMIO doorbell (ioeventfd): a guest write to `addr` signals
    /// `fd` *in the kernel* instead of exiting to userspace. A device thread
    /// waits on `fd` and services the queue — removing the per-notification vCPU
    /// round-trip (the dominant virtio cost). `NoDatamatch` triggers on any
    /// write to the address.
    pub fn register_mmio_ioevent(&self, fd: &EventFd, addr: u64) -> Result<(), KvmError> {
        self.vm
            .register_ioevent(fd, &IoEventAddress::Mmio(addr), NoDatamatch)?;
        Ok(())
    }

    /// Register an irqfd: writing `fd` injects `gsi` directly via the in-kernel
    /// irqchip, with no `set_irq_line` ioctl. The device thread uses this to
    /// raise the virtio used-buffer interrupt off the vCPU path.
    pub fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> Result<(), KvmError> {
        self.vm.register_irqfd(fd, gsi)?;
        Ok(())
    }

    /// Capture the in-kernel device state for a snapshot: the PIT (8254), the
    /// interrupt controllers (PIC master/slave + IOAPIC), and the KVM clock.
    /// The per-vCPU LAPIC is captured with each vCPU (see [`KvmSnapshotState`]).
    pub fn capture_devices(&self) -> Result<KvmDeviceState, KvmError> {
        let pit = self.vm.get_pit2()?;
        let mut irqchips = [
            kvm_irqchip::default(),
            kvm_irqchip::default(),
            kvm_irqchip::default(),
        ];
        for (i, chip) in irqchips.iter_mut().enumerate() {
            chip.chip_id = i as u32; // 0=PIC master, 1=PIC slave, 2=IOAPIC
            self.vm.get_irqchip(chip)?;
        }
        let clock = self.vm.get_clock()?;
        Ok(KvmDeviceState {
            pit,
            irqchips,
            clock,
        })
    }

    /// Restore the in-kernel device state captured by [`capture_devices`].
    pub fn restore_devices(&self, s: &KvmDeviceState) -> Result<(), KvmError> {
        self.vm.set_pit2(&s.pit)?;
        for chip in &s.irqchips {
            self.vm.set_irqchip(chip)?;
        }
        // Reset the clock flags so KVM takes our value as the new base rather
        // than expecting realtime/host-TSC-stable semantics across the restore.
        let mut clock = s.clock;
        clock.flags = 0;
        self.vm.set_clock(&clock)?;
        Ok(())
    }
}

/// In-kernel device state for a VM snapshot (PIT + interrupt controllers +
/// clock). Captured/restored via [`KvmVm::capture_devices`]/[`restore_devices`].
/// `Clone` so the in-place reset path can cache the snapshot's intc/timer
/// baseline and re-apply it each reset.
#[derive(Clone)]
pub struct KvmDeviceState {
    pit: kvm_pit_state2,
    irqchips: [kvm_irqchip; 3],
    clock: kvm_clock_data,
}

/// Per-vCPU cross-thread control block, shared between the vCPU's own thread
/// (which runs it) and any thread holding a [`KvmVcpuHandle`] for it. Lives in
/// the global [`registry`] keyed by vCPU id.
struct VcpuReg {
    /// The OS thread currently running this vCPU (`pthread_t`, 0 = not yet
    /// bound). Stored by the vCPU thread on its first `step()`.
    tid: AtomicU64,
    /// Set by `force_exit`; observed by the run loop via [`KvmVcpu::should_exit`]
    /// so the stop intent is never lost even if the SIGUSR1 lands in the window
    /// between two `KVM_RUN`s (a spinning guest is broken out by the signal;
    /// this flag covers the rest).
    exit: AtomicBool,
    /// Address of this vCPU's `kvm_run.immediate_exit` byte (0 = not yet
    /// published). Published at vCPU *creation* (on the main thread, before the
    /// run thread is spawned) — NOT in `bind_thread` — so `force_exit` can gate
    /// guest re-entry even in the window before the run thread has bound itself
    /// (the `tid == 0` window). Writing 1 here makes the vCPU's *next* `KVM_RUN`
    /// return `EINTR` at guest entry without executing an instruction; combined
    /// with the SIGUSR1 (which breaks a vCPU already blocked *inside* KVM_RUN),
    /// a single `force_exit` is race-free in every thread state — no re-kicking.
    immediate_exit_ptr: AtomicUsize,
}

/// Globally-unique vCPU identity for the registry/handle. Distinct from the
/// per-VM KVM vCPU index (the APIC id, 0..N within one VM) — that index is NOT
/// unique across VMs, so it can't key a process-global map (two VMs would both
/// claim id 0 and force-exit each other's thread).
static NEXT_REG_ID: AtomicU64 = AtomicU64::new(0);

/// Global vCPU id → control block. The handle is `Copy` (just an id), so the
/// shared state it needs (thread + exit flag) lives here, not in the handle.
fn registry() -> &'static Mutex<HashMap<u64, Arc<VcpuReg>>> {
    static R: OnceLock<Mutex<HashMap<u64, Arc<VcpuReg>>>> = OnceLock::new();
    R.get_or_init(|| Mutex::new(HashMap::new()))
}

/// No-op SIGUSR1 handler. Its only purpose is to exist (so the default
/// terminate action doesn't fire) without `SA_RESTART`, so a SIGUSR1 delivered
/// during `KVM_RUN` makes the ioctl return `EINTR` (→ `VcpuExit::Intr`).
extern "C" fn sigusr1_noop(_sig: libc::c_int) {}

/// Install the SIGUSR1 handler once per process.
fn install_force_exit_signal() {
    static ONCE: Once = Once::new();
    ONCE.call_once(|| unsafe {
        let mut sa: libc::sigaction = std::mem::zeroed();
        sa.sa_sigaction = sigusr1_noop as usize;
        libc::sigemptyset(&mut sa.sa_mask);
        sa.sa_flags = 0; // deliberately NOT SA_RESTART → KVM_RUN returns EINTR
        libc::sigaction(libc::SIGUSR1, &sa, std::ptr::null_mut());
    });
}

/// Cross-thread force-exit token: kicks a vCPU out of `KVM_RUN` from another
/// thread. The signal (SIGUSR1, no `SA_RESTART`) makes a *blocked* `KVM_RUN`
/// return `EINTR`; the registry's `exit` flag carries the intent so it survives
/// the gap between two runs. Foundational for quiesce/snapshot, the multi-vCPU
/// coordinator, and an interruptible run loop.
#[derive(Clone, Copy)]
pub struct KvmVcpuHandle {
    vcpu_id: u64,
}

impl VcpuHandle for KvmVcpuHandle {
    fn force_exit(handles: &[Self]) {
        install_force_exit_signal();
        let reg = registry().lock().unwrap();
        for h in handles {
            if let Some(e) = reg.get(&h.vcpu_id) {
                e.exit.store(true, Ordering::SeqCst);
                // Gate guest re-entry FIRST (before the signal): the vCPU's next
                // KVM_RUN returns EINTR at entry without running an instruction.
                // This covers the not-yet-bound (tid==0) and between-runs states.
                // SAFETY: `immediate_exit_ptr` was published at vCPU creation and
                // points into the still-mapped kvm_run page for this fd's life.
                let imm = e.immediate_exit_ptr.load(Ordering::SeqCst);
                if imm != 0 {
                    unsafe { std::ptr::write_volatile(imm as *mut u8, 1u8) };
                }
                let tid = e.tid.load(Ordering::SeqCst);
                // Break a vCPU already blocked INSIDE KVM_RUN (idle HLT with the
                // in-kernel irqchip): immediate_exit isn't re-checked while
                // blocked, so the signal is what wakes it. Harmless if it's not
                // in KVM_RUN — SIGUSR1 is thread-blocked there and stays pending.
                if tid != 0 {
                    // SAFETY: signalling a live thread with an installed
                    // handler; a stale tid would target a since-exited thread,
                    // but a vCPU's registry entry is removed on its Drop before
                    // the thread can be reused, so tid refers to this vCPU.
                    unsafe {
                        libc::pthread_kill(tid as libc::pthread_t, libc::SIGUSR1);
                    }
                }
            }
        }
    }
}

/// The MSRs to snapshot — the resume-critical ones that are NOT already
/// covered by REGS/SREGS (EFER, APIC_BASE, FS/GS_BASE live in sregs). These
/// are the syscall/sysenter fast-path bases + TSC; all get/set cleanly (unlike
/// the full KVM_GET_MSR_INDEX_LIST, which contains entries that error on set).
const SNAPSHOT_MSRS: &[u32] = &[
    0x0000_0010, // IA32_TSC
    0x0000_0174, // IA32_SYSENTER_CS
    0x0000_0175, // IA32_SYSENTER_ESP
    0x0000_0176, // IA32_SYSENTER_EIP
    0xc000_0081, // STAR
    0xc000_0082, // LSTAR
    0xc000_0083, // CSTAR
    0xc000_0084, // SYSCALL_MASK (SFMASK)
    0xc000_0102, // KERNEL_GS_BASE
    0xc000_0103, // TSC_AUX
];

/// Captured vCPU state for snapshot/restore. The full per-vCPU CPU state KVM
/// exposes: GP regs, system regs, FP/SSE (fxsave via FPU — kvm-ioctls 0.23 has
/// no SET_XSAVE so AVX state isn't restored), extended control regs, the local
/// APIC, pending events, debug regs, run state, and the resume-critical MSRs.
/// (Guest RAM + device state are captured separately by the run layer.)
///
/// `Clone` so the in-place reset path can hand each vCPU thread an owned copy of
/// its snapshot-baseline state to re-apply on reset (POD register blobs + a
/// `Msrs` FAM wrapper, all cloneable).
#[derive(Clone)]
pub struct KvmSnapshotState {
    regs: kvm_regs,
    sregs: kvm_sregs,
    fpu: kvm_fpu,
    xcrs: kvm_xcrs,
    events: kvm_vcpu_events,
    mp_state: kvm_mp_state,
    debug_regs: kvm_debugregs,
    lapic: kvm_lapic_state,
    msrs: Msrs,
}

/// A single vCPU. `RefCell` because `VcpuFd::run` needs `&mut` (it returns an
/// exit borrowing the shared `kvm_run` page) while the seam's `step` is `&self`;
/// vCPUs are thread-bound, so this is never contended.
pub struct KvmVcpu {
    vcpu: RefCell<VcpuFd>,
    id: u64,
    reg: Arc<VcpuReg>,
    /// Set once `bind_thread` has run (idempotent guard).
    bound: AtomicBool,
}

impl Drop for KvmVcpu {
    fn drop(&mut self) {
        // Remove the registry entry so a recycled tid can't be force-exited as
        // if it were still this vCPU.
        registry().lock().unwrap().remove(&self.id);
    }
}

impl HypervisorVcpu for KvmVcpu {
    type Error = KvmError;
    type Handle = KvmVcpuHandle;
    type SnapshotState = KvmSnapshotState;

    fn exit_token(&self) -> KvmVcpuHandle {
        KvmVcpuHandle { vcpu_id: self.id }
    }

    fn capture_snapshot(&self) -> Result<KvmSnapshotState, KvmError> {
        self.capture_snapshot_locked(&self.vcpu.borrow())
    }

    fn restore_snapshot(&self, s: &KvmSnapshotState) -> Result<(), KvmError> {
        self.restore_snapshot_locked(&self.vcpu.borrow(), s)
    }

    fn capture_clock_ref(_state: &Self::SnapshotState, _host_now: u64) -> u64 {
        // On KVM the guest clock travels in the VM-global device blob
        // (kvm_clock_data, captured via capture_intc), and KVM re-anchors the
        // guest TSC on KVM_SET_CLOCK at restore — so there is no per-vCPU clock
        // reference to persist. Sentinel 0; restore_clock is a matching no-op.
        0
    }

    fn restore_clock(&self, _captured_ref: u64, _host_now: u64) -> Result<u64, KvmError> {
        // No-op on KVM: the guest clock is re-anchored by restoring the device
        // blob (restore_intc -> KVM_SET_CLOCK), not per vCPU. See capture_clock_ref.
        Ok(0)
    }

    fn write_snapshot_state(
        s: &KvmSnapshotState,
        w: &mut dyn std::io::Write,
    ) -> std::io::Result<()> {
        // Byte layout matches the inline per-vCPU encoding the SMSNAP container
        // already uses (length-prefixed POD blobs in field order, then the MSR
        // index/data pairs), so routing the snapshot pipeline through this seam
        // method leaves the on-disk format unchanged.
        write_pod_to(w, &s.regs)?;
        write_pod_to(w, &s.sregs)?;
        write_pod_to(w, &s.fpu)?;
        write_pod_to(w, &s.xcrs)?;
        write_pod_to(w, &s.events)?;
        write_pod_to(w, &s.mp_state)?;
        write_pod_to(w, &s.debug_regs)?;
        write_pod_to(w, &s.lapic)?;
        let entries = s.msrs.as_slice();
        w.write_all(&(entries.len() as u32).to_le_bytes())?;
        for e in entries {
            w.write_all(&e.index.to_le_bytes())?;
            w.write_all(&e.data.to_le_bytes())?;
        }
        Ok(())
    }

    fn read_snapshot_state(r: &mut dyn std::io::Read) -> std::io::Result<KvmSnapshotState> {
        let regs = read_pod_from(r)?;
        let sregs = read_pod_from(r)?;
        let fpu = read_pod_from(r)?;
        let xcrs = read_pod_from(r)?;
        let events = read_pod_from(r)?;
        let mp_state = read_pod_from(r)?;
        let debug_regs = read_pod_from(r)?;
        let lapic = read_pod_from(r)?;
        let mut lb = [0u8; 4];
        r.read_exact(&mut lb)?;
        let nmsr = u32::from_le_bytes(lb);
        // Cap the pre-allocation: a corrupt/huge count must not OOM here. A bogus
        // count then fails fast on the first truncated `read_exact` below.
        let mut entries = Vec::with_capacity((nmsr as usize).min(4096));
        for _ in 0..nmsr {
            let mut ib = [0u8; 4];
            r.read_exact(&mut ib)?;
            let mut db = [0u8; 8];
            r.read_exact(&mut db)?;
            entries.push(kvm_msr_entry {
                index: u32::from_le_bytes(ib),
                data: u64::from_le_bytes(db),
                ..Default::default()
            });
        }
        let msrs = Msrs::from_entries(&entries).map_err(|e| {
            std::io::Error::new(std::io::ErrorKind::InvalidData, format!("msrs: {e:?}"))
        })?;
        Ok(KvmSnapshotState {
            regs,
            sregs,
            fpu,
            xcrs,
            events,
            mp_state,
            debug_regs,
            lapic,
            msrs,
        })
    }

    // aarch64 register model — not used by x86 orchestration (it uses the
    // inherent x86 register methods below).
    fn get_core(&self, _reg: CoreReg) -> Result<u64, KvmError> {
        Err(KvmError::unsupported("aarch64 CoreReg on x86"))
    }
    fn set_core(&self, _reg: CoreReg, _value: u64) -> Result<(), KvmError> {
        Err(KvmError::unsupported("aarch64 CoreReg on x86"))
    }
    fn get_sys(&self, _reg: SysReg) -> Result<u64, KvmError> {
        Err(KvmError::unsupported("aarch64 SysReg on x86"))
    }
    fn set_sys(&self, _reg: SysReg, _value: u64) -> Result<(), KvmError> {
        Err(KvmError::unsupported("aarch64 SysReg on x86"))
    }

    fn step(&self) -> Result<VcpuExit, KvmError> {
        // Bind this vCPU to the running OS thread (idempotent) so a handle can
        // force-exit it race-free. Must run on the thread that will call run().
        self.bind_thread()?;
        let mut vcpu = self.vcpu.borrow_mut();
        // A force-exit SIGUSR1 makes KVM_RUN fail with EINTR (it does not set a
        // successful KVM_EXIT_INTR). Surface both as Canceled.
        let exit = match vcpu.run() {
            Ok(e) => e,
            Err(e) if e.errno() == libc::EINTR => return Ok(VcpuExit::Canceled),
            Err(e) => return Err(e.into()),
        };
        Ok(match exit {
            KvmExit::IoOut(port, data) => VcpuExit::Io {
                port,
                write: true,
                size: data.len() as u8,
                data: pack_u32(data),
            },
            KvmExit::IoIn(port, data) => VcpuExit::Io {
                port,
                write: false,
                size: data.len() as u8,
                data: 0,
            },
            KvmExit::MmioWrite(addr, data) => VcpuExit::Mmio {
                phys_addr: addr,
                write: true,
                len: data.len() as u8,
                data: pack_u64(data),
            },
            KvmExit::MmioRead(addr, data) => VcpuExit::Mmio {
                phys_addr: addr,
                write: false,
                len: data.len() as u8,
                data: 0,
            },
            KvmExit::Hlt | KvmExit::Shutdown => VcpuExit::Halt,
            KvmExit::Intr => VcpuExit::Canceled,
            // Unmodeled exits (debug, internal-error, …). The raw discriminant
            // isn't exposed by kvm-ioctls' enum; 0 is the "unknown" sentinel.
            _ => VcpuExit::Unknown(0),
        })
    }
}

impl KvmVcpu {
    /// Whether a [`KvmVcpuHandle::force_exit`] has asked this vCPU to stop. The
    /// run loop checks this after an `Intr` exit (and may check it between
    /// iterations) so the stop is honored even if the signal raced the run.
    pub fn should_exit(&self) -> bool {
        self.reg.exit.load(Ordering::SeqCst)
    }

    /// Clear a pending force-exit request (e.g. before re-running a vCPU that
    /// was stopped for a snapshot).
    pub fn clear_exit(&self) {
        self.reg.exit.store(false, Ordering::SeqCst);
    }

    /// As the `HypervisorVcpu::capture_snapshot` trait method, but using an
    /// already-borrowed `VcpuFd` — for callers that hold the vCPU's `RefCell`
    /// borrow (the live-snapshot pause inside the run loop, where a second
    /// `self.vcpu.borrow()` would panic against the active `borrow_mut`).
    /// Re-apply a captured [`KvmSnapshotState`] using an already-held vcpu fd
    /// borrow — the restore analog of [`capture_snapshot_locked`]. The in-place
    /// reset path (`run_vcpu`) holds `self.vcpu.borrow_mut()` across the whole
    /// run loop, so it cannot call [`restore_snapshot`] (which takes its own
    /// borrow → RefCell double-borrow panic); it re-applies the vCPU's baseline
    /// registers through this on its own owning thread.
    pub(crate) fn restore_snapshot_locked(
        &self,
        v: &VcpuFd,
        s: &KvmSnapshotState,
    ) -> Result<(), KvmError> {
        // SREGS before REGS (paging/segments must be in place); the rest are
        // order-independent.
        v.set_sregs(&s.sregs)?;
        v.set_regs(&s.regs)?;
        v.set_fpu(&s.fpu)?;
        v.set_xcrs(&s.xcrs)?;
        v.set_lapic(&s.lapic)?;
        let set = v.set_msrs(&s.msrs)?;
        if set != SNAPSHOT_MSRS.len() {
            return Err(KvmError(format!(
                "kvm: set_msrs wrote {set}/{} entries",
                SNAPSHOT_MSRS.len()
            )));
        }
        v.set_vcpu_events(&s.events)?;
        v.set_debug_regs(&s.debug_regs)?;
        v.set_mp_state(s.mp_state)?;
        Ok(())
    }

    pub(crate) fn capture_snapshot_locked(&self, v: &VcpuFd) -> Result<KvmSnapshotState, KvmError> {
        let entries: Vec<kvm_msr_entry> = SNAPSHOT_MSRS
            .iter()
            .map(|&index| kvm_msr_entry {
                index,
                data: 0,
                ..Default::default()
            })
            .collect();
        let mut msrs =
            Msrs::from_entries(&entries).map_err(|e| KvmError(format!("kvm: msrs fam: {e:?}")))?;
        let got = v.get_msrs(&mut msrs)?;
        if got != SNAPSHOT_MSRS.len() {
            return Err(KvmError(format!(
                "kvm: get_msrs read {got}/{} entries",
                SNAPSHOT_MSRS.len()
            )));
        }
        Ok(KvmSnapshotState {
            regs: v.get_regs()?,
            sregs: v.get_sregs()?,
            fpu: v.get_fpu()?,
            xcrs: v.get_xcrs()?,
            events: v.get_vcpu_events()?,
            mp_state: v.get_mp_state()?,
            debug_regs: v.get_debug_regs()?,
            lapic: v.get_lapic()?,
            msrs,
        })
    }

    /// Bind this vCPU to the *calling* OS thread for race-free force-exit.
    /// Idempotent; must be called on the thread that will drive `run()`/`step()`
    /// (step() calls it automatically; the KVM-native run loop calls it once up
    /// front). Two things happen:
    ///   1. SIGUSR1 is blocked at the thread level, so a force-exit signal sent
    ///      while we are *not* in KVM_RUN stays pending (instead of running the
    ///      no-op handler and being lost).
    ///   2. KVM is told to unblock all signals during guest execution
    ///      (KVM_SET_SIGNAL_MASK, empty set), so that pending SIGUSR1 fires the
    ///      instant we enter the guest → KVM_RUN returns EINTR. Together these
    ///      close the race where the signal lands between two runs.
    pub fn bind_thread(&self) -> Result<(), KvmError> {
        if self.bound.swap(true, Ordering::SeqCst) {
            return Ok(());
        }
        install_force_exit_signal();
        // 1. Block SIGUSR1 on this thread.
        // SAFETY: standard pthread signal-mask manipulation on the current thread.
        unsafe {
            let mut set: libc::sigset_t = std::mem::zeroed();
            libc::sigemptyset(&mut set);
            libc::sigaddset(&mut set, libc::SIGUSR1);
            libc::pthread_sigmask(libc::SIG_BLOCK, &set, std::ptr::null_mut());
        }
        // 2. KVM_SET_SIGNAL_MASK with an empty sigset → no signals blocked
        //    during guest execution, so the pending SIGUSR1 is delivered on
        //    guest entry.
        self.set_kvm_signal_mask_empty()?;
        // 3. Record the thread so a handle can target it.
        let tid = unsafe { libc::pthread_self() } as u64;
        self.reg.tid.store(tid, Ordering::SeqCst);
        Ok(())
    }

    /// Issue `KVM_SET_SIGNAL_MASK` with an empty signal set on this vCPU fd.
    /// kvm-ioctls 0.23 has no wrapper, so this is the raw ioctl. The argument
    /// is `struct kvm_signal_mask { __u32 len; __u8 sigset[len]; }`; an 8-byte
    /// (64-signal) all-zero sigset means "block nothing during guest mode".
    fn set_kvm_signal_mask_empty(&self) -> Result<(), KvmError> {
        use std::os::unix::io::AsRawFd;
        // _IOW(KVMIO=0xAE, 0x8b, struct kvm_signal_mask) with size field = 4.
        const KVM_SET_SIGNAL_MASK: libc::c_ulong = 0x4004_ae8b;
        let mut buf = [0u8; 4 + 8];
        buf[0..4].copy_from_slice(&8u32.to_le_bytes()); // len = 8 sigset bytes
        let fd = self.vcpu.borrow().as_raw_fd();
        // SAFETY: `fd` is this vCPU's KVM fd; `buf` is a valid kvm_signal_mask.
        let r = unsafe { libc::ioctl(fd, KVM_SET_SIGNAL_MASK, buf.as_ptr()) };
        if r != 0 {
            return Err(KvmError::from(std::io::Error::last_os_error()));
        }
        Ok(())
    }

    /// Bring the vCPU up in 64-bit long mode: a flat 64-bit code/data segment,
    /// paging on (`cr3` = identity-mapped page-table root), and `rip` = entry.
    /// The caller must have written the page tables into guest RAM first.
    pub fn enter_long_mode(&self, entry: u64, cr3: u64) -> Result<(), KvmError> {
        let vcpu = self.vcpu.borrow();
        let mut sregs = vcpu.get_sregs()?;
        let code = kvm_segment {
            base: 0,
            limit: 0xffff_ffff,
            selector: 0x08,
            type_: 0b1011, // code: execute/read/accessed
            present: 1,
            dpl: 0,
            db: 0,
            s: 1,
            l: 1, // 64-bit
            g: 1,
            ..Default::default()
        };
        let data = kvm_segment {
            base: 0,
            limit: 0xffff_ffff,
            selector: 0x10,
            type_: 0b0011, // data: read/write/accessed
            present: 1,
            dpl: 0,
            db: 1,
            s: 1,
            l: 0,
            g: 1,
            ..Default::default()
        };
        sregs.cs = code;
        sregs.ds = data;
        sregs.es = data;
        sregs.fs = data;
        sregs.gs = data;
        sregs.ss = data;
        sregs.cr3 = cr3;
        sregs.cr0 = 0x8000_0001; // PG | PE
        sregs.cr4 = 0x0000_0020; // PAE
        sregs.efer = 0x0000_0500; // LME | LMA
        vcpu.set_sregs(&sregs)?;

        let mut regs = vcpu.get_regs()?;
        regs.rip = entry;
        regs.rflags = 0x2; // reserved-1 bit
        vcpu.set_regs(&regs)?;
        Ok(())
    }

    /// Park a secondary CPU (AP) in the wait-for-SIPI state. Only the BSP
    /// (vCPU 0) boots from the kernel entry; APs sit `UNINITIALIZED` until the
    /// kernel sends INIT-SIPI-SIPI via the in-kernel LAPIC, which KVM services.
    pub fn park_for_sipi(&self) -> Result<(), KvmError> {
        let st = kvm_mp_state {
            mp_state: KVM_MP_STATE_UNINITIALIZED,
        };
        self.vcpu.borrow().set_mp_state(st)?;
        Ok(())
    }

    /// Apply the long-mode register state from the x86 boot protocol
    /// ([`crate::arch::x86_64::boot::setup_boot`]) to this vCPU: the flat
    /// `__BOOT_CS`/`__BOOT_DS` segments, the GDT pointer, CR0/CR3/CR4/EFER, and
    /// RIP/RSI/RFLAGS (RSI = `boot_params`). The caller must have run
    /// `setup_boot` over this VM's guest RAM first (it writes the page tables,
    /// GDT, and zero page that these registers reference).
    pub fn apply_boot_regs(&self, b: &crate::arch::x86_64::boot::BootRegs) -> Result<(), KvmError> {
        let vcpu = self.vcpu.borrow();
        let mut sregs = vcpu.get_sregs()?;
        let cs = seg_to_kvm(&b.cs);
        let ds = seg_to_kvm(&b.ds);
        sregs.cs = cs;
        sregs.ds = ds;
        sregs.es = ds;
        sregs.ss = ds;
        sregs.fs = ds;
        sregs.gs = ds;
        sregs.gdt.base = b.gdt_base;
        sregs.gdt.limit = b.gdt_limit;
        sregs.cr0 = b.cr0;
        sregs.cr3 = b.cr3;
        sregs.cr4 = b.cr4;
        sregs.efer = b.efer;
        vcpu.set_sregs(&sregs)?;

        let mut regs = vcpu.get_regs()?;
        regs.rip = b.rip;
        regs.rsi = b.rsi;
        regs.rflags = b.rflags;
        vcpu.set_regs(&regs)?;
        Ok(())
    }
}

/// Translate the portable boot-protocol [`Segment`](crate::arch::x86_64::boot::Segment)
/// into KVM's `kvm_segment`.
fn seg_to_kvm(s: &crate::arch::x86_64::boot::Segment) -> kvm_segment {
    kvm_segment {
        base: s.base,
        limit: s.limit,
        selector: s.selector,
        type_: s.type_,
        present: s.present,
        dpl: s.dpl,
        s: s.s,
        l: s.l,
        db: s.db,
        g: s.g,
        ..Default::default()
    }
}

fn pack_u32(data: &[u8]) -> u32 {
    let mut buf = [0u8; 4];
    let n = data.len().min(4);
    buf[..n].copy_from_slice(&data[..n]);
    u32::from_le_bytes(buf)
}

fn pack_u64(data: &[u8]) -> u64 {
    let mut buf = [0u8; 8];
    let n = data.len().min(8);
    buf[..n].copy_from_slice(&data[..n]);
    u64::from_le_bytes(buf)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::hypervisor::prot;

    /// The length-prefixed POD (de)serialization behind `capture_intc` /
    /// `restore_intc` must round-trip and reject corrupt/truncated blobs (pure
    /// logic — no `/dev/kvm` needed).
    #[test]
    fn pod_blob_round_trips_and_rejects_corruption() {
        let mut buf = Vec::new();
        push_pod(&mut buf, &0x1122_3344_5566_7788u64);
        push_pod(&mut buf, &[0xAAu8; 8]);

        // Round-trip in order.
        let mut p = 0;
        let a: u64 = read_pod(&buf, &mut p).unwrap();
        let b: [u8; 8] = read_pod(&buf, &mut p).unwrap();
        assert_eq!(a, 0x1122_3344_5566_7788);
        assert_eq!(b, [0xAA; 8]);
        assert_eq!(p, buf.len(), "consumed the whole blob");

        // Truncated length prefix.
        let mut p = 0;
        assert!(read_pod::<u64>(&buf[..2], &mut p).is_err());

        // Stored len (8, the u64's) != size_of::<u32>() → size mismatch.
        let mut p = 0;
        assert!(read_pod::<u32>(&buf, &mut p).is_err());

        // Length prefix claims 8 payload bytes but only 4 follow.
        let mut bad = Vec::new();
        bad.extend_from_slice(&8u32.to_le_bytes());
        bad.extend_from_slice(&[1, 2, 3, 4]);
        let mut p = 0;
        assert!(read_pod::<u64>(&bad, &mut p).is_err());
    }

    /// Increment 1, through the seam: create a KVM VM + irqchip, map RAM, bring
    /// a vCPU up in long mode, run a stub that `out`s 'K' to 0x3f8, and catch it
    /// as `VcpuExit::Io`. The standalone `spikes/kvm-floor` proved the raw
    /// mechanics; this proves the `HypervisorVm`/`HypervisorVcpu` binding.
    #[test]
    fn floor_long_mode_io_exit_through_seam() {
        const MEM: usize = 0x20_0000;
        let vm = KvmVm::create().expect("create VM");

        let host = unsafe {
            libc::mmap(
                std::ptr::null_mut(),
                MEM,
                libc::PROT_READ | libc::PROT_WRITE,
                libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
                -1,
                0,
            )
        };
        assert!(host != libc::MAP_FAILED, "mmap");
        let host = host as *mut u8;

        let put_u64 = |gpa: u64, v: u64| unsafe {
            std::ptr::write(host.add(gpa as usize) as *mut u64, v.to_le());
        };
        // PML4[0]->PDPT, PDPT[0]->PD, PD[0]->2MiB page at 0 (present|rw|ps).
        put_u64(0x1000, 0x2000 | 0x3);
        put_u64(0x2000, 0x3000 | 0x3);
        put_u64(0x3000, 0x83);
        // mov dx,0x3f8 ; mov al,'K' ; out dx,al ; jmp $
        let stub = [0x66u8, 0xba, 0xf8, 0x03, 0xb0, b'K', 0xee, 0xeb, 0xfe];
        unsafe { std::ptr::copy_nonoverlapping(stub.as_ptr(), host, stub.len()) };

        unsafe { vm.map_ram(host, 0, MEM, prot::RWX).expect("map_ram") };
        let vcpu = vm.create_vcpu().expect("create_vcpu");
        vcpu.enter_long_mode(0x0, 0x1000).expect("enter_long_mode");

        match vcpu.step().expect("step") {
            VcpuExit::Io {
                port, write, data, ..
            } => {
                assert_eq!(port, 0x3f8, "serial port");
                assert!(write, "OUT direction");
                assert_eq!(data & 0xff, u32::from(b'K'), "serial byte");
            }
            other => panic!("unexpected exit: {other:?}"),
        }
        unsafe { libc::munmap(host as *mut libc::c_void, MEM) };
    }

    /// The boot-protocol glue: build `BootRegs` from the x86 boot setup and
    /// apply them to a real KVM vCPU, then read the SREGS/REGS back and confirm
    /// the core is configured for long-mode kernel entry (paging on, EFER.LMA,
    /// flat __BOOT_CS, GDT pointer, RIP=load+0x200, RSI=boot_params).
    #[test]
    fn apply_boot_regs_configures_long_mode_entry() {
        use crate::arch::x86_64::boot::{setup_boot, BootConfig};

        let vm = KvmVm::create().expect("create VM");
        let vcpu = vm.create_vcpu().expect("create_vcpu");

        // Minimal fake bzImage: valid setup header span + a little pm-kernel.
        let setup_sects = 4u8;
        let pm_off = (setup_sects as usize + 1) * 512;
        let mut bz = vec![0u8; pm_off + 512];
        bz[0x1f1] = setup_sects;

        let mem_size = 2 * 1024 * 1024;
        let mut mem = vec![0u8; mem_size];
        let cfg = BootConfig {
            mem_size,
            cmdline: "console=ttyS0",
            bzimage: &bz,
            initrd: None,
        };
        let regs = setup_boot(&mut mem, &cfg).expect("setup_boot");
        vcpu.apply_boot_regs(&regs).expect("apply_boot_regs");

        let sregs = vcpu.vcpu.borrow().get_sregs().expect("get_sregs");
        assert_eq!(sregs.cr0, 0x8000_0001, "PG|PE");
        assert_eq!(sregs.cr4, 0x20, "PAE");
        assert_eq!(sregs.efer & 0x500, 0x500, "LME|LMA active");
        assert_eq!(sregs.cr3, 0x1000, "PML4 root");
        assert_eq!(sregs.cs.selector, 0x10, "__BOOT_CS");
        assert_eq!(sregs.cs.l, 1, "64-bit code segment");
        assert_eq!(sregs.ds.selector, 0x18, "__BOOT_DS");
        assert_eq!(sregs.gdt.base, 0x4000, "GDT base");
        assert_eq!(sregs.gdt.limit, 31, "GDT limit");

        let r = vcpu.vcpu.borrow().get_regs().expect("get_regs");
        assert_eq!(r.rip, 0x10_0200, "64-bit entry = load+0x200");
        assert_eq!(r.rsi, 0x1_0000, "RSI = boot_params");
    }

    /// Increment 5: cross-thread force-exit. A vCPU runs a tight infinite loop
    /// (`jmp $`) on its own thread — it never VM-exits on its own. From the main
    /// thread, `force_exit` must break it out of `KVM_RUN` (SIGUSR1 → EINTR →
    /// Canceled). The channel + timeout proves it stopped rather than hanging.
    #[test]
    fn force_exit_breaks_a_spinning_vcpu() {
        use std::sync::mpsc;
        use std::time::Duration;

        const MEM: usize = 0x20_0000;
        let vm = KvmVm::create().expect("create VM");

        let host = unsafe {
            libc::mmap(
                std::ptr::null_mut(),
                MEM,
                libc::PROT_READ | libc::PROT_WRITE,
                libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
                -1,
                0,
            )
        };
        assert!(host != libc::MAP_FAILED, "mmap");
        let host = host as *mut u8;
        let put_u64 = |gpa: u64, v: u64| unsafe {
            std::ptr::write(host.add(gpa as usize) as *mut u64, v.to_le());
        };
        put_u64(0x1000, 0x2000 | 0x3);
        put_u64(0x2000, 0x3000 | 0x3);
        put_u64(0x3000, 0x83);
        // jmp $ (EB FE) — spin forever, never exit.
        let stub = [0xebu8, 0xfe];
        unsafe { std::ptr::copy_nonoverlapping(stub.as_ptr(), host, stub.len()) };

        unsafe { vm.map_ram(host, 0, MEM, prot::RWX).expect("map_ram") };
        let vcpu = vm.create_vcpu().expect("create_vcpu");
        vcpu.enter_long_mode(0x0, 0x1000).expect("enter_long_mode");
        let handle = vcpu.exit_token();

        let (ready_tx, ready_rx) = mpsc::channel::<()>();
        let (tx, rx) = mpsc::channel::<&'static str>();
        let runner = std::thread::spawn(move || {
            // Bind on this thread (sets the SIGUSR1 mask + tid) BEFORE signalling
            // ready, so force_exit can never race the binding.
            vcpu.bind_thread().expect("bind_thread");
            let _ = ready_tx.send(());
            loop {
                match vcpu.step() {
                    Ok(VcpuExit::Canceled) => {
                        let _ = tx.send("canceled");
                        break;
                    }
                    Ok(_) => continue,
                    Err(_) => {
                        let _ = tx.send("err");
                        break;
                    }
                }
            }
            // vcpu drops here, removing its registry entry.
        });

        // Once bound, force_exit is race-free: a SIGUSR1 sent before the vCPU
        // enters KVM_RUN stays pending (blocked) and fires on guest entry.
        ready_rx
            .recv_timeout(Duration::from_secs(2))
            .expect("vcpu ready");
        KvmVcpuHandle::force_exit(&[handle]);

        let got = rx.recv_timeout(Duration::from_secs(5));
        assert_eq!(
            got.ok(),
            Some("canceled"),
            "force_exit did not break the spinning vCPU out of KVM_RUN"
        );
        runner.join().expect("join runner");
        unsafe { libc::munmap(host as *mut libc::c_void, MEM) };
    }

    /// Regression: a `force_exit` issued in the window BEFORE the vCPU thread has
    /// bound itself — `tid == 0`, so NO SIGUSR1 can be delivered — must still
    /// stop the vCPU. `immediate_exit` is published at vCPU *creation* (not in
    /// `bind_thread`), so it gates the very first `KVM_RUN` entry → EINTR →
    /// Canceled, with no signal involved. This is exactly the start()/teardown
    /// race that used to need a 20–40 ms re-kick poll (the vCPU would otherwise
    /// spin forever on the never-exiting guest below).
    #[test]
    fn force_exit_before_bind_still_stops_vcpu() {
        use std::sync::mpsc;
        use std::time::Duration;

        const MEM: usize = 0x20_0000;
        let vm = KvmVm::create().expect("create VM");
        let host = unsafe {
            libc::mmap(
                std::ptr::null_mut(),
                MEM,
                libc::PROT_READ | libc::PROT_WRITE,
                libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
                -1,
                0,
            )
        };
        assert!(host != libc::MAP_FAILED, "mmap");
        let host = host as *mut u8;
        let put_u64 = |gpa: u64, v: u64| unsafe {
            std::ptr::write(host.add(gpa as usize) as *mut u64, v.to_le());
        };
        put_u64(0x1000, 0x2000 | 0x3);
        put_u64(0x2000, 0x3000 | 0x3);
        put_u64(0x3000, 0x83);
        // jmp $ (EB FE) — spin forever, never exit on its own.
        let stub = [0xebu8, 0xfe];
        unsafe { std::ptr::copy_nonoverlapping(stub.as_ptr(), host, stub.len()) };

        unsafe { vm.map_ram(host, 0, MEM, prot::RWX).expect("map_ram") };
        let vcpu = vm.create_vcpu().expect("create_vcpu");
        vcpu.enter_long_mode(0x0, 0x1000).expect("enter_long_mode");
        let handle = vcpu.exit_token();

        // Force-exit NOW — before any thread runs `bind_thread`, so the registry
        // tid is still 0 and `force_exit` sends no signal. Only the immediate_exit
        // gate (published at creation) can stop the vCPU here.
        KvmVcpuHandle::force_exit(&[handle]);

        let (tx, rx) = mpsc::channel::<&'static str>();
        let runner = std::thread::spawn(move || {
            // step() binds (stores tid) then enters KVM_RUN. The spinning guest
            // would run forever unless immediate_exit breaks the first entry.
            let r = match vcpu.step() {
                Ok(VcpuExit::Canceled) => "canceled",
                Ok(_) => "other",
                Err(_) => "err",
            };
            let _ = tx.send(r);
        });

        let got = rx.recv_timeout(Duration::from_secs(5));
        assert_eq!(
            got.ok(),
            Some("canceled"),
            "immediate_exit did not gate the first KVM_RUN entry for a pre-bind force_exit"
        );
        runner.join().expect("join runner");
        unsafe { libc::munmap(host as *mut libc::c_void, MEM) };
    }

    /// A single `force_exit` over a slice of handles must stop EVERY vCPU — the
    /// multi-vCPU teardown path (`stop`/`Drop` force-exit all handles at once).
    /// Both spin forever on their own thread until the one kick cancels them.
    #[test]
    fn force_exit_stops_all_vcpus() {
        use std::sync::mpsc;
        use std::time::Duration;

        const MEM: usize = 0x20_0000;
        let vm = KvmVm::create().expect("create VM");
        let host = unsafe {
            libc::mmap(
                std::ptr::null_mut(),
                MEM,
                libc::PROT_READ | libc::PROT_WRITE,
                libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
                -1,
                0,
            )
        };
        assert!(host != libc::MAP_FAILED, "mmap");
        let host = host as *mut u8;
        let put_u64 = |gpa: u64, v: u64| unsafe {
            std::ptr::write(host.add(gpa as usize) as *mut u64, v.to_le());
        };
        put_u64(0x1000, 0x2000 | 0x3);
        put_u64(0x2000, 0x3000 | 0x3);
        put_u64(0x3000, 0x83);
        let stub = [0xebu8, 0xfe]; // jmp $
        unsafe { std::ptr::copy_nonoverlapping(stub.as_ptr(), host, stub.len()) };
        unsafe { vm.map_ram(host, 0, MEM, prot::RWX).expect("map_ram") };

        let (ready_tx, ready_rx) = mpsc::channel::<()>();
        let (done_tx, done_rx) = mpsc::channel::<&'static str>();
        let mut handles = Vec::new();
        let mut runners = Vec::new();
        for _ in 0..2 {
            let vcpu = vm.create_vcpu().expect("create_vcpu");
            vcpu.enter_long_mode(0x0, 0x1000).expect("enter_long_mode");
            handles.push(vcpu.exit_token());
            let ready = ready_tx.clone();
            let done = done_tx.clone();
            runners.push(std::thread::spawn(move || {
                // Bind before signalling ready so the kick can never race binding.
                vcpu.bind_thread().expect("bind_thread");
                let _ = ready.send(());
                loop {
                    match vcpu.step() {
                        Ok(VcpuExit::Canceled) => {
                            let _ = done.send("canceled");
                            break;
                        }
                        Ok(_) => continue,
                        Err(_) => {
                            let _ = done.send("err");
                            break;
                        }
                    }
                }
            }));
        }

        // Both bound → one force_exit over both handles stops both.
        ready_rx
            .recv_timeout(Duration::from_secs(2))
            .expect("vcpu0");
        ready_rx
            .recv_timeout(Duration::from_secs(2))
            .expect("vcpu1");
        KvmVcpuHandle::force_exit(&handles);

        for _ in 0..2 {
            assert_eq!(
                done_rx.recv_timeout(Duration::from_secs(5)).ok(),
                Some("canceled"),
                "a vCPU was not stopped by the shared force_exit"
            );
        }
        for r in runners {
            r.join().expect("join runner");
        }
        unsafe { libc::munmap(host as *mut libc::c_void, MEM) };
    }

    /// Snapshot increment 7a: capture the full per-vCPU state, clobber the
    /// registers, restore, and confirm the captured values come back. Exercises
    /// every get/set on real /dev/kvm (regs/sregs/fpu/xcrs/lapic/msrs/events/
    /// debugregs/mp_state) — the main risk is any one erroring on the kernel.
    #[test]
    fn snapshot_round_trips_vcpu_state() {
        const MEM: usize = 0x20_0000;
        let vm = KvmVm::create().expect("create VM");
        let host = unsafe {
            libc::mmap(
                std::ptr::null_mut(),
                MEM,
                libc::PROT_READ | libc::PROT_WRITE,
                libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
                -1,
                0,
            )
        };
        assert!(host != libc::MAP_FAILED, "mmap");
        let host = host as *mut u8;
        let put_u64 = |gpa: u64, v: u64| unsafe {
            std::ptr::write(host.add(gpa as usize) as *mut u64, v.to_le());
        };
        put_u64(0x1000, 0x2000 | 0x3);
        put_u64(0x2000, 0x3000 | 0x3);
        put_u64(0x3000, 0x83);
        unsafe { vm.map_ram(host, 0, MEM, prot::RWX).expect("map_ram") };
        let vcpu = vm.create_vcpu().expect("create_vcpu");
        vcpu.enter_long_mode(0x0, 0x1000).expect("enter_long_mode");

        // Set known register values to snapshot.
        {
            let v = vcpu.vcpu.borrow();
            let mut r = v.get_regs().expect("get_regs");
            r.rax = 0x1234_5678_9abc_def0;
            r.rbx = 0x00ca_feba_be00_1357;
            r.rip = 0x0;
            v.set_regs(&r).expect("set_regs");
        }

        let snap = vcpu.capture_snapshot().expect("capture_snapshot");

        // Clobber everything we'll check.
        {
            let v = vcpu.vcpu.borrow();
            let mut r = v.get_regs().expect("get_regs");
            r.rax = 0;
            r.rbx = 0;
            r.rip = 0x1000;
            v.set_regs(&r).expect("set_regs");
        }

        vcpu.restore_snapshot(&snap).expect("restore_snapshot");

        let r = vcpu.vcpu.borrow().get_regs().expect("get_regs");
        assert_eq!(r.rax, 0x1234_5678_9abc_def0, "rax restored");
        assert_eq!(r.rbx, 0x00ca_feba_be00_1357, "rbx restored");
        assert_eq!(r.rip, 0x0, "rip restored");

        unsafe { libc::munmap(host as *mut libc::c_void, MEM) };
    }

    /// Snapshot increment 7b: capture a *running* guest's evolving vCPU state +
    /// RAM, restore into a brand-new VM, and confirm it RESUMES (not restarts).
    ///
    /// The guest is a tight `inc rax; jmp` loop (no IO), so it never exits on its
    /// own — we pause it at a clean instruction boundary with `force_exit` (the
    /// production quiesce path; snapshotting at an IO exit would lose KVM's
    /// pending-IO-completion state and re-execute the instruction). We snapshot
    /// the paused vCPU, tear the source VM down, restore into a fresh VM with a
    /// copy of the RAM, and check: (a) rax equals the snapshot value (running
    /// state transferred), then (b) after running again it has grown (resumed,
    /// not reset to 0).
    #[test]
    fn snapshot_resumes_running_guest_in_fresh_vm() {
        use std::time::Duration;

        const MEM: usize = 0x20_0000;
        // inc rax ; jmp -5 (back to offset 0) — a tight no-exit loop.
        let stub = [0x48u8, 0xff, 0xc0, 0xeb, 0xfb];

        let build = || -> (KvmVm, *mut u8) {
            let vm = KvmVm::create().expect("create VM");
            let host = unsafe {
                libc::mmap(
                    std::ptr::null_mut(),
                    MEM,
                    libc::PROT_READ | libc::PROT_WRITE,
                    libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
                    -1,
                    0,
                )
            } as *mut u8;
            assert!(host as *mut libc::c_void != libc::MAP_FAILED, "mmap");
            let put_u64 = |gpa: u64, v: u64| unsafe {
                std::ptr::write(host.add(gpa as usize) as *mut u64, v.to_le());
            };
            put_u64(0x1000, 0x2000 | 0x3);
            put_u64(0x2000, 0x3000 | 0x3);
            put_u64(0x3000, 0x83);
            unsafe { std::ptr::copy_nonoverlapping(stub.as_ptr(), host, stub.len()) };
            unsafe { vm.map_ram(host, 0, MEM, prot::RWX).expect("map_ram") };
            (vm, host)
        };

        // Run a vCPU's loop on its own thread until force-exited, then capture +
        // return its snapshot and the rax it reached.
        let run_until_kicked = move |vcpu: KvmVcpu| -> (KvmSnapshotState, u64) {
            vcpu.bind_thread().expect("bind_thread");
            loop {
                match vcpu.step() {
                    Ok(VcpuExit::Canceled) => break,
                    Ok(_) => continue,
                    Err(e) => panic!("step: {e}"),
                }
            }
            let rax = vcpu.vcpu.borrow().get_regs().expect("get_regs").rax;
            (vcpu.capture_snapshot().expect("capture"), rax)
        };

        // --- Source: let the loop spin, then pause + snapshot. ---
        let (vm1, host1) = build();
        let vcpu1 = vm1.create_vcpu().expect("create_vcpu");
        vcpu1.enter_long_mode(0x0, 0x1000).expect("enter_long_mode");
        let h1 = vcpu1.exit_token();
        let t1 = std::thread::spawn(move || run_until_kicked(vcpu1));
        std::thread::sleep(Duration::from_millis(50));
        KvmVcpuHandle::force_exit(&[h1]);
        let (snap, rax_at_snap) = t1.join().expect("join t1");
        assert!(rax_at_snap > 0, "guest should have incremented rax");

        let mut saved_ram = vec![0u8; MEM];
        unsafe { std::ptr::copy_nonoverlapping(host1, saved_ram.as_mut_ptr(), MEM) };
        unsafe { libc::munmap(host1 as *mut libc::c_void, MEM) };

        // --- Destination: fresh VM, load RAM, restore, verify resume. ---
        let (vm2, host2) = build();
        unsafe { std::ptr::copy_nonoverlapping(saved_ram.as_ptr(), host2, MEM) };
        let vcpu2 = vm2.create_vcpu().expect("create_vcpu");
        vcpu2.restore_snapshot(&snap).expect("restore_snapshot");

        // (a) Running state transferred: rax matches the snapshot exactly.
        let rax_restored = vcpu2.vcpu.borrow().get_regs().expect("get_regs").rax;
        assert_eq!(rax_restored, rax_at_snap, "rax not transferred to fresh VM");

        // (b) Resumes (keeps counting from there, not from 0).
        let h2 = vcpu2.exit_token();
        let t2 = std::thread::spawn(move || run_until_kicked(vcpu2));
        std::thread::sleep(Duration::from_millis(50));
        KvmVcpuHandle::force_exit(&[h2]);
        let (_snap2, rax_after) = t2.join().expect("join t2");
        assert!(
            rax_after > rax_at_snap,
            "restored guest did not resume: rax {rax_after} <= snapshot {rax_at_snap}"
        );

        unsafe { libc::munmap(host2 as *mut libc::c_void, MEM) };
    }

    /// Snapshot increment 7c: the in-kernel device state (PIT + PIC master/slave
    /// + IOAPIC + clock) captures and restores cleanly on real /dev/kvm —
    /// exercising get/set_pit2, get/set_irqchip (×3 chips), get/set_clock. These
    /// are needed so a restored Linux keeps timekeeping + interrupt routing.
    #[test]
    fn device_state_captures_and_restores() {
        let vm = KvmVm::create().expect("create VM");
        vm.create_pit().expect("create_pit");
        let dev = vm.capture_devices().expect("capture_devices");
        vm.restore_devices(&dev).expect("restore_devices");
        // Re-capture after restore: all the ioctls still succeed (round-trip).
        let _dev2 = vm.capture_devices().expect("recapture_devices");
    }
}