supermachine 0.7.70

// Multi-vCPU coordinator. Each vCPU runs in its own pthread, owns
// its own hv_vcpu_t (HVF requires creation + register access from
// the SAME thread). vCPU 0 is the boot CPU and runs on the main
// thread; secondaries are parked at thread start, woken by PSCI
// CPU_ON from vCPU 0.
//
// PSCI is via HVC (FDT advertises method="hvc" already). Subset
// implemented:
//   PSCI_VERSION         (0x84000000)  -> 0x10000  (PSCI 1.0)
//   PSCI_CPU_OFF         (0x84000002)  -> spin (no-op for now)
//   PSCI_CPU_ON          (0xC4000003)  -> wake target thread
//   PSCI_AFFINITY_INFO   (0xC4000004)  -> 0=on / 1=off / 2=pending
//   PSCI_FEATURES        (0x8400000A)  -> 0 for known, NOT_SUPPORTED else
//   PSCI_SYSTEM_OFF      (0x84000008)  -> shutdown
//   PSCI_SYSTEM_RESET    (0x84000009)  -> shutdown

#![cfg(all(target_os = "macos", target_arch = "aarch64"))]

use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
use std::sync::{Arc, Condvar, Mutex};

use crate::hypervisor::{ActiveVcpu, ActiveVcpuHandle, HypervisorVcpu, VcpuHandle};
use crate::vmm::snapshot::PerVcpuState;

pub const PSCI_VERSION: u32 = 0x84000000;
pub const PSCI_CPU_OFF: u32 = 0x84000002;
pub const PSCI_CPU_ON: u32 = 0xC4000003;
pub const PSCI_AFFINITY_INFO: u32 = 0xC4000004;
pub const PSCI_FEATURES: u32 = 0x8400000A;
pub const PSCI_SYSTEM_OFF: u32 = 0x84000008;
pub const PSCI_SYSTEM_RESET: u32 = 0x84000009;

pub const PSCI_SUCCESS: i64 = 0;
pub const PSCI_NOT_SUPPORTED: i64 = -1;
pub const PSCI_INVALID_PARAMS: i64 = -2;
pub const PSCI_ALREADY_ON: i64 = -4;

#[derive(Clone)]
pub enum VcpuStart {
    /// Secondary thread waits.
    Parked,
    /// CPU_ON requested: enter at `entry` with X0=`ctx_id`.
    Run { entry: u64, ctx_id: u64 },
}

pub struct VcpuSlot {
    pub state: Mutex<VcpuStart>,
    pub cv: Condvar,
    /// True once vCPU thread has begun running guest code (any
    /// CPU_ON for an already-on vCPU returns ALREADY_ON).
    pub on: AtomicBool,
}

pub struct VcpuCoordinator {
    pub n_vcpus: u32,
    pub slots: Vec<VcpuSlot>,
    pub shutdown: AtomicBool,
    /// Multi-vCPU snapshot pause/resume rendezvous — the shared, loom-proven
    /// [`PauseBarrier`](crate::vcpu_dispatch::PauseBarrier) (Phase 3 7b.3c). On
    /// vcpu0's thread the snapshot trigger calls `request_snapshot_pause`, which
    /// `request_pause`s + `hv_vcpus_exit`s the secondaries; each secondary's
    /// `maybe_pause_for_snapshot` captures its `PerVcpuState` and
    /// `park_cancelable`s into the barrier (best-effort: a 2 s deadline, and the
    /// shutdown flag cancels the park). The trigger drains the deposited states
    /// into `captured` — now just a single-threaded OUTPUT buffer that
    /// `take_secondary_states` reads, defaulting any secondary that didn't park
    /// in time. The barrier owns all the cross-thread concurrency (the part that
    /// was previously hand-rolled here and intermittent at N>1 vCPUs).
    pub snapshot_pause: crate::vcpu_dispatch::PauseBarrier<PerVcpuState>,
    pub captured: Mutex<Vec<Option<PerVcpuState>>>,
    pub resume_lock: Mutex<u64>, // monotonic generation; secondaries wait while gen unchanged
    pub resume_cv: Condvar,
    /// Secondary `hv_vcpu_t` handles, registered by each
    /// secondary thread after it creates its vCPU (HVF requires
    /// the same thread that created the vCPU to perform reg
    /// access; the handle itself is just a u64 we can pass to
    /// `hv_vcpus_exit` to force a run exit from any thread).
    pub secondary_handles: Mutex<Vec<ActiveVcpuHandle>>,
    /// Multi-vCPU cycle-restore rendezvous. Counterpart to the
    /// snapshot pause primitives above. When the runner takes a
    /// pool restore request and needs to roll secondaries back to
    /// the snapshot's per-vCPU state, it populates `restore_states`
    /// with each secondary's target state, sets `restore_request`,
    /// and `hv_vcpus_exit`s them. Each secondary thread, on its own
    /// HVF-owning thread (HVF requires register writes to come from
    /// the thread that created the vCPU), pulls its target from
    /// `restore_states[idx]`, calls `restore_vcpu_state`, increments
    /// `restored_count`, and waits on `resume_lock`'s generation
    /// (shared with the snapshot path — only one of the two can be
    /// active at a time). Same shape as `request_snapshot_pause`,
    /// just inverted: secondaries WRITE state instead of READING it.
    pub restore_request: AtomicBool,
    pub restore_states: Mutex<Vec<Option<PerVcpuState>>>,
    pub restored_count: AtomicU32,
    /// CNTVOFF_EL2 value to apply to every secondary vCPU on its
    /// restore path. Set by the boot vCPU's restore_snapshot just
    /// after it computes the new offset; read by each secondary's
    /// `run_secondary_inner` after it's loaded its captured
    /// per-vCPU state. Same value on all vCPUs so the guest's
    /// CNTVCT_EL0 reads are consistent across cores — without
    /// this, only vcpu0 had its offset re-applied on restore,
    /// secondaries saw the host's raw CNTPCT, and Linux thread
    /// migration between vCPUs caused CLOCK_MONOTONIC to skew
    /// (libuv asserts in Node ≥24, hangs in earlier versions).
    /// 0 means "no value set yet" (the secondary just keeps its
    /// HVF default for that boot, fine on cold boot, but should
    /// only happen pre-snapshot-restore).
    pub vtimer_offset: AtomicU64,
    /// Cold-restore handshake: signals secondaries that the boot
    /// thread has completed `hv_gic_set_state` (the GIC blob applies
    /// to every PE — distributor + per-PE redistributor frame
    /// including the WAKER `ProcessorSleep` bit and CTLR enable
    /// bits HVF does NOT expose via per-register accessors). Before
    /// this flag is true, a secondary's HVF vCPU exists but its
    /// redistributor frame is in HVF's default state (asleep, IRQs
    /// not routable). After it's true, the secondary can safely
    /// overlay its per-PE register state from the snapshot's
    /// `redist_regs`/`ich_regs` knowing the GIC's global view of
    /// this PE already has the blob's machinery in place.
    ///
    /// Without this handshake we saw intermittent (~13 % at 4
    /// vCPUs, never at 1 vCPU) post-restore deadlocks where amd64
    /// processes spawned via rosettad would hang in
    /// `__skb_wait_for_more_packets` on the rosettad↔wrapper
    /// AF_UNIX DGRAM socket. The rosettad daemon (or the wrapper)
    /// happened to land on a secondary whose redistributor was
    /// still in "ProcessorSleep" state — IPIs from the sending
    /// process (running on a working CPU) hit a redistributor that
    /// wouldn't accept SGIs, the wake-up was dropped, and the
    /// recv() never returned. The boot CPU stayed responsive (its
    /// PE was created before `hv_gic_set_state` so it picked up
    /// the blob), the agent could still accept `vm.exec` calls,
    /// but everything scheduled on the broken secondary stayed
    /// frozen.
    pub gic_restore_done: AtomicBool,
}

impl VcpuCoordinator {
    pub fn new(n_vcpus: u32) -> Arc<Self> {
        let slots = (0..n_vcpus)
            .map(|_| VcpuSlot {
                state: Mutex::new(VcpuStart::Parked),
                cv: Condvar::new(),
                on: AtomicBool::new(false),
            })
            .collect();
        Arc::new(Self {
            n_vcpus,
            slots,
            shutdown: AtomicBool::new(false),
            snapshot_pause: crate::vcpu_dispatch::PauseBarrier::new(),
            captured: Mutex::new((0..n_vcpus).map(|_| None).collect()),
            resume_lock: Mutex::new(0),
            resume_cv: Condvar::new(),
            secondary_handles: Mutex::new(Vec::new()),
            restore_request: AtomicBool::new(false),
            restore_states: Mutex::new((0..n_vcpus).map(|_| None).collect()),
            restored_count: AtomicU32::new(0),
            vtimer_offset: AtomicU64::new(0),
            gic_restore_done: AtomicBool::new(false),
        })
    }

    /// Called by a secondary vCPU thread once it has created its
    /// `hv_vcpu_t`. Stored so the snapshot trigger thread can
    /// `hv_vcpus_exit` all secondaries in one batch.
    pub fn register_secondary(&self, handle: ActiveVcpuHandle) {
        self.secondary_handles.lock().unwrap().push(handle);
    }

    pub fn secondary_handles_snapshot(&self) -> Vec<ActiveVcpuHandle> {
        self.secondary_handles.lock().unwrap().clone()
    }

    /// Called by secondary dispatch loops between `hv_vcpu_run`
    /// iterations. If a snapshot is in progress, the secondary
    /// captures its own register state, deposits it into
    /// `captured[idx]`, bumps `captured_count`, and waits for
    /// `resume_lock`'s generation to advance. Returns when the
    /// secondary is free to continue running guest code.
    pub fn maybe_pause_for_snapshot(
        &self,
        idx: u32,
        vcpu: &ActiveVcpu,
    ) -> crate::hypervisor::ActiveResult<()> {
        if !self.snapshot_pause.is_paused() {
            return Ok(());
        }
        // Capture state on the OWNING thread (the backend requires it), then
        // deposit it and park on the shared barrier until the trigger resumes us
        // (or shutdown cancels the park).
        let st = vcpu.capture_snapshot()?;
        if crate::trace::enabled("timings") {
            let pc = st.pc().unwrap_or(0);
            eprintln!("  [vcpu-{idx}] snapshot pause: PC=0x{pc:x}");
        }
        self.snapshot_pause
            .park_cancelable(idx as usize, st, &self.shutdown);
        Ok(())
    }

    /// Called by the snapshot trigger thread (vcpu0). Returns
    /// after every *running* secondary has deposited its state.
    /// Secondaries still parked in `wait_for_run` (e.g. snapshot
    /// trigger fires pre-SMP-bringup, like in volume mode where
    /// the bake snapshots at the heartbeat marker before the
    /// kernel has issued CPU_ON for them) are skipped — their
    /// PerVcpuState defaults to a no-op restore that hits the
    /// PSCI park path again on restore.
    pub fn request_snapshot_pause(&self, secondary_handles: &[ActiveVcpuHandle]) {
        // Reset the output buffer from any prior snapshot.
        {
            let mut g = self.captured.lock().unwrap();
            for s in g.iter_mut() {
                *s = None;
            }
        }
        self.snapshot_pause.request_pause();
        ActiveVcpuHandle::force_exit(secondary_handles);
        // Count only secondaries currently running guest code; parked-pre-SMP
        // ones won't reach the rendezvous, so they're not part of `target`.
        let target = self
            .slots
            .iter()
            .skip(1)
            .filter(|s| s.on.load(Ordering::Acquire))
            .count();
        let deadline = std::time::Instant::now() + std::time::Duration::from_secs(2);
        // Best-effort: wait until all running secondaries park, the deadline
        // passes, or shutdown — then place each deposited state into the output
        // buffer BY INDEX (a secondary that didn't park stays None → restores
        // from a default/no-op state, the prior contract). The barrier owns the
        // concurrency (loom-proven); this is now single-threaded bookkeeping.
        let pairs = self
            .snapshot_pause
            .wait_parked_until(target, deadline, &self.shutdown);
        if pairs.len() < target {
            eprintln!(
                "  [coord] snapshot-pause timeout: {}/{} secondaries deposited",
                pairs.len(),
                target,
            );
        }
        let mut g = self.captured.lock().unwrap();
        for (idx, st) in pairs {
            if idx < g.len() {
                g[idx] = Some(st);
            }
        }
    }

    /// Release secondaries so they resume guest execution after
    /// snapshot capture. Pair with `request_snapshot_pause`.
    pub fn release_after_snapshot(&self) {
        self.snapshot_pause.resume();
    }

    /// Multi-vCPU cycle-restore: secondary-side rendezvous.
    /// Counterpart to `maybe_pause_for_snapshot`. Two-phase:
    ///
    ///   1. The runner sets `restore_request` + `hv_vcpus_exit`s
    ///      secondaries BEFORE doing RAM remap / GIC blob /
    ///      vcpu0 restore. Each secondary lands here and spins
    ///      until its `restore_states[idx]` becomes `Some` —
    ///      that's the runner's signal "GIC + vcpu0 are done,
    ///      apply your state now". (HVF requires register writes
    ///      to come from the OWNING thread, so secondaries apply
    ///      their own state.)
    ///
    ///   2. After applying, the secondary increments
    ///      `restored_count` and waits on `resume_lock`'s
    ///      generation. The runner spins until
    ///      `restored_count == n_running_secondaries`, then
    ///      `release_after_restore` bumps the gen and secondaries
    ///      re-enter `hv_vcpu_run` with the snapshot's coherent
    ///      state.
    ///
    /// Without the two-phase split the runner would race against
    /// secondaries' state application — they'd write ICH regs
    /// referencing distributor INTIDs the GIC blob restore
    /// hadn't published yet, and the next IRQ delivery on the
    /// restored guest would land in undefined territory.
    pub fn maybe_apply_restore(
        &self,
        idx: u32,
        vcpu: &ActiveVcpu,
    ) -> crate::hypervisor::ActiveResult<()> {
        if !self.restore_request.load(Ordering::Acquire) {
            return Ok(());
        }
        let saved_gen = *self.resume_lock.lock().unwrap();
        // Phase 1: spin-wait until our target state is published.
        // The runner does GIC + vcpu0 restore first, then pops
        // states into `restore_states`. Bound the wait on shutdown
        // so a teardown mid-restore doesn't deadlock the secondary.
        let deadline = std::time::Instant::now() + std::time::Duration::from_secs(5);
        loop {
            let snapshot = {
                let g = self.restore_states.lock().unwrap();
                g[idx as usize].clone()
            };
            if let Some(st) = snapshot {
                vcpu.restore_snapshot(&st)?;
                // Re-align this secondary's CNTVOFF_EL2 with the
                // boot vCPU's just-applied offset. The boot vCPU
                // runs `restore_snapshot_timed_with_options`
                // BEFORE publishing into `restore_states`, so the
                // value in `self.vtimer_offset` is already fresh
                // when we land here. See worker.rs's analogous
                // call in run_secondary_inner's cold-restore path
                // for the full rationale (Node libuv asserts on
                // CLOCK_MONOTONIC skew when secondaries don't
                // share vcpu0's CNTVOFF after restore).
                let off = self.vtimer_offset.load(Ordering::Acquire);
                if off != 0 {
                    vcpu.set_vtimer_offset(off)?;
                }
                break;
            }
            if self.shutdown.load(Ordering::Acquire) {
                return Ok(());
            }
            if std::time::Instant::now() > deadline {
                eprintln!(
                    "  [vcpu-{idx}] cycle-restore phase-1 timeout (state not published); \
                     proceeding without restore (will likely panic on resume)"
                );
                break;
            }
            std::hint::spin_loop();
        }
        // Phase 2: deposit completion + wait for resume.
        self.restored_count.fetch_add(1, Ordering::AcqRel);
        let mut g = self.resume_lock.lock().unwrap();
        while *g == saved_gen && !self.shutdown.load(Ordering::Acquire) {
            g = self.resume_cv.wait(g).unwrap();
        }
        Ok(())
    }

    /// Multi-vCPU cycle-restore: phase-1 driver call. Forces
    /// secondaries out of `hv_vcpu_run` and parks them in
    /// `maybe_apply_restore`, where they spin-wait for the
    /// runner to publish per-vcpu target states. Returns
    /// immediately after issuing `hv_vcpus_exit` — does NOT wait
    /// for secondaries to land at the rendezvous (they'll spin
    /// when they get there; runner can do GIC + vcpu0 restore in
    /// parallel).
    pub fn pause_secondaries_for_restore(&self, secondary_handles: &[ActiveVcpuHandle]) {
        let _t0 = std::time::Instant::now();
        // Reset state slots — a prior restore may have left them
        // populated with stale snapshots.
        {
            let mut g = self.restore_states.lock().unwrap();
            for s in g.iter_mut() {
                *s = None;
            }
        }
        self.restored_count.store(0, Ordering::SeqCst);
        self.restore_request.store(true, Ordering::Release);
        ActiveVcpuHandle::force_exit(secondary_handles);
        if crate::trace::enabled("timings") {
            eprintln!(
                "  [coord-restore] pause: {} us (n_secondaries={})",
                _t0.elapsed().as_micros(),
                secondary_handles.len(),
            );
        }
    }

    /// Multi-vCPU cycle-restore: phase-2 driver call. Publishes
    /// each secondary's target `PerVcpuState` into
    /// `restore_states`, then spins until every running secondary
    /// has applied its state and parked at the resume rendezvous.
    /// Caller MUST call `release_after_restore` after this returns
    /// to bump the resume generation.
    ///
    /// `secondary_states[i]` corresponds to vcpu_index `i + 1`.
    pub fn publish_and_wait_secondary_restore(&self, secondary_states: &[Option<PerVcpuState>]) {
        let _t0 = std::time::Instant::now();
        // Publish target states atomically in one lock acquisition.
        {
            let mut g = self.restore_states.lock().unwrap();
            for (i, st) in secondary_states.iter().enumerate() {
                let idx = i + 1; // secondaries start at vcpu 1
                if idx < g.len() {
                    g[idx] = st.clone();
                }
            }
        }
        let _t_publish = _t0.elapsed().as_micros();
        // Wait for completion. Only secondaries with slot.on=true
        // are running and will reach `maybe_apply_restore`; parked-
        // pre-SMP-bringup ones are skipped (same logic as
        // `request_snapshot_pause`).
        let target: u32 = self
            .slots
            .iter()
            .skip(1)
            .filter(|s| s.on.load(Ordering::Acquire))
            .count() as u32;
        let deadline = std::time::Instant::now() + std::time::Duration::from_secs(2);
        while self.restored_count.load(Ordering::Acquire) < target {
            if self.shutdown.load(Ordering::Acquire) {
                return;
            }
            if std::time::Instant::now() > deadline {
                eprintln!(
                    "  [coord] cycle-restore phase-2 timeout: {}/{} secondaries applied",
                    self.restored_count.load(Ordering::Acquire),
                    target,
                );
                break;
            }
            std::hint::spin_loop();
        }
        if crate::trace::enabled("timings") {
            eprintln!(
                "  [coord-restore] publish+wait: total {} us (publish {} us; spin {} us, target={target})",
                _t0.elapsed().as_micros(),
                _t_publish,
                _t0.elapsed().as_micros().saturating_sub(_t_publish),
            );
        }
    }

    /// Release secondaries after a cycle-restore — they re-enter
    /// `hv_vcpu_run` with their newly-applied snapshot state.
    pub fn release_after_restore(&self) {
        self.restore_request.store(false, Ordering::Release);
        let mut g = self.resume_lock.lock().unwrap();
        *g = g.wrapping_add(1);
        self.resume_cv.notify_all();
    }

    /// Pop secondary captured states (idx 1..n_vcpus). Called by
    /// the snapshot trigger AFTER `request_snapshot_pause`
    /// returned and AFTER capturing vcpu0's own state. Returns a
    /// vector of length `n_vcpus - 1` (so caller can prepend
    /// vcpu0's state).
    pub fn take_secondary_states(&self) -> Vec<PerVcpuState> {
        let mut g = self.captured.lock().unwrap();
        let mut out = Vec::with_capacity(g.len().saturating_sub(1));
        for st in g.iter_mut().skip(1) {
            // Replace with default (empty) so a missing entry
            // doesn't crash the saver. In practice every entry
            // should be Some here.
            out.push(st.take().unwrap_or_default());
        }
        out
    }

    /// PSCI CPU_ON: signal target vCPU's thread to start. Returns the
    /// PSCI return code.
    pub fn cpu_on(&self, target: u32, entry: u64, ctx_id: u64) -> i64 {
        let Some(slot) = self.slots.get(target as usize) else {
            return PSCI_INVALID_PARAMS;
        };
        if slot.on.load(Ordering::SeqCst) {
            return PSCI_ALREADY_ON;
        }
        let mut s = slot.state.lock().unwrap();
        *s = VcpuStart::Run { entry, ctx_id };
        slot.cv.notify_one();
        PSCI_SUCCESS
    }

    pub fn affinity_info(&self, target: u32) -> i64 {
        match self.slots.get(target as usize) {
            Some(slot) if slot.on.load(Ordering::SeqCst) => 0, // ON
            Some(_) => 1,                                      // OFF
            None => PSCI_INVALID_PARAMS,
        }
    }

    /// Wait (blocking) until our slot is told to Run. Used by
    /// secondary vCPU threads on startup.
    pub fn wait_for_run(&self, idx: u32) -> Option<(u64, u64)> {
        let slot = &self.slots[idx as usize];
        let mut s = slot.state.lock().unwrap();
        loop {
            if self.shutdown.load(Ordering::SeqCst) {
                return None;
            }
            if let VcpuStart::Run { entry, ctx_id } = *s {
                slot.on.store(true, Ordering::SeqCst);
                return Some((entry, ctx_id));
            }
            s = slot.cv.wait(s).unwrap();
        }
    }
}

#[cfg(test)]
mod tests {
    //! The multi-vCPU restore rendezvous (`publish_and_wait_secondary_
    //! restore`) drives real secondary vCPU threads in production, but its
    //! coordination is just atomics + a deadline, so we exercise it here
    //! without HVF by driving `restored_count` / slot `on` / `shutdown`
    //! directly. The headline is the forward-progress guarantee: a stalled
    //! secondary must NOT hang the host — the deadline fires and we return.
    use super::*;
    use std::time::{Duration, Instant};

    fn st(vtimer_offset: u64) -> Option<PerVcpuState> {
        Some(PerVcpuState {
            vtimer_offset,
            ..Default::default()
        })
    }

    fn mark_running(coord: &VcpuCoordinator, indices: &[usize]) {
        for &i in indices {
            coord.slots[i].on.store(true, Ordering::Release);
        }
    }

    #[test]
    fn publish_and_wait_returns_at_once_with_no_running_secondaries() {
        // 4 vCPUs declared but none marked running → target 0 → no spin.
        let coord = VcpuCoordinator::new(4);
        let t = Instant::now();
        coord.publish_and_wait_secondary_restore(&[st(0xAB), st(0xAB), st(0xAB)]);
        assert!(t.elapsed() < Duration::from_millis(500));
        // Published into the secondary slots (idx 1..).
        let g = coord.restore_states.lock().unwrap();
        assert_eq!(g[1].as_ref().unwrap().vtimer_offset, 0xAB);
        assert_eq!(g[3].as_ref().unwrap().vtimer_offset, 0xAB);
    }

    #[test]
    fn publish_and_wait_completes_when_all_secondaries_apply() {
        let coord = VcpuCoordinator::new(3);
        mark_running(&coord, &[1, 2]); // target = 2
        let c = coord.clone();
        let h = std::thread::spawn(move || {
            std::thread::sleep(Duration::from_millis(50));
            c.restored_count.store(2, Ordering::Release); // both applied
        });
        let t = Instant::now();
        coord.publish_and_wait_secondary_restore(&[st(1), st(2)]);
        let el = t.elapsed();
        h.join().unwrap();
        assert!(
            el < Duration::from_millis(1500),
            "returned before the 2s deadline: {el:?}"
        );
    }

    #[test]
    fn publish_and_wait_times_out_when_a_secondary_stalls() {
        // FORWARD PROGRESS: one secondary never applies (count stuck below
        // target). The host must NOT hang — the 2s deadline fires and we
        // return. (This single test costs ~2s by design.)
        let coord = VcpuCoordinator::new(3);
        mark_running(&coord, &[1, 2]); // target = 2
        coord.restored_count.store(1, Ordering::Release); // only one applied
        let t = Instant::now();
        coord.publish_and_wait_secondary_restore(&[st(1), st(2)]);
        let el = t.elapsed();
        assert!(
            el >= Duration::from_secs(2),
            "must wait out the deadline: {el:?}"
        );
        assert!(
            el < Duration::from_secs(4),
            "must return after it, not hang: {el:?}"
        );
    }

    #[test]
    fn publish_and_wait_shutdown_short_circuits() {
        let coord = VcpuCoordinator::new(3);
        mark_running(&coord, &[1, 2]); // target never satisfied
        let c = coord.clone();
        let h = std::thread::spawn(move || {
            std::thread::sleep(Duration::from_millis(50));
            c.shutdown.store(true, Ordering::Release);
        });
        let t = Instant::now();
        coord.publish_and_wait_secondary_restore(&[st(1), st(2)]);
        let el = t.elapsed();
        h.join().unwrap();
        assert!(
            el < Duration::from_millis(1500),
            "shutdown must cut the wait short: {el:?}"
        );
    }

    #[test]
    fn publish_state_is_bounds_checked_against_slot_count() {
        // More secondary_states than slots → only in-bounds slots written,
        // no panic / OOB (the `idx < g.len()` guard in the publish loop).
        let coord = VcpuCoordinator::new(2); // restore_states len 2 (idx 0,1)
        coord.publish_and_wait_secondary_restore(&[
            st(0x55),
            st(0x66),
            st(0x77),
            st(0x88),
            st(0x99),
        ]);
        let g = coord.restore_states.lock().unwrap();
        assert_eq!(g.len(), 2);
        assert_eq!(g[1].as_ref().unwrap().vtimer_offset, 0x55); // i=0 → idx 1
                                                                // i>=1 map to idx>=2 which are out of range and silently skipped.
    }

    #[test]
    fn release_after_restore_clears_request_and_bumps_resume_generation() {
        let coord = VcpuCoordinator::new(2);
        coord.restore_request.store(true, Ordering::Release);
        *coord.resume_lock.lock().unwrap() = 41;
        coord.release_after_restore();
        assert!(!coord.restore_request.load(Ordering::Acquire));
        assert_eq!(
            *coord.resume_lock.lock().unwrap(),
            42,
            "resume generation advances"
        );
    }

    // The snapshot-PAUSE rendezvous now runs on the shared, loom-proven
    // `PauseBarrier` (its own concurrency tests + loom Models C/D cover the
    // handshake). These tests just verify the COORDINATOR wires it correctly:
    // request → secondaries park → states drained into `captured` by index;
    // shutdown short-circuits; release clears the pause. An empty handles slice
    // skips the HVF `hv_vcpus_exit`, so it drives without a real VM.

    #[test]
    fn request_snapshot_pause_collects_parked_secondary_states() {
        let coord = VcpuCoordinator::new(3);
        mark_running(&coord, &[1, 2]); // target = 2
        coord.captured.lock().unwrap()[1] = st(0xDEAD); // stale, must be reset
                                                        // Two secondaries park (deposit their state) once the pause is requested.
        let parkers: Vec<_> = [1usize, 2]
            .into_iter()
            .map(|idx| {
                let c = coord.clone();
                std::thread::spawn(move || {
                    while !c.snapshot_pause.is_paused() {
                        std::thread::yield_now();
                    }
                    c.snapshot_pause
                        .park_cancelable(idx, st(idx as u64).unwrap(), &c.shutdown);
                })
            })
            .collect();
        let t = Instant::now();
        coord.request_snapshot_pause(&[]); // request pause + wait for both to park
        let el = t.elapsed();
        assert!(
            el < Duration::from_millis(1500),
            "returned before the deadline: {el:?}"
        );
        assert!(
            coord.snapshot_pause.is_paused(),
            "still paused after request"
        );
        // Both secondaries' states were drained into the output buffer by index
        // (and the stale 0xDEAD slot was reset first).
        assert!(
            coord.captured.lock().unwrap()[1].is_some(),
            "secondary 1 captured"
        );
        assert!(
            coord.captured.lock().unwrap()[2].is_some(),
            "secondary 2 captured"
        );
        coord.release_after_snapshot(); // release the parked secondaries
        for h in parkers {
            h.join().unwrap();
        }
    }

    #[test]
    fn request_snapshot_pause_shutdown_short_circuits() {
        let coord = VcpuCoordinator::new(3);
        mark_running(&coord, &[1, 2]); // target never satisfied
        let c = coord.clone();
        let h = std::thread::spawn(move || {
            std::thread::sleep(Duration::from_millis(50));
            c.shutdown.store(true, Ordering::Release);
        });
        let t = Instant::now();
        coord.request_snapshot_pause(&[]);
        let el = t.elapsed();
        h.join().unwrap();
        assert!(
            el < Duration::from_millis(1500),
            "shutdown must cut the wait short: {el:?}"
        );
    }

    #[test]
    fn release_after_snapshot_clears_the_pause() {
        // Snapshot release now drives the barrier (its own gen), not the
        // restore-shared `resume_lock`. Just verify the pause is cleared.
        let coord = VcpuCoordinator::new(2);
        coord.snapshot_pause.request_pause();
        assert!(coord.snapshot_pause.is_paused());
        coord.release_after_snapshot();
        assert!(
            !coord.snapshot_pause.is_paused(),
            "pause cleared after release"
        );
    }
}