supermachine 0.7.70

//! Portable run-loop convergence vocabulary (Phase 3 7b).
//!
//! Both backends run a per-vCPU loop that, on each guest exit, decodes it,
//! services the device/bus or arch trap, completes it, and decides whether to
//! keep running. The DECODE + COMPLETE step is irreducibly per-backend (HVF
//! decodes an ARM `Exception`/ESR and completes MMIO via `set_core` + advance-PC;
//! KVM gets first-class `Io`/`Mmio` exits and completes reads in place in the
//! `kvm_run` page — see `docs/design/vmm-backend-unification-2026-06-07.md` §7b).
//! But the OUTCOME of one such step — "keep going / clean shutdown / terminal
//! stop / force-exited by the lifecycle" — is identical across backends.
//!
//! This module owns that shared outcome type so both run loops speak one
//! vocabulary. It is the first converged piece of 7b; the per-backend
//! `run_one` dispatch trait and the shared lifecycle loop that consumes
//! [`VcpuOutcome`] land in subsequent increments (7b.1→7b.3), once both
//! backends' concrete shapes inform the trait (avoids generalizing from a
//! single implementation).

/// What one "run the vCPU until it exits, then dispatch that exit" step decided.
/// Returned by each backend's per-exit handler; consumed by the per-vCPU loop.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum VcpuOutcome {
    /// The exit was serviced (device MMIO/PIO, arch trap, timer, spurious wake);
    /// the vCPU should re-enter the guest.
    Continue,
    /// A cross-thread force-exit stopped this vCPU because the lifecycle asked it
    /// to (a peer vCPU stopped first, a pause/cancel was requested). Not an error
    /// and not a guest-initiated stop.
    Canceled,
    /// A terminal stop that is NOT a clean power-off: an unhandled exit, a fatal
    /// guest abort, or `panic=-1`/`reboot=t`. The VM is done; surface it as such.
    Stop,
    /// A clean guest-requested power-off (aarch64 PSCI `SYSTEM_OFF`; x86 a triple
    /// fault / `KVM_EXIT_SHUTDOWN` from `reboot`). Distinguished from [`Stop`] so
    /// the orchestration can report an orderly shutdown vs an abnormal one.
    SystemOff,
}

/// The live-snapshot pause/resume rendezvous, shared by both backends (Phase 3
/// 7b.3). A snapshot of a *running* multi-vCPU guest must freeze every vCPU at a
/// clean instruction boundary, capture each one's state, then resume them — all
/// without the guest observing a gap. Both backends implemented this twice with
/// the same generation-barrier shape; this is the single audited version.
///
/// Protocol (snapshotter is one thread; `ncpus` vCPU threads):
///   1. snapshotter [`request_pause`](Self::request_pause), then force-exits
///      every vCPU (backend-specific: `hv_vcpus_exit` / SIGUSR1).
///   2. each vCPU, on its forced exit, sees [`is_paused`](Self::is_paused),
///      captures its own state, and calls [`park`](Self::park) — depositing the
///      state and blocking until resumed.
///   3. snapshotter [`wait_all_parked`](Self::wait_all_parked) returns once all
///      `ncpus` have parked (guest fully frozen), handing back the states sorted
///      by vCPU index; it captures device + RAM state, then [`resume`](Self::resume).
///   4. `resume` clears the pause, bumps the generation, and wakes the parked
///      vCPUs, which re-enter the guest.
///
/// `pause` is a lock-free `AtomicBool` so the vCPU hot path checks it on every
/// spurious wake without taking the mutex; the deposit/park/resume handshake is
/// under the mutex + condvar so there is no lost wakeup. The generation counter
/// lets a vCPU that wakes after the snapshotter has already cycled to the next
/// generation exit its wait correctly (it compares the gen it parked at).
pub struct PauseBarrier<S> {
    pause: std::sync::atomic::AtomicBool,
    inner: std::sync::Mutex<PauseState<S>>,
    cv: std::sync::Condvar,
}

struct PauseState<S> {
    /// How many vCPUs have parked in the current generation.
    parked: usize,
    /// Monotonic generation; bumped by `resume` so a late-waking vCPU that
    /// parked in gen N sees gen != N and stops waiting.
    gen: u64,
    /// `(vcpu_index, captured_state)` deposited by each parked vCPU.
    states: Vec<(usize, S)>,
}

impl<S> Default for PauseBarrier<S> {
    fn default() -> Self {
        Self::new()
    }
}

impl<S> PauseBarrier<S> {
    pub fn new() -> Self {
        PauseBarrier {
            pause: std::sync::atomic::AtomicBool::new(false),
            inner: std::sync::Mutex::new(PauseState {
                parked: 0,
                gen: 0,
                states: Vec::new(),
            }),
            cv: std::sync::Condvar::new(),
        }
    }

    /// Snapshotter: signal that a pause is requested. The caller then force-exits
    /// every vCPU so each lands in [`park`](Self::park). Lock-free.
    #[inline]
    pub fn request_pause(&self) {
        self.pause.store(true, std::sync::atomic::Ordering::SeqCst);
    }

    /// vCPU hot path: is a pause currently requested? Lock-free — checked on each
    /// forced exit before taking the slow [`park`](Self::park) path. `#[inline]`
    /// so this compiles to the same bare atomic load the per-backend run loops
    /// did before the convergence — zero added cost on the vCPU path.
    #[inline]
    pub fn is_paused(&self) -> bool {
        self.pause.load(std::sync::atomic::Ordering::SeqCst)
    }

    /// vCPU: deposit this vCPU's captured `state` and block until the snapshotter
    /// resumes (clears the pause and bumps the generation). Returns when the vCPU
    /// should re-enter the guest.
    pub fn park(&self, vcpu_index: usize, state: S) {
        let mut g = self.inner.lock().unwrap_or_else(|e| e.into_inner());
        g.states.push((vcpu_index, state));
        g.parked += 1;
        let my_gen = g.gen;
        // Wake the snapshotter waiting in `wait_all_parked`.
        self.cv.notify_all();
        while self.pause.load(std::sync::atomic::Ordering::SeqCst) && g.gen == my_gen {
            g = self.cv.wait(g).unwrap_or_else(|e| e.into_inner());
        }
    }

    /// Snapshotter: block until all `ncpus` vCPUs have parked (guest fully
    /// frozen), then take their captured states sorted by vCPU index. Must be
    /// followed by [`resume`](Self::resume) to release the vCPUs.
    pub fn wait_all_parked(&self, ncpus: usize) -> Vec<S> {
        let mut g = self.inner.lock().unwrap_or_else(|e| e.into_inner());
        while g.parked < ncpus {
            g = self.cv.wait(g).unwrap_or_else(|e| e.into_inner());
        }
        let mut s = std::mem::take(&mut g.states);
        s.sort_by_key(|(i, _)| *i);
        s.into_iter().map(|(_, st)| st).collect()
    }

    /// Snapshotter: release the parked vCPUs — clear the pause, bump the
    /// generation, and wake everyone. After this the vCPUs re-enter the guest.
    ///
    /// Also clears any leftover deposited state. For the symmetric path
    /// ([`wait_all_parked`](Self::wait_all_parked)) the states were already
    /// taken, so this is a no-op; for the best-effort timeout path
    /// ([`wait_parked_until`](Self::wait_parked_until)) a straggler can deposit
    /// between the snapshotter's take and this resume, and clearing it here keeps
    /// it out of the NEXT snapshot's capture.
    pub fn resume(&self) {
        let mut g = self.inner.lock().unwrap_or_else(|e| e.into_inner());
        self.pause.store(false, std::sync::atomic::Ordering::SeqCst);
        g.parked = 0;
        g.states.clear();
        g.gen = g.gen.wrapping_add(1);
        self.cv.notify_all();
    }

    /// vCPU variant of [`park`](Self::park) for backends whose snapshotter waits
    /// with a TIMEOUT (HVF: a 2 s best-effort rendezvous). Two differences from
    /// `park`:
    ///   * the wait also ends if `cancel` is set (e.g. VM shutdown), so a vCPU
    ///     is never stuck parked while the VM is tearing down;
    ///   * if the pause is ALREADY over when this vCPU acquires the lock (a
    ///     "late" arrival after the snapshotter timed out + resumed), it deposits
    ///     NOTHING and returns — a straggler can't contaminate the next
    ///     snapshot's captured state, and can't increment a fresh generation's
    ///     parked count.
    pub fn park_cancelable(
        &self,
        vcpu_index: usize,
        state: S,
        cancel: &std::sync::atomic::AtomicBool,
    ) {
        let mut g = self.inner.lock().unwrap_or_else(|e| e.into_inner());
        if !self.pause.load(std::sync::atomic::Ordering::SeqCst) {
            // Late arrival: the pause window is closed; nothing to join.
            return;
        }
        g.states.push((vcpu_index, state));
        g.parked += 1;
        let my_gen = g.gen;
        self.cv.notify_all();
        // Poll with a bounded wait so a `cancel` set WITHOUT a cv notification
        // (the shutdown path need not touch this barrier) is still observed
        // within the interval; `resume`'s notify still wakes us immediately on
        // the normal path.
        while self.pause.load(std::sync::atomic::Ordering::SeqCst)
            && g.gen == my_gen
            && !cancel.load(std::sync::atomic::Ordering::SeqCst)
        {
            let (ng, _to) = self
                .cv
                .wait_timeout(g, std::time::Duration::from_millis(50))
                .unwrap_or_else(|e| e.into_inner());
            g = ng;
        }
    }

    /// Snapshotter variant of [`wait_all_parked`](Self::wait_all_parked) for
    /// backends that snapshot BEST-EFFORT under a deadline (HVF). Waits until
    /// `target` vCPUs have parked, OR `deadline` passes, OR `cancel` is set, then
    /// takes the deposited `(vcpu_index, state)` pairs sorted by index. A short
    /// count means some vCPUs didn't reach the rendezvous in time; the caller
    /// places each by index and defaults the missing ones (the best-effort
    /// contract HVF's coordinator has — so the INDEX must be preserved, unlike
    /// the symmetric KVM path where every vCPU always parks). Must be followed by
    /// [`resume`](Self::resume).
    pub fn wait_parked_until(
        &self,
        target: usize,
        deadline: std::time::Instant,
        cancel: &std::sync::atomic::AtomicBool,
    ) -> Vec<(usize, S)> {
        let mut g = self.inner.lock().unwrap_or_else(|e| e.into_inner());
        while g.parked < target && !cancel.load(std::sync::atomic::Ordering::SeqCst) {
            let now = std::time::Instant::now();
            if now >= deadline {
                break;
            }
            // Cap each wait so `cancel` (shutdown) is observed within the poll
            // interval even when no vCPU parks to trigger a notification; the
            // last `park` / `resume` still wakes us immediately on the normal
            // path. Without the cap a `wait_timeout(deadline - now)` would block
            // the whole deadline before re-checking `cancel`.
            let wait = (deadline - now).min(std::time::Duration::from_millis(50));
            let (ng, _to) = self
                .cv
                .wait_timeout(g, wait)
                .unwrap_or_else(|e| e.into_inner());
            g = ng;
        }
        let mut s = std::mem::take(&mut g.states);
        s.sort_by_key(|(i, _)| *i);
        s
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn outcome_is_copy_and_eq() {
        // The loop returns these by value on the hot path; cheap Copy + Eq.
        let a = VcpuOutcome::Continue;
        let b = a;
        assert_eq!(a, b);
        assert_ne!(VcpuOutcome::Stop, VcpuOutcome::SystemOff);
        assert_ne!(VcpuOutcome::Canceled, VcpuOutcome::Continue);
    }

    /// Drive the full pause/capture/resume rendezvous across many generations
    /// with several vCPU threads: every generation must freeze all vCPUs, hand
    /// back exactly their states (one per index), and resume them — and the
    /// whole thing must terminate (no lost wakeup / no missed generation).
    #[test]
    fn pause_barrier_round_trips_many_generations() {
        use std::sync::atomic::{AtomicBool, Ordering};
        use std::sync::Arc;

        const NCPUS: usize = 4;
        const GENS: u64 = 50;

        let barrier: Arc<PauseBarrier<usize>> = Arc::new(PauseBarrier::new());
        // Set once the snapshotter has finished all generations, so vCPU threads
        // that are spinning-to-park can exit instead of waiting forever.
        let done = Arc::new(AtomicBool::new(false));

        let mut handles = Vec::new();
        for idx in 0..NCPUS {
            let b = Arc::clone(&barrier);
            let done = Arc::clone(&done);
            handles.push(std::thread::spawn(move || {
                // Mirror the production vCPU loop: spin until a pause is
                // requested, then park (depositing this vCPU's "state" = its
                // index); repeat until the snapshotter is done.
                loop {
                    while !b.is_paused() {
                        if done.load(Ordering::SeqCst) {
                            return;
                        }
                        std::thread::yield_now();
                    }
                    b.park(idx, idx);
                }
            }));
        }

        for gen in 0..GENS {
            barrier.request_pause();
            let states = barrier.wait_all_parked(NCPUS);
            assert_eq!(
                states,
                (0..NCPUS).collect::<Vec<_>>(),
                "generation {gen}: every vCPU parked exactly once, sorted by index"
            );
            barrier.resume();
        }
        done.store(true, Ordering::SeqCst);
        // A vCPU could be mid-`park` for a generation the snapshotter already
        // finished; one more resume lets it fall through to see `done`.
        barrier.resume();
        for h in handles {
            h.join().expect("vcpu thread");
        }
    }

    /// The HVF (best-effort) variants: `wait_parked_until` must return a partial
    /// capture under its deadline without hanging; a straggler that parks after
    /// the cycle ended must NOT contaminate the next cycle; and a parked vCPU
    /// must be released by `cancel` (no resume).
    #[test]
    fn park_cancelable_timeout_late_park_and_cancel() {
        use std::sync::atomic::{AtomicBool, Ordering};
        use std::sync::Arc;
        use std::time::{Duration, Instant};

        let barrier: Arc<PauseBarrier<usize>> = Arc::new(PauseBarrier::new());
        let cancel = Arc::new(AtomicBool::new(false));

        // 1) TIMEOUT: ask for 2 but only 1 parks → partial set, no hang.
        barrier.request_pause();
        let one_parked = {
            let b = Arc::clone(&barrier);
            let c = Arc::clone(&cancel);
            std::thread::spawn(move || b.park_cancelable(0, 0, &c))
        };
        let states =
            barrier.wait_parked_until(2, Instant::now() + Duration::from_millis(300), &cancel);
        assert!(
            states.len() <= 1,
            "partial under timeout, got {}",
            states.len()
        );
        barrier.resume(); // releases the parked vCPU
        one_parked.join().unwrap();

        // 2) LATE PARK: after the cycle ended (pause cleared), a straggler must
        //    deposit nothing and the next cycle must not see its value.
        barrier.park_cancelable(1, 999, &cancel); // returns immediately, no deposit
        barrier.request_pause();
        let s2 = barrier.wait_parked_until(1, Instant::now() + Duration::from_millis(60), &cancel);
        assert!(
            s2.is_empty(),
            "straggler 999 leaked into next cycle: {s2:?}"
        );
        barrier.resume();

        // 3) CANCEL: a parked vCPU is released by the cancel flag alone.
        let cancel2 = Arc::new(AtomicBool::new(false));
        barrier.request_pause();
        let parked = {
            let b = Arc::clone(&barrier);
            let c = Arc::clone(&cancel2);
            std::thread::spawn(move || b.park_cancelable(0, 0, &c))
        };
        // Ensure it has parked, then cancel WITHOUT resuming.
        let _ = barrier.wait_parked_until(1, Instant::now() + Duration::from_millis(500), &cancel2);
        cancel2.store(true, Ordering::SeqCst);
        parked.join().expect("cancel must release the parked vCPU");
        barrier.resume(); // leave the barrier clean
    }
}