supermachine 0.4.27

// Multi-vCPU coordinator. Each vCPU runs in its own pthread, owns
// its own hv_vcpu_t (HVF requires creation + register access from
// the SAME thread). vCPU 0 is the boot CPU and runs on the main
// thread; secondaries are parked at thread start, woken by PSCI
// CPU_ON from vCPU 0.
//
// PSCI is via HVC (FDT advertises method="hvc" already). Subset
// implemented:
//   PSCI_VERSION         (0x84000000)  -> 0x10000  (PSCI 1.0)
//   PSCI_CPU_OFF         (0x84000002)  -> spin (no-op for now)
//   PSCI_CPU_ON          (0xC4000003)  -> wake target thread
//   PSCI_AFFINITY_INFO   (0xC4000004)  -> 0=on / 1=off / 2=pending
//   PSCI_FEATURES        (0x8400000A)  -> 0 for known, NOT_SUPPORTED else
//   PSCI_SYSTEM_OFF      (0x84000008)  -> shutdown
//   PSCI_SYSTEM_RESET    (0x84000009)  -> shutdown

#![cfg(all(target_os = "macos", target_arch = "aarch64"))]

use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
use std::sync::{Arc, Condvar, Mutex};

use crate::vmm::snapshot::PerVcpuState;

pub const PSCI_VERSION: u32 = 0x84000000;
pub const PSCI_CPU_OFF: u32 = 0x84000002;
pub const PSCI_CPU_ON: u32 = 0xC4000003;
pub const PSCI_AFFINITY_INFO: u32 = 0xC4000004;
pub const PSCI_FEATURES: u32 = 0x8400000A;
pub const PSCI_SYSTEM_OFF: u32 = 0x84000008;
pub const PSCI_SYSTEM_RESET: u32 = 0x84000009;

pub const PSCI_SUCCESS: i64 = 0;
pub const PSCI_NOT_SUPPORTED: i64 = -1;
pub const PSCI_INVALID_PARAMS: i64 = -2;
pub const PSCI_ALREADY_ON: i64 = -4;

#[derive(Clone)]
pub enum VcpuStart {
    /// Secondary thread waits.
    Parked,
    /// CPU_ON requested: enter at `entry` with X0=`ctx_id`.
    Run { entry: u64, ctx_id: u64 },
}

pub struct VcpuSlot {
    pub state: Mutex<VcpuStart>,
    pub cv: Condvar,
    /// True once vCPU thread has begun running guest code (any
    /// CPU_ON for an already-on vCPU returns ALREADY_ON).
    pub on: AtomicBool,
}

pub struct VcpuCoordinator {
    pub n_vcpus: u32,
    pub slots: Vec<VcpuSlot>,
    pub shutdown: AtomicBool,
    /// Multi-vCPU snapshot pause/resume rendezvous. When the
    /// snapshot trigger fires (on vcpu0's thread), we set
    /// `snapshot_request`, force-exit each secondary's
    /// `hv_vcpu_run` via `hv_vcpus_exit`, wait for each
    /// secondary thread to land in the pause point and deposit
    /// its `PerVcpuState` into `captured`. Then vcpu0 captures
    /// its own state, assembles the full Snapshot, writes it,
    /// and unblocks secondaries by toggling `resume`. Secondaries
    /// pop their captured slot back to None on resume so the
    /// next snapshot starts fresh.
    pub snapshot_request: AtomicBool,
    pub captured: Mutex<Vec<Option<PerVcpuState>>>,
    pub captured_count: AtomicU32,
    pub resume_lock: Mutex<u64>, // monotonic generation; secondaries wait while gen unchanged
    pub resume_cv: Condvar,
    /// Secondary `hv_vcpu_t` handles, registered by each
    /// secondary thread after it creates its vCPU (HVF requires
    /// the same thread that created the vCPU to perform reg
    /// access; the handle itself is just a u64 we can pass to
    /// `hv_vcpus_exit` to force a run exit from any thread).
    pub secondary_handles: Mutex<Vec<applevisor_sys::hv_vcpu_t>>,
    /// Multi-vCPU cycle-restore rendezvous. Counterpart to the
    /// snapshot pause primitives above. When the runner takes a
    /// pool restore request and needs to roll secondaries back to
    /// the snapshot's per-vCPU state, it populates `restore_states`
    /// with each secondary's target state, sets `restore_request`,
    /// and `hv_vcpus_exit`s them. Each secondary thread, on its own
    /// HVF-owning thread (HVF requires register writes to come from
    /// the thread that created the vCPU), pulls its target from
    /// `restore_states[idx]`, calls `restore_vcpu_state`, increments
    /// `restored_count`, and waits on `resume_lock`'s generation
    /// (shared with the snapshot path — only one of the two can be
    /// active at a time). Same shape as `request_snapshot_pause`,
    /// just inverted: secondaries WRITE state instead of READING it.
    pub restore_request: AtomicBool,
    pub restore_states: Mutex<Vec<Option<PerVcpuState>>>,
    pub restored_count: AtomicU32,
    /// CNTVOFF_EL2 value to apply to every secondary vCPU on its
    /// restore path. Set by the boot vCPU's restore_snapshot just
    /// after it computes the new offset; read by each secondary's
    /// `run_secondary_inner` after it's loaded its captured
    /// per-vCPU state. Same value on all vCPUs so the guest's
    /// CNTVCT_EL0 reads are consistent across cores — without
    /// this, only vcpu0 had its offset re-applied on restore,
    /// secondaries saw the host's raw CNTPCT, and Linux thread
    /// migration between vCPUs caused CLOCK_MONOTONIC to skew
    /// (libuv asserts in Node ≥24, hangs in earlier versions).
    /// 0 means "no value set yet" (the secondary just keeps its
    /// HVF default for that boot, fine on cold boot, but should
    /// only happen pre-snapshot-restore).
    pub vtimer_offset: AtomicU64,
}

impl VcpuCoordinator {
    pub fn new(n_vcpus: u32) -> Arc<Self> {
        let slots = (0..n_vcpus)
            .map(|_| VcpuSlot {
                state: Mutex::new(VcpuStart::Parked),
                cv: Condvar::new(),
                on: AtomicBool::new(false),
            })
            .collect();
        Arc::new(Self {
            n_vcpus,
            slots,
            shutdown: AtomicBool::new(false),
            snapshot_request: AtomicBool::new(false),
            captured: Mutex::new((0..n_vcpus).map(|_| None).collect()),
            captured_count: AtomicU32::new(0),
            resume_lock: Mutex::new(0),
            resume_cv: Condvar::new(),
            secondary_handles: Mutex::new(Vec::new()),
            restore_request: AtomicBool::new(false),
            restore_states: Mutex::new((0..n_vcpus).map(|_| None).collect()),
            restored_count: AtomicU32::new(0),
            vtimer_offset: AtomicU64::new(0),
        })
    }

    /// Called by a secondary vCPU thread once it has created its
    /// `hv_vcpu_t`. Stored so the snapshot trigger thread can
    /// `hv_vcpus_exit` all secondaries in one batch.
    pub fn register_secondary(&self, handle: applevisor_sys::hv_vcpu_t) {
        self.secondary_handles.lock().unwrap().push(handle);
    }

    pub fn secondary_handles_snapshot(&self) -> Vec<applevisor_sys::hv_vcpu_t> {
        self.secondary_handles.lock().unwrap().clone()
    }

    /// Called by secondary dispatch loops between `hv_vcpu_run`
    /// iterations. If a snapshot is in progress, the secondary
    /// captures its own register state, deposits it into
    /// `captured[idx]`, bumps `captured_count`, and waits for
    /// `resume_lock`'s generation to advance. Returns when the
    /// secondary is free to continue running guest code.
    pub fn maybe_pause_for_snapshot(
        &self,
        idx: u32,
        vcpu: &crate::hvf::Vcpu,
    ) -> crate::hvf::Result<()> {
        if !self.snapshot_request.load(Ordering::Acquire) {
            return Ok(());
        }
        let saved_gen = *self.resume_lock.lock().unwrap();
        // Capture state on the OWNING thread (HVF requires it).
        let st = crate::vmm::snapshot::capture_vcpu_state(vcpu)?;
        if std::env::var_os("SUPERMACHINE_TIMINGS").is_some() {
            let pc = st
                .gp_regs
                .iter()
                .find(|(id, _)| *id == applevisor_sys::hv_reg_t::PC as u32)
                .map(|(_, v)| *v)
                .unwrap_or(0);
            eprintln!("  [vcpu-{idx}] snapshot pause: PC=0x{pc:x}");
        }
        let _ = idx; // appease unused-var when timings off
        self.captured.lock().unwrap()[idx as usize] = Some(st);
        self.captured_count.fetch_add(1, Ordering::AcqRel);
        // Block until snapshot thread bumps the generation.
        let mut g = self.resume_lock.lock().unwrap();
        while *g == saved_gen && !self.shutdown.load(Ordering::Acquire) {
            g = self.resume_cv.wait(g).unwrap();
        }
        Ok(())
    }

    /// Called by the snapshot trigger thread (vcpu0). Returns
    /// after every *running* secondary has deposited its state.
    /// Secondaries still parked in `wait_for_run` (e.g. snapshot
    /// trigger fires pre-SMP-bringup, like in volume mode where
    /// the bake snapshots at the heartbeat marker before the
    /// kernel has issued CPU_ON for them) are skipped — their
    /// PerVcpuState defaults to a no-op restore that hits the
    /// PSCI park path again on restore.
    pub fn request_snapshot_pause(
        &self,
        secondary_handles: &[applevisor_sys::hv_vcpu_t],
    ) {
        // Reset captured state from any prior snapshot.
        {
            let mut g = self.captured.lock().unwrap();
            for s in g.iter_mut() {
                *s = None;
            }
        }
        self.captured_count.store(0, Ordering::SeqCst);
        self.snapshot_request.store(true, Ordering::Release);
        if !secondary_handles.is_empty() {
            unsafe {
                let _ = applevisor_sys::hv_vcpus_exit(
                    secondary_handles.as_ptr(),
                    secondary_handles.len() as u32,
                );
            }
        }
        // Count only secondaries that are currently running guest
        // code; parked ones won't reach the rendezvous.
        let target: u32 = self
            .slots
            .iter()
            .skip(1)
            .filter(|s| s.on.load(Ordering::Acquire))
            .count() as u32;
        let deadline = std::time::Instant::now() + std::time::Duration::from_secs(2);
        while self.captured_count.load(Ordering::Acquire) < target {
            if self.shutdown.load(Ordering::Acquire) {
                return;
            }
            if std::time::Instant::now() > deadline {
                eprintln!(
                    "  [coord] snapshot-pause timeout: {}/{} secondaries deposited",
                    self.captured_count.load(Ordering::Acquire),
                    target,
                );
                break;
            }
            std::hint::spin_loop();
        }
    }

    /// Release secondaries so they resume guest execution after
    /// snapshot capture. Pair with `request_snapshot_pause`.
    pub fn release_after_snapshot(&self) {
        self.snapshot_request.store(false, Ordering::Release);
        let mut g = self.resume_lock.lock().unwrap();
        *g = g.wrapping_add(1);
        self.resume_cv.notify_all();
    }

    /// Multi-vCPU cycle-restore: secondary-side rendezvous.
    /// Counterpart to `maybe_pause_for_snapshot`. Two-phase:
    ///
    ///   1. The runner sets `restore_request` + `hv_vcpus_exit`s
    ///      secondaries BEFORE doing RAM remap / GIC blob /
    ///      vcpu0 restore. Each secondary lands here and spins
    ///      until its `restore_states[idx]` becomes `Some` —
    ///      that's the runner's signal "GIC + vcpu0 are done,
    ///      apply your state now". (HVF requires register writes
    ///      to come from the OWNING thread, so secondaries apply
    ///      their own state.)
    ///
    ///   2. After applying, the secondary increments
    ///      `restored_count` and waits on `resume_lock`'s
    ///      generation. The runner spins until
    ///      `restored_count == n_running_secondaries`, then
    ///      `release_after_restore` bumps the gen and secondaries
    ///      re-enter `hv_vcpu_run` with the snapshot's coherent
    ///      state.
    ///
    /// Without the two-phase split the runner would race against
    /// secondaries' state application — they'd write ICH regs
    /// referencing distributor INTIDs the GIC blob restore
    /// hadn't published yet, and the next IRQ delivery on the
    /// restored guest would land in undefined territory.
    pub fn maybe_apply_restore(
        &self,
        idx: u32,
        vcpu: &crate::hvf::Vcpu,
    ) -> crate::hvf::Result<()> {
        if !self.restore_request.load(Ordering::Acquire) {
            return Ok(());
        }
        let saved_gen = *self.resume_lock.lock().unwrap();
        // Phase 1: spin-wait until our target state is published.
        // The runner does GIC + vcpu0 restore first, then pops
        // states into `restore_states`. Bound the wait on shutdown
        // so a teardown mid-restore doesn't deadlock the secondary.
        let deadline = std::time::Instant::now() + std::time::Duration::from_secs(5);
        loop {
            let snapshot = {
                let g = self.restore_states.lock().unwrap();
                g[idx as usize].clone()
            };
            if let Some(st) = snapshot {
                crate::vmm::snapshot::restore_vcpu_state(vcpu, &st)?;
                // Re-align this secondary's CNTVOFF_EL2 with the
                // boot vCPU's just-applied offset. The boot vCPU
                // runs `restore_snapshot_timed_with_options`
                // BEFORE publishing into `restore_states`, so the
                // value in `self.vtimer_offset` is already fresh
                // when we land here. See worker.rs's analogous
                // call in run_secondary_inner's cold-restore path
                // for the full rationale (Node libuv asserts on
                // CLOCK_MONOTONIC skew when secondaries don't
                // share vcpu0's CNTVOFF after restore).
                let off = self.vtimer_offset.load(Ordering::Acquire);
                if off != 0 {
                    vcpu.set_vtimer_offset(off)?;
                }
                break;
            }
            if self.shutdown.load(Ordering::Acquire) {
                return Ok(());
            }
            if std::time::Instant::now() > deadline {
                eprintln!(
                    "  [vcpu-{idx}] cycle-restore phase-1 timeout (state not published); \
                     proceeding without restore (will likely panic on resume)"
                );
                break;
            }
            std::hint::spin_loop();
        }
        // Phase 2: deposit completion + wait for resume.
        self.restored_count.fetch_add(1, Ordering::AcqRel);
        let mut g = self.resume_lock.lock().unwrap();
        while *g == saved_gen && !self.shutdown.load(Ordering::Acquire) {
            g = self.resume_cv.wait(g).unwrap();
        }
        Ok(())
    }

    /// Multi-vCPU cycle-restore: phase-1 driver call. Forces
    /// secondaries out of `hv_vcpu_run` and parks them in
    /// `maybe_apply_restore`, where they spin-wait for the
    /// runner to publish per-vcpu target states. Returns
    /// immediately after issuing `hv_vcpus_exit` — does NOT wait
    /// for secondaries to land at the rendezvous (they'll spin
    /// when they get there; runner can do GIC + vcpu0 restore in
    /// parallel).
    pub fn pause_secondaries_for_restore(
        &self,
        secondary_handles: &[applevisor_sys::hv_vcpu_t],
    ) {
        let _t0 = std::time::Instant::now();
        // Reset state slots — a prior restore may have left them
        // populated with stale snapshots.
        {
            let mut g = self.restore_states.lock().unwrap();
            for s in g.iter_mut() {
                *s = None;
            }
        }
        self.restored_count.store(0, Ordering::SeqCst);
        self.restore_request.store(true, Ordering::Release);
        if !secondary_handles.is_empty() {
            unsafe {
                let _ = applevisor_sys::hv_vcpus_exit(
                    secondary_handles.as_ptr(),
                    secondary_handles.len() as u32,
                );
            }
        }
        if std::env::var_os("SUPERMACHINE_TIMINGS").is_some() {
            eprintln!(
                "  [coord-restore] pause: {} us (n_secondaries={})",
                _t0.elapsed().as_micros(),
                secondary_handles.len(),
            );
        }
    }

    /// Multi-vCPU cycle-restore: phase-2 driver call. Publishes
    /// each secondary's target `PerVcpuState` into
    /// `restore_states`, then spins until every running secondary
    /// has applied its state and parked at the resume rendezvous.
    /// Caller MUST call `release_after_restore` after this returns
    /// to bump the resume generation.
    ///
    /// `secondary_states[i]` corresponds to vcpu_index `i + 1`.
    pub fn publish_and_wait_secondary_restore(
        &self,
        secondary_states: &[Option<PerVcpuState>],
    ) {
        let _t0 = std::time::Instant::now();
        // Publish target states atomically in one lock acquisition.
        {
            let mut g = self.restore_states.lock().unwrap();
            for (i, st) in secondary_states.iter().enumerate() {
                let idx = i + 1; // secondaries start at vcpu 1
                if idx < g.len() {
                    g[idx] = st.clone();
                }
            }
        }
        let _t_publish = _t0.elapsed().as_micros();
        // Wait for completion. Only secondaries with slot.on=true
        // are running and will reach `maybe_apply_restore`; parked-
        // pre-SMP-bringup ones are skipped (same logic as
        // `request_snapshot_pause`).
        let target: u32 = self
            .slots
            .iter()
            .skip(1)
            .filter(|s| s.on.load(Ordering::Acquire))
            .count() as u32;
        let deadline = std::time::Instant::now() + std::time::Duration::from_secs(2);
        while self.restored_count.load(Ordering::Acquire) < target {
            if self.shutdown.load(Ordering::Acquire) {
                return;
            }
            if std::time::Instant::now() > deadline {
                eprintln!(
                    "  [coord] cycle-restore phase-2 timeout: {}/{} secondaries applied",
                    self.restored_count.load(Ordering::Acquire),
                    target,
                );
                break;
            }
            std::hint::spin_loop();
        }
        if std::env::var_os("SUPERMACHINE_TIMINGS").is_some() {
            eprintln!(
                "  [coord-restore] publish+wait: total {} us (publish {} us; spin {} us, target={target})",
                _t0.elapsed().as_micros(),
                _t_publish,
                _t0.elapsed().as_micros().saturating_sub(_t_publish),
            );
        }
    }

    /// Release secondaries after a cycle-restore — they re-enter
    /// `hv_vcpu_run` with their newly-applied snapshot state.
    pub fn release_after_restore(&self) {
        self.restore_request.store(false, Ordering::Release);
        let mut g = self.resume_lock.lock().unwrap();
        *g = g.wrapping_add(1);
        self.resume_cv.notify_all();
    }

    /// Pop secondary captured states (idx 1..n_vcpus). Called by
    /// the snapshot trigger AFTER `request_snapshot_pause`
    /// returned and AFTER capturing vcpu0's own state. Returns a
    /// vector of length `n_vcpus - 1` (so caller can prepend
    /// vcpu0's state).
    pub fn take_secondary_states(&self) -> Vec<PerVcpuState> {
        let mut g = self.captured.lock().unwrap();
        let mut out = Vec::with_capacity(g.len().saturating_sub(1));
        for st in g.iter_mut().skip(1) {
            // Replace with default (empty) so a missing entry
            // doesn't crash the saver. In practice every entry
            // should be Some here.
            out.push(st.take().unwrap_or_default());
        }
        out
    }

    /// PSCI CPU_ON: signal target vCPU's thread to start. Returns the
    /// PSCI return code.
    pub fn cpu_on(&self, target: u32, entry: u64, ctx_id: u64) -> i64 {
        let Some(slot) = self.slots.get(target as usize) else {
            return PSCI_INVALID_PARAMS;
        };
        if slot.on.load(Ordering::SeqCst) {
            return PSCI_ALREADY_ON;
        }
        let mut s = slot.state.lock().unwrap();
        *s = VcpuStart::Run { entry, ctx_id };
        slot.cv.notify_one();
        PSCI_SUCCESS
    }

    pub fn affinity_info(&self, target: u32) -> i64 {
        match self.slots.get(target as usize) {
            Some(slot) if slot.on.load(Ordering::SeqCst) => 0, // ON
            Some(_) => 1,                                      // OFF
            None => PSCI_INVALID_PARAMS,
        }
    }

    /// Wait (blocking) until our slot is told to Run. Used by
    /// secondary vCPU threads on startup.
    pub fn wait_for_run(&self, idx: u32) -> Option<(u64, u64)> {
        let slot = &self.slots[idx as usize];
        let mut s = slot.state.lock().unwrap();
        loop {
            if self.shutdown.load(Ordering::SeqCst) {
                return None;
            }
            if let VcpuStart::Run { entry, ctx_id } = *s {
                slot.on.store(true, Ordering::SeqCst);
                return Some((entry, ctx_id));
            }
            s = slot.cv.wait(s).unwrap();
        }
    }
}