supermachine 0.3.3

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
// Multi-vCPU coordinator. Each vCPU runs in its own pthread, owns
// its own hv_vcpu_t (HVF requires creation + register access from
// the SAME thread). vCPU 0 is the boot CPU and runs on the main
// thread; secondaries are parked at thread start, woken by PSCI
// CPU_ON from vCPU 0.
//
// PSCI is via HVC (FDT advertises method="hvc" already). Subset
// implemented:
//   PSCI_VERSION         (0x84000000)  -> 0x10000  (PSCI 1.0)
//   PSCI_CPU_OFF         (0x84000002)  -> spin (no-op for now)
//   PSCI_CPU_ON          (0xC4000003)  -> wake target thread
//   PSCI_AFFINITY_INFO   (0xC4000004)  -> 0=on / 1=off / 2=pending
//   PSCI_FEATURES        (0x8400000A)  -> 0 for known, NOT_SUPPORTED else
//   PSCI_SYSTEM_OFF      (0x84000008)  -> shutdown
//   PSCI_SYSTEM_RESET    (0x84000009)  -> shutdown

#![cfg(all(target_os = "macos", target_arch = "aarch64"))]

use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
use std::sync::{Arc, Condvar, Mutex};

use crate::vmm::snapshot::PerVcpuState;

pub const PSCI_VERSION: u32 = 0x84000000;
pub const PSCI_CPU_OFF: u32 = 0x84000002;
pub const PSCI_CPU_ON: u32 = 0xC4000003;
pub const PSCI_AFFINITY_INFO: u32 = 0xC4000004;
pub const PSCI_FEATURES: u32 = 0x8400000A;
pub const PSCI_SYSTEM_OFF: u32 = 0x84000008;
pub const PSCI_SYSTEM_RESET: u32 = 0x84000009;

pub const PSCI_SUCCESS: i64 = 0;
pub const PSCI_NOT_SUPPORTED: i64 = -1;
pub const PSCI_INVALID_PARAMS: i64 = -2;
pub const PSCI_ALREADY_ON: i64 = -4;

#[derive(Clone)]
pub enum VcpuStart {
    /// Secondary thread waits.
    Parked,
    /// CPU_ON requested: enter at `entry` with X0=`ctx_id`.
    Run { entry: u64, ctx_id: u64 },
}

pub struct VcpuSlot {
    pub state: Mutex<VcpuStart>,
    pub cv: Condvar,
    /// True once vCPU thread has begun running guest code (any
    /// CPU_ON for an already-on vCPU returns ALREADY_ON).
    pub on: AtomicBool,
}

pub struct VcpuCoordinator {
    pub n_vcpus: u32,
    pub slots: Vec<VcpuSlot>,
    pub shutdown: AtomicBool,
    /// Multi-vCPU snapshot pause/resume rendezvous. When the
    /// snapshot trigger fires (on vcpu0's thread), we set
    /// `snapshot_request`, force-exit each secondary's
    /// `hv_vcpu_run` via `hv_vcpus_exit`, wait for each
    /// secondary thread to land in the pause point and deposit
    /// its `PerVcpuState` into `captured`. Then vcpu0 captures
    /// its own state, assembles the full Snapshot, writes it,
    /// and unblocks secondaries by toggling `resume`. Secondaries
    /// pop their captured slot back to None on resume so the
    /// next snapshot starts fresh.
    pub snapshot_request: AtomicBool,
    pub captured: Mutex<Vec<Option<PerVcpuState>>>,
    pub captured_count: AtomicU32,
    pub resume_lock: Mutex<u64>, // monotonic generation; secondaries wait while gen unchanged
    pub resume_cv: Condvar,
    /// Secondary `hv_vcpu_t` handles, registered by each
    /// secondary thread after it creates its vCPU (HVF requires
    /// the same thread that created the vCPU to perform reg
    /// access; the handle itself is just a u64 we can pass to
    /// `hv_vcpus_exit` to force a run exit from any thread).
    pub secondary_handles: Mutex<Vec<applevisor_sys::hv_vcpu_t>>,
}

impl VcpuCoordinator {
    pub fn new(n_vcpus: u32) -> Arc<Self> {
        let slots = (0..n_vcpus)
            .map(|_| VcpuSlot {
                state: Mutex::new(VcpuStart::Parked),
                cv: Condvar::new(),
                on: AtomicBool::new(false),
            })
            .collect();
        Arc::new(Self {
            n_vcpus,
            slots,
            shutdown: AtomicBool::new(false),
            snapshot_request: AtomicBool::new(false),
            captured: Mutex::new((0..n_vcpus).map(|_| None).collect()),
            captured_count: AtomicU32::new(0),
            resume_lock: Mutex::new(0),
            resume_cv: Condvar::new(),
            secondary_handles: Mutex::new(Vec::new()),
        })
    }

    /// Called by a secondary vCPU thread once it has created its
    /// `hv_vcpu_t`. Stored so the snapshot trigger thread can
    /// `hv_vcpus_exit` all secondaries in one batch.
    pub fn register_secondary(&self, handle: applevisor_sys::hv_vcpu_t) {
        self.secondary_handles.lock().unwrap().push(handle);
    }

    pub fn secondary_handles_snapshot(&self) -> Vec<applevisor_sys::hv_vcpu_t> {
        self.secondary_handles.lock().unwrap().clone()
    }

    /// Called by secondary dispatch loops between `hv_vcpu_run`
    /// iterations. If a snapshot is in progress, the secondary
    /// captures its own register state, deposits it into
    /// `captured[idx]`, bumps `captured_count`, and waits for
    /// `resume_lock`'s generation to advance. Returns when the
    /// secondary is free to continue running guest code.
    pub fn maybe_pause_for_snapshot(
        &self,
        idx: u32,
        vcpu: &crate::hvf::Vcpu,
    ) -> crate::hvf::Result<()> {
        if !self.snapshot_request.load(Ordering::Acquire) {
            return Ok(());
        }
        let saved_gen = *self.resume_lock.lock().unwrap();
        // Capture state on the OWNING thread (HVF requires it).
        let st = crate::vmm::snapshot::capture_vcpu_state(vcpu)?;
        if std::env::var_os("SUPERMACHINE_TIMINGS").is_some() {
            let pc = st
                .gp_regs
                .iter()
                .find(|(id, _)| *id == applevisor_sys::hv_reg_t::PC as u32)
                .map(|(_, v)| *v)
                .unwrap_or(0);
            eprintln!("  [vcpu-{idx}] snapshot pause: PC=0x{pc:x}");
        }
        let _ = idx; // appease unused-var when timings off
        self.captured.lock().unwrap()[idx as usize] = Some(st);
        self.captured_count.fetch_add(1, Ordering::AcqRel);
        // Block until snapshot thread bumps the generation.
        let mut g = self.resume_lock.lock().unwrap();
        while *g == saved_gen && !self.shutdown.load(Ordering::Acquire) {
            g = self.resume_cv.wait(g).unwrap();
        }
        Ok(())
    }

    /// Called by the snapshot trigger thread (vcpu0). Returns
    /// after every *running* secondary has deposited its state.
    /// Secondaries still parked in `wait_for_run` (e.g. snapshot
    /// trigger fires pre-SMP-bringup, like in volume mode where
    /// the bake snapshots at the heartbeat marker before the
    /// kernel has issued CPU_ON for them) are skipped — their
    /// PerVcpuState defaults to a no-op restore that hits the
    /// PSCI park path again on restore.
    pub fn request_snapshot_pause(
        &self,
        secondary_handles: &[applevisor_sys::hv_vcpu_t],
    ) {
        // Reset captured state from any prior snapshot.
        {
            let mut g = self.captured.lock().unwrap();
            for s in g.iter_mut() {
                *s = None;
            }
        }
        self.captured_count.store(0, Ordering::SeqCst);
        self.snapshot_request.store(true, Ordering::Release);
        if !secondary_handles.is_empty() {
            unsafe {
                let _ = applevisor_sys::hv_vcpus_exit(
                    secondary_handles.as_ptr(),
                    secondary_handles.len() as u32,
                );
            }
        }
        // Count only secondaries that are currently running guest
        // code; parked ones won't reach the rendezvous.
        let target: u32 = self
            .slots
            .iter()
            .skip(1)
            .filter(|s| s.on.load(Ordering::Acquire))
            .count() as u32;
        let deadline = std::time::Instant::now() + std::time::Duration::from_secs(2);
        while self.captured_count.load(Ordering::Acquire) < target {
            if self.shutdown.load(Ordering::Acquire) {
                return;
            }
            if std::time::Instant::now() > deadline {
                eprintln!(
                    "  [coord] snapshot-pause timeout: {}/{} secondaries deposited",
                    self.captured_count.load(Ordering::Acquire),
                    target,
                );
                break;
            }
            std::hint::spin_loop();
        }
    }

    /// Release secondaries so they resume guest execution after
    /// snapshot capture. Pair with `request_snapshot_pause`.
    pub fn release_after_snapshot(&self) {
        self.snapshot_request.store(false, Ordering::Release);
        let mut g = self.resume_lock.lock().unwrap();
        *g = g.wrapping_add(1);
        self.resume_cv.notify_all();
    }

    /// Pop secondary captured states (idx 1..n_vcpus). Called by
    /// the snapshot trigger AFTER `request_snapshot_pause`
    /// returned and AFTER capturing vcpu0's own state. Returns a
    /// vector of length `n_vcpus - 1` (so caller can prepend
    /// vcpu0's state).
    pub fn take_secondary_states(&self) -> Vec<PerVcpuState> {
        let mut g = self.captured.lock().unwrap();
        let mut out = Vec::with_capacity(g.len().saturating_sub(1));
        for st in g.iter_mut().skip(1) {
            // Replace with default (empty) so a missing entry
            // doesn't crash the saver. In practice every entry
            // should be Some here.
            out.push(st.take().unwrap_or_default());
        }
        out
    }

    /// PSCI CPU_ON: signal target vCPU's thread to start. Returns the
    /// PSCI return code.
    pub fn cpu_on(&self, target: u32, entry: u64, ctx_id: u64) -> i64 {
        let Some(slot) = self.slots.get(target as usize) else {
            return PSCI_INVALID_PARAMS;
        };
        if slot.on.load(Ordering::SeqCst) {
            return PSCI_ALREADY_ON;
        }
        let mut s = slot.state.lock().unwrap();
        *s = VcpuStart::Run { entry, ctx_id };
        slot.cv.notify_one();
        PSCI_SUCCESS
    }

    pub fn affinity_info(&self, target: u32) -> i64 {
        match self.slots.get(target as usize) {
            Some(slot) if slot.on.load(Ordering::SeqCst) => 0, // ON
            Some(_) => 1,                                      // OFF
            None => PSCI_INVALID_PARAMS,
        }
    }

    /// Wait (blocking) until our slot is told to Run. Used by
    /// secondary vCPU threads on startup.
    pub fn wait_for_run(&self, idx: u32) -> Option<(u64, u64)> {
        let slot = &self.slots[idx as usize];
        let mut s = slot.state.lock().unwrap();
        loop {
            if self.shutdown.load(Ordering::SeqCst) {
                return None;
            }
            if let VcpuStart::Run { entry, ctx_id } = *s {
                slot.on.store(true, Ordering::SeqCst);
                return Some((entry, ctx_id));
            }
            s = slot.cv.wait(s).unwrap();
        }
    }
}