use std::fmt;
use std::os::unix::net::UnixStream;
use std::time::SystemTime;
use crate::vmm::pool::PoolWorker;
use crate::vmm::resources::{ResourceError, VmResources};
use crate::vmm::tls::TlsConfig;
#[derive(Default)]
pub struct RunOptions {
pub tls: Option<TlsConfig>,
pub pool_sock: Option<UnixStream>,
pub pool_worker: Option<PoolWorker>,
pub experimental_skip_warm_gic_restore: bool,
}
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub struct RunReport {
pub warm_restores: u64,
}
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
struct WarmSnapshotCache {
path: String,
file_len: u64,
modified: Option<SystemTime>,
file: std::fs::File,
snap: crate::vmm::snapshot::Snapshot,
ram_offset: u64,
memory_bytes: usize,
}
#[derive(Debug)]
pub enum RunError {
Build(crate::vmm::builder::BuildError),
Hvf(crate::hvf::Error),
MmapCow {
path: String,
source: std::io::Error,
},
Pool(crate::vmm::pool::PoolError),
Resource(ResourceError),
SnapshotLoad {
path: String,
source: crate::vmm::snapshot::FileError,
},
ThreadSpawn {
name: String,
source: std::io::Error,
},
Tls(crate::vmm::tls::StartError),
UnexpectedProofOfLifeExit {
reason: crate::hvf::ExitReason,
ec: u64,
},
VsockMux(crate::vmm::vsock_mux::StartError),
Worker(crate::vmm::worker::WorkerError),
}
impl fmt::Display for RunError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
RunError::Build(e) => write!(f, "{e}"),
RunError::Hvf(e) => write!(f, "HVF operation failed: {e:?}"),
RunError::MmapCow { path, source } => {
write!(f, "mmap CoW snapshot RAM {path}: {source}")
}
RunError::Pool(e) => write!(f, "{e}"),
RunError::Resource(e) => write!(f, "{e}"),
RunError::SnapshotLoad { path, source } => {
write!(f, "load snapshot {path}: {source:?}")
}
RunError::ThreadSpawn { name, source } => {
write!(f, "spawn thread {name}: {source}")
}
RunError::Tls(e) => write!(f, "{e}"),
RunError::UnexpectedProofOfLifeExit { reason, ec } => {
write!(
f,
"unexpected proof-of-life exit: {reason:?} ESR_EL2 EC={ec:#x}"
)
}
RunError::VsockMux(e) => write!(f, "{e}"),
RunError::Worker(e) => write!(f, "{e}"),
}
}
}
impl std::error::Error for RunError {}
impl From<ResourceError> for RunError {
fn from(value: ResourceError) -> Self {
Self::Resource(value)
}
}
impl From<crate::vmm::builder::BuildError> for RunError {
fn from(value: crate::vmm::builder::BuildError) -> Self {
Self::Build(value)
}
}
impl From<crate::hvf::Error> for RunError {
fn from(value: crate::hvf::Error) -> Self {
Self::Hvf(value)
}
}
impl From<crate::vmm::pool::PoolError> for RunError {
fn from(value: crate::vmm::pool::PoolError) -> Self {
Self::Pool(value)
}
}
impl From<crate::vmm::worker::WorkerError> for RunError {
fn from(value: crate::vmm::worker::WorkerError) -> Self {
Self::Worker(value)
}
}
impl From<crate::vmm::vsock_mux::StartError> for RunError {
fn from(value: crate::vmm::vsock_mux::StartError) -> Self {
Self::VsockMux(value)
}
}
impl From<crate::vmm::tls::StartError> for RunError {
fn from(value: crate::vmm::tls::StartError) -> Self {
Self::Tls(value)
}
}
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
pub fn run(resources: &VmResources, options: RunOptions) -> Result<RunReport, RunError> {
resources.validate_for_run()?;
run_kernel(
resources,
options.tls,
options.pool_sock,
options.pool_worker,
options.experimental_skip_warm_gic_restore,
)
}
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
pub fn run_proof_of_life() -> Result<(), RunError> {
use crate::arch::aarch64::layout;
use crate::vmm::vstate::{MicroVm, TEST_PROGRAM};
eprintln!("supermachine: HVF init test");
let vm = MicroVm::new(64 * 1024 * 1024)?;
eprintln!(
" VM created, RAM mapped at GPA 0x{:x}, {} MiB",
vm.ram_gpa,
vm.ram_size / (1024 * 1024)
);
let entry = vm.ram_gpa + layout::KERNEL_LOAD_OFFSET;
unsafe {
vm.write_ram(entry, &TEST_PROGRAM);
}
vm.set_boot_cpsr()?;
vm.set_pc(entry)?;
eprintln!(" PC set to 0x{entry:x}, CPSR=EL1h (DAIF masked)");
eprintln!(" running vCPU…");
let (reason, esr, _gpa, _va) = vm.run_once()?;
let ec = (esr >> 26) & 0x3f;
eprintln!(" exit: {reason:?} ESR_EL2=0x{esr:x} EC={ec:#x}");
if reason == crate::hvf::ExitReason::Exception && ec == 0x16 {
eprintln!(" PASS: HVF round-trip working — guest executed HVC #0");
Ok(())
} else {
eprintln!(" UNEXPECTED exit; HVF probably misconfigured");
Err(RunError::UnexpectedProofOfLifeExit { reason, ec })
}
}
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
fn run_kernel(
resources: &VmResources,
tls_cfg: Option<TlsConfig>,
pool_sock: Option<UnixStream>,
pool_worker: Option<PoolWorker>,
option_skip_warm_gic_restore: bool,
) -> Result<RunReport, RunError> {
use crate::vmm::builder;
use crate::vmm::pool::{PoolControl, WarmRestoreTimings};
use crate::vmm::snapshot;
use crate::vmm::worker::{self, DispatchSnapshot};
let kernel_path = resources.kernel_path.as_deref();
let initrd_path = resources.initrd_path.as_deref();
let cmdline = resources.cmdline.as_str();
let mem_size = resources.memory_bytes();
let blk_paths = &resources.block_devices;
let n_vcpus = resources.vcpus;
let snapshot_after_ms = resources.snapshot.after_ms;
let snapshot_at = resources.snapshot.at_heartbeat;
let snapshot_on_listener = resources.snapshot.on_listener;
let quiesce_ms = resources.snapshot.quiesce_ms;
let snapshot_out = resources.snapshot.out_path.as_deref();
let restore_from = resources.restore_from.as_deref();
let cow_restore = resources.cow_restore;
let vsock_mux_path = resources.endpoints.vsock_mux.as_deref();
let http_port_addr = resources.endpoints.http_port.as_deref();
let vsock_mux_handoff_path = resources.endpoints.vsock_mux_handoff.as_deref();
let vsock_exec_path = resources.endpoints.vsock_exec.as_deref();
let vsock_exec_guest_port = resources
.endpoints
.vsock_exec_guest_port
.unwrap_or(crate::vmm::resources::DEFAULT_EXEC_GUEST_PORT);
let timings = std::env::var_os("SUPERMACHINE_TIMINGS").is_some();
let skip_warm_gic_restore =
option_skip_warm_gic_restore || std::env::var_os("SUPERMACHINE_SKIP_WARM_GIC_RESTORE").is_some();
let fixed_warm_ram_remap = std::env::var_os("SUPERMACHINE_REMAP_FIXED").is_some();
let run_t0 = std::time::Instant::now();
let mut cow_ram: Option<(*mut u8, usize)> = None;
let restore = match restore_from {
Some(p) => Some(if cow_restore {
eprintln!("supermachine: restoring from {p} (CoW mmap)");
let t0 = std::time::Instant::now();
let (snap, ram_offset, memory_bytes) =
snapshot::load_meta(p).map_err(|source| RunError::SnapshotLoad {
path: p.to_string(),
source,
})?;
if timings {
eprintln!(
"[timing] restore.load_meta={}us total={}us",
t0.elapsed().as_micros(),
run_t0.elapsed().as_micros()
);
}
let t0 = std::time::Instant::now();
let (ptr, len) =
snapshot::mmap_ram_cow_at(p, ram_offset, memory_bytes).map_err(|source| {
RunError::MmapCow {
path: p.to_string(),
source,
}
})?;
if timings {
eprintln!(
"[timing] restore.mmap_cow={}us total={}us",
t0.elapsed().as_micros(),
run_t0.elapsed().as_micros()
);
}
cow_ram = Some((ptr, len));
snap
} else {
eprintln!("supermachine: restoring from {p}");
let t0 = std::time::Instant::now();
snapshot::load_from_file(p)
.map_err(|source| RunError::SnapshotLoad {
path: p.to_string(),
source,
})
.inspect(|_| {
if timings {
eprintln!(
"[timing] restore.load_full={}us total={}us",
t0.elapsed().as_micros(),
run_t0.elapsed().as_micros()
);
}
})?
}),
None => None,
};
if restore.is_none() {
eprintln!("supermachine: kernel boot");
eprintln!(" kernel : {}", kernel_path.unwrap_or(""));
if let Some(p) = initrd_path {
eprintln!(" initramfs : {p}");
}
eprintln!(" cmdline : {cmdline}");
eprintln!(" memory : {} MiB", mem_size / (1024 * 1024));
for p in blk_paths {
eprintln!(" blk : {p}");
}
} else if let Some(s) = restore.as_ref() {
eprintln!(
" memory : {} MiB (from snapshot)",
s.memory.len() / (1024 * 1024)
);
}
let restore_memory_len = restore.as_ref().map(|s| s.memory.len());
let t0 = std::time::Instant::now();
let mut vmm = builder::build_vmm(resources, cow_ram, restore_memory_len)?;
if timings {
eprintln!(
"[timing] restore.build_vmm={}us total={}us",
t0.elapsed().as_micros(),
run_t0.elapsed().as_micros()
);
}
let mut first_restore_us: u128 = 0;
if let Some(snap) = restore.as_ref() {
let t0 = std::time::Instant::now();
vmm.restore_snapshot(snap)?;
first_restore_us = t0.elapsed().as_micros();
if timings {
eprintln!(
"[timing] restore.state={}us total={}us",
first_restore_us,
run_t0.elapsed().as_micros()
);
}
eprintln!(
" restored in {first_restore_us} us (mmio={} listeners={})",
snap.virtio.mmio.len(),
snap.virtio.vsock_listeners.len()
);
}
if let Some(c) = tls_cfg {
crate::vmm::tls::start(c, vmm.vsock.clone())?;
}
if let Some(p) = vsock_mux_path {
crate::vmm::vsock_mux::start(p, vmm.vsock.clone(), None)?;
}
if let Some(addr) = http_port_addr {
crate::vmm::vsock_mux::start_tcp(addr, vmm.vsock.clone(), None)?;
}
if let Some(p) = vsock_mux_handoff_path {
crate::vmm::vsock_mux::start_handoff(p, vmm.vsock.clone(), None)?;
}
if let Some(p) = vsock_exec_path {
crate::vmm::vsock_mux::start_exec(p, vmm.vsock.clone(), vsock_exec_guest_port)?;
}
if timings && restore.is_some() {
eprintln!(
"[timing] restore.endpoints_ready={}us",
run_t0.elapsed().as_micros()
);
}
let secondary_states: Vec<Option<crate::vmm::snapshot::PerVcpuState>> = restore
.as_ref()
.map(|s| {
(1..n_vcpus as usize)
.map(|i| s.per_vcpu.get(i).cloned())
.collect()
})
.unwrap_or_else(|| (1..n_vcpus).map(|_| None).collect());
for idx in 1..n_vcpus {
let coord_c = vmm.coord.clone();
let bus_c = vmm.bus.clone();
let name = format!("vcpu-{idx}");
let st = secondary_states
.get((idx - 1) as usize)
.cloned()
.unwrap_or(None);
std::thread::Builder::new()
.name(name.clone())
.spawn(move || worker::run_secondary(idx, coord_c, bus_c, st))
.map_err(|source| RunError::ThreadSpawn { name, source })?;
}
eprintln!(" vCPU launched ({n_vcpus} total), dispatch loop running\n");
if timings && restore.is_some() {
eprintln!(
"[timing] restore.vcpu_launched={}us",
run_t0.elapsed().as_micros()
);
}
let pool_mode = pool_sock.is_some() || pool_worker.is_some();
let transport_idle = pool_mode.then(|| {
let vsock = vmm.vsock.clone();
std::sync::Arc::new(move || vsock.is_transport_idle())
as std::sync::Arc<dyn Fn() -> bool + Send + Sync>
});
let mut pool = PoolControl::start(
pool_sock.as_ref(),
pool_worker,
restore.is_some().then_some(first_restore_us),
restore
.is_some()
.then(|| vmm.vsock.muxer().first_host_port())
.flatten(),
vmm.vm.vcpu.handle(),
transport_idle,
)?;
let mut report = RunReport::default();
let mut warm_snapshot_cache: Option<WarmSnapshotCache> = None;
loop {
let dispatch_exit = worker::dispatch_vcpu(
0,
&vmm.vm.vcpu,
&vmm.bus,
&vmm.coord,
&vmm.all_mmio,
&vmm.vsock,
&vmm.vm,
DispatchSnapshot {
after_ms: snapshot_after_ms,
at_heartbeat: snapshot_at,
on_listener: snapshot_on_listener,
quiesce_ms,
out_path: snapshot_out,
stop_requested: Some(pool.pause_flag()),
},
)?;
if dispatch_exit != worker::DispatchExit::Stopped
&& dispatch_exit != worker::DispatchExit::Canceled
{
break;
}
if pool.should_quit() {
break;
}
if !pool.pause_requested() {
if pool_mode && dispatch_exit == worker::DispatchExit::Canceled {
continue;
}
break;
}
pool.clear_pause();
if let Some(snap_req) = pool.take_snapshot_request() {
let cap_t0 = std::time::Instant::now();
let virtio = snapshot::VirtioSnapshot {
mmio: vmm.all_mmio.iter().map(|m| m.capture_state()).collect(),
vsock_listeners: vmm.vsock.muxer().capture_tsi_listeners(),
};
let snap_result = match snapshot::capture_snapshot(&vmm.vm, virtio) {
Ok(s) => s,
Err(e) => {
pool.post_snapshot_result(Err(format!("capture: {e:?}")));
continue;
}
};
let capture_us = cap_t0.elapsed().as_micros();
let save_t0 = std::time::Instant::now();
let save_stats = match snapshot::save_to_file_with_stats(
&snap_req.out_path,
&snap_result,
) {
Ok(s) => s,
Err(e) => {
pool.post_snapshot_result(Err(format!(
"save to {}: {e:?}",
snap_req.out_path
)));
continue;
}
};
let save_us = save_t0.elapsed().as_micros();
pool.post_snapshot_result(Ok(crate::vmm::pool::SnapshotResult {
bytes_written: save_stats.ram_bytes
+ save_stats.ram_data_bytes,
capture_us,
save_us,
}));
continue;
}
let Some(req) = pool.take_restore_request() else {
break;
};
if let Some(p) = req.egress_policy {
crate::vmm::egress_policy::set(&p);
}
let t0 = std::time::Instant::now();
let phase_t0 = std::time::Instant::now();
vmm.reset_vsock_transport();
let reset_vsock_us = phase_t0.elapsed().as_micros();
let phase_t0 = std::time::Instant::now();
let file_meta = std::fs::metadata(&req.path).map_err(|source| RunError::MmapCow {
path: req.path.clone(),
source,
})?;
let modified = file_meta.modified().ok();
let cache_hit = warm_snapshot_cache.as_ref().is_some_and(|cached| {
cached.path == req.path
&& cached.file_len == file_meta.len()
&& cached.modified == modified
});
if !cache_hit {
let file = std::fs::File::open(&req.path).map_err(|source| RunError::MmapCow {
path: req.path.clone(),
source,
})?;
let (snap, ram_offset, memory_bytes) =
snapshot::load_meta(&req.path).map_err(|source| RunError::SnapshotLoad {
path: req.path.clone(),
source,
})?;
warm_snapshot_cache = Some(WarmSnapshotCache {
path: req.path.clone(),
file_len: file_meta.len(),
modified,
file,
snap,
ram_offset,
memory_bytes,
});
}
let load_meta_us = phase_t0.elapsed().as_micros();
let cached = warm_snapshot_cache
.as_ref()
.expect("warm snapshot cache populated");
let phase_t0 = std::time::Instant::now();
unsafe {
if fixed_warm_ram_remap {
vmm.vm.remap_cow_from_file_fixed(
&cached.file,
cached.ram_offset,
cached.memory_bytes,
)?;
} else {
vmm.vm
.remap_cow_from_file(&cached.file, cached.ram_offset, cached.memory_bytes)?;
}
}
let remap_cow_us = phase_t0.elapsed().as_micros();
let phase_t0 = std::time::Instant::now();
let restore_timings = vmm.restore_snapshot_timed_with_options(
&cached.snap,
snapshot::SnapshotRestoreOptions {
skip_gic_blob: skip_warm_gic_restore,
},
)?;
let restore_snapshot_us = phase_t0.elapsed().as_micros();
let us = t0.elapsed().as_micros();
let timings = WarmRestoreTimings {
reset_vsock_us,
remap_cow_us,
load_meta_us,
restore_snapshot_us,
ram_copy_us: restore_timings.ram_copy_us,
gic_restore_us: restore_timings.gic_restore_us,
vcpu_restore_us: restore_timings.vcpu_restore_us,
vtimer_offset_us: restore_timings.vtimer_offset_us,
mmio_restore_us: restore_timings.mmio_restore_us,
listener_restore_us: restore_timings.listener_restore_us,
};
eprintln!(
" warm restore from {} in {us} us (reset={} remap={} load_meta={} restore={} ram={} gic={} vcpu={} vtimer={} mmio={} listener={})",
req.path,
timings.reset_vsock_us,
timings.remap_cow_us,
timings.load_meta_us,
timings.restore_snapshot_us,
timings.ram_copy_us,
timings.gic_restore_us,
timings.vcpu_restore_us,
timings.vtimer_offset_us,
timings.mmio_restore_us,
timings.listener_restore_us
);
report.warm_restores += 1;
pool.complete_restore(us, vmm.vsock.muxer().first_host_port(), timings);
}
Ok(report)
}