use std::io::{BufWriter, Read, Seek, SeekFrom, Write};
use std::os::unix::io::AsRawFd;
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::sync::{Arc, Mutex};
use kvm_bindings::{kvm_clock_data, kvm_irqchip, kvm_pit_state2};
use kvm_ioctls::VcpuExit as KvmExit;
use vmm_sys_util::eventfd::EventFd;
use super::{KvmDeviceState, KvmError, KvmSnapshotState, KvmVcpu, KvmVcpuHandle, KvmVm};
use crate::arch::x86_64::mptable;
use crate::devices::com1::{Com1, Com1State, COM1_BASE, COM1_IRQ};
use crate::devices::mmio_bus::MmioBus;
use crate::devices::virtio::blk::VirtioBlk;
use crate::devices::virtio::fs::{VirtioFs, VirtioFsConfig};
use crate::devices::virtio::mmio::{MmioSnapshot, MmioVirtio};
use crate::devices::virtio::queue::GuestMem;
use crate::devices::virtio::vsock::device::Vsock;
use crate::devices::virtio::vsock::muxer::TsiListenerSnapshot;
use crate::devices::virtio::vsock::muxer_thread::MuxerStream;
use crate::devices::virtio::VirtioDevice;
use crate::hypervisor::{HypervisorVcpu, HypervisorVm, VcpuHandle};
use crate::snapshot_frame::{DeviceBacking, DeviceKind, DeviceRecord};
const VIRTIO_BASE: u64 = 0xd000_0000;
const VIRTIO_LEN: u64 = 0x1000;
const VIRTIO_IRQ: u32 = 5;
const VIRTIO_QUEUE_NOTIFY: u64 = VIRTIO_BASE + 0x050;
const VSOCK_BASE: u64 = 0xd000_1000;
const VSOCK_LEN: u64 = 0x1000;
const VSOCK_IRQ: u32 = 6;
const GUEST_CID: u64 = 3;
const VOLUME_BASE: u64 = 0xd000_2000;
const VOLUME_IRQ_BASE: u32 = 7;
const FS_BASE: u64 = 0xd010_0000;
const FS_LEN: u64 = 0x1000;
const FS_IRQ_BASE: u32 = 10;
const BALLOON_BASE: u64 = 0xd020_0000;
const BALLOON_LEN: u64 = 0x1000;
const IOAPIC_GSI_CEILING: u32 = 24;
fn balloon_irq(num_fs: usize) -> u32 {
FS_IRQ_BASE + num_fs as u32
}
fn virtio_irq_budget_ok(
num_volumes: usize,
num_fs: usize,
enable_balloon: bool,
) -> Result<(), String> {
let max_volumes = (FS_IRQ_BASE - VOLUME_IRQ_BASE) as usize;
if num_volumes > max_volumes {
return Err(format!(
"too many data volumes: {num_volumes} > max {max_volumes} \
(volume IRQs would collide with the virtio-fs IRQ range)"
));
}
let highest_used = if enable_balloon {
balloon_irq(num_fs)
} else if num_fs > 0 {
FS_IRQ_BASE + num_fs as u32 - 1
} else {
VSOCK_IRQ
};
if highest_used >= IOAPIC_GSI_CEILING {
return Err(format!(
"virtio IRQ budget exhausted: highest GSI {highest_used} >= IOAPIC ceiling \
{IOAPIC_GSI_CEILING} ({num_fs} fs mounts{})",
if enable_balloon { " + balloon" } else { "" }
));
}
Ok(())
}
const FS_DAX_BASE: u64 = 0x10_0000_0000;
const FS_DAX_WINDOW_LEN: u64 = 1 << 30;
struct KvmDaxMapper {
vm: Arc<KvmVm>,
}
pub(crate) fn kvm_dax_mapper(vm: Arc<KvmVm>) -> Arc<dyn crate::fuse::HvfMapper> {
Arc::new(KvmDaxMapper { vm })
}
impl crate::fuse::HvfMapper for KvmDaxMapper {
fn map(
&self,
host_va: *mut u8,
gpa: u64,
len: u64,
prot: u32,
) -> Result<(), crate::fuse::Errno> {
let kvm_prot = if prot & crate::fuse::DAX_PROT_WRITE != 0 {
crate::hypervisor::prot::RWX
} else {
crate::hypervisor::prot::READ };
unsafe { self.vm.map_ram(host_va, gpa, len as usize, kvm_prot) }
.map_err(|_| crate::fuse::backend::EIO)
}
fn unmap(&self, gpa: u64, len: u64) -> Result<(), crate::fuse::Errno> {
unsafe { self.vm.unmap_ram(gpa, len as usize) }.map_err(|_| crate::fuse::backend::EIO)
}
}
#[derive(Clone)]
pub struct VirtioFsAttach {
pub host_path: String,
pub tag: String,
pub mount: String,
}
#[derive(Clone)]
pub struct VolumeAttach {
pub path: String,
pub size: u64,
pub mount: String,
}
struct FsMount {
mmio: Arc<MmioVirtio>,
backend: Arc<dyn crate::fuse::FsBackend>,
dax: Arc<crate::fuse::DaxSession>,
host_path: String,
tag: String,
mount: String,
dax_gpa: u64,
dax_window_len: u64,
}
struct VirtioFsSnap {
host_path: String,
tag: String,
mount: String,
dax_gpa: u64,
dax_window_len: u64,
mmio: MmioSnapshot,
backend_state: Vec<u8>,
dax_state: Vec<u8>,
}
const COM1_PORTS: std::ops::Range<u16> = COM1_BASE..COM1_BASE + 8;
pub struct LinuxVmConfig<'a> {
pub mem_size: usize,
pub num_cpus: u8,
pub kernel: &'a [u8],
pub initrd: Option<&'a [u8]>,
pub disk_path: Option<&'a str>,
pub disk_size: u64,
pub cmdline: &'a str,
pub enable_vsock: bool,
pub volumes: &'a [VolumeAttach],
pub virtiofs: &'a [VirtioFsAttach],
pub tsi_token: Option<[u8; 32]>,
pub enable_balloon: bool,
}
#[derive(Debug, PartialEq, Eq, Clone)]
pub enum ExitReason {
Halt,
Shutdown,
Canceled,
Unknown(String),
}
pub struct LinuxVm {
vm: Arc<KvmVm>,
vcpus: Vec<KvmVcpu>,
bus: Arc<MmioBus>,
com1: Arc<Mutex<Com1>>,
host: *mut u8,
mem_size: usize,
_blk: Option<Arc<VirtioBlk>>,
blk_mmio: Option<Arc<MmioVirtio>>,
disk: Option<(String, u64)>,
vsock: Option<Arc<Vsock>>,
vsock_mmio: Option<Arc<MmioVirtio>>,
balloon: Option<Arc<crate::devices::virtio::balloon::VirtioBalloon>>,
balloon_mmio: Option<Arc<MmioVirtio>>,
dev_thread: Option<std::thread::JoinHandle<()>>,
dev_stop: Arc<AtomicBool>,
dev_wake: Option<EventFd>,
_volume_blks: Vec<Arc<VirtioBlk>>,
volume_threads: Vec<std::thread::JoinHandle<()>>,
volume_wakes: Vec<EventFd>,
volume_mmios: Vec<Arc<MmioVirtio>>,
volume_meta: Vec<VolumeAttach>,
tsi_token: Option<[u8; 32]>,
fs_mounts: Vec<FsMount>,
bridges: Mutex<Vec<crate::vmm::vsock_mux::Acceptor>>,
vcpu_baselines: Vec<KvmSnapshotState>,
reset_seq: Arc<AtomicU64>,
reset_intc: Option<KvmDeviceState>,
reset_com1: Option<Com1State>,
reset_blk_mmio: Option<MmioSnapshot>,
reset_vsock_mmio: Option<MmioSnapshot>,
reset_volume_mmios: Vec<MmioSnapshot>,
reset_fs_mmios: Vec<MmioSnapshot>,
}
fn next_sock_id() -> u64 {
static SOCK_ID: AtomicU64 = AtomicU64::new(0);
SOCK_ID.fetch_add(1, Ordering::Relaxed)
}
unsafe impl Send for LinuxVm {}
fn advise_hugepage(ptr: *mut u8, len: usize) {
unsafe {
libc::madvise(ptr as *mut libc::c_void, len, libc::MADV_HUGEPAGE);
}
}
fn advise_mergeable(ptr: *mut u8, len: usize) {
if std::env::var_os("SUPERMACHINE_NO_KSM").is_some() {
return;
}
unsafe {
libc::madvise(ptr as *mut libc::c_void, len, libc::MADV_MERGEABLE);
}
}
#[allow(clippy::too_many_arguments)]
fn register_volume_blk(
vm: &Arc<KvmVm>,
bus: &Arc<MmioBus>,
host: *mut u8,
mem_size: usize,
name: &str,
path: &str,
size: u64,
base: u64,
irq: u32,
dev_stop: &Arc<AtomicBool>,
) -> Result<
(
Arc<VirtioBlk>,
Arc<MmioVirtio>,
Option<std::thread::JoinHandle<()>>,
Option<EventFd>,
),
KvmError,
> {
let blk = Arc::new(VirtioBlk::open_rw(name, path, size)?);
let gmem = GuestMem::new(host, 0, mem_size);
let irq_efd = EventFd::new(0)?;
vm.register_irqfd(&irq_efd, irq)?;
let irq_efd_dev = irq_efd.try_clone()?;
let irq_raise: Arc<dyn Fn() + Send + Sync> = Arc::new(move || {
let _ = irq_efd_dev.write(1);
});
let mmio = Arc::new(MmioVirtio::new(blk.clone(), gmem, irq_raise));
blk.set_irq_raise(mmio.make_used_buffer_irq());
bus.register(base, mmio.clone());
let mut thread = None;
let mut wake = None;
if std::env::var_os("KVM_NO_IOEVENTFD").is_none() {
let notify_efd = EventFd::new(0)?;
vm.register_mmio_ioevent(¬ify_efd, base + 0x050)?;
let notify_rd = notify_efd.try_clone()?;
let blk_thread = blk.clone();
let stop = dev_stop.clone();
thread = Some(std::thread::spawn(move || loop {
if notify_rd.read().is_err() {
break;
}
if stop.load(Ordering::SeqCst) {
break;
}
blk_thread.notify(0);
}));
wake = Some(notify_efd.try_clone()?);
}
Ok((blk, mmio, thread, wake))
}
impl LinuxVm {
pub fn new(cfg: &LinuxVmConfig) -> Result<Self, KvmError> {
assert!(cfg.num_cpus >= 1, "num_cpus must be >= 1");
let vm = Arc::new(KvmVm::create()?);
vm.create_pit()?;
let host = unsafe {
libc::mmap(
std::ptr::null_mut(),
cfg.mem_size,
libc::PROT_READ | libc::PROT_WRITE,
libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
-1,
0,
)
};
if host == libc::MAP_FAILED {
return Err(KvmError::from(std::io::Error::last_os_error()));
}
let host = host as *mut u8;
advise_hugepage(host, cfg.mem_size);
advise_mergeable(host, cfg.mem_size);
unsafe {
if let Err(e) = vm.map_ram(host, 0, cfg.mem_size, crate::hypervisor::prot::RWX) {
libc::munmap(host as *mut libc::c_void, cfg.mem_size);
return Err(e);
}
}
let assemble = || -> Result<Assembled, KvmError> {
virtio_irq_budget_ok(cfg.volumes.len(), cfg.virtiofs.len(), cfg.enable_balloon)
.map_err(|e| KvmError::from(std::io::Error::other(e)))?;
let mut cmdline = cfg.cmdline.to_string();
if cfg.disk_path.is_some() {
cmdline.push_str(&format!(
" virtio_mmio.device=0x{VIRTIO_LEN:x}@0x{VIRTIO_BASE:x}:{VIRTIO_IRQ}"
));
}
if cfg.enable_vsock {
cmdline.push_str(&format!(
" virtio_mmio.device=0x{VSOCK_LEN:x}@0x{VSOCK_BASE:x}:{VSOCK_IRQ}"
));
}
for (i, vol) in cfg.volumes.iter().enumerate() {
let base = VOLUME_BASE + (i as u64) * 0x1000;
let irq = VOLUME_IRQ_BASE + i as u32;
let dev = format!("vd{}", (b'b' + i as u8) as char);
cmdline.push_str(&format!(" virtio_mmio.device=0x1000@0x{base:x}:{irq}"));
cmdline.push_str(&format!(" sm.volume=/dev/{dev}:{}", vol.mount));
}
for (i, fs) in cfg.virtiofs.iter().enumerate() {
let base = FS_BASE + (i as u64) * 0x1000;
let irq = FS_IRQ_BASE + i as u32;
cmdline.push_str(&format!(
" virtio_mmio.device=0x{FS_LEN:x}@0x{base:x}:{irq}"
));
cmdline.push_str(&format!(" sm.virtiofs={}:{}", fs.tag, fs.mount));
}
if cfg.enable_balloon {
let irq = balloon_irq(cfg.virtiofs.len());
cmdline.push_str(&format!(
" virtio_mmio.device=0x{BALLOON_LEN:x}@0x{BALLOON_BASE:x}:{irq}"
));
}
if let Some(token) = cfg.tsi_token.as_ref() {
crate::cli::append_tsi_token_cmdline(&mut cmdline, &crate::cli::hex_lower(token));
}
let mem = unsafe { std::slice::from_raw_parts_mut(host, cfg.mem_size) };
let boot_cfg = crate::hypervisor::LinuxBootConfig {
kernel: cfg.kernel,
initrd: cfg.initrd,
cmdline: &cmdline,
ram_gpa: 0, ram_size: cfg.mem_size,
fdt: None, };
let mut vcpus = Vec::with_capacity(cfg.num_cpus as usize);
let bsp = vm.create_vcpu()?;
vm.boot_linux(&bsp, mem, &boot_cfg)?;
vcpus.push(bsp);
if cfg.num_cpus > 1 {
mptable::write_mptable(mem, cfg.num_cpus).map_err(|e| {
KvmError::from(std::io::Error::new(
std::io::ErrorKind::Other,
e.to_string(),
))
})?;
}
for _ in 1..cfg.num_cpus {
let vcpu = vm.create_vcpu()?;
vcpu.park_for_sipi()?;
vcpus.push(vcpu);
}
let bus = Arc::new(MmioBus::new());
let mut blk_keep = None;
let mut blk_mmio_keep = None;
let mut disk_keep = None;
let mut dev_thread = None;
let mut dev_wake = None;
let dev_stop = Arc::new(AtomicBool::new(false));
if let Some(disk) = cfg.disk_path {
let blk = Arc::new(VirtioBlk::open_rw("vda", disk, cfg.disk_size)?);
let gmem = GuestMem::new(host, 0, cfg.mem_size);
let irq_efd = EventFd::new(0)?;
vm.register_irqfd(&irq_efd, VIRTIO_IRQ)?;
let irq_efd_dev = irq_efd.try_clone()?;
let irq_raise: Arc<dyn Fn() + Send + Sync> = Arc::new(move || {
let _ = irq_efd_dev.write(1);
});
let mmio = Arc::new(MmioVirtio::new(blk.clone(), gmem, irq_raise));
blk.set_irq_raise(mmio.make_used_buffer_irq());
bus.register(VIRTIO_BASE, mmio.clone());
blk_mmio_keep = Some(mmio);
disk_keep = Some((disk.to_string(), cfg.disk_size));
let use_ioeventfd = std::env::var_os("KVM_NO_IOEVENTFD").is_none();
if use_ioeventfd {
let notify_efd = EventFd::new(0)?; vm.register_mmio_ioevent(¬ify_efd, VIRTIO_QUEUE_NOTIFY)?;
let notify_rd = notify_efd.try_clone()?;
let blk_thread = blk.clone();
let stop = dev_stop.clone();
dev_thread = Some(std::thread::spawn(move || loop {
if notify_rd.read().is_err() {
break;
}
if stop.load(Ordering::SeqCst) {
break;
}
blk_thread.notify(0);
}));
dev_wake = Some(notify_efd.try_clone()?);
}
blk_keep = Some(blk);
}
let mut vsock_keep = None;
let mut vsock_mmio_keep = None;
if cfg.enable_vsock {
let vsock = Arc::new(
Vsock::with_tsi_token(GUEST_CID, cfg.tsi_token)
.map_err(|e| KvmError::from(std::io::Error::other(format!("{e:?}"))))?,
);
let gmem = GuestMem::new(host, 0, cfg.mem_size);
let vm_irq = vm.clone();
let irq_raise: Arc<dyn Fn() + Send + Sync> = Arc::new(move || {
let _ = vm_irq.set_irq(VSOCK_IRQ, true);
let _ = vm_irq.set_irq(VSOCK_IRQ, false);
});
let mmio = Arc::new(MmioVirtio::new(vsock.clone(), gmem, irq_raise));
vsock.set_irq_raise(mmio.make_used_buffer_irq());
let vsock_for_kick = vsock.clone();
vsock
.muxer()
.set_kick(Arc::new(move || vsock_for_kick.kick()));
bus.register(VSOCK_BASE, mmio.clone());
vsock_mmio_keep = Some(mmio);
vsock_keep = Some(vsock);
}
let mut fs_mounts: Vec<FsMount> = Vec::with_capacity(cfg.virtiofs.len());
for (i, fsm) in cfg.virtiofs.iter().enumerate() {
let base = FS_BASE + (i as u64) * 0x1000;
let irq = FS_IRQ_BASE + i as u32;
let dax_gpa = FS_DAX_BASE + (i as u64) * FS_DAX_WINDOW_LEN;
let backend: Arc<dyn crate::fuse::FsBackend> =
Arc::new(crate::fuse::PosixFs::new(&fsm.host_path).map_err(|e| {
KvmError::from(std::io::Error::other(format!(
"virtio-fs root {}: {e}",
fsm.host_path
)))
})?);
let fs_dev = Arc::new(VirtioFs::with_backend(
VirtioFsConfig {
tag: fsm.tag.clone(),
num_request_queues: 1,
dax_window_gpa: dax_gpa,
dax_window_len: FS_DAX_WINDOW_LEN,
},
backend.clone(),
));
let mapper: Arc<dyn crate::fuse::HvfMapper> =
Arc::new(KvmDaxMapper { vm: vm.clone() });
let session = Arc::new(crate::fuse::DaxSession::new(
dax_gpa,
FS_DAX_WINDOW_LEN,
backend.clone(),
mapper,
));
fs_dev
.fuse_server()
.lock()
.unwrap_or_else(|e| e.into_inner())
.set_dax(session.clone());
let gmem = GuestMem::new(host, 0, cfg.mem_size);
let vm_irq = vm.clone();
let irq_raise: Arc<dyn Fn() + Send + Sync> = Arc::new(move || {
let _ = vm_irq.set_irq(irq, true);
let _ = vm_irq.set_irq(irq, false);
});
let mmio = Arc::new(MmioVirtio::new(fs_dev.clone(), gmem, irq_raise));
fs_dev.set_irq_raise(mmio.make_used_buffer_irq());
bus.register(base, mmio.clone());
fs_mounts.push(FsMount {
mmio,
backend,
dax: session,
host_path: fsm.host_path.clone(),
tag: fsm.tag.clone(),
mount: fsm.mount.clone(),
dax_gpa,
dax_window_len: FS_DAX_WINDOW_LEN,
});
}
let mut volume_blks = Vec::with_capacity(cfg.volumes.len());
let mut volume_mmios = Vec::with_capacity(cfg.volumes.len());
let mut volume_meta = Vec::with_capacity(cfg.volumes.len());
let mut volume_threads = Vec::new();
let mut volume_wakes = Vec::new();
for (i, vol) in cfg.volumes.iter().enumerate() {
let base = VOLUME_BASE + (i as u64) * 0x1000;
let irq = VOLUME_IRQ_BASE + i as u32;
let name = format!("vd{}", (b'b' + i as u8) as char);
let (blk, mmio, thread, wake) = register_volume_blk(
&vm,
&bus,
host,
cfg.mem_size,
&name,
&vol.path,
vol.size,
base,
irq,
&dev_stop,
)?;
volume_blks.push(blk);
volume_mmios.push(mmio);
volume_meta.push(vol.clone());
if let Some(t) = thread {
volume_threads.push(t);
}
if let Some(w) = wake {
volume_wakes.push(w);
}
}
let mut balloon_keep = None;
let mut balloon_mmio_keep = None;
if cfg.enable_balloon {
let balloon = Arc::new(crate::devices::virtio::balloon::VirtioBalloon::new());
let balloon_dev = Arc::new(crate::devices::virtio::balloon::VirtioBalloonWithRam {
inner: balloon.clone(),
ram_host: host,
ram_size: cfg.mem_size,
ram_gpa: 0,
});
let irq = balloon_irq(cfg.virtiofs.len());
let vm_irq = vm.clone();
let irq_raise: Arc<dyn Fn() + Send + Sync> = Arc::new(move || {
let _ = vm_irq.set_irq(irq, true);
let _ = vm_irq.set_irq(irq, false);
});
let gmem = GuestMem::new(host, 0, cfg.mem_size);
let mmio = Arc::new(MmioVirtio::new(balloon_dev, gmem, irq_raise));
balloon.set_irq_raise(mmio.make_used_buffer_irq());
balloon.set_config_irq_raise(mmio.make_config_change_irq());
bus.register(BALLOON_BASE, mmio.clone());
balloon_mmio_keep = Some(mmio);
balloon_keep = Some(balloon);
}
Ok(Assembled {
vcpus,
bus,
balloon: balloon_keep,
balloon_mmio: balloon_mmio_keep,
blk: blk_keep,
blk_mmio: blk_mmio_keep,
disk: disk_keep,
vsock: vsock_keep,
vsock_mmio: vsock_mmio_keep,
dev_thread,
dev_stop,
dev_wake,
volume_blks,
volume_mmios,
volume_meta,
volume_threads,
volume_wakes,
fs_mounts,
})
};
let a = match assemble() {
Ok(parts) => parts,
Err(e) => {
unsafe { libc::munmap(host as *mut libc::c_void, cfg.mem_size) };
return Err(e);
}
};
Ok(LinuxVm {
vm,
vcpus: a.vcpus,
bus: a.bus,
com1: Arc::new(Mutex::new(Com1::new())),
host,
mem_size: cfg.mem_size,
_blk: a.blk,
blk_mmio: a.blk_mmio,
disk: a.disk,
vsock: a.vsock,
vsock_mmio: a.vsock_mmio,
balloon: a.balloon,
balloon_mmio: a.balloon_mmio,
dev_thread: a.dev_thread,
dev_stop: a.dev_stop,
dev_wake: a.dev_wake,
_volume_blks: a.volume_blks,
volume_threads: a.volume_threads,
volume_wakes: a.volume_wakes,
volume_mmios: a.volume_mmios,
volume_meta: a.volume_meta,
tsi_token: cfg.tsi_token,
fs_mounts: a.fs_mounts,
bridges: Mutex::new(Vec::new()),
vcpu_baselines: Vec::new(),
reset_seq: Arc::new(AtomicU64::new(0)),
reset_intc: None,
reset_com1: None,
reset_blk_mmio: None,
reset_vsock_mmio: None,
reset_volume_mmios: Vec::new(),
reset_fs_mmios: Vec::new(),
})
}
pub fn request_balloon_inflate(&self, pages: u32) -> bool {
match &self.balloon {
Some(b) => {
b.request_inflate(pages);
true
}
None => false,
}
}
#[allow(clippy::type_complexity)]
fn spawn_vcpus(
&mut self,
stop: Arc<AtomicBool>,
snapshot_req: Arc<AtomicBool>,
exits: Arc<AtomicU64>,
count_exits: bool,
) -> (
Vec<std::thread::JoinHandle<(ExitReason, Option<KvmSnapshotState>)>>,
Vec<KvmVcpuHandle>,
) {
self.spawn_vcpus_paused(stop, snapshot_req, exits, count_exits, None)
}
#[allow(clippy::type_complexity)]
fn spawn_vcpus_paused(
&mut self,
stop: Arc<AtomicBool>,
snapshot_req: Arc<AtomicBool>,
exits: Arc<AtomicU64>,
count_exits: bool,
pause: Option<Arc<PauseCoord>>,
) -> (
Vec<std::thread::JoinHandle<(ExitReason, Option<KvmSnapshotState>)>>,
Vec<KvmVcpuHandle>,
) {
let handles: Vec<KvmVcpuHandle> = self.vcpus.iter().map(|v| v.exit_token()).collect();
let vcpus = std::mem::take(&mut self.vcpus);
let mut threads = Vec::with_capacity(vcpus.len());
for (idx, vcpu) in vcpus.into_iter().enumerate() {
let vm = self.vm.clone();
let bus = self.bus.clone();
let com1 = self.com1.clone();
let stop = stop.clone();
let snapshot_req = snapshot_req.clone();
let exits = exits.clone();
let handles = handles.clone();
let pause = pause.clone();
let baseline = self.vcpu_baselines.get(idx).cloned();
let reset_seq = self.reset_seq.clone();
threads.push(std::thread::spawn(move || {
run_vcpu(
vcpu,
vm,
bus,
com1,
stop,
snapshot_req,
exits,
count_exits,
handles,
pause,
idx,
baseline,
reset_seq,
)
}));
}
(threads, handles)
}
pub fn run(&mut self) -> Result<ExitReason, KvmError> {
let count_exits = std::env::var_os("KVM_COUNT_EXITS").is_some();
let exits = Arc::new(AtomicU64::new(0));
let stop = Arc::new(AtomicBool::new(false));
let snapshot_req = Arc::new(AtomicBool::new(false));
let (threads, _handles) = self.spawn_vcpus(stop, snapshot_req, exits.clone(), count_exits);
let mut result = ExitReason::Unknown("no vcpus".into());
for (i, t) in threads.into_iter().enumerate() {
let (r, _snap) = t
.join()
.unwrap_or((ExitReason::Unknown("vcpu thread panicked".into()), None));
if std::env::var_os("KVM_DEBUG_VCPU").is_some() {
eprintln!("[vcpu {i}] exit: {r:?}");
}
if i == 0 {
result = r;
}
}
if count_exits {
let v = exits.load(Ordering::SeqCst);
eprintln!(
"[kvm] virtio-notify vCPU exits: {} | other device exits: {}",
v & 0xffff_ffff,
v >> 32
);
}
Ok(result)
}
pub fn snapshot_after(&mut self, after: std::time::Duration) -> Result<VmSnapshot, KvmError> {
let stop = Arc::new(AtomicBool::new(false));
let snapshot_req = Arc::new(AtomicBool::new(false));
let exits = Arc::new(AtomicU64::new(0));
let (threads, handles) = self.spawn_vcpus(stop, snapshot_req.clone(), exits, false);
let ncpus = handles.len();
let trigger_handles = handles.clone();
let trigger_req = snapshot_req.clone();
let timer = std::thread::spawn(move || {
std::thread::sleep(after);
trigger_req.store(true, Ordering::SeqCst);
KvmVcpuHandle::force_exit(&trigger_handles);
});
let snap = self.capture_quiesced(threads, ncpus);
let _ = timer.join();
snap
}
fn capture_quiesced(
&mut self,
threads: Vec<std::thread::JoinHandle<(ExitReason, Option<KvmSnapshotState>)>>,
ncpus: usize,
) -> Result<VmSnapshot, KvmError> {
let mut vcpu_states = Vec::with_capacity(ncpus);
for t in threads {
let (_reason, snap) = t
.join()
.unwrap_or((ExitReason::Unknown("panic".into()), None));
if let Some(s) = snap {
vcpu_states.push(s);
}
}
if vcpu_states.len() != ncpus {
return Err(KvmError::from(std::io::Error::other(
"a vCPU stopped before the snapshot trigger",
)));
}
if let Some(t) = self.dev_thread.take() {
self.dev_stop.store(true, Ordering::SeqCst);
if let Some(efd) = &self.dev_wake {
let _ = efd.write(1);
}
let _ = t.join();
}
if !self.volume_threads.is_empty() {
self.dev_stop.store(true, Ordering::SeqCst);
for w in &self.volume_wakes {
let _ = w.write(1);
}
for t in self.volume_threads.drain(..) {
let _ = t.join();
}
}
self.capture_with_states(vcpu_states)
}
fn capture_with_states(
&self,
vcpu_states: Vec<KvmSnapshotState>,
) -> Result<VmSnapshot, KvmError> {
let ncpus = vcpu_states.len();
let devices = self.vm.capture_devices()?;
let com1 = lock_recover(&self.com1).snapshot();
let disk = match (&self.blk_mmio, &self.disk) {
(Some(mmio), Some((path, size))) => Some(DiskSnap {
path: path.clone(),
size: *size,
mmio: mmio.capture_state(),
}),
_ => None,
};
let vsock = self.vsock_mmio.as_ref().map(|m| m.capture_state());
let vsock_listeners = self
.vsock
.as_ref()
.map(|v| v.muxer().capture_tsi_listeners())
.unwrap_or_default();
let volumes = self
.volume_mmios
.iter()
.zip(self.volume_meta.iter())
.map(|(mmio, m)| VolumeSnap {
path: m.path.clone(),
size: m.size,
mount: m.mount.clone(),
mmio: mmio.capture_state(),
})
.collect();
let virtiofs: Vec<VirtioFsSnap> = self
.fs_mounts
.iter()
.map(|m| VirtioFsSnap {
host_path: m.host_path.clone(),
tag: m.tag.clone(),
mount: m.mount.clone(),
dax_gpa: m.dax_gpa,
dax_window_len: m.dax_window_len,
mmio: m.mmio.capture_state(),
backend_state: m.backend.snapshot_state().unwrap_or_default(),
dax_state: m.dax.snapshot_state(),
})
.collect();
let mut ram = vec![0u8; self.mem_size];
unsafe { std::ptr::copy_nonoverlapping(self.host, ram.as_mut_ptr(), self.mem_size) };
Ok(VmSnapshot {
num_cpus: ncpus as u8,
mem_size: self.mem_size,
vcpus: vcpu_states,
devices,
com1,
disk,
vsock,
vsock_listeners,
volumes,
tsi_token: self.tsi_token,
virtiofs,
ram,
})
}
pub fn start_running(mut self) -> RunningVm {
let stop = Arc::new(AtomicBool::new(false));
let snapshot_req = Arc::new(AtomicBool::new(false));
let exits = Arc::new(AtomicU64::new(0));
let pause = Arc::new(PauseCoord::default());
let reset_seq = self.reset_seq.clone();
let (threads, handles) = self.spawn_vcpus_paused(
stop.clone(),
snapshot_req.clone(),
exits,
false,
Some(pause.clone()),
);
RunningVm {
vm: self,
threads,
stop,
snapshot_req,
handles,
pause,
reset_seq,
}
}
pub fn restore(snap: &VmSnapshot) -> Result<LinuxVm, KvmError> {
let vm = Arc::new(KvmVm::create()?);
vm.create_pit()?;
let host = unsafe {
libc::mmap(
std::ptr::null_mut(),
snap.mem_size,
libc::PROT_READ | libc::PROT_WRITE,
libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
-1,
0,
)
};
if host == libc::MAP_FAILED {
return Err(KvmError::from(std::io::Error::last_os_error()));
}
let host = host as *mut u8;
advise_hugepage(host, snap.mem_size);
advise_mergeable(host, snap.mem_size);
unsafe { std::ptr::copy_nonoverlapping(snap.ram.as_ptr(), host, snap.mem_size) };
unsafe {
if let Err(e) = vm.map_ram(host, 0, snap.mem_size, crate::hypervisor::prot::RWX) {
libc::munmap(host as *mut libc::c_void, snap.mem_size);
return Err(e);
}
}
Self::finish_restore(
vm,
host,
snap.mem_size,
&snap.vcpus,
&snap.devices,
&snap.com1,
&snap.disk,
&snap.vsock,
&snap.vsock_listeners,
&snap.volumes,
snap.tsi_token,
&snap.virtiofs,
)
}
pub fn restore_from_file(path: &std::path::Path) -> Result<LinuxVm, KvmError> {
let mut f = std::fs::File::open(path)?;
let mut magic = [0u8; 8];
f.read_exact(&mut magic)?;
if &magic == b"SMSNAP7D" {
return Self::restore_diff_from_file(path);
}
f.seek(SeekFrom::Start(0))?;
let meta = read_meta(&mut f)?;
let vm = Arc::new(KvmVm::create()?);
vm.create_pit()?;
let host = crate::snapshot_frame::cow_map_ram(
&f,
meta.ram_offset,
meta.mem_size,
libc::MAP_NORESERVE,
)?;
advise_mergeable(host, meta.mem_size);
unsafe {
if let Err(e) = vm.map_ram(host, 0, meta.mem_size, crate::hypervisor::prot::RWX) {
libc::munmap(host as *mut libc::c_void, meta.mem_size);
return Err(e);
}
}
Self::finish_restore(
vm,
host,
meta.mem_size,
&meta.vcpus,
&meta.devices,
&meta.com1,
&meta.disk,
&meta.vsock,
&meta.vsock_listeners,
&meta.volumes,
meta.tsi_token,
&meta.virtiofs,
)
}
fn restore_diff_from_file(path: &std::path::Path) -> Result<LinuxVm, KvmError> {
const PG: usize = 4096;
let mut f = std::fs::File::open(path)?;
let mut magic = [0u8; 8];
f.read_exact(&mut magic)?;
if &magic != b"SMSNAP7D" {
return Err(KvmError::from(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"not a differential snapshot",
)));
}
let bp_len = read_u32(&mut f)? as usize;
let mut bp = vec![0u8; bp_len];
f.read_exact(&mut bp)?;
let base_path = String::from_utf8(bp).map_err(|_| {
KvmError::from(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"diff base path not utf8",
))
})?;
let meta = read_meta_body(&mut f)?;
let mut bf = std::fs::File::open(&base_path)?;
let base_meta = read_meta(&mut bf)?;
if base_meta.mem_size != meta.mem_size {
return Err(KvmError::from(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"diff/base mem_size mismatch",
)));
}
let host = crate::snapshot_frame::cow_map_ram(
&bf,
base_meta.ram_offset,
base_meta.mem_size,
libc::MAP_NORESERVE,
)?;
advise_mergeable(host, base_meta.mem_size);
let num_changed = read_u32(&mut f)?;
for _ in 0..num_changed {
let idx = read_u32(&mut f)? as usize;
let off = idx * PG;
if off + PG > base_meta.mem_size {
unsafe { libc::munmap(host as *mut libc::c_void, base_meta.mem_size) };
return Err(KvmError::from(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"diff page index out of range",
)));
}
let mut page = [0u8; PG];
if let Err(e) = f.read_exact(&mut page) {
unsafe { libc::munmap(host as *mut libc::c_void, base_meta.mem_size) };
return Err(KvmError::from(e));
}
unsafe { std::ptr::copy_nonoverlapping(page.as_ptr(), host.add(off), PG) };
}
let vm = Arc::new(KvmVm::create()?);
vm.create_pit()?;
unsafe {
if let Err(e) = vm.map_ram(host, 0, base_meta.mem_size, crate::hypervisor::prot::RWX) {
libc::munmap(host as *mut libc::c_void, base_meta.mem_size);
return Err(e);
}
}
Self::finish_restore(
vm,
host,
base_meta.mem_size,
&meta.vcpus,
&meta.devices,
&meta.com1,
&meta.disk,
&meta.vsock,
&meta.vsock_listeners,
&meta.volumes,
meta.tsi_token,
&meta.virtiofs,
)
}
fn finish_restore(
vm: Arc<KvmVm>,
host: *mut u8,
mem_size: usize,
vcpu_states: &[KvmSnapshotState],
devices: &KvmDeviceState,
com1_state: &Com1State,
disk: &Option<DiskSnap>,
vsock: &Option<MmioSnapshot>,
vsock_listeners: &[TsiListenerSnapshot],
volumes: &[VolumeSnap],
tsi_token: Option<[u8; 32]>,
virtiofs: &[VirtioFsSnap],
) -> Result<LinuxVm, KvmError> {
vm.restore_devices(devices)?;
let mut vcpus = Vec::with_capacity(vcpu_states.len());
for st in vcpu_states {
let vcpu = vm.create_vcpu()?;
vcpu.restore_snapshot(st)?;
vcpus.push(vcpu);
}
let mut com1 = Com1::new();
com1.restore(com1_state);
let bus = Arc::new(MmioBus::new());
let dev_stop = Arc::new(AtomicBool::new(false));
let mut blk_keep = None;
let mut blk_mmio_keep = None;
let mut disk_keep = None;
let mut dev_thread = None;
let mut dev_wake = None;
if let Some(d) = disk {
let blk = Arc::new(VirtioBlk::open_rw("vda", &d.path, d.size)?);
let gmem = GuestMem::new(host, 0, mem_size);
let irq_efd = EventFd::new(0)?;
vm.register_irqfd(&irq_efd, VIRTIO_IRQ)?;
let irq_efd_dev = irq_efd.try_clone()?;
let irq_raise: Arc<dyn Fn() + Send + Sync> = Arc::new(move || {
let _ = irq_efd_dev.write(1);
});
let mmio = Arc::new(MmioVirtio::new(blk.clone(), gmem, irq_raise));
blk.set_irq_raise(mmio.make_used_buffer_irq());
mmio.restore_state(&d.mmio);
bus.register(VIRTIO_BASE, mmio.clone());
let notify_efd = EventFd::new(0)?;
vm.register_mmio_ioevent(¬ify_efd, VIRTIO_QUEUE_NOTIFY)?;
let notify_rd = notify_efd.try_clone()?;
let blk_thread = blk.clone();
let stop = dev_stop.clone();
dev_thread = Some(std::thread::spawn(move || loop {
if notify_rd.read().is_err() {
break;
}
if stop.load(Ordering::SeqCst) {
break;
}
blk_thread.notify(0);
}));
dev_wake = Some(notify_efd.try_clone()?);
blk.notify(0);
disk_keep = Some((d.path.clone(), d.size));
blk_mmio_keep = Some(mmio);
blk_keep = Some(blk);
}
let mut vsock_keep = None;
let mut vsock_mmio_keep = None;
if let Some(vmmio) = vsock {
let vsock_dev = Arc::new(
Vsock::with_tsi_token(GUEST_CID, tsi_token)
.map_err(|e| KvmError::from(std::io::Error::other(format!("{e:?}"))))?,
);
let gmem = GuestMem::new(host, 0, mem_size);
let vm_irq = vm.clone();
let irq_raise: Arc<dyn Fn() + Send + Sync> = Arc::new(move || {
let _ = vm_irq.set_irq(VSOCK_IRQ, true);
let _ = vm_irq.set_irq(VSOCK_IRQ, false);
});
let mmio = Arc::new(MmioVirtio::new(vsock_dev.clone(), gmem, irq_raise));
vsock_dev.set_irq_raise(mmio.make_used_buffer_irq());
let vsock_for_kick = vsock_dev.clone();
vsock_dev
.muxer()
.set_kick(Arc::new(move || vsock_for_kick.kick()));
mmio.restore_state(vmmio);
bus.register(VSOCK_BASE, mmio.clone());
vsock_dev.muxer().restore_tsi_listeners(vsock_listeners);
vsock_mmio_keep = Some(mmio);
vsock_keep = Some(vsock_dev);
}
let mut volume_blks = Vec::with_capacity(volumes.len());
let mut volume_mmios = Vec::with_capacity(volumes.len());
let mut volume_meta = Vec::with_capacity(volumes.len());
let mut volume_threads = Vec::new();
let mut volume_wakes = Vec::new();
for (i, v) in volumes.iter().enumerate() {
let base = VOLUME_BASE + (i as u64) * 0x1000;
let irq = VOLUME_IRQ_BASE + i as u32;
let name = format!("vd{}", (b'b' + i as u8) as char);
let (blk, mmio, thread, wake) = register_volume_blk(
&vm, &bus, host, mem_size, &name, &v.path, v.size, base, irq, &dev_stop,
)?;
mmio.restore_state(&v.mmio);
blk.notify(0);
volume_blks.push(blk);
volume_mmios.push(mmio);
volume_meta.push(VolumeAttach {
path: v.path.clone(),
size: v.size,
mount: v.mount.clone(),
});
if let Some(t) = thread {
volume_threads.push(t);
}
if let Some(w) = wake {
volume_wakes.push(w);
}
}
let mut fs_mounts: Vec<FsMount> = Vec::with_capacity(virtiofs.len());
for (i, f) in virtiofs.iter().enumerate() {
let base = FS_BASE + (i as u64) * 0x1000;
let irq = FS_IRQ_BASE + i as u32;
let backend: Arc<dyn crate::fuse::FsBackend> =
Arc::new(crate::fuse::PosixFs::new(&f.host_path).map_err(|e| {
KvmError::from(std::io::Error::other(format!(
"virtio-fs root {}: {e}",
f.host_path
)))
})?);
if !f.backend_state.is_empty() {
backend.restore_state(&f.backend_state).map_err(|e| {
KvmError::from(std::io::Error::other(format!(
"virtio-fs backend restore {}: {e}",
f.host_path
)))
})?;
}
let fs_dev = Arc::new(VirtioFs::with_backend(
VirtioFsConfig {
tag: f.tag.clone(),
num_request_queues: 1,
dax_window_gpa: f.dax_gpa,
dax_window_len: f.dax_window_len,
},
backend.clone(),
));
let mapper: Arc<dyn crate::fuse::HvfMapper> = Arc::new(KvmDaxMapper { vm: vm.clone() });
let session = Arc::new(crate::fuse::DaxSession::new(
f.dax_gpa,
f.dax_window_len,
backend.clone(),
mapper,
));
if !f.dax_state.is_empty() {
session.restore_state(&f.dax_state).map_err(|e| {
KvmError::from(std::io::Error::other(format!(
"virtio-fs dax restore {}: {e}",
f.host_path
)))
})?;
}
fs_dev
.fuse_server()
.lock()
.unwrap_or_else(|e| e.into_inner())
.set_dax(session.clone());
let gmem = GuestMem::new(host, 0, mem_size);
let vm_irq = vm.clone();
let irq_raise: Arc<dyn Fn() + Send + Sync> = Arc::new(move || {
let _ = vm_irq.set_irq(irq, true);
let _ = vm_irq.set_irq(irq, false);
});
let mmio = Arc::new(MmioVirtio::new(fs_dev.clone(), gmem, irq_raise));
fs_dev.set_irq_raise(mmio.make_used_buffer_irq());
mmio.restore_state(&f.mmio);
bus.register(base, mmio.clone());
session.rebind_all().map_err(|e| {
KvmError::from(std::io::Error::other(format!(
"virtio-fs dax rebind {}: errno {e}",
f.host_path
)))
})?;
fs_mounts.push(FsMount {
mmio,
backend,
dax: session,
host_path: f.host_path.clone(),
tag: f.tag.clone(),
mount: f.mount.clone(),
dax_gpa: f.dax_gpa,
dax_window_len: f.dax_window_len,
});
}
Ok(LinuxVm {
vm,
vcpus,
bus,
com1: Arc::new(Mutex::new(com1)),
host,
mem_size,
_blk: blk_keep,
blk_mmio: blk_mmio_keep,
disk: disk_keep,
vsock: vsock_keep,
vsock_mmio: vsock_mmio_keep,
balloon: None,
balloon_mmio: None,
dev_thread,
dev_stop,
dev_wake,
_volume_blks: volume_blks,
volume_threads,
volume_wakes,
volume_mmios,
volume_meta,
tsi_token,
fs_mounts,
bridges: Mutex::new(Vec::new()),
vcpu_baselines: vcpu_states.to_vec(),
reset_seq: Arc::new(AtomicU64::new(0)),
reset_intc: Some(devices.clone()),
reset_com1: Some(*com1_state),
reset_blk_mmio: disk.as_ref().map(|d| d.mmio.clone()),
reset_vsock_mmio: vsock.clone(),
reset_volume_mmios: volumes.iter().map(|v| v.mmio.clone()).collect(),
reset_fs_mmios: virtiofs.iter().map(|f| f.mmio.clone()).collect(),
})
}
pub fn vsock_handle(&self) -> Option<VsockHandle> {
self.vsock.clone().map(|vsock| VsockHandle { vsock })
}
pub fn start_exec_bridge(&self, guest_port: u32) -> std::io::Result<std::path::PathBuf> {
let vsock = self.vsock.clone().ok_or_else(|| {
std::io::Error::new(std::io::ErrorKind::NotConnected, "vsock not enabled")
})?;
let path = std::env::temp_dir().join(format!(
"sm-kvm-exec-{}-{}.sock",
std::process::id(),
next_sock_id()
));
let _ = std::fs::remove_file(&path);
let listener = std::os::unix::net::UnixListener::bind(&path)?;
let stop = Arc::new(AtomicBool::new(false));
let stop_c = stop.clone();
let join = std::thread::spawn(move || {
for stream in listener.incoming() {
if stop_c.load(Ordering::SeqCst) {
break;
}
match stream {
Ok(s) => {
let _ = vsock
.muxer()
.open_native_to_guest(MuxerStream::Unix(s), guest_port);
}
Err(_) => break,
}
}
});
self.bridges
.lock()
.unwrap_or_else(std::sync::PoisonError::into_inner)
.push(crate::vmm::vsock_mux::Acceptor::from_parts(
stop,
path.to_string_lossy().into_owned(),
join,
));
Ok(path)
}
pub fn start_tsi_mux(&self) -> std::io::Result<std::path::PathBuf> {
let vsock = self.vsock.clone().ok_or_else(|| {
std::io::Error::new(std::io::ErrorKind::NotConnected, "vsock not enabled")
})?;
let path = std::env::temp_dir().join(format!(
"sm-kvm-mux-{}-{}.sock",
std::process::id(),
next_sock_id()
));
let path_str = path.to_string_lossy().into_owned();
let acceptor = crate::vmm::vsock_mux::start(&path_str, vsock, None)
.map_err(|e| std::io::Error::other(format!("vsock_mux::start: {e}")))?;
self.bridges
.lock()
.unwrap_or_else(std::sync::PoisonError::into_inner)
.push(acceptor);
Ok(path)
}
pub fn serial_input(&self) -> SerialInput {
SerialInput {
com1: self.com1.clone(),
vm: self.vm.clone(),
}
}
}
struct Assembled {
vcpus: Vec<KvmVcpu>,
bus: Arc<MmioBus>,
blk: Option<Arc<VirtioBlk>>,
blk_mmio: Option<Arc<MmioVirtio>>,
disk: Option<(String, u64)>,
vsock: Option<Arc<Vsock>>,
vsock_mmio: Option<Arc<MmioVirtio>>,
balloon: Option<Arc<crate::devices::virtio::balloon::VirtioBalloon>>,
balloon_mmio: Option<Arc<MmioVirtio>>,
dev_thread: Option<std::thread::JoinHandle<()>>,
dev_stop: Arc<AtomicBool>,
dev_wake: Option<EventFd>,
volume_blks: Vec<Arc<VirtioBlk>>,
volume_mmios: Vec<Arc<MmioVirtio>>,
volume_meta: Vec<VolumeAttach>,
volume_threads: Vec<std::thread::JoinHandle<()>>,
volume_wakes: Vec<EventFd>,
fs_mounts: Vec<FsMount>,
}
pub struct VmSnapshot {
num_cpus: u8,
mem_size: usize,
vcpus: Vec<KvmSnapshotState>,
devices: KvmDeviceState,
com1: Com1State,
disk: Option<DiskSnap>,
vsock: Option<MmioSnapshot>,
vsock_listeners: Vec<TsiListenerSnapshot>,
volumes: Vec<VolumeSnap>,
tsi_token: Option<[u8; 32]>,
virtiofs: Vec<VirtioFsSnap>,
ram: Vec<u8>,
}
struct DiskSnap {
path: String,
size: u64,
mmio: MmioSnapshot,
}
struct VolumeSnap {
path: String,
size: u64,
mount: String,
mmio: MmioSnapshot,
}
impl VmSnapshot {
pub fn num_cpus(&self) -> u8 {
self.num_cpus
}
pub fn mem_size(&self) -> usize {
self.mem_size
}
fn write_meta<W: Write>(&self, w: &mut W) -> std::io::Result<()> {
self.to_container()?.write_container(w)
}
fn to_container(&self) -> std::io::Result<crate::snapshot_frame::ContainerMeta> {
let mut intc_blob = Vec::new();
write_blob(&mut intc_blob, pod_bytes(&self.devices.pit))?;
for chip in &self.devices.irqchips {
write_blob(&mut intc_blob, pod_bytes(chip))?;
}
write_blob(&mut intc_blob, pod_bytes(&self.devices.clock))?;
let mut vcpu_blobs = Vec::with_capacity(self.vcpus.len());
for s in &self.vcpus {
let mut b = Vec::new();
crate::kvm::KvmVcpu::write_snapshot_state(s, &mut b)?;
vcpu_blobs.push(b);
}
let mut devices: Vec<DeviceRecord> = Vec::new();
if let Some(d) = &self.disk {
devices.push(DeviceRecord {
kind: DeviceKind::Blk,
mmio: d.mmio.clone(),
backing: DeviceBacking::Disk {
path: d.path.clone(),
size: d.size,
},
});
}
if let Some(m) = &self.vsock {
devices.push(DeviceRecord {
kind: DeviceKind::Vsock,
mmio: m.clone(),
backing: DeviceBacking::None,
});
}
for v in &self.volumes {
devices.push(DeviceRecord {
kind: DeviceKind::Volume,
mmio: v.mmio.clone(),
backing: DeviceBacking::Volume {
path: v.path.clone(),
size: v.size,
mount: v.mount.clone(),
},
});
}
for f in &self.virtiofs {
devices.push(DeviceRecord {
kind: DeviceKind::VirtioFs,
mmio: f.mmio.clone(),
backing: DeviceBacking::VirtioFs {
tag: f.tag.clone(),
mount: f.mount.clone(),
host_path: f.host_path.clone(),
dax_gpa: f.dax_gpa,
dax_window_len: f.dax_window_len,
backend_state: f.backend_state.clone(),
dax_state: f.dax_state.clone(),
},
});
}
let c = &self.com1;
Ok(crate::snapshot_frame::ContainerMeta {
num_cpus: self.num_cpus,
mem_size: self.mem_size as u64,
com1: [c.ier, c.lcr, c.mcr, c.scr, c.dll, c.dlm],
clock_host_ticks: 0,
clock_ref: 0,
intc_blob,
vcpu_blobs,
devices,
tsi_token: self.tsi_token,
vsock_listeners: self.vsock_listeners.clone(),
})
}
pub fn save(&self, path: &std::path::Path) -> std::io::Result<()> {
use std::os::unix::fs::FileExt;
const PG: usize = 4096;
let mut meta = Vec::new();
self.write_meta(&mut meta)?;
const HDR: usize = 8 + 8; let ram_offset = (HDR + meta.len()).next_multiple_of(4096);
let mut hdr = Vec::with_capacity(ram_offset);
hdr.extend_from_slice(b"SMSNAP07");
hdr.extend_from_slice(&(ram_offset as u64).to_le_bytes());
hdr.extend_from_slice(&meta);
hdr.resize(ram_offset, 0);
let f = std::fs::File::create(path)?;
f.write_all_at(&hdr, 0)?;
let ram = &self.ram[..];
let n = ram.len();
let mut i = 0;
while i < n {
let end = (i + PG).min(n);
if ram[i..end].iter().all(|&b| b == 0) {
i = end; continue;
}
let start = i;
i = end;
while i < n {
let e = (i + PG).min(n);
if ram[i..e].iter().all(|&b| b == 0) {
break;
}
i = e;
}
f.write_all_at(&ram[start..i], (ram_offset + start) as u64)?;
}
f.set_len((ram_offset + n) as u64)?;
Ok(())
}
pub fn save_diff(
&self,
path: &std::path::Path,
base_path: &std::path::Path,
) -> std::io::Result<()> {
const PG: usize = 4096;
let mut bf = std::fs::File::open(base_path)?;
let base_meta = read_meta(&mut bf)?;
if base_meta.mem_size != self.mem_size {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!(
"diff base mem_size {} != current {}",
base_meta.mem_size, self.mem_size
),
));
}
let base_ptr = unsafe {
libc::mmap(
std::ptr::null_mut(),
self.mem_size,
libc::PROT_READ,
libc::MAP_PRIVATE,
bf.as_raw_fd(),
base_meta.ram_offset as libc::off_t,
)
};
if base_ptr == libc::MAP_FAILED {
return Err(std::io::Error::last_os_error());
}
let base = unsafe { std::slice::from_raw_parts(base_ptr as *const u8, self.mem_size) };
let npages = self.mem_size / PG;
let mut changed: Vec<u32> = Vec::new();
for i in 0..npages {
let o = i * PG;
if self.ram[o..o + PG] != base[o..o + PG] {
changed.push(i as u32);
}
}
let tail = npages * PG;
let mut meta = Vec::new();
self.write_meta(&mut meta)?;
let bp = base_path.to_string_lossy();
let res = (|| -> std::io::Result<()> {
let mut w = BufWriter::new(std::fs::File::create(path)?);
w.write_all(b"SMSNAP7D")?;
w.write_all(&(bp.len() as u32).to_le_bytes())?;
w.write_all(bp.as_bytes())?;
w.write_all(&0u64.to_le_bytes())?; w.write_all(&meta)?;
w.write_all(&(changed.len() as u32).to_le_bytes())?;
for &i in &changed {
w.write_all(&i.to_le_bytes())?;
let o = i as usize * PG;
w.write_all(&self.ram[o..o + PG])?;
}
if tail < self.mem_size {
w.write_all(&self.ram[tail..])?;
}
w.flush()
})();
unsafe { libc::munmap(base_ptr, self.mem_size) };
res
}
pub fn load(path: &std::path::Path) -> std::io::Result<VmSnapshot> {
let mut f = std::fs::File::open(path)?;
let meta = read_meta(&mut f)?;
let mut ram = vec![0u8; meta.mem_size];
f.seek(SeekFrom::Start(meta.ram_offset))?;
f.read_exact(&mut ram)?;
Ok(VmSnapshot {
num_cpus: meta.num_cpus,
mem_size: meta.mem_size,
vcpus: meta.vcpus,
devices: meta.devices,
com1: meta.com1,
disk: meta.disk,
vsock: meta.vsock,
vsock_listeners: meta.vsock_listeners,
volumes: meta.volumes,
tsi_token: meta.tsi_token,
virtiofs: meta.virtiofs,
ram,
})
}
}
pub fn rewrite_full_as_diff(
full_path: &std::path::Path,
base_path: &std::path::Path,
) -> std::io::Result<u64> {
{
let mut bf = std::fs::File::open(base_path)?;
let mut magic = [0u8; 8];
bf.read_exact(&mut magic)?;
if &magic == b"SMSNAP7D" {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"diff base is itself a differential snapshot",
));
}
}
let snap = VmSnapshot::load(full_path)?;
let tmp = full_path.with_extension("snap.diff.tmp");
let _ = std::fs::remove_file(&tmp);
let res = (|| {
snap.save_diff(&tmp, base_path)?;
let size = std::fs::metadata(&tmp)?.len();
std::fs::rename(&tmp, full_path)?;
Ok(size)
})();
if res.is_err() {
let _ = std::fs::remove_file(&tmp);
}
res
}
struct SnapshotMeta {
ram_offset: u64,
num_cpus: u8,
mem_size: usize,
com1: Com1State,
devices: KvmDeviceState,
vcpus: Vec<KvmSnapshotState>,
disk: Option<DiskSnap>,
vsock: Option<MmioSnapshot>,
vsock_listeners: Vec<TsiListenerSnapshot>,
volumes: Vec<VolumeSnap>,
tsi_token: Option<[u8; 32]>,
virtiofs: Vec<VirtioFsSnap>,
}
fn read_meta<R: Read>(r: &mut R) -> std::io::Result<SnapshotMeta> {
let mut magic = [0u8; 8];
r.read_exact(&mut magic)?;
if &magic != b"SMSNAP07" {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"bad snapshot magic/version",
));
}
read_meta_body(r)
}
fn read_meta_body<R: Read>(r: &mut R) -> std::io::Result<SnapshotMeta> {
let ram_offset = read_u64(r)?;
let meta = crate::snapshot_frame::ContainerMeta::read_container(r)?;
let com1 = Com1State {
ier: meta.com1[0],
lcr: meta.com1[1],
mcr: meta.com1[2],
scr: meta.com1[3],
dll: meta.com1[4],
dlm: meta.com1[5],
};
let mut ic = std::io::Cursor::new(&meta.intc_blob);
let devices = KvmDeviceState {
pit: read_blob_pod::<kvm_pit_state2>(&mut ic)?,
irqchips: [
read_blob_pod::<kvm_irqchip>(&mut ic)?,
read_blob_pod::<kvm_irqchip>(&mut ic)?,
read_blob_pod::<kvm_irqchip>(&mut ic)?,
],
clock: read_blob_pod::<kvm_clock_data>(&mut ic)?,
};
let mut vcpus = Vec::with_capacity(meta.vcpu_blobs.len());
for vb in &meta.vcpu_blobs {
let mut vc = std::io::Cursor::new(vb);
vcpus.push(crate::kvm::KvmVcpu::read_snapshot_state(&mut vc)?);
}
let mut disk = None;
let mut vsock = None;
let mut volumes = Vec::new();
let mut virtiofs = Vec::new();
for rec in meta.devices {
let bad = || {
std::io::Error::new(
std::io::ErrorKind::InvalidData,
"device kind/backing mismatch",
)
};
match (rec.kind, rec.backing) {
(DeviceKind::Blk, DeviceBacking::Disk { path, size }) => {
disk = Some(DiskSnap {
path,
size,
mmio: rec.mmio,
});
}
(DeviceKind::Vsock, _) => vsock = Some(rec.mmio),
(DeviceKind::Volume, DeviceBacking::Volume { path, size, mount }) => {
volumes.push(VolumeSnap {
path,
size,
mount,
mmio: rec.mmio,
});
}
(
DeviceKind::VirtioFs,
DeviceBacking::VirtioFs {
tag,
mount,
host_path,
dax_gpa,
dax_window_len,
backend_state,
dax_state,
},
) => {
virtiofs.push(VirtioFsSnap {
host_path,
tag,
mount,
dax_gpa,
dax_window_len,
mmio: rec.mmio,
backend_state,
dax_state,
});
}
_ => return Err(bad()),
}
}
Ok(SnapshotMeta {
ram_offset,
num_cpus: meta.num_cpus,
mem_size: meta.mem_size as usize,
com1,
devices,
vcpus,
disk,
vsock,
vsock_listeners: meta.vsock_listeners,
volumes,
tsi_token: meta.tsi_token,
virtiofs,
})
}
fn pod_bytes<T>(v: &T) -> &[u8] {
unsafe { std::slice::from_raw_parts(v as *const T as *const u8, std::mem::size_of::<T>()) }
}
fn write_blob<W: Write>(w: &mut W, bytes: &[u8]) -> std::io::Result<()> {
w.write_all(&(bytes.len() as u32).to_le_bytes())?;
w.write_all(bytes)
}
fn read_blob_vec<R: Read>(r: &mut R) -> std::io::Result<Vec<u8>> {
let len = read_u32(r)? as usize;
let mut buf = vec![0u8; len];
r.read_exact(&mut buf)?;
Ok(buf)
}
fn read_blob_string<R: Read>(r: &mut R, what: &str) -> std::io::Result<String> {
let buf = read_blob_vec(r)?;
String::from_utf8(buf).map_err(|_| {
std::io::Error::new(std::io::ErrorKind::InvalidData, format!("{what} not utf8"))
})
}
fn read_u32<R: Read>(r: &mut R) -> std::io::Result<u32> {
let mut b = [0u8; 4];
r.read_exact(&mut b)?;
Ok(u32::from_le_bytes(b))
}
fn read_u64<R: Read>(r: &mut R) -> std::io::Result<u64> {
let mut b = [0u8; 8];
r.read_exact(&mut b)?;
Ok(u64::from_le_bytes(b))
}
fn read_blob_pod<T: Copy>(r: &mut impl Read) -> std::io::Result<T> {
let len = read_u32(r)? as usize;
if len != std::mem::size_of::<T>() {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!("blob size {len} != {}", std::mem::size_of::<T>()),
));
}
let mut buf = vec![0u8; len];
r.read_exact(&mut buf)?;
Ok(unsafe { std::ptr::read_unaligned(buf.as_ptr() as *const T) })
}
#[derive(Clone)]
pub struct VsockHandle {
vsock: Arc<Vsock>,
}
impl VsockHandle {
pub fn connect(&self, guest_port: u32) -> std::io::Result<std::os::unix::net::UnixStream> {
let (host_end, muxer_end) = std::os::unix::net::UnixStream::pair()?;
self.vsock
.muxer()
.open_native_to_guest(MuxerStream::Unix(muxer_end), guest_port)?;
Ok(host_end)
}
}
#[derive(Clone)]
pub struct SerialInput {
com1: Arc<Mutex<Com1>>,
vm: Arc<KvmVm>,
}
impl SerialInput {
pub fn push(&self, byte: u8) {
let mut c = lock_recover(&self.com1);
c.push_rx(byte);
let level = c.irq_line();
drop(c);
let _ = self.vm.set_irq(COM1_IRQ, level);
}
}
pub struct RunningVm {
vm: LinuxVm,
threads: Vec<std::thread::JoinHandle<(ExitReason, Option<KvmSnapshotState>)>>,
stop: Arc<AtomicBool>,
snapshot_req: Arc<AtomicBool>,
handles: Vec<KvmVcpuHandle>,
pause: Arc<PauseCoord>,
reset_seq: Arc<AtomicU64>,
}
type PauseCoord = crate::vcpu_dispatch::PauseBarrier<KvmSnapshotState>;
impl RunningVm {
pub fn request_balloon_inflate(&self, pages: u32) -> bool {
self.vm.request_balloon_inflate(pages)
}
fn join_all(&mut self) -> ExitReason {
let mut result = ExitReason::Unknown("no vcpus".into());
for (i, t) in self.threads.drain(..).enumerate() {
let (r, _) = t
.join()
.unwrap_or((ExitReason::Unknown("vcpu thread panicked".into()), None));
if i == 0 {
result = r;
}
}
result
}
pub fn wait(mut self) -> ExitReason {
self.join_all()
}
pub fn stop(mut self) -> ExitReason {
self.stop.store(true, Ordering::SeqCst);
KvmVcpuHandle::force_exit(&self.handles);
self.join_all()
}
pub fn snapshot(mut self) -> Result<VmSnapshot, KvmError> {
let ncpus = self.handles.len();
self.snapshot_req.store(true, Ordering::SeqCst);
KvmVcpuHandle::force_exit(&self.handles);
let threads = std::mem::take(&mut self.threads);
self.vm.capture_quiesced(threads, ncpus)
}
pub fn snapshot_live(&self) -> Result<VmSnapshot, KvmError> {
let ncpus = self.handles.len();
self.pause.request_pause();
KvmVcpuHandle::force_exit(&self.handles);
let states = self.pause.wait_all_parked(ncpus);
let snap = self.vm.capture_with_states(states);
self.pause.resume();
snap
}
pub fn reset_to_snapshot(&self) -> Result<(), KvmError> {
let (Some(intc), Some(com1)) = (&self.vm.reset_intc, &self.vm.reset_com1) else {
return Err(KvmError(
"reset_to_snapshot: VM has no snapshot baseline (cold-booted?)".into(),
));
};
let ncpus = self.handles.len();
self.pause.request_pause();
KvmVcpuHandle::force_exit(&self.handles);
let _dirty = self.pause.wait_all_parked(ncpus);
unsafe {
libc::madvise(
self.vm.host as *mut libc::c_void,
self.vm.mem_size,
libc::MADV_DONTNEED,
);
}
self.vm.vm.restore_devices(intc)?;
{
let mut c = self.vm.com1.lock().unwrap_or_else(|e| e.into_inner());
*c = Com1::new();
c.restore(com1);
}
if let (Some(m), Some(b)) = (&self.vm.blk_mmio, &self.vm.reset_blk_mmio) {
m.restore_state(b);
}
if let Some(v) = &self.vm.vsock {
v.muxer().reset();
v.reset_pending_rx();
}
if let (Some(m), Some(b)) = (&self.vm.vsock_mmio, &self.vm.reset_vsock_mmio) {
m.restore_state(b);
}
for (m, b) in self.vm.volume_mmios.iter().zip(&self.vm.reset_volume_mmios) {
m.restore_state(b);
}
for (f, b) in self.vm.fs_mounts.iter().zip(&self.vm.reset_fs_mmios) {
f.mmio.restore_state(b);
}
self.reset_seq.fetch_add(1, Ordering::SeqCst);
self.pause.resume();
Ok(())
}
pub fn expose_tls(
&self,
cfg: crate::vmm::tls::TlsConfig,
) -> Result<std::net::SocketAddr, crate::vmm::tls::StartError> {
let vsock = self.vm.vsock.clone().ok_or_else(|| {
crate::vmm::tls::StartError::Config("vsock not enabled on this VM".into())
})?;
crate::vmm::tls::start(cfg, vsock)
}
}
impl Drop for RunningVm {
fn drop(&mut self) {
if !self.threads.is_empty() {
self.stop.store(true, Ordering::SeqCst);
KvmVcpuHandle::force_exit(&self.handles);
for t in self.threads.drain(..) {
let _ = t.join();
}
}
}
}
fn lock_recover<T>(m: &Mutex<T>) -> std::sync::MutexGuard<'_, T> {
m.lock().unwrap_or_else(std::sync::PoisonError::into_inner)
}
#[inline]
fn dispatch_kvm_exit(
exit: KvmExit<'_>,
vm: &KvmVm,
bus: &MmioBus,
com1: &Mutex<Com1>,
stop: &AtomicBool,
vcpu: &KvmVcpu,
) -> std::ops::ControlFlow<ExitReason> {
use std::ops::ControlFlow;
match exit {
KvmExit::IoOut(port, data) => {
if COM1_PORTS.contains(&port) {
let mut c = lock_recover(com1);
let mut out = std::io::stdout().lock();
for &b in data.iter() {
if let Some(tx) = c.write(port, b) {
let _ = out.write_all(&[tx]);
}
}
let _ = out.flush();
let level = c.irq_line();
drop(out);
drop(c);
let _ = vm.set_irq(COM1_IRQ, level);
}
}
KvmExit::IoIn(port, data) => {
if COM1_PORTS.contains(&port) {
let mut c = lock_recover(com1);
let v = c.read(port);
for b in data.iter_mut() {
*b = v;
}
let level = c.irq_line();
drop(c);
let _ = vm.set_irq(COM1_IRQ, level);
}
}
KvmExit::MmioWrite(addr, data) => {
let mut buf = [0u8; 8];
buf[..data.len()].copy_from_slice(data);
bus.write(addr, u64::from_le_bytes(buf), data.len() as u8);
}
KvmExit::MmioRead(addr, data) => {
if let Some(v) = bus.read(addr, data.len() as u8) {
let le = v.to_le_bytes();
data.copy_from_slice(&le[..data.len()]);
}
}
KvmExit::Hlt => return ControlFlow::Break(ExitReason::Halt),
KvmExit::Shutdown => return ControlFlow::Break(ExitReason::Shutdown),
KvmExit::Intr => {
if stop.load(Ordering::SeqCst) || vcpu.should_exit() {
return ControlFlow::Break(ExitReason::Canceled);
}
}
other => return ControlFlow::Break(ExitReason::Unknown(format!("{other:?}"))),
}
ControlFlow::Continue(())
}
fn run_vcpu(
vcpu: KvmVcpu,
vm: Arc<KvmVm>,
bus: Arc<MmioBus>,
com1: Arc<Mutex<Com1>>,
stop: Arc<AtomicBool>,
snapshot_req: Arc<AtomicBool>,
exits: Arc<AtomicU64>,
count_exits: bool,
handles: Vec<KvmVcpuHandle>,
pause: Option<Arc<PauseCoord>>,
vcpu_idx: usize,
baseline: Option<KvmSnapshotState>,
reset_seq: Arc<AtomicU64>,
) -> (ExitReason, Option<KvmSnapshotState>) {
if let Err(e) = vcpu.bind_thread() {
return (ExitReason::Unknown(format!("bind_thread: {e}")), None);
}
let mut last_reset_applied = reset_seq.load(Ordering::SeqCst);
let reason = {
let mut fd = vcpu.vcpu.borrow_mut();
loop {
let exit = match fd.run() {
Ok(e) => e,
Err(e) if e.errno() == libc::EINTR => {
if stop.load(Ordering::SeqCst) {
break ExitReason::Canceled;
}
if let Some(pc) = &pause {
if pc.is_paused() {
if let Ok(s) = vcpu.capture_snapshot_locked(&fd) {
pc.park(vcpu_idx, s);
}
unsafe {
let mut set: libc::sigset_t = std::mem::zeroed();
libc::sigemptyset(&mut set);
libc::sigaddset(&mut set, libc::SIGUSR1);
let ts = libc::timespec {
tv_sec: 0,
tv_nsec: 0,
};
while libc::sigtimedwait(&set, std::ptr::null_mut(), &ts) >= 0 {}
}
let rs = reset_seq.load(Ordering::SeqCst);
if rs != last_reset_applied {
if let Some(b) = &baseline {
let _ = vcpu.restore_snapshot_locked(&fd, b);
}
last_reset_applied = rs;
}
fd.set_kvm_immediate_exit(0);
vcpu.clear_exit();
if stop.load(Ordering::SeqCst) || vcpu.should_exit() {
break ExitReason::Canceled;
}
if pc.is_paused() {
fd.set_kvm_immediate_exit(1);
}
continue;
}
}
if vcpu.should_exit() {
break ExitReason::Canceled;
}
continue;
}
Err(e) if e.errno() == libc::EAGAIN => {
if stop.load(Ordering::SeqCst) || vcpu.should_exit() {
break ExitReason::Canceled;
}
std::thread::sleep(std::time::Duration::from_millis(1));
continue;
}
Err(e) => break ExitReason::Unknown(format!("{e}")),
};
if count_exits {
match exit {
KvmExit::MmioWrite(addr, _) if addr == VIRTIO_QUEUE_NOTIFY => {
exits.fetch_add(1, Ordering::Relaxed);
}
KvmExit::IoOut(..)
| KvmExit::IoIn(..)
| KvmExit::MmioWrite(..)
| KvmExit::MmioRead(..) => {
exits.fetch_add(1 << 32, Ordering::Relaxed);
}
_ => {}
}
}
if let std::ops::ControlFlow::Break(reason) =
dispatch_kvm_exit(exit, &vm, &bus, &com1, &stop, &vcpu)
{
break reason;
}
}
};
let snap = if matches!(reason, ExitReason::Canceled) && snapshot_req.load(Ordering::SeqCst) {
match vcpu.capture_snapshot() {
Ok(s) => Some(s),
Err(_) => None,
}
} else {
None
};
if !matches!(reason, ExitReason::Canceled) {
stop.store(true, Ordering::SeqCst);
KvmVcpuHandle::force_exit(&handles);
}
(reason, snap)
}
impl Drop for LinuxVm {
fn drop(&mut self) {
let bridges = std::mem::take(
&mut *self
.bridges
.lock()
.unwrap_or_else(std::sync::PoisonError::into_inner),
);
for b in bridges {
b.shutdown();
}
if let Some(t) = self.dev_thread.take() {
self.dev_stop.store(true, Ordering::SeqCst);
if let Some(efd) = &self.dev_wake {
let _ = efd.write(1);
}
let _ = t.join();
}
if !self.volume_threads.is_empty() {
self.dev_stop.store(true, Ordering::SeqCst);
for w in &self.volume_wakes {
let _ = w.write(1);
}
for t in self.volume_threads.drain(..) {
let _ = t.join();
}
}
if let Some(vsock) = &self.vsock {
vsock.shutdown();
}
unsafe { libc::munmap(self.host as *mut libc::c_void, self.mem_size) };
}
}
#[cfg(test)]
mod snapshot_listener_tests {
use crate::devices::virtio::vsock::muxer::TsiListenerSnapshot;
fn roundtrip(l: &TsiListenerSnapshot) -> TsiListenerSnapshot {
let mut buf = Vec::new();
l.write_to(&mut buf).expect("write");
let mut cur = std::io::Cursor::new(buf);
TsiListenerSnapshot::read_from(&mut cur).expect("read")
}
#[test]
fn tsi_listener_record_roundtrips() {
let with_inet = TsiListenerSnapshot {
cid: 3,
peer_port: 4242906079,
vm_port: 4242906079,
family: 2,
socktype: 1,
inet_port: Some(80),
};
let r = roundtrip(&with_inet);
assert_eq!(r.cid, with_inet.cid);
assert_eq!(r.peer_port, with_inet.peer_port);
assert_eq!(r.vm_port, with_inet.vm_port);
assert_eq!(r.family, with_inet.family);
assert_eq!(r.socktype, with_inet.socktype);
assert_eq!(r.inet_port, Some(80));
let no_inet = TsiListenerSnapshot {
cid: 3,
peer_port: 7,
vm_port: 99,
family: 10,
socktype: 1,
inet_port: None,
};
assert_eq!(roundtrip(&no_inet).inet_port, None);
}
}
#[cfg(test)]
mod irq_budget_tests {
use super::{
balloon_irq, virtio_irq_budget_ok, FS_IRQ_BASE, IOAPIC_GSI_CEILING, VOLUME_IRQ_BASE,
};
#[test]
fn typical_device_sets_fit() {
assert!(virtio_irq_budget_ok(0, 0, false).is_ok());
assert!(virtio_irq_budget_ok(0, 1, false).is_ok());
assert!(virtio_irq_budget_ok(3, 2, false).is_ok());
assert!(virtio_irq_budget_ok(0, 0, true).is_ok()); assert!(virtio_irq_budget_ok(2, 3, true).is_ok());
}
#[test]
fn too_many_volumes_rejected() {
let max = (FS_IRQ_BASE - VOLUME_IRQ_BASE) as usize;
assert!(virtio_irq_budget_ok(max, 0, false).is_ok());
let err = virtio_irq_budget_ok(max + 1, 0, false).unwrap_err();
assert!(err.contains("too many data volumes"), "got: {err}");
}
#[test]
fn ioapic_overflow_rejected() {
let max_fs_with_balloon = (IOAPIC_GSI_CEILING - 1 - FS_IRQ_BASE) as usize; assert!(virtio_irq_budget_ok(0, max_fs_with_balloon, true).is_ok());
assert_eq!(
balloon_irq(max_fs_with_balloon),
IOAPIC_GSI_CEILING - 1,
"boundary balloon IRQ is the top usable GSI"
);
let err = virtio_irq_budget_ok(0, max_fs_with_balloon + 1, true).unwrap_err();
assert!(err.contains("IRQ budget exhausted"), "got: {err}");
}
}