use anyhow::{Context, Result};
use kvm_bindings::{
KVM_CAP_HALT_POLL, KVM_CAP_SPLIT_IRQCHIP, KVM_CAP_X2APIC_API, KVM_CAP_X86_DISABLE_EXITS,
KVM_CLOCK_TSC_STABLE, KVM_PIT_SPEAKER_DUMMY, KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK,
KVM_X2APIC_API_USE_32BIT_IDS, KVM_X86_DISABLE_EXITS_HLT, KVM_X86_DISABLE_EXITS_PAUSE,
kvm_enable_cap, kvm_pit_config,
};
use kvm_ioctls::{Cap, Kvm, VcpuFd, VmFd};
use std::mem::ManuallyDrop;
use vm_memory::{GuestAddress, GuestMemoryMmap};
use super::topology::{generate_cpuid, max_apic_id};
use crate::vmm::numa_mem::{NumaMemoryLayout, ReservationGuard};
use crate::vmm::topology::Topology;
pub(crate) const KERNEL_LOAD_ADDR: u64 = 0x100000;
pub(crate) const BOOT_PARAMS_ADDR: u64 = 0x7000;
pub(crate) const CMDLINE_ADDR: u64 = 0x20000;
pub(crate) const CMDLINE_MAX: usize = 4096;
pub(crate) const EBDA_START: u64 = 0x9FC00;
pub(crate) const HIMEM_START: u64 = 0x10_0000;
pub(crate) const MMIO_GAP_START: u64 = 0xC000_0000;
pub(crate) const MMIO_GAP_END: u64 = 0x1_0000_0000;
pub(crate) const VIRTIO_CONSOLE_MMIO_BASE: u64 = MMIO_GAP_START;
pub(crate) const VIRTIO_BLK_MMIO_BASE: u64 = MMIO_GAP_START + 0x1000;
pub(crate) const VIRTIO_NET_MMIO_BASE: u64 = MMIO_GAP_START + 0x2000;
pub(crate) const VIRTIO_CONSOLE_IRQ: u32 = 5;
pub(crate) const VIRTIO_BLK_IRQ: u32 = 6;
pub(crate) const VIRTIO_NET_IRQ: u32 = 7;
pub(crate) const E820_RAM: u32 = 1;
pub(crate) const STARTUP64_OFFSET: u64 = 0x200;
const KVM_TSS_ADDRESS: u64 = 0xfffb_d000;
const KVM_IDENTITY_MAP_ADDRESS: u64 = KVM_TSS_ADDRESS + 3 * 4096;
const NUM_IOAPIC_PINS: u64 = 24;
const MAX_XAPIC_ID: u32 = 254;
const HALT_POLL_NS: u64 = 200_000;
const REQUIRED_CAPS: &[Cap] = &[
Cap::Irqchip,
Cap::Ioeventfd,
Cap::Irqfd,
Cap::UserMemory,
Cap::SetTssAddr,
Cap::Pit2,
Cap::PitState2,
Cap::AdjustClock,
Cap::Debugregs,
Cap::MpState,
Cap::VcpuEvents,
Cap::Xcrs,
Cap::Xsave,
Cap::ExtCpuid,
];
#[allow(dead_code)] pub struct KtstrKvm {
pub kvm: ManuallyDrop<Kvm>,
pub vm_fd: ManuallyDrop<VmFd>,
pub vcpus: Vec<VcpuFd>,
pub guest_mem: ManuallyDrop<GuestMemoryMmap>,
pub topology: Topology,
pub(crate) numa_layout: Option<NumaMemoryLayout>,
pub has_immediate_exit: bool,
pub(crate) split_irqchip: bool,
use_hugepages: bool,
performance_mode: bool,
_reservation: Option<ReservationGuard>,
pub(crate) cow_overlay_guards: Vec<crate::vmm::initramfs::CowOverlayGuard>,
}
impl Drop for KtstrKvm {
fn drop(&mut self) {
unsafe {
let vcpus = std::mem::take(&mut self.vcpus);
drop(vcpus);
ManuallyDrop::drop(&mut self.vm_fd);
ManuallyDrop::drop(&mut self.guest_mem);
let reservation = self._reservation.take();
drop(reservation);
let cow_guards = std::mem::take(&mut self.cow_overlay_guards);
drop(cow_guards);
ManuallyDrop::drop(&mut self.kvm);
}
}
}
impl KtstrKvm {
pub fn new(topo: Topology, memory_mb: u32, performance_mode: bool) -> Result<Self> {
Self::new_inner(topo, Some(memory_mb), false, performance_mode)
}
pub fn new_with_hugepages(
topo: Topology,
memory_mb: u32,
performance_mode: bool,
) -> Result<Self> {
Self::new_inner(topo, Some(memory_mb), true, performance_mode)
}
pub fn new_deferred(
topo: Topology,
use_hugepages: bool,
performance_mode: bool,
) -> Result<Self> {
Self::new_inner(topo, None, use_hugepages, performance_mode)
}
pub fn allocate_and_register_memory(&mut self, memory_mb: u32) -> Result<()> {
let layout = NumaMemoryLayout::compute(&self.topology, memory_mb, 0)?;
let alloc =
layout.allocate_and_register(&self.vm_fd, self.use_hugepages, self.performance_mode)?;
unsafe { ManuallyDrop::drop(&mut self.guest_mem) };
self.guest_mem = ManuallyDrop::new(alloc.guest_mem);
self._reservation = Some(alloc.reservation);
self.numa_layout = Some(layout);
Ok(())
}
fn new_inner(
topo: Topology,
memory_mb: Option<u32>,
use_hugepages: bool,
performance_mode: bool,
) -> Result<Self> {
let kvm = Kvm::new().context("open /dev/kvm")?;
for &cap in REQUIRED_CAPS {
anyhow::ensure!(
kvm.check_extension(cap),
"KVM missing required capability: {:?}",
cap
);
}
let has_immediate_exit = kvm.check_extension(Cap::ImmediateExit);
let vm_fd = crate::vmm::create_vm_with_retry(&kvm)?;
vm_fd
.set_tss_address(KVM_TSS_ADDRESS as usize)
.map_err(|e| crate::vmm::map_transient_to_contention(e, "set TSS"))?;
vm_fd
.set_identity_map_address(KVM_IDENTITY_MAP_ADDRESS)
.map_err(|e| crate::vmm::map_transient_to_contention(e, "set identity map address"))?;
let max_apic_id = max_apic_id(&topo);
let split_irqchip = max_apic_id > MAX_XAPIC_ID;
if split_irqchip {
let mut cap = kvm_enable_cap {
cap: KVM_CAP_SPLIT_IRQCHIP,
..Default::default()
};
cap.args[0] = NUM_IOAPIC_PINS;
vm_fd
.enable_cap(&cap)
.map_err(|e| crate::vmm::map_transient_to_contention(e, "enable split IRQ chip"))?;
let mut cap = kvm_enable_cap {
cap: KVM_CAP_X2APIC_API,
..Default::default()
};
cap.args[0] =
(KVM_X2APIC_API_USE_32BIT_IDS | KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) as u64;
vm_fd.enable_cap(&cap).context("enable x2APIC API")?;
} else {
vm_fd
.create_irq_chip()
.map_err(|e| crate::vmm::map_transient_to_contention(e, "create IRQ chip"))?;
let pit_config = kvm_pit_config {
flags: KVM_PIT_SPEAKER_DUMMY,
..Default::default()
};
vm_fd
.create_pit2(pit_config)
.map_err(|e| crate::vmm::map_transient_to_contention(e, "create PIT"))?;
}
if performance_mode {
let mut cap = kvm_enable_cap {
cap: KVM_CAP_X86_DISABLE_EXITS,
..Default::default()
};
cap.args[0] = KVM_X86_DISABLE_EXITS_PAUSE as u64;
if let Err(e) = vm_fd.enable_cap(&cap) {
eprintln!(
"performance_mode: WARNING: \
KVM_CAP_X86_DISABLE_EXITS (PAUSE) not supported: {e}"
);
}
cap.args[0] = KVM_X86_DISABLE_EXITS_HLT as u64;
if let Err(e) = vm_fd.enable_cap(&cap) {
eprintln!(
"performance_mode: WARNING: \
KVM_CAP_X86_DISABLE_EXITS (HLT) rejected: {e}"
);
}
}
if !performance_mode {
let host_cpus = unsafe { libc::sysconf(libc::_SC_NPROCESSORS_ONLN) };
let poll_ns: u64 = if host_cpus > 0 && topo.total_cpus() <= host_cpus as u32 {
HALT_POLL_NS
} else {
0
};
let mut cap = kvm_enable_cap {
cap: KVM_CAP_HALT_POLL,
..Default::default()
};
cap.args[0] = poll_ns;
if let Err(e) = vm_fd.enable_cap(&cap) {
eprintln!(
"kvm: WARNING: KVM_CAP_HALT_POLL not supported ({e}), using kernel default"
);
}
}
let (guest_mem, numa_layout, reservation) = match memory_mb {
Some(mb) => {
let layout = NumaMemoryLayout::compute(&topo, mb, 0)?;
let alloc =
layout.allocate_and_register(&vm_fd, use_hugepages, performance_mode)?;
(alloc.guest_mem, Some(layout), Some(alloc.reservation))
}
None => {
let placeholder = GuestMemoryMmap::<()>::from_ranges(&[(GuestAddress(0), 4096)])
.context("allocate placeholder guest memory")?;
(placeholder, None, None)
}
};
let base_cpuid = kvm
.get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
.context("get_supported_cpuid")?;
let total = topo.total_cpus();
let mut vcpus = Vec::with_capacity(total as usize);
for cpu_id in 0..total {
let vcpu = vm_fd.create_vcpu(cpu_id as u64).map_err(|e| {
crate::vmm::map_transient_to_contention(e, format!("create vCPU {cpu_id}"))
})?;
let cpuid_entries =
generate_cpuid(base_cpuid.as_slice(), &topo, cpu_id, performance_mode);
let cpuid = kvm_bindings::CpuId::from_entries(&cpuid_entries).context("build CpuId")?;
vcpu.set_cpuid2(&cpuid)
.with_context(|| format!("set CPUID for vCPU {cpu_id}"))?;
vcpus.push(vcpu);
}
if performance_mode {
match vm_fd.get_clock() {
Ok(clock) => {
let mut set_data = clock;
set_data.flags = 0;
if let Err(e) = vm_fd.set_clock(&set_data) {
eprintln!(
"performance_mode: WARNING: KVM_SET_CLOCK failed ({e}), \
cannot check TSC stability"
);
} else {
match vm_fd.get_clock() {
Ok(clock2) => {
if clock2.flags & KVM_CLOCK_TSC_STABLE == 0 {
eprintln!(
"performance_mode: WARNING: TSC not stable \
(KVM_CLOCK_TSC_STABLE not set), \
timing measurements may have higher variance \
(nested virt?)."
);
}
}
Err(e) => {
eprintln!(
"performance_mode: WARNING: KVM_GET_CLOCK failed ({e}), \
cannot check TSC stability"
);
}
}
}
}
Err(e) => {
eprintln!(
"performance_mode: WARNING: KVM_GET_CLOCK failed ({e}), \
cannot check TSC stability"
);
}
}
}
Ok(KtstrKvm {
kvm: ManuallyDrop::new(kvm),
vm_fd: ManuallyDrop::new(vm_fd),
vcpus,
guest_mem: ManuallyDrop::new(guest_mem),
topology: topo,
numa_layout,
has_immediate_exit,
split_irqchip,
use_hugepages,
performance_mode,
_reservation: reservation,
cow_overlay_guards: Vec::new(),
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use vm_memory::GuestMemory;
#[test]
fn create_vm_basic() {
let topo = Topology {
llcs: 1,
cores_per_llc: 2,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 128, false);
assert!(vm.is_ok(), "VM creation failed: {:?}", vm.err());
let vm = vm.unwrap();
assert_eq!(vm.vcpus.len(), 2);
}
#[test]
fn create_vm_multi_llc() {
let topo = Topology {
llcs: 2,
cores_per_llc: 2,
threads_per_core: 2,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 256, false);
assert!(vm.is_ok(), "multi-LLC VM creation failed: {:?}", vm.err());
let vm = vm.unwrap();
assert_eq!(vm.vcpus.len(), 8);
}
#[test]
fn create_vm_single_cpu() {
let topo = Topology {
llcs: 1,
cores_per_llc: 1,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 64, false);
assert!(vm.is_ok());
assert_eq!(vm.unwrap().vcpus.len(), 1);
}
#[test]
fn create_vm_large_topology() {
let topo = Topology {
llcs: 4,
cores_per_llc: 4,
threads_per_core: 2,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 512, false);
assert!(vm.is_ok(), "large topology failed: {:?}", vm.err());
assert_eq!(vm.unwrap().vcpus.len(), 32);
}
#[test]
fn create_vm_odd_topology() {
let topo = Topology {
llcs: 3,
cores_per_llc: 3,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 128, false);
assert!(vm.is_ok(), "odd topology failed: {:?}", vm.err());
assert_eq!(vm.unwrap().vcpus.len(), 9);
}
#[test]
fn memory_size_correct() {
use vm_memory::GuestMemoryRegion;
let topo = Topology {
llcs: 1,
cores_per_llc: 1,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 256, false).unwrap();
let total: u64 = vm.guest_mem.iter().map(|r| r.len()).sum();
assert_eq!(total, 256 << 20);
}
#[test]
fn tss_address_matches_firecracker() {
assert_eq!(KVM_TSS_ADDRESS, 0xfffb_d000);
}
#[test]
fn identity_map_follows_tss() {
assert_eq!(KVM_IDENTITY_MAP_ADDRESS, KVM_TSS_ADDRESS + 3 * 4096);
assert_eq!(KVM_IDENTITY_MAP_ADDRESS, 0xfffc_0000);
}
#[test]
fn required_caps_non_empty() {
assert!(!REQUIRED_CAPS.is_empty());
assert!(REQUIRED_CAPS.len() >= 14);
}
#[test]
fn small_topology_uses_full_irqchip() {
let topo = Topology {
llcs: 2,
cores_per_llc: 4,
threads_per_core: 2,
numa_nodes: 1,
nodes: None,
distances: None,
};
assert!(max_apic_id(&topo) <= MAX_XAPIC_ID);
let vm = KtstrKvm::new(topo, 256, false).unwrap();
assert!(!vm.split_irqchip, "small topology should use full IRQ chip");
}
#[test]
fn large_topology_uses_split_irqchip() {
let topo = Topology {
llcs: 14,
cores_per_llc: 9,
threads_per_core: 2,
numa_nodes: 1,
nodes: None,
distances: None,
};
assert!(
max_apic_id(&topo) > MAX_XAPIC_ID,
"max APIC ID {} should exceed {}",
max_apic_id(&topo),
MAX_XAPIC_ID,
);
let vm = match KtstrKvm::new(topo, 4096, false) {
Ok(v) => v,
Err(e) => {
skip!("large_topology VM creation: {e:#}");
}
};
assert!(vm.split_irqchip, "large topology should use split IRQ chip");
assert_eq!(vm.vcpus.len(), 252);
}
#[test]
fn split_irqchip_boundary() {
let small = Topology {
llcs: 8,
cores_per_llc: 8,
threads_per_core: 2,
numa_nodes: 1,
nodes: None,
distances: None,
};
assert!(
max_apic_id(&small) <= MAX_XAPIC_ID,
"8l/8c/2t max APIC ID {} should be <= 254",
max_apic_id(&small),
);
let vm = KtstrKvm::new(small, 2048, false).unwrap();
assert!(!vm.split_irqchip);
let still_small = Topology {
llcs: 15,
cores_per_llc: 8,
threads_per_core: 2,
numa_nodes: 1,
nodes: None,
distances: None,
};
assert!(
max_apic_id(&still_small) <= MAX_XAPIC_ID,
"15l/8c/2t max APIC ID {} should be <= 254",
max_apic_id(&still_small),
);
let vm = KtstrKvm::new(still_small, 4096, false).unwrap();
assert!(!vm.split_irqchip);
}
#[test]
fn immediate_exit_cap_detected() {
let topo = Topology {
llcs: 1,
cores_per_llc: 1,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 64, false).unwrap();
assert!(vm.has_immediate_exit);
}
#[test]
fn performance_mode_succeeds() {
let topo = Topology {
llcs: 1,
cores_per_llc: 2,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 128, true);
assert!(
vm.is_ok(),
"performance_mode VM creation failed: {:?}",
vm.err()
);
}
#[test]
fn performance_mode_does_not_affect_vcpu_count() {
let topo = Topology {
llcs: 2,
cores_per_llc: 2,
threads_per_core: 2,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm_normal = KtstrKvm::new(topo, 256, false).unwrap();
let vm_perf = KtstrKvm::new(topo, 256, true).unwrap();
assert_eq!(vm_normal.vcpus.len(), vm_perf.vcpus.len());
}
#[test]
fn halt_poll_ns_constant() {
assert_eq!(HALT_POLL_NS, 200_000);
}
#[test]
fn non_perf_mode_succeeds_with_halt_poll() {
let topo = Topology {
llcs: 1,
cores_per_llc: 2,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 128, false);
assert!(
vm.is_ok(),
"non-perf VM with halt poll failed: {:?}",
vm.err()
);
}
#[test]
fn disable_exits_hlt_bit_value() {
assert_eq!(KVM_X86_DISABLE_EXITS_HLT, 2);
}
#[test]
fn disable_exits_pause_and_hlt_no_overlap() {
assert_ne!(
KVM_X86_DISABLE_EXITS_PAUSE, KVM_X86_DISABLE_EXITS_HLT,
"PAUSE and HLT bits must be distinct"
);
assert_eq!(
KVM_X86_DISABLE_EXITS_PAUSE & KVM_X86_DISABLE_EXITS_HLT,
0,
"PAUSE and HLT bits must not overlap"
);
}
#[test]
fn tsc_stability_check_roundtrip() {
let topo = Topology {
llcs: 1,
cores_per_llc: 2,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 64, true).unwrap();
let clock = vm.vm_fd.get_clock().unwrap();
let mut set_data = clock;
set_data.flags = 0;
vm.vm_fd.set_clock(&set_data).unwrap();
let clock2 = vm.vm_fd.get_clock().unwrap();
let _ = clock2.flags & KVM_CLOCK_TSC_STABLE;
}
#[test]
fn performance_mode_with_hlt_disable_succeeds() {
let topo = Topology {
llcs: 1,
cores_per_llc: 2,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 128, true);
assert!(
vm.is_ok(),
"performance_mode with HLT disable failed: {:?}",
vm.err()
);
}
}