use anyhow::{Context, Result};
use kvm_bindings::{
KVM_CAP_HALT_POLL, KVM_CAP_SPLIT_IRQCHIP, KVM_CAP_X2APIC_API, KVM_CAP_X86_DISABLE_EXITS,
KVM_CLOCK_TSC_STABLE, KVM_PIT_SPEAKER_DUMMY, KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK,
KVM_X2APIC_API_USE_32BIT_IDS, KVM_X86_DISABLE_EXITS_HLT, KVM_X86_DISABLE_EXITS_PAUSE,
kvm_enable_cap, kvm_pit_config, kvm_userspace_memory_region,
};
use kvm_ioctls::{Cap, Kvm, VcpuFd, VmFd};
use vm_memory::{GuestAddress, GuestMemory, GuestMemoryMmap};
use super::topology::{generate_cpuid, max_apic_id};
use crate::vmm::topology::Topology;
pub(crate) const KERNEL_LOAD_ADDR: u64 = 0x100000;
pub(crate) const BOOT_PARAMS_ADDR: u64 = 0x7000;
pub(crate) const CMDLINE_ADDR: u64 = 0x20000;
pub(crate) const CMDLINE_MAX: usize = 4096;
pub(crate) const EBDA_START: u64 = 0x9FC00;
pub(crate) const HIMEM_START: u64 = 0x10_0000;
pub(crate) const MMIO_GAP_START: u64 = 0xC000_0000;
pub(crate) const MMIO_GAP_END: u64 = 0x1_0000_0000;
pub(crate) const VIRTIO_CONSOLE_MMIO_BASE: u64 = MMIO_GAP_START;
pub(crate) const VIRTIO_CONSOLE_IRQ: u32 = 5;
pub(crate) const E820_RAM: u32 = 1;
pub(crate) const STARTUP64_OFFSET: u64 = 0x200;
const KVM_TSS_ADDRESS: u64 = 0xfffb_d000;
const KVM_IDENTITY_MAP_ADDRESS: u64 = KVM_TSS_ADDRESS + 3 * 4096;
const NUM_IOAPIC_PINS: u64 = 24;
const MAX_XAPIC_ID: u32 = 254;
const HALT_POLL_NS: u64 = 200_000;
const REQUIRED_CAPS: &[Cap] = &[
Cap::Irqchip,
Cap::Ioeventfd,
Cap::Irqfd,
Cap::UserMemory,
Cap::SetTssAddr,
Cap::Pit2,
Cap::PitState2,
Cap::AdjustClock,
Cap::Debugregs,
Cap::MpState,
Cap::VcpuEvents,
Cap::Xcrs,
Cap::Xsave,
Cap::ExtCpuid,
];
#[allow(dead_code)] pub struct KtstrKvm {
pub kvm: Kvm,
pub vm_fd: VmFd,
pub vcpus: Vec<VcpuFd>,
pub guest_mem: GuestMemoryMmap,
pub topology: Topology,
pub has_immediate_exit: bool,
pub(crate) split_irqchip: bool,
use_hugepages: bool,
performance_mode: bool,
}
impl KtstrKvm {
pub fn new(topo: Topology, memory_mb: u32, performance_mode: bool) -> Result<Self> {
Self::new_inner(topo, Some(memory_mb), false, performance_mode)
}
pub fn new_with_hugepages(
topo: Topology,
memory_mb: u32,
performance_mode: bool,
) -> Result<Self> {
Self::new_inner(topo, Some(memory_mb), true, performance_mode)
}
pub fn new_deferred(
topo: Topology,
use_hugepages: bool,
performance_mode: bool,
) -> Result<Self> {
Self::new_inner(topo, None, use_hugepages, performance_mode)
}
pub fn allocate_and_register_memory(&mut self, memory_mb: u32) -> Result<()> {
let mem_size = (memory_mb as u64) << 20;
let use_hugepages = self.use_hugepages
|| (self.performance_mode
&& crate::vmm::host_topology::hugepages_free()
>= crate::vmm::host_topology::hugepages_needed(memory_mb));
let guest_mem = if use_hugepages {
crate::vmm::allocate_hugepage_memory(mem_size as usize, GuestAddress(0))?
} else {
GuestMemoryMmap::<()>::from_ranges(&[(GuestAddress(0), mem_size as usize)])
.context("allocate guest memory")?
};
let host_addr = guest_mem
.get_host_address(GuestAddress(0))
.context("get host address for guest memory")? as u64;
let mem_region = kvm_userspace_memory_region {
slot: 0,
guest_phys_addr: 0,
memory_size: mem_size,
userspace_addr: host_addr,
flags: 0,
};
unsafe {
self.vm_fd
.set_user_memory_region(mem_region)
.context("set user memory region")?;
}
self.guest_mem = guest_mem;
Ok(())
}
fn new_inner(
topo: Topology,
memory_mb: Option<u32>,
use_hugepages: bool,
performance_mode: bool,
) -> Result<Self> {
let kvm = Kvm::new().context("open /dev/kvm")?;
for &cap in REQUIRED_CAPS {
anyhow::ensure!(
kvm.check_extension(cap),
"KVM missing required capability: {:?}",
cap
);
}
let has_immediate_exit = kvm.check_extension(Cap::ImmediateExit);
let vm_fd = {
let mut attempts = 0;
loop {
match kvm.create_vm() {
Ok(fd) => break fd,
Err(e) if e.errno() == libc::EINTR && attempts < 5 => {
attempts += 1;
std::thread::sleep(std::time::Duration::from_micros(1 << attempts));
}
Err(e) => return Err(e).context("create VM"),
}
}
};
vm_fd
.set_tss_address(KVM_TSS_ADDRESS as usize)
.context("set TSS")?;
vm_fd
.set_identity_map_address(KVM_IDENTITY_MAP_ADDRESS)
.context("set identity map address")?;
let max_apic_id = max_apic_id(&topo);
let split_irqchip = max_apic_id > MAX_XAPIC_ID;
if split_irqchip {
let mut cap = kvm_enable_cap {
cap: KVM_CAP_SPLIT_IRQCHIP,
..Default::default()
};
cap.args[0] = NUM_IOAPIC_PINS;
vm_fd.enable_cap(&cap).context("enable split IRQ chip")?;
let mut cap = kvm_enable_cap {
cap: KVM_CAP_X2APIC_API,
..Default::default()
};
cap.args[0] =
(KVM_X2APIC_API_USE_32BIT_IDS | KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) as u64;
vm_fd.enable_cap(&cap).context("enable x2APIC API")?;
} else {
vm_fd.create_irq_chip().context("create IRQ chip")?;
let pit_config = kvm_pit_config {
flags: KVM_PIT_SPEAKER_DUMMY,
..Default::default()
};
vm_fd.create_pit2(pit_config).context("create PIT")?;
}
if performance_mode {
let mut cap = kvm_enable_cap {
cap: KVM_CAP_X86_DISABLE_EXITS,
..Default::default()
};
cap.args[0] = KVM_X86_DISABLE_EXITS_PAUSE as u64;
if let Err(e) = vm_fd.enable_cap(&cap) {
eprintln!(
"performance_mode: WARNING: \
KVM_CAP_X86_DISABLE_EXITS (PAUSE) not supported: {e}"
);
}
cap.args[0] = KVM_X86_DISABLE_EXITS_HLT as u64;
if let Err(e) = vm_fd.enable_cap(&cap) {
eprintln!(
"performance_mode: WARNING: \
KVM_CAP_X86_DISABLE_EXITS (HLT) rejected: {e}"
);
}
}
if !performance_mode {
let host_cpus = unsafe { libc::sysconf(libc::_SC_NPROCESSORS_ONLN) };
let poll_ns: u64 = if host_cpus > 0 && topo.total_cpus() <= host_cpus as u32 {
HALT_POLL_NS
} else {
0
};
let mut cap = kvm_enable_cap {
cap: KVM_CAP_HALT_POLL,
..Default::default()
};
cap.args[0] = poll_ns;
if let Err(e) = vm_fd.enable_cap(&cap) {
eprintln!(
"kvm: WARNING: KVM_CAP_HALT_POLL not supported ({e}), using kernel default"
);
}
}
let guest_mem = match memory_mb {
Some(mb) => {
let mem_size = (mb as u64) << 20;
let mem = if use_hugepages {
crate::vmm::allocate_hugepage_memory(mem_size as usize, GuestAddress(0))?
} else {
GuestMemoryMmap::<()>::from_ranges(&[(GuestAddress(0), mem_size as usize)])
.context("allocate guest memory")?
};
let host_addr =
mem.get_host_address(GuestAddress(0))
.context("get host address for guest memory")? as u64;
let mem_region = kvm_userspace_memory_region {
slot: 0,
guest_phys_addr: 0,
memory_size: mem_size,
userspace_addr: host_addr,
flags: 0,
};
unsafe {
vm_fd
.set_user_memory_region(mem_region)
.context("set user memory region")?;
}
mem
}
None => {
GuestMemoryMmap::<()>::from_ranges(&[(GuestAddress(0), 4096)])
.context("allocate placeholder guest memory")?
}
};
let base_cpuid = kvm
.get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
.context("get_supported_cpuid")?;
let total = topo.total_cpus();
let mut vcpus = Vec::with_capacity(total as usize);
for cpu_id in 0..total {
let vcpu = vm_fd
.create_vcpu(cpu_id as u64)
.with_context(|| format!("create vCPU {cpu_id}"))?;
let cpuid_entries =
generate_cpuid(base_cpuid.as_slice(), &topo, cpu_id, performance_mode);
let cpuid = kvm_bindings::CpuId::from_entries(&cpuid_entries).context("build CpuId")?;
vcpu.set_cpuid2(&cpuid)
.with_context(|| format!("set CPUID for vCPU {cpu_id}"))?;
vcpus.push(vcpu);
}
if performance_mode {
match vm_fd.get_clock() {
Ok(clock) => {
let mut set_data = clock;
set_data.flags = 0;
if let Err(e) = vm_fd.set_clock(&set_data) {
eprintln!(
"performance_mode: WARNING: KVM_SET_CLOCK failed ({e}), \
cannot check TSC stability"
);
} else {
match vm_fd.get_clock() {
Ok(clock2) => {
if clock2.flags & KVM_CLOCK_TSC_STABLE == 0 {
eprintln!(
"performance_mode: WARNING: TSC not stable \
(KVM_CLOCK_TSC_STABLE not set), \
timing measurements may have higher variance \
(nested virt?)."
);
}
}
Err(e) => {
eprintln!(
"performance_mode: WARNING: KVM_GET_CLOCK failed ({e}), \
cannot check TSC stability"
);
}
}
}
}
Err(e) => {
eprintln!(
"performance_mode: WARNING: KVM_GET_CLOCK failed ({e}), \
cannot check TSC stability"
);
}
}
}
Ok(KtstrKvm {
kvm,
vm_fd,
vcpus,
guest_mem,
topology: topo,
has_immediate_exit,
split_irqchip,
use_hugepages,
performance_mode,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn create_vm_basic() {
let topo = Topology {
llcs: 1,
cores_per_llc: 2,
threads_per_core: 1,
numa_nodes: 1,
};
let vm = KtstrKvm::new(topo, 128, false);
assert!(vm.is_ok(), "VM creation failed: {:?}", vm.err());
let vm = vm.unwrap();
assert_eq!(vm.vcpus.len(), 2);
}
#[test]
fn create_vm_multi_llc() {
let topo = Topology {
llcs: 2,
cores_per_llc: 2,
threads_per_core: 2,
numa_nodes: 1,
};
let vm = KtstrKvm::new(topo, 256, false);
assert!(vm.is_ok(), "multi-LLC VM creation failed: {:?}", vm.err());
let vm = vm.unwrap();
assert_eq!(vm.vcpus.len(), 8);
}
#[test]
fn create_vm_single_cpu() {
let topo = Topology {
llcs: 1,
cores_per_llc: 1,
threads_per_core: 1,
numa_nodes: 1,
};
let vm = KtstrKvm::new(topo, 64, false);
assert!(vm.is_ok());
assert_eq!(vm.unwrap().vcpus.len(), 1);
}
#[test]
fn create_vm_large_topology() {
let topo = Topology {
llcs: 4,
cores_per_llc: 4,
threads_per_core: 2,
numa_nodes: 1,
};
let vm = KtstrKvm::new(topo, 512, false);
assert!(vm.is_ok(), "large topology failed: {:?}", vm.err());
assert_eq!(vm.unwrap().vcpus.len(), 32);
}
#[test]
fn create_vm_odd_topology() {
let topo = Topology {
llcs: 3,
cores_per_llc: 3,
threads_per_core: 1,
numa_nodes: 1,
};
let vm = KtstrKvm::new(topo, 128, false);
assert!(vm.is_ok(), "odd topology failed: {:?}", vm.err());
assert_eq!(vm.unwrap().vcpus.len(), 9);
}
#[test]
fn memory_size_correct() {
use vm_memory::GuestMemoryRegion;
let topo = Topology {
llcs: 1,
cores_per_llc: 1,
threads_per_core: 1,
numa_nodes: 1,
};
let vm = KtstrKvm::new(topo, 256, false).unwrap();
let total: u64 = vm.guest_mem.iter().map(|r| r.len()).sum();
assert_eq!(total, 256 << 20);
}
#[test]
fn tss_address_matches_firecracker() {
assert_eq!(KVM_TSS_ADDRESS, 0xfffb_d000);
}
#[test]
fn identity_map_follows_tss() {
assert_eq!(KVM_IDENTITY_MAP_ADDRESS, KVM_TSS_ADDRESS + 3 * 4096);
assert_eq!(KVM_IDENTITY_MAP_ADDRESS, 0xfffc_0000);
}
#[test]
fn required_caps_non_empty() {
assert!(!REQUIRED_CAPS.is_empty());
assert!(REQUIRED_CAPS.len() >= 14);
}
#[test]
fn small_topology_uses_full_irqchip() {
let topo = Topology {
llcs: 2,
cores_per_llc: 4,
threads_per_core: 2,
numa_nodes: 1,
};
assert!(max_apic_id(&topo) <= MAX_XAPIC_ID);
let vm = KtstrKvm::new(topo, 256, false).unwrap();
assert!(!vm.split_irqchip, "small topology should use full IRQ chip");
}
#[test]
fn large_topology_uses_split_irqchip() {
let topo = Topology {
llcs: 14,
cores_per_llc: 9,
threads_per_core: 2,
numa_nodes: 1,
};
assert!(
max_apic_id(&topo) > MAX_XAPIC_ID,
"max APIC ID {} should exceed {}",
max_apic_id(&topo),
MAX_XAPIC_ID,
);
let vm = match KtstrKvm::new(topo, 4096, false) {
Ok(v) => v,
Err(e) => {
eprintln!("skipping large_topology VM creation: {e:#}");
return;
}
};
assert!(vm.split_irqchip, "large topology should use split IRQ chip");
assert_eq!(vm.vcpus.len(), 252);
}
#[test]
fn split_irqchip_boundary() {
let small = Topology {
llcs: 8,
cores_per_llc: 8,
threads_per_core: 2,
numa_nodes: 1,
};
assert!(
max_apic_id(&small) <= MAX_XAPIC_ID,
"8l/8c/2t max APIC ID {} should be <= 254",
max_apic_id(&small),
);
let vm = KtstrKvm::new(small, 2048, false).unwrap();
assert!(!vm.split_irqchip);
let still_small = Topology {
llcs: 15,
cores_per_llc: 8,
threads_per_core: 2,
numa_nodes: 1,
};
assert!(
max_apic_id(&still_small) <= MAX_XAPIC_ID,
"15l/8c/2t max APIC ID {} should be <= 254",
max_apic_id(&still_small),
);
let vm = KtstrKvm::new(still_small, 4096, false).unwrap();
assert!(!vm.split_irqchip);
}
#[test]
fn immediate_exit_cap_detected() {
let topo = Topology {
llcs: 1,
cores_per_llc: 1,
threads_per_core: 1,
numa_nodes: 1,
};
let vm = KtstrKvm::new(topo, 64, false).unwrap();
assert!(vm.has_immediate_exit);
}
#[test]
fn performance_mode_succeeds() {
let topo = Topology {
llcs: 1,
cores_per_llc: 2,
threads_per_core: 1,
numa_nodes: 1,
};
let vm = KtstrKvm::new(topo, 128, true);
assert!(
vm.is_ok(),
"performance_mode VM creation failed: {:?}",
vm.err()
);
}
#[test]
fn performance_mode_does_not_affect_vcpu_count() {
let topo = Topology {
llcs: 2,
cores_per_llc: 2,
threads_per_core: 2,
numa_nodes: 1,
};
let vm_normal = KtstrKvm::new(topo, 256, false).unwrap();
let vm_perf = KtstrKvm::new(topo, 256, true).unwrap();
assert_eq!(vm_normal.vcpus.len(), vm_perf.vcpus.len());
}
#[test]
fn halt_poll_ns_constant() {
assert_eq!(HALT_POLL_NS, 200_000);
}
#[test]
fn non_perf_mode_succeeds_with_halt_poll() {
let topo = Topology {
llcs: 1,
cores_per_llc: 2,
threads_per_core: 1,
numa_nodes: 1,
};
let vm = KtstrKvm::new(topo, 128, false);
assert!(
vm.is_ok(),
"non-perf VM with halt poll failed: {:?}",
vm.err()
);
}
#[test]
fn disable_exits_hlt_bit_value() {
assert_eq!(KVM_X86_DISABLE_EXITS_HLT, 2);
}
#[test]
fn disable_exits_pause_and_hlt_no_overlap() {
assert_ne!(
KVM_X86_DISABLE_EXITS_PAUSE, KVM_X86_DISABLE_EXITS_HLT,
"PAUSE and HLT bits must be distinct"
);
assert_eq!(
KVM_X86_DISABLE_EXITS_PAUSE & KVM_X86_DISABLE_EXITS_HLT,
0,
"PAUSE and HLT bits must not overlap"
);
}
#[test]
fn tsc_stability_check_roundtrip() {
let topo = Topology {
llcs: 1,
cores_per_llc: 2,
threads_per_core: 1,
numa_nodes: 1,
};
let vm = KtstrKvm::new(topo, 64, true).unwrap();
let clock = vm.vm_fd.get_clock().unwrap();
let mut set_data = clock;
set_data.flags = 0;
vm.vm_fd.set_clock(&set_data).unwrap();
let clock2 = vm.vm_fd.get_clock().unwrap();
let _ = clock2.flags & KVM_CLOCK_TSC_STABLE;
}
#[test]
fn performance_mode_with_hlt_disable_succeeds() {
let topo = Topology {
llcs: 1,
cores_per_llc: 2,
threads_per_core: 1,
numa_nodes: 1,
};
let vm = KtstrKvm::new(topo, 128, true);
assert!(
vm.is_ok(),
"performance_mode with HLT disable failed: {:?}",
vm.err()
);
}
}