use anyhow::{Context, Result};
use kvm_bindings::{
KVM_ARM_VCPU_PMU_V3_CTRL, KVM_ARM_VCPU_PMU_V3_INIT, KVM_ARM_VCPU_PMU_V3_IRQ,
KVM_DEV_ARM_VGIC_CTRL_INIT, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_DEV_ARM_VGIC_GRP_CTRL,
KVM_DEV_ARM_VGIC_GRP_NR_IRQS, KVM_IRQ_ROUTING_IRQCHIP, KVM_VGIC_V3_ADDR_TYPE_DIST,
KVM_VGIC_V3_ADDR_TYPE_REDIST, KvmIrqRouting, kvm_create_device, kvm_device_attr,
kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V3, kvm_irq_routing_entry,
kvm_irq_routing_entry__bindgen_ty_1, kvm_irq_routing_irqchip,
};
use kvm_ioctls::{Cap, DeviceFd, Kvm, VcpuFd, VmFd};
use std::mem::ManuallyDrop;
use vm_memory::{GuestAddress, GuestMemoryMmap};
use crate::vmm::numa_mem::{NumaMemoryLayout, ReservationGuard};
use crate::vmm::topology::Topology;
pub(crate) const DRAM_START: u64 = 0x4000_0000;
pub(crate) const GIC_DIST_BASE: u64 = 0x0800_0000;
pub(crate) const GIC_DIST_SIZE: u64 = 0x1_0000;
pub(crate) const GIC_REDIST_BASE: u64 = GIC_DIST_BASE + GIC_DIST_SIZE;
pub(crate) const GIC_REDIST_SIZE_PER_CPU: u64 = 0x2_0000;
pub(crate) const SERIAL_MMIO_BASE: u64 = 0x0900_0000;
pub(crate) const SERIAL_MMIO_SIZE: u64 = 0x1000;
pub(crate) const SERIAL2_MMIO_BASE: u64 = SERIAL_MMIO_BASE + SERIAL_MMIO_SIZE;
pub(crate) const VIRTIO_CONSOLE_MMIO_BASE: u64 = SERIAL2_MMIO_BASE + SERIAL_MMIO_SIZE;
pub(crate) const VIRTIO_CONSOLE_IRQ: u32 = 35;
pub(crate) const VIRTIO_BLK_MMIO_BASE: u64 =
VIRTIO_CONSOLE_MMIO_BASE + crate::vmm::virtio_console::VIRTIO_MMIO_SIZE;
pub(crate) const VIRTIO_BLK_IRQ: u32 = 36;
pub(crate) const VIRTIO_NET_MMIO_BASE: u64 =
VIRTIO_BLK_MMIO_BASE + crate::vmm::virtio_blk::VIRTIO_MMIO_SIZE;
pub(crate) const VIRTIO_NET_IRQ: u32 = 37;
pub(crate) const KERNEL_LOAD_ADDR: u64 = DRAM_START;
pub(crate) const FDT_MAX_SIZE: u64 = 0x20_0000;
pub(crate) const CMDLINE_MAX: usize = 4096;
pub(crate) const SERIAL_IRQ: u32 = 33;
pub(crate) const SERIAL2_IRQ: u32 = 34;
pub(crate) const PMU_PPI: u32 = 7;
pub(crate) const PMU_INTID: u32 = PMU_PPI + 16;
const GIC_NR_IRQS: u32 = 128;
#[allow(dead_code)]
pub struct KtstrKvm {
pub kvm: ManuallyDrop<Kvm>,
pub vm_fd: ManuallyDrop<VmFd>,
pub vcpus: Vec<VcpuFd>,
pub guest_mem: ManuallyDrop<GuestMemoryMmap>,
pub topology: Topology,
pub(crate) numa_layout: Option<NumaMemoryLayout>,
pub has_immediate_exit: bool,
pub has_pmu: bool,
gic_fd: ManuallyDrop<DeviceFd>,
use_hugepages: bool,
performance_mode: bool,
_reservation: Option<ReservationGuard>,
pub(crate) cow_overlay_guards: Vec<crate::vmm::initramfs::CowOverlayGuard>,
}
impl Drop for KtstrKvm {
fn drop(&mut self) {
unsafe {
let vcpus = std::mem::take(&mut self.vcpus);
drop(vcpus);
ManuallyDrop::drop(&mut self.gic_fd);
ManuallyDrop::drop(&mut self.vm_fd);
ManuallyDrop::drop(&mut self.guest_mem);
let reservation = self._reservation.take();
drop(reservation);
let cow_guards = std::mem::take(&mut self.cow_overlay_guards);
drop(cow_guards);
ManuallyDrop::drop(&mut self.kvm);
}
}
}
impl KtstrKvm {
pub fn new(topo: Topology, memory_mb: u32, performance_mode: bool) -> Result<Self> {
Self::new_inner(topo, Some(memory_mb), false, performance_mode)
}
pub fn new_with_hugepages(
topo: Topology,
memory_mb: u32,
performance_mode: bool,
) -> Result<Self> {
Self::new_inner(topo, Some(memory_mb), true, performance_mode)
}
pub fn new_deferred(
topo: Topology,
use_hugepages: bool,
performance_mode: bool,
) -> Result<Self> {
Self::new_inner(topo, None, use_hugepages, performance_mode)
}
pub fn allocate_and_register_memory(&mut self, memory_mb: u32) -> Result<()> {
let layout = NumaMemoryLayout::compute(&self.topology, memory_mb, DRAM_START)?;
let alloc =
layout.allocate_and_register(&self.vm_fd, self.use_hugepages, self.performance_mode)?;
unsafe { ManuallyDrop::drop(&mut self.guest_mem) };
self.guest_mem = ManuallyDrop::new(alloc.guest_mem);
self._reservation = Some(alloc.reservation);
self.numa_layout = Some(layout);
Ok(())
}
fn new_inner(
topo: Topology,
memory_mb: Option<u32>,
use_hugepages: bool,
performance_mode: bool,
) -> Result<Self> {
let kvm = Kvm::new().context("open /dev/kvm")?;
let has_immediate_exit = kvm.check_extension(Cap::ImmediateExit);
let vm_fd = crate::vmm::create_vm_with_retry(&kvm)?;
let (guest_mem, numa_layout, reservation) = match memory_mb {
Some(mb) => {
let layout = NumaMemoryLayout::compute(&topo, mb, DRAM_START)?;
let alloc =
layout.allocate_and_register(&vm_fd, use_hugepages, performance_mode)?;
(alloc.guest_mem, Some(layout), Some(alloc.reservation))
}
None => {
let placeholder =
GuestMemoryMmap::<()>::from_ranges(&[(GuestAddress(DRAM_START), 4096)])
.context("allocate placeholder guest memory")?;
(placeholder, None, None)
}
};
let total = topo.total_cpus();
let mut vcpus = Vec::with_capacity(total as usize);
let mut kvi = kvm_bindings::kvm_vcpu_init::default();
vm_fd
.get_preferred_target(&mut kvi)
.context("get preferred target")?;
kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
if vm_fd.check_extension(kvm_ioctls::Cap::ArmPtrAuthAddress) {
kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PTRAUTH_ADDRESS;
}
if vm_fd.check_extension(kvm_ioctls::Cap::ArmPtrAuthGeneric) {
kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PTRAUTH_GENERIC;
}
let pmu_supported = vm_fd.check_extension(Cap::ArmPmuV3);
if pmu_supported {
kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
}
for cpu_id in 0..total {
let vcpu = vm_fd.create_vcpu(cpu_id as u64).map_err(|e| {
crate::vmm::map_transient_to_contention(e, format!("create vCPU {cpu_id}"))
})?;
let mut vcpu_kvi = kvi;
if cpu_id != 0 {
vcpu_kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
}
vcpu.vcpu_init(&vcpu_kvi).map_err(|e| {
crate::vmm::map_transient_to_contention(e, format!("init vCPU {cpu_id}"))
})?;
vcpus.push(vcpu);
}
super::topology::override_clidr(&vcpus)
.context("override CLIDR_EL1 to match host cache topology")?;
let gic_fd = Self::create_gic(&vm_fd, total)?;
Self::setup_gsi_routing(&vm_fd)?;
if pmu_supported {
Self::init_pmuv3(&vcpus)?;
}
Ok(KtstrKvm {
kvm: ManuallyDrop::new(kvm),
vm_fd: ManuallyDrop::new(vm_fd),
vcpus,
guest_mem: ManuallyDrop::new(guest_mem),
topology: topo,
numa_layout,
has_immediate_exit,
has_pmu: pmu_supported,
gic_fd: ManuallyDrop::new(gic_fd),
use_hugepages,
performance_mode,
_reservation: reservation,
cow_overlay_guards: Vec::new(),
})
}
fn create_gic(vm_fd: &VmFd, num_cpus: u32) -> Result<DeviceFd> {
let mut gic_device = kvm_create_device {
type_: kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V3,
fd: 0,
flags: 0,
};
let gic_fd = vm_fd
.create_device(&mut gic_device)
.map_err(|e| crate::vmm::map_transient_to_contention(e, "create GICv3 device"))?;
let nr_irqs: u32 = GIC_NR_IRQS;
let nr_irqs_attr = kvm_device_attr {
group: KVM_DEV_ARM_VGIC_GRP_NR_IRQS,
attr: 0,
addr: &nr_irqs as *const u32 as u64,
flags: 0,
};
gic_fd
.set_device_attr(&nr_irqs_attr)
.context("set GIC nr_irqs")?;
let dist_addr: u64 = GIC_DIST_BASE;
let dist_attr = kvm_device_attr {
group: KVM_DEV_ARM_VGIC_GRP_ADDR,
attr: KVM_VGIC_V3_ADDR_TYPE_DIST as u64,
addr: &dist_addr as *const u64 as u64,
flags: 0,
};
gic_fd
.set_device_attr(&dist_attr)
.context("set GIC distributor address")?;
let redist_addr: u64 = GIC_REDIST_BASE;
let redist_size = num_cpus as u64 * GIC_REDIST_SIZE_PER_CPU;
anyhow::ensure!(
GIC_REDIST_BASE + redist_size <= DRAM_START,
"GIC redistributor region (ends at {:#x}) overlaps DRAM at {:#x} for {} CPUs",
GIC_REDIST_BASE + redist_size,
DRAM_START,
num_cpus,
);
let redist_attr = kvm_device_attr {
group: KVM_DEV_ARM_VGIC_GRP_ADDR,
attr: KVM_VGIC_V3_ADDR_TYPE_REDIST as u64,
addr: &redist_addr as *const u64 as u64,
flags: 0,
};
gic_fd
.set_device_attr(&redist_attr)
.context("set GIC redistributor address")?;
let init_attr = kvm_device_attr {
group: KVM_DEV_ARM_VGIC_GRP_CTRL,
attr: KVM_DEV_ARM_VGIC_CTRL_INIT as u64,
addr: 0,
flags: 0,
};
gic_fd
.set_device_attr(&init_attr)
.map_err(|e| crate::vmm::map_transient_to_contention(e, "init GIC device"))?;
Ok(gic_fd)
}
fn setup_gsi_routing(vm_fd: &VmFd) -> Result<()> {
let irqs = [
SERIAL_IRQ,
SERIAL2_IRQ,
VIRTIO_CONSOLE_IRQ,
VIRTIO_BLK_IRQ,
VIRTIO_NET_IRQ,
];
let mut routing = KvmIrqRouting::new(irqs.len()).context("create KvmIrqRouting")?;
for (i, &irq) in irqs.iter().enumerate() {
routing.as_mut_slice()[i] = kvm_irq_routing_entry {
gsi: irq,
type_: KVM_IRQ_ROUTING_IRQCHIP,
flags: 0,
pad: 0,
u: kvm_irq_routing_entry__bindgen_ty_1 {
irqchip: kvm_irq_routing_irqchip {
irqchip: 0, pin: irq - 32, },
},
};
}
vm_fd.set_gsi_routing(&routing).map_err(|e| {
crate::vmm::map_transient_to_contention(e, "set GSI routing for serial IRQs")
})?;
Ok(())
}
fn init_pmuv3(vcpus: &[VcpuFd]) -> Result<()> {
for (cpu_id, vcpu) in vcpus.iter().enumerate() {
let irq: u32 = PMU_INTID;
let irq_attr = kvm_device_attr {
group: KVM_ARM_VCPU_PMU_V3_CTRL,
attr: KVM_ARM_VCPU_PMU_V3_IRQ as u64,
addr: &irq as *const u32 as u64,
flags: 0,
};
vcpu.set_device_attr(&irq_attr)
.with_context(|| format!("set PMU IRQ on vcpu {cpu_id}"))?;
let init_attr = kvm_device_attr {
group: KVM_ARM_VCPU_PMU_V3_CTRL,
attr: KVM_ARM_VCPU_PMU_V3_INIT as u64,
addr: 0,
flags: 0,
};
vcpu.set_device_attr(&init_attr)
.with_context(|| format!("init PMU on vcpu {cpu_id}"))?;
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use vm_memory::GuestMemory;
#[test]
fn create_vm_basic() {
let topo = Topology {
llcs: 1,
cores_per_llc: 2,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 128, false);
assert!(vm.is_ok(), "VM creation failed: {:?}", vm.err());
let vm = vm.unwrap();
assert_eq!(vm.vcpus.len(), 2);
}
#[test]
fn create_vm_multi_llc() {
let topo = Topology {
llcs: 2,
cores_per_llc: 2,
threads_per_core: 2,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 256, false);
assert!(vm.is_ok(), "multi-LLC VM creation failed: {:?}", vm.err());
let vm = vm.unwrap();
assert_eq!(vm.vcpus.len(), 8);
}
#[test]
fn create_vm_single_cpu() {
let topo = Topology {
llcs: 1,
cores_per_llc: 1,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 64, false);
assert!(vm.is_ok());
assert_eq!(vm.unwrap().vcpus.len(), 1);
}
#[test]
fn memory_size_correct() {
use vm_memory::GuestMemoryRegion;
let topo = Topology {
llcs: 1,
cores_per_llc: 1,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 256, false).unwrap();
let total: u64 = vm.guest_mem.iter().map(|r| r.len()).sum();
assert_eq!(total, 256 << 20);
}
#[test]
fn memory_starts_at_dram() {
use vm_memory::GuestMemoryRegion;
let topo = Topology {
llcs: 1,
cores_per_llc: 1,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 64, false).unwrap();
let region = vm.guest_mem.iter().next().unwrap();
assert_eq!(
region.start_addr(),
GuestAddress(DRAM_START),
"guest memory must start at DRAM_START"
);
}
#[test]
fn immediate_exit_cap_detected() {
let topo = Topology {
llcs: 1,
cores_per_llc: 1,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 64, false).unwrap();
assert!(vm.has_immediate_exit);
}
#[test]
fn gic_redist_does_not_overlap_dram() {
let max_cpus = (DRAM_START - GIC_REDIST_BASE) / GIC_REDIST_SIZE_PER_CPU;
assert!(
max_cpus >= 128,
"layout should support at least 128 vCPUs, got {max_cpus}"
);
}
#[test]
fn devices_below_dram() {
const { assert!(GIC_DIST_BASE < DRAM_START) };
const { assert!(GIC_REDIST_BASE < DRAM_START) };
const { assert!(SERIAL_MMIO_BASE < DRAM_START) };
const { assert!(SERIAL2_MMIO_BASE < DRAM_START) };
const { assert!(VIRTIO_CONSOLE_MMIO_BASE < DRAM_START) };
const {
assert!(
VIRTIO_CONSOLE_MMIO_BASE + crate::vmm::virtio_console::VIRTIO_MMIO_SIZE
<= DRAM_START
)
};
const { assert!(VIRTIO_BLK_MMIO_BASE < DRAM_START) };
const { assert!(VIRTIO_BLK_MMIO_BASE + crate::vmm::virtio_blk::VIRTIO_MMIO_SIZE <= DRAM_START) };
const { assert!(VIRTIO_NET_MMIO_BASE < DRAM_START) };
const { assert!(VIRTIO_NET_MMIO_BASE + crate::vmm::virtio_net::VIRTIO_MMIO_SIZE <= DRAM_START) };
}
#[test]
fn pmu_ppi_in_ppi_namespace() {
const { assert!(PMU_PPI < 16) };
}
#[test]
fn pmu_intid_in_ppi_intid_range() {
const { assert!(PMU_INTID >= 16) };
const { assert!(PMU_INTID < 32) };
}
#[test]
fn pmu_intid_offset_from_ppi() {
const { assert!(PMU_INTID == PMU_PPI + 16) };
}
}