use anyhow::{Context, Result};
use std::path::PathBuf;
use std::sync::Arc;
use std::thread::JoinHandle;
use std::time::Instant;
use vm_memory::{Bytes, GuestAddress, GuestMemory, GuestMemoryMmap};
use super::KtstrVm;
use super::initramfs_cache::{BaseKey, BaseRef, get_or_build_base, get_or_compress_base_shm};
use super::memory_budget::{MemoryBudget, initramfs_min_memory_mib, read_kernel_init_size};
use super::pi_mutex::PiMutex;
use super::{disk_config, disk_template, host_topology, initramfs, virtio_blk, virtio_net};
#[cfg(target_arch = "aarch64")]
use super::aarch64;
#[cfg(target_arch = "aarch64")]
use super::aarch64::boot;
#[cfg(target_arch = "aarch64")]
use super::aarch64::kvm;
#[cfg(target_arch = "x86_64")]
use super::virtio_console;
#[cfg(target_arch = "x86_64")]
use super::x86_64::{acpi, boot, kvm, mptable};
#[cfg(target_arch = "x86_64")]
const INITRD_ADDR: u64 = 0x800_0000;
#[cfg(target_arch = "aarch64")]
fn aarch64_initrd_addr(memory_mib: u32, total_cpus: u32, initrd_max_size: u64) -> Result<u64> {
let ceiling = aarch64::fdt::pvtime_base(memory_mib, total_cpus);
let page_size = host_page_size();
let mask = !(page_size - 1);
let load_addr = ceiling
.checked_sub(initrd_max_size)
.map(|top| top & mask)
.with_context(|| {
format!(
"compressed initrd ({initrd_max_size} bytes) exceeds the \
RAM span below the PVTIME carve (pvtime_base={ceiling:#x}): \
reduce initramfs size or increase VM memory"
)
})?;
anyhow::ensure!(
load_addr >= kvm::DRAM_START,
"initrd load address {load_addr:#x} underflows DRAM_START {:#x} \
(compressed initrd {initrd_max_size} bytes, pvtime_base {ceiling:#x}): \
reduce initramfs size or increase VM memory",
kvm::DRAM_START,
);
Ok(load_addr)
}
#[allow(dead_code)]
pub(crate) fn host_page_size() -> u64 {
static CACHED: std::sync::OnceLock<u64> = std::sync::OnceLock::new();
*CACHED.get_or_init(|| {
let sz = unsafe { libc::sysconf(libc::_SC_PAGESIZE) };
if sz > 0 { sz as u64 } else { 0x1000 }
})
}
#[allow(dead_code)]
pub(crate) fn disk_auto_mount_cmdline_tokens(disk: &disk_config::DiskConfig) -> String {
if disk.filesystem == disk_config::Filesystem::Raw || disk.no_auto_mount {
return String::new();
}
let mut s = format!(
" KTSTR_DISK0_FS={} KTSTR_DISK0_MOUNT={}",
disk.filesystem.cache_tag(),
disk.auto_mount_path(),
);
if disk.read_only {
s.push_str(" KTSTR_DISK0_RO=1");
}
s
}
fn base_guest_cmdline(arch_extra: &str) -> String {
format!(
"console=ttyS0 nomodules mitigations=off random.trust_cpu=on \
swiotlb=noforce panic=-1 lockdown=none \
sysctl.kernel.unprivileged_bpf_disabled=0 \
sysctl.kernel.sched_schedstats=1 delayacct \
sysctl.kernel.task_delayacct=1 sysctl.vm.overcommit_memory=1 \
{arch_extra} KTSTR_GUEST=1"
)
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn assemble_extras_and_key<'a>(
payload: &'a std::path::Path,
scheduler: Option<&'a std::path::Path>,
probe: Option<&'a std::path::Path>,
worker: Option<&'a std::path::Path>,
staged_schedulers: &'a [crate::vmm::builder::StagedScheduler],
staged_extras_names: &'a [String],
merged_includes: &'a [(String, PathBuf)],
busybox_bytes: Option<&[u8]>,
has_jemalloc_extras: bool,
) -> Result<(Vec<(&'a str, &'a std::path::Path)>, BaseKey)> {
debug_assert_eq!(
staged_schedulers.len(),
staged_extras_names.len(),
"staged_schedulers and staged_extras_names must be co-indexed; \
caller mis-built the extras-names slice"
);
let mut extras: Vec<(&str, &std::path::Path)> = Vec::new();
if let Some(s) = scheduler {
extras.push(("scheduler", s));
}
if let Some(p) = probe {
extras.push(("bin/ktstr-jemalloc-probe", p));
}
for (idx, staged) in staged_schedulers.iter().enumerate() {
extras.push((staged_extras_names[idx].as_str(), staged.binary.as_path()));
}
let shell_mode = busybox_bytes.is_some() || !merged_includes.is_empty() || has_jemalloc_extras;
let staged_for_key: Vec<(&str, &std::path::Path)> = staged_schedulers
.iter()
.map(|s| (s.name.as_str(), s.binary.as_path()))
.collect();
let key = if shell_mode {
BaseKey::new_shell(
payload,
scheduler,
probe,
worker,
&staged_for_key,
merged_includes,
busybox_bytes,
)?
} else {
BaseKey::new(payload, scheduler, probe, worker, &staged_for_key)?
};
Ok((extras, key))
}
impl KtstrVm {
pub(super) fn init_virtio_blk(
&self,
vm: &kvm::KtstrKvm,
) -> Result<Option<Arc<PiMutex<virtio_blk::VirtioBlk>>>> {
if self.disks.is_empty() {
return Ok(None);
}
let disk = &self.disks[0];
let capacity = disk.capacity_bytes();
disk.throttle
.validate()
.map_err(|e| anyhow::anyhow!(e).context("invalid disk throttle"))?;
let backing = if let Some(staging) = self.template_staging_image.as_ref() {
let f = std::fs::OpenOptions::new()
.read(true)
.write(true)
.open(staging)
.with_context(|| {
format!(
"open template staging image {} for virtio-blk",
staging.display(),
)
})?;
f.set_len(capacity)
.context("set template staging image length to capacity")?;
f
} else {
match disk.filesystem {
disk_config::Filesystem::Raw => {
let f = tempfile::tempfile()
.context("create virtio-blk sparse temp backing file")?;
f.set_len(capacity)
.context("set virtio-blk backing file length")?;
f
}
disk_config::Filesystem::Btrfs => {
let template =
disk_template::ensure_template(disk_config::Filesystem::Btrfs, capacity)
.context("ensure btrfs disk template")?;
let cache_root = disk_template::cache_root()
.context("resolve disk-template cache root for per-test clone")?;
std::fs::create_dir_all(&cache_root)
.with_context(|| format!("create cache root {cache_root:?}"))?;
let dest = cache_root.join(format!(
".per-test-{pid}-{ns:x}-{rnd:x}.img",
pid = std::process::id(),
ns = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_nanos())
.unwrap_or(0),
rnd = rand::random::<u64>(),
));
let f = disk_template::clone_to_per_test(&template, &dest)
.context("FICLONE template into per-test backing")?;
if let Err(e) = std::fs::remove_file(&dest) {
tracing::warn!(
path = %dest.display(),
error = %e,
"failed to unlink per-test btrfs backing after \
FICLONE; the open File still backs the device, \
but the leftover path will accumulate in the \
cache directory until manual cleanup or the \
next disk-template cache GC pass."
);
}
f
}
}
};
let mut blk =
virtio_blk::VirtioBlk::with_options(backing, capacity, disk.throttle, disk.read_only);
let placement = virtio_blk::WorkerPlacement {
service_cpu: self.pinning_plan.as_ref().and_then(|p| p.service_cpu),
no_perf_cpus: self.no_perf_plan.as_ref().map(|p| p.cpus.clone()),
};
blk.set_worker_placement(placement);
blk.set_mem((*vm.guest_mem).clone());
let blk_arc = Arc::new(PiMutex::new(blk));
vm.vm_fd
.register_irqfd(blk_arc.lock().irq_evt(), kvm::VIRTIO_BLK_IRQ)
.context("register virtio-blk irqfd")?;
Ok(Some(blk_arc))
}
pub(super) fn init_virtio_net(
&self,
vm: &kvm::KtstrKvm,
) -> Result<Option<Arc<PiMutex<virtio_net::VirtioNet>>>> {
let Some(cfg) = self.network else {
return Ok(None);
};
let mut dev = virtio_net::VirtioNet::new(cfg);
dev.set_mem((*vm.guest_mem).clone());
let net_arc = Arc::new(PiMutex::new(dev));
vm.vm_fd
.register_irqfd(net_arc.lock().irq_evt(), kvm::VIRTIO_NET_IRQ)
.context("register virtio-net irqfd")?;
Ok(Some(net_arc))
}
pub(super) fn create_vm_and_load_kernel(
&self,
) -> Result<(kvm::KtstrKvm, Option<boot::KernelLoadResult>)> {
let t0 = Instant::now();
let use_hugepages = self.performance_mode
&& self.memory_mib.is_some_and(|mib| {
host_topology::hugepages_free() >= host_topology::hugepages_needed(mib)
});
let vm = match self.memory_mib {
Some(mib) => {
if use_hugepages {
kvm::KtstrKvm::new_with_hugepages(self.topology, mib, self.performance_mode)
.context("create VM with hugepages")?
} else {
kvm::KtstrKvm::new(self.topology, mib, self.performance_mode)
.context("create VM")?
}
}
None => {
kvm::KtstrKvm::new_deferred(self.topology, use_hugepages, self.performance_mode)
.context("create VM (deferred memory)")?
}
};
tracing::debug!(elapsed_us = t0.elapsed().as_micros(), "kvm_create");
let kernel_result = if self.memory_mib.is_some() {
if self.performance_mode && !self.mbind_node_map.is_empty() {
let layout = vm.numa_layout.as_ref().expect(
"numa_layout is Some on the non-deferred allocation path: \
allocate_and_register_memory ran during `vm_new` because \
memory_mib was provided up front, and that call sets \
numa_layout to Some(...) in src/vmm/{x86_64,aarch64}/kvm.rs",
);
layout.mbind_regions(&vm.guest_mem, &self.mbind_node_map);
}
let t0 = Instant::now();
let kr = boot::load_kernel(&vm.guest_mem, &self.kernel).context("load kernel")?;
tracing::debug!(elapsed_us = t0.elapsed().as_micros(), "load_kernel");
Some(kr)
} else {
None
};
Ok((vm, kernel_result))
}
pub(super) fn spawn_initramfs_resolve(&self) -> Option<JoinHandle<Result<(BaseRef, BaseKey)>>> {
let bin = self.init_binary.as_ref()?;
let payload = bin.clone();
let scheduler = self.scheduler_binary.clone();
let probe = self.jemalloc_probe_binary.clone();
let worker = self.jemalloc_alloc_worker_binary.clone();
let include_files = self.include_files.clone();
let staged_schedulers = self.staged_schedulers.clone();
let busybox_bytes = self.busybox_bytes.clone();
#[cfg(feature = "wprof")]
let wprof_host_path: Option<PathBuf> = self.wprof.as_ref().map(|w| w.host_path.clone());
std::thread::Builder::new()
.name("initramfs-resolve".into())
.spawn(move || -> Result<(BaseRef, BaseKey)> {
let staged_extras_names: Vec<String> = staged_schedulers
.iter()
.map(|s| {
format!(
"{}/scheduler",
crate::test_support::staged::staged_scheduler_archive_dir(&s.name),
)
})
.collect();
let has_jemalloc_extras = probe.as_deref().is_some() || worker.as_deref().is_some();
let mut merged_includes: Vec<(String, PathBuf)> = include_files.clone();
if let Some(w) = worker.as_deref() {
merged_includes.push((
"bin/ktstr-jemalloc-alloc-worker".to_string(),
w.to_path_buf(),
));
}
#[cfg(feature = "wprof")]
if let Some(wprof_path) = wprof_host_path.as_deref() {
merged_includes.push(("bin/wprof".to_string(), wprof_path.to_path_buf()));
}
let (extras, key) = assemble_extras_and_key(
&payload,
scheduler.as_deref(),
probe.as_deref(),
worker.as_deref(),
&staged_schedulers,
&staged_extras_names,
&merged_includes,
busybox_bytes.as_deref(),
has_jemalloc_extras,
)?;
let include_refs: Vec<(&str, &std::path::Path)> = merged_includes
.iter()
.map(|(a, p)| (a.as_str(), p.as_path()))
.collect();
let base =
get_or_build_base(&payload, &extras, &include_refs, busybox_bytes, &key)?;
Ok((base, key))
})
.ok()
}
fn compress_and_load_initrd(
&self,
vm: &mut kvm::KtstrKvm,
base_bytes: &[u8],
suffix: &[u8],
key: &BaseKey,
load_addr: u64,
) -> Result<u32> {
let uncompressed_size = base_bytes.len() + suffix.len();
let t0 = Instant::now();
let lz4_base = self.get_or_compress_base(base_bytes, key)?;
let lz4_suffix = initramfs::lz4_legacy_compress(suffix);
let total_compressed = lz4_base.len() + lz4_suffix.len();
tracing::debug!(
elapsed_us = t0.elapsed().as_micros(),
uncompressed = uncompressed_size,
lz4_base = lz4_base.len(),
lz4_suffix = lz4_suffix.len(),
ratio = format!("{:.1}x", uncompressed_size as f64 / total_compressed as f64),
"lz4_initramfs",
);
tracing::debug!(
base_magic = format!(
"{:02x}{:02x}{:02x}{:02x}",
lz4_base[0], lz4_base[1], lz4_base[2], lz4_base[3]
),
suffix_magic = format!(
"{:02x}{:02x}{:02x}{:02x}",
lz4_suffix[0], lz4_suffix[1], lz4_suffix[2], lz4_suffix[3]
),
base_len = lz4_base.len(),
suffix_len = lz4_suffix.len(),
total = total_compressed,
load_addr = format!("{:#x}", load_addr),
suffix_addr = format!("{:#x}", load_addr + lz4_base.len() as u64),
"initrd_load_debug",
);
let t0 = Instant::now();
let cow_guard = self.try_cow_overlay(&vm.guest_mem, key, lz4_base.len(), load_addr);
let cow_active = cow_guard.is_some();
if let Some(guard) = cow_guard {
vm.cow_overlay_guards.push(guard);
}
if cow_active {
vm.guest_mem
.write_slice(&lz4_suffix, GuestAddress(load_addr + lz4_base.len() as u64))
.context("write lz4 suffix after COW base")?;
tracing::debug!(
elapsed_us = t0.elapsed().as_micros(),
cow = true,
"initrd_write"
);
} else {
initramfs::load_initramfs_parts(&vm.guest_mem, &[&lz4_base, &lz4_suffix], load_addr)?;
tracing::debug!(
elapsed_us = t0.elapsed().as_micros(),
cow = false,
"initrd_write"
);
}
let mut check_buf = [0u8; 8];
vm.guest_mem
.read_slice(&mut check_buf, GuestAddress(load_addr))
.context("read-back initrd check")?;
tracing::debug!(
first_8 = format!(
"{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}",
check_buf[0],
check_buf[1],
check_buf[2],
check_buf[3],
check_buf[4],
check_buf[5],
check_buf[6],
check_buf[7]
),
expected_magic = "02214c18",
"initrd_verify",
);
Ok(total_compressed as u32)
}
#[cfg(target_arch = "x86_64")]
fn join_and_load_initramfs(
&self,
vm: &mut kvm::KtstrKvm,
handle: JoinHandle<Result<(BaseRef, BaseKey)>>,
load_addr: u64,
) -> Result<(Option<u64>, Option<u32>)> {
let t0 = Instant::now();
let (base, key) = handle
.join()
.map_err(|_| anyhow::anyhow!("initramfs-resolve thread panicked"))??;
tracing::debug!(elapsed_us = t0.elapsed().as_micros(), "initramfs_join");
let base_bytes: &[u8] = base.as_ref();
let t0 = Instant::now();
let suffix = initramfs::build_suffix(base_bytes.len(), &self.suffix_params())?;
let uncompressed_size = base_bytes.len() + suffix.len();
tracing::debug!(
elapsed_us = t0.elapsed().as_micros(),
base_bytes = base_bytes.len(),
suffix_bytes = suffix.len(),
"build_suffix",
);
let memory_mib = self.memory_mib.expect(
"join_and_load_initramfs called in deferred mode; \
use join_compute_memory_and_load instead",
);
let lz4_base = self.get_or_compress_base(base_bytes, &key)?;
let lz4_suffix = initramfs::lz4_legacy_compress(&suffix);
let compressed_size = lz4_base.len() + lz4_suffix.len();
let kernel_init_size = read_kernel_init_size(&self.kernel).unwrap_or(0) as u64;
let budget = MemoryBudget {
uncompressed_initramfs_bytes: uncompressed_size as u64,
compressed_initrd_bytes: compressed_size as u64,
kernel_init_size,
};
let min_mib = initramfs_min_memory_mib(&budget);
if memory_mib < min_mib {
anyhow::bail!(
"VM memory {}MiB insufficient for initramfs \
(uncompressed={}MiB, compressed={}MiB, \
init_size={}MiB): need {}MiB",
memory_mib,
uncompressed_size >> 20,
compressed_size >> 20,
kernel_init_size >> 20,
min_mib,
);
}
let size = self.compress_and_load_initrd(vm, base_bytes, &suffix, &key, load_addr)?;
Ok((Some(load_addr), Some(size)))
}
#[cfg(target_arch = "x86_64")]
fn join_compute_memory_and_load(
&self,
vm: &mut kvm::KtstrKvm,
handle: JoinHandle<Result<(BaseRef, BaseKey)>>,
load_addr: u64,
) -> Result<(Option<u64>, Option<u32>, u32)> {
let t0 = Instant::now();
let (base, key) = handle
.join()
.map_err(|_| anyhow::anyhow!("initramfs-resolve thread panicked"))??;
tracing::debug!(elapsed_us = t0.elapsed().as_micros(), "initramfs_join");
let base_bytes: &[u8] = base.as_ref();
let t0 = Instant::now();
let suffix = initramfs::build_suffix(base_bytes.len(), &self.suffix_params())?;
let uncompressed_size = base_bytes.len() + suffix.len();
tracing::debug!(
elapsed_us = t0.elapsed().as_micros(),
base_bytes = base_bytes.len(),
suffix_bytes = suffix.len(),
"build_suffix",
);
let t0_compress = Instant::now();
let lz4_base = self.get_or_compress_base(base_bytes, &key)?;
let lz4_suffix = initramfs::lz4_legacy_compress(&suffix);
let compressed_size = lz4_base.len() + lz4_suffix.len();
tracing::debug!(
elapsed_us = t0_compress.elapsed().as_micros(),
uncompressed = uncompressed_size,
compressed = compressed_size,
ratio = format!("{:.1}x", uncompressed_size as f64 / compressed_size as f64),
"deferred_lz4_compress",
);
let kernel_init_size = read_kernel_init_size(&self.kernel).unwrap_or(0) as u64;
let budget = MemoryBudget {
uncompressed_initramfs_bytes: uncompressed_size as u64,
compressed_initrd_bytes: compressed_size as u64,
kernel_init_size,
};
let memory_mib = initramfs_min_memory_mib(&budget).max(self.memory_min_mib);
tracing::debug!(
uncompressed_mib = uncompressed_size >> 20,
compressed_mib = compressed_size >> 20,
init_size_mib = kernel_init_size >> 20,
memory_min_mib = self.memory_min_mib,
memory_mib,
"deferred_memory_computed",
);
vm.allocate_and_register_memory(memory_mib)
.with_context(|| format!("allocate deferred memory ({memory_mib}MiB)"))?;
let size = self.compress_and_load_initrd(vm, base_bytes, &suffix, &key, load_addr)?;
Ok((Some(load_addr), Some(size), memory_mib))
}
pub(super) fn effective_memory_mib(&self, guest_mem: &GuestMemoryMmap) -> u32 {
use vm_memory::GuestMemoryRegion;
match self.memory_mib {
Some(mib) => mib,
None => {
let total_bytes: u64 = guest_mem.iter().map(|r| r.len()).sum();
(total_bytes >> 20) as u32
}
}
}
fn get_or_compress_base(&self, base_bytes: &[u8], key: &BaseKey) -> Result<Vec<u8>> {
Ok(get_or_compress_base_shm(key.0, base_bytes))
}
fn try_cow_overlay(
&self,
guest_mem: &GuestMemoryMmap,
key: &BaseKey,
expected_len: usize,
load_addr: u64,
) -> Option<initramfs::CowOverlayGuard> {
let (fd, len) = initramfs::shm_open_lz4(key.0)?;
if len != expected_len {
initramfs::shm_close_fd(fd);
return None;
}
use std::os::fd::AsRawFd;
let mut magic = [0u8; 4];
let n = unsafe {
libc::pread(
fd.as_raw_fd(),
magic.as_mut_ptr() as *mut libc::c_void,
4,
0,
)
};
if n != 4 {
initramfs::shm_close_fd(fd);
return None;
}
if magic != initramfs::LZ4_LEGACY_MAGIC {
tracing::warn!(
magic = format!(
"{:02x}{:02x}{:02x}{:02x}",
magic[0], magic[1], magic[2], magic[3]
),
"stale compressed shm segment in COW path, skipping"
);
initramfs::shm_close_fd(fd);
return None;
}
if len == 0 || load_addr.checked_add(len as u64).is_none() {
tracing::debug!(
load_addr = format!("{:#x}", load_addr),
len,
"cow_overlay: invalid range (zero-length or overflow), falling back"
);
initramfs::shm_close_fd(fd);
return None;
}
#[cfg(target_arch = "aarch64")]
let host_page = host_page_size();
#[cfg(target_arch = "x86_64")]
let host_page: u64 = 0x1000;
if load_addr & (host_page - 1) != 0 {
tracing::debug!(
load_addr = format!("{:#x}", load_addr),
host_page,
"cow_overlay: load_addr not host-page-aligned, falling back"
);
initramfs::shm_close_fd(fd);
return None;
}
let rounded_len = (len as u64)
.checked_add(host_page - 1)
.map(|v| v & !(host_page - 1));
let Some(rounded_len) = rounded_len else {
tracing::debug!(
load_addr = format!("{:#x}", load_addr),
len,
"cow_overlay: rounded length overflows u64, falling back"
);
initramfs::shm_close_fd(fd);
return None;
};
let rounded_usize = match usize::try_from(rounded_len) {
Ok(v) => v,
Err(_) => {
tracing::debug!(
load_addr = format!("{:#x}", load_addr),
rounded_len,
"cow_overlay: rounded length exceeds usize, falling back"
);
initramfs::shm_close_fd(fd);
return None;
}
};
if guest_mem
.get_slice(GuestAddress(load_addr), rounded_usize)
.is_err()
{
tracing::debug!(
load_addr = format!("{:#x}", load_addr),
len,
rounded_len,
"cow_overlay: range exceeds guest memory region, falling back"
);
initramfs::shm_close_fd(fd);
return None;
}
let Ok(host_addr) = guest_mem.get_host_address(GuestAddress(load_addr)) else {
initramfs::shm_close_fd(fd);
return None;
};
unsafe { initramfs::cow_overlay(host_addr, len, fd) }
}
#[cfg(target_arch = "x86_64")]
pub(super) fn setup_memory(
&self,
vm: &mut kvm::KtstrKvm,
kernel_result: Option<boot::KernelLoadResult>,
initramfs_handle: Option<JoinHandle<Result<(BaseRef, BaseKey)>>>,
) -> Result<boot::KernelLoadResult> {
let (kernel_result, initrd_addr, initrd_size) = if let Some(kr) = kernel_result {
let (initrd_addr, initrd_size) = match initramfs_handle {
Some(handle) => self.join_and_load_initramfs(vm, handle, INITRD_ADDR)?,
None => (None, None),
};
(kr, initrd_addr, initrd_size)
} else {
let (initrd_addr, initrd_size, _memory_mib) = match initramfs_handle {
Some(handle) => self.join_compute_memory_and_load(vm, handle, INITRD_ADDR)?,
None => {
let memory_mib = 256u32;
vm.allocate_and_register_memory(memory_mib)
.context("allocate deferred memory (no initramfs)")?;
(None, None, memory_mib)
}
};
if self.performance_mode && !self.mbind_node_map.is_empty() {
let layout = vm.numa_layout.as_ref().expect(
"numa_layout is Some after the deferred allocate_and_register_memory \
call above: that call sets numa_layout to Some(...) in \
src/vmm/{x86_64,aarch64}/kvm.rs before this branch can reach here",
);
layout.mbind_regions(&vm.guest_mem, &self.mbind_node_map);
}
let t0 = Instant::now();
let kr = boot::load_kernel(&vm.guest_mem, &self.kernel).context("load kernel")?;
tracing::debug!(elapsed_us = t0.elapsed().as_micros(), "load_kernel");
(kr, initrd_addr, initrd_size)
};
let memory_mib = self.effective_memory_mib(&vm.guest_mem);
let mut cmdline = base_guest_cmdline(
"no_timer_check clocksource=kvm-clock i8042.noaux i8042.nomux \
i8042.nopnp i8042.dumbkbd pci=off reboot=k",
);
let verbose = std::env::var(crate::KTSTR_VERBOSE_ENV)
.map(|v| v == "1")
.unwrap_or(false)
|| std::env::var("RUST_BACKTRACE").is_ok_and(|v| v == "1" || v == "full");
if verbose {
cmdline.push_str(" earlyprintk=serial loglevel=7");
} else {
cmdline.push_str(" loglevel=0");
}
if self.init_binary.is_some() {
cmdline.push_str(" rdinit=/init initramfs_options=size=90%");
}
cmdline.push_str(&format!(
" virtio_mmio.device={:#x}@{:#x}:{}",
virtio_console::VIRTIO_MMIO_SIZE,
kvm::VIRTIO_CONSOLE_MMIO_BASE,
kvm::VIRTIO_CONSOLE_IRQ,
));
if !self.disks.is_empty() {
cmdline.push_str(&format!(
" virtio_mmio.device={:#x}@{:#x}:{}",
virtio_blk::VIRTIO_MMIO_SIZE,
kvm::VIRTIO_BLK_MMIO_BASE,
kvm::VIRTIO_BLK_IRQ,
));
let disk = &self.disks[0];
cmdline.push_str(&disk_auto_mount_cmdline_tokens(disk));
}
if self.network.is_some() {
cmdline.push_str(&format!(
" virtio_mmio.device={:#x}@{:#x}:{}",
virtio_net::VIRTIO_MMIO_SIZE,
kvm::VIRTIO_NET_MMIO_BASE,
kvm::VIRTIO_NET_IRQ,
));
}
if self.topology.has_memory_only_nodes() {
cmdline.push_str(" numa_balancing=enable");
} else {
cmdline.push_str(" numa_balancing=0");
}
#[cfg(feature = "wprof")]
if let Some(wprof) = self.wprof.as_ref() {
cmdline.push_str(" KTSTR_WPROF_ARGS=");
cmdline.push_str(&wprof.args_cmdline());
}
if !self.cmdline_extra.is_empty() {
cmdline.push(' ');
cmdline.push_str(&self.cmdline_extra);
}
let t0 = Instant::now();
boot::write_cmdline(&vm.guest_mem, &cmdline)?;
boot::write_boot_params(
&vm.guest_mem,
&cmdline,
memory_mib,
initrd_addr,
initrd_size,
kernel_result.setup_header.as_ref(),
)?;
tracing::debug!(elapsed_us = t0.elapsed().as_micros(), "cmdline_boot_params");
let t0 = Instant::now();
mptable::setup_mptable(&vm.guest_mem, &self.topology)?;
let _acpi_layout = acpi::setup_acpi(
&vm.guest_mem,
&self.topology,
vm.numa_layout.as_ref().expect(
"numa_layout is Some by the time setup_acpi runs: \
memory allocation (whether deferred or not) ran earlier \
in this function and set numa_layout via \
allocate_and_register_memory in src/vmm/x86_64/kvm.rs",
),
)?;
tracing::debug!(elapsed_us = t0.elapsed().as_micros(), "mptable_acpi");
Ok(kernel_result)
}
#[cfg(target_arch = "x86_64")]
pub(super) fn setup_vcpus(&self, vm: &kvm::KtstrKvm, kernel_entry: u64) -> Result<()> {
let t0 = Instant::now();
boot::setup_sregs(&vm.guest_mem, &vm.vcpus[0], vm.split_irqchip)?;
boot::setup_regs(&vm.vcpus[0], kernel_entry)?;
boot::setup_fpu(&vm.vcpus[0])?;
boot::setup_msrs(&vm.vcpus[0], None)?;
boot::setup_lapic(&vm.vcpus[0], true)?;
vm.vcpus[0]
.set_mp_state(kvm_bindings::kvm_mp_state {
mp_state: kvm_bindings::KVM_MP_STATE_RUNNABLE,
})
.context("set BSP mp_state")?;
tracing::debug!(elapsed_us = t0.elapsed().as_micros(), "bsp_setup");
let t0 = Instant::now();
for vcpu in &vm.vcpus[1..] {
boot::setup_fpu(vcpu)?;
boot::setup_lapic(vcpu, false)?;
vcpu.set_mp_state(kvm_bindings::kvm_mp_state {
mp_state: kvm_bindings::KVM_MP_STATE_UNINITIALIZED,
})
.context("set AP mp_state")?;
}
tracing::debug!(
elapsed_us = t0.elapsed().as_micros(),
ap_count = vm.vcpus.len().saturating_sub(1),
"ap_setup"
);
Ok(())
}
}
#[cfg(target_arch = "aarch64")]
impl KtstrVm {
pub(super) fn setup_memory_aarch64(
&self,
vm: &mut kvm::KtstrKvm,
kernel_result: Option<boot::KernelLoadResult>,
initramfs_handle: Option<JoinHandle<Result<(BaseRef, BaseKey)>>>,
) -> Result<boot::KernelLoadResult> {
let (kernel_result, initrd_addr, initrd_size) = if let Some(kr) = kernel_result {
let (initrd_addr, initrd_size) = match initramfs_handle {
Some(handle) => {
let memory_mib = self.memory_mib.context(
"internal: non-deferred aarch64 path requires memory_mib to be set",
)?;
self.join_and_load_initramfs_aarch64(vm, handle, memory_mib)?
}
None => (None, None),
};
(kr, initrd_addr, initrd_size)
} else {
let (initrd_addr, initrd_size) = match initramfs_handle {
Some(handle) => self.join_compute_memory_and_load_aarch64(vm, handle)?,
None => {
let memory_mib = 256u32;
vm.allocate_and_register_memory(memory_mib)
.context("allocate deferred memory (no initramfs, aarch64)")?;
(None, None)
}
};
let t0 = Instant::now();
let kr =
boot::load_kernel(&vm.guest_mem, &self.kernel).context("load kernel (aarch64)")?;
tracing::debug!(elapsed_us = t0.elapsed().as_micros(), "load_kernel");
(kr, initrd_addr, initrd_size)
};
self.finish_aarch64_setup(vm, kernel_result, initrd_addr, initrd_size)
}
fn join_and_load_initramfs_aarch64(
&self,
vm: &mut kvm::KtstrKvm,
handle: JoinHandle<Result<(BaseRef, BaseKey)>>,
memory_mib: u32,
) -> Result<(Option<u64>, Option<u32>)> {
let t0 = Instant::now();
let (base, key) = handle
.join()
.map_err(|_| anyhow::anyhow!("initramfs-resolve thread panicked"))??;
tracing::debug!(elapsed_us = t0.elapsed().as_micros(), "initramfs_join");
let base_bytes: &[u8] = base.as_ref();
let t0 = Instant::now();
let suffix = initramfs::build_suffix(base_bytes.len(), &self.suffix_params())?;
let uncompressed_size = base_bytes.len() + suffix.len();
tracing::debug!(
elapsed_us = t0.elapsed().as_micros(),
base_bytes = base_bytes.len(),
suffix_bytes = suffix.len(),
"build_suffix",
);
let lz4_base = self.get_or_compress_base(base_bytes, &key)?;
let lz4_suffix = initramfs::lz4_legacy_compress(&suffix);
let compressed_size = lz4_base.len() + lz4_suffix.len();
let kernel_init_size = read_kernel_init_size(&self.kernel).unwrap_or(0) as u64;
let budget = MemoryBudget {
uncompressed_initramfs_bytes: uncompressed_size as u64,
compressed_initrd_bytes: compressed_size as u64,
kernel_init_size,
};
let min_mib = initramfs_min_memory_mib(&budget);
if memory_mib < min_mib {
anyhow::bail!(
"VM memory {}MiB insufficient for initramfs \
(uncompressed={}MiB, compressed={}MiB, \
init_size={}MiB): need {}MiB",
memory_mib,
uncompressed_size >> 20,
compressed_size >> 20,
kernel_init_size >> 20,
min_mib,
);
}
let load_addr = aarch64_initrd_addr(
memory_mib,
self.topology.total_cpus(),
compressed_size as u64,
)?;
let size = self.compress_and_load_initrd(vm, base_bytes, &suffix, &key, load_addr)?;
Ok((Some(load_addr), Some(size)))
}
fn join_compute_memory_and_load_aarch64(
&self,
vm: &mut kvm::KtstrKvm,
handle: JoinHandle<Result<(BaseRef, BaseKey)>>,
) -> Result<(Option<u64>, Option<u32>)> {
let t0 = Instant::now();
let (base, key) = handle
.join()
.map_err(|_| anyhow::anyhow!("initramfs-resolve thread panicked"))??;
tracing::debug!(elapsed_us = t0.elapsed().as_micros(), "initramfs_join");
let base_bytes: &[u8] = base.as_ref();
let t0 = Instant::now();
let suffix = initramfs::build_suffix(base_bytes.len(), &self.suffix_params())?;
let uncompressed_size = base_bytes.len() + suffix.len();
tracing::debug!(
elapsed_us = t0.elapsed().as_micros(),
base_bytes = base_bytes.len(),
suffix_bytes = suffix.len(),
"build_suffix",
);
let t0_compress = Instant::now();
let lz4_base = self.get_or_compress_base(base_bytes, &key)?;
let lz4_suffix = initramfs::lz4_legacy_compress(&suffix);
let compressed_size = lz4_base.len() + lz4_suffix.len();
tracing::debug!(
elapsed_us = t0_compress.elapsed().as_micros(),
uncompressed = uncompressed_size,
compressed = compressed_size,
ratio = format!("{:.1}x", uncompressed_size as f64 / compressed_size as f64),
"deferred_lz4_compress",
);
let kernel_init_size = read_kernel_init_size(&self.kernel).unwrap_or(0) as u64;
let budget = MemoryBudget {
uncompressed_initramfs_bytes: uncompressed_size as u64,
compressed_initrd_bytes: compressed_size as u64,
kernel_init_size,
};
let memory_mib = initramfs_min_memory_mib(&budget).max(self.memory_min_mib);
tracing::debug!(
uncompressed_mib = uncompressed_size >> 20,
compressed_mib = compressed_size >> 20,
init_size_mib = kernel_init_size >> 20,
memory_min_mib = self.memory_min_mib,
memory_mib,
"deferred_memory_computed",
);
vm.allocate_and_register_memory(memory_mib)
.with_context(|| format!("allocate deferred memory ({memory_mib}MiB, aarch64)"))?;
let load_addr = aarch64_initrd_addr(
memory_mib,
self.topology.total_cpus(),
compressed_size as u64,
)?;
let size = self.compress_and_load_initrd(vm, base_bytes, &suffix, &key, load_addr)?;
Ok((Some(load_addr), Some(size)))
}
#[cfg(target_arch = "aarch64")]
fn finish_aarch64_setup(
&self,
vm: &kvm::KtstrKvm,
kernel_result: boot::KernelLoadResult,
initrd_addr: Option<u64>,
initrd_size: Option<u32>,
) -> Result<boot::KernelLoadResult> {
let memory_mib = self.effective_memory_mib(&vm.guest_mem);
let mut cmdline = base_guest_cmdline("kfence.sample_interval=0");
cmdline.push_str(&format!(
" earlycon=uart,mmio,{:#x}",
aarch64::kvm::SERIAL_MMIO_BASE
));
let verbose = std::env::var(crate::KTSTR_VERBOSE_ENV)
.map(|v| v == "1")
.unwrap_or(false)
|| std::env::var("RUST_BACKTRACE").is_ok_and(|v| v == "1" || v == "full");
if verbose {
cmdline.push_str(" loglevel=7");
} else {
cmdline.push_str(" loglevel=0");
}
if self.init_binary.is_some() {
cmdline.push_str(" rdinit=/init initramfs_options=size=90%");
}
if !self.disks.is_empty() {
let disk = &self.disks[0];
cmdline.push_str(&disk_auto_mount_cmdline_tokens(disk));
}
if self.topology.has_memory_only_nodes() {
cmdline.push_str(" numa_balancing=enable");
} else {
cmdline.push_str(" numa_balancing=0");
}
#[cfg(feature = "wprof")]
if let Some(wprof) = self.wprof.as_ref() {
cmdline.push_str(" KTSTR_WPROF_ARGS=");
cmdline.push_str(&wprof.args_cmdline());
}
if !self.cmdline_extra.is_empty() {
cmdline.push(' ');
cmdline.push_str(&self.cmdline_extra);
}
let t0 = Instant::now();
boot::validate_cmdline(&cmdline)?;
let fdt_addr = aarch64::fdt::fdt_address(memory_mib);
let pvtime_base = aarch64::fdt::pvtime_base(memory_mib, self.topology.total_cpus());
anyhow::ensure!(
pvtime_base >= aarch64::kvm::DRAM_START && pvtime_base < fdt_addr,
"guest RAM too small to carve the PVTIME region \
(pvtime_base={pvtime_base:#x}, fdt_addr={fdt_addr:#x})"
);
vm.setup_pvtime(pvtime_base)
.context("wire KVM PV stolen-time")?;
let mpidrs =
aarch64::topology::read_mpidrs(&vm.vcpus).context("read vCPU MPIDRs for FDT")?;
let hw_cache_level = aarch64::topology::host_cache_levels();
let guest_l1_unified = aarch64::topology::host_l1_is_unified();
let dtb = aarch64::fdt::create_fdt(
&self.topology,
&mpidrs,
memory_mib,
&cmdline,
initrd_addr,
initrd_size,
hw_cache_level,
guest_l1_unified,
vm.numa_layout.as_ref().expect(
"numa_layout is Some by the time FDT creation runs: \
memory allocation (whether deferred or not) ran earlier \
in this function and set numa_layout via \
allocate_and_register_memory in src/vmm/aarch64/kvm.rs",
),
!self.disks.is_empty(),
self.network.is_some(),
vm.has_pmu,
)
.context("create FDT")?;
vm.guest_mem
.write_slice(&dtb, GuestAddress(fdt_addr))
.context("write FDT to guest memory")?;
tracing::debug!(
elapsed_us = t0.elapsed().as_micros(),
fdt_addr,
fdt_len = dtb.len(),
"cmdline_fdt",
);
Ok(kernel_result)
}
#[cfg(target_arch = "aarch64")]
pub(super) fn setup_vcpus_aarch64(&self, vm: &kvm::KtstrKvm, kernel_entry: u64) -> Result<()> {
let t0 = Instant::now();
let memory_mib = self.effective_memory_mib(&vm.guest_mem);
let fdt_addr = aarch64::fdt::fdt_address(memory_mib);
boot::setup_regs(&vm.vcpus[0], kernel_entry, fdt_addr)?;
tracing::debug!(elapsed_us = t0.elapsed().as_micros(), "bsp_setup");
Ok(())
}
}
#[cfg(test)]
mod tests;