use super::btf_offsets::{
CPU_MAX_IDLE_TYPES, KernelOffsets, SchedDomainOffsets, SchedDomainStatsOffsets,
SchedstatOffsets, ScxEventOffsets,
};
use super::{
CpuSnapshot, Kva, MonitorSample, RqSchedstat, SchedDomainSnapshot, SchedDomainStats,
ScxEventCounters,
};
use std::os::unix::io::AsRawFd;
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::time::{Duration, Instant};
use vmm_sys_util::epoll::{ControlOperation, Epoll, EpollEvent, EventSet};
use vmm_sys_util::eventfd::EventFd;
use vmm_sys_util::timerfd::TimerFd;
#[derive(Debug, Clone, Copy)]
pub(crate) struct MemRegion {
pub(crate) host_ptr: *mut u8,
pub(crate) offset: u64,
pub(crate) size: u64,
}
#[derive(Debug, Clone, Copy, Default)]
pub(crate) struct WalkContext {
pub cr3_pa: u64,
pub page_offset: u64,
pub l5: bool,
pub tcr_el1: u64,
}
#[derive(Debug, Clone, Copy)]
#[allow(dead_code)] pub(crate) struct Aarch64WalkParams {
pub tcr_el1: u64,
pub stride: u64,
pub start_level: u64,
pub first_indexmask: u64,
pub indexmask_grainsize: u64,
pub descaddrmask: u64,
}
#[cfg(target_arch = "aarch64")]
impl Aarch64WalkParams {
pub(crate) fn from_tcr_el1(tcr_el1: u64) -> Option<Self> {
if tcr_el1 == 0 {
return None;
}
if tcr_el1 & (1u64 << 59) != 0 {
return None;
}
let t1sz = (tcr_el1 >> 16) & 0x3F;
if t1sz == 0 {
return None;
}
let tg1 = (tcr_el1 >> 30) & 0x3;
let stride: u64 = match tg1 {
0b11 => 13, 0b01 => 11, 0b10 => 9, _ => return None, };
let va_width = 64u64.saturating_sub(t1sz);
if va_width < 4 {
return None;
}
let levels_below = (va_width - 4) / stride;
if levels_below == 0 || levels_below > 4 {
return None;
}
let start_level: u64 = 4 - levels_below;
let indexmask_grainsize: u64 = (!0u64) >> (64 - (stride + 3));
let first_indexmask: u64 = (!0u64) >> (64 - (va_width - stride * (4 - start_level)));
let descaddrmask: u64 = ((!0u64) >> (64 - 50)) & !indexmask_grainsize;
Some(Self {
tcr_el1,
stride,
start_level,
first_indexmask,
indexmask_grainsize,
descaddrmask,
})
}
}
pub struct GuestMem {
size: u64,
regions: Vec<MemRegion>,
page_tlb: std::sync::Mutex<Option<TlbEntry>>,
}
#[derive(Debug, Clone, Copy)]
struct TlbEntry {
cr3_pa: u64,
l5: bool,
tcr_el1: u64,
kva_page: u64,
pa_page: u64,
}
unsafe impl Send for GuestMem {}
unsafe impl Sync for GuestMem {}
impl GuestMem {
pub unsafe fn new(base: *mut u8, size: u64) -> Self {
Self {
size,
regions: vec![MemRegion {
host_ptr: base,
offset: 0,
size,
}],
page_tlb: std::sync::Mutex::new(None),
}
}
#[cfg(test)]
pub(crate) unsafe fn from_regions_for_test(regions: Vec<MemRegion>) -> Self {
assert!(!regions.is_empty(), "at least one region required");
let size = regions
.iter()
.map(|r| r.offset + r.size)
.max()
.expect("non-empty");
Self {
size,
regions,
page_tlb: std::sync::Mutex::new(None),
}
}
pub(crate) fn from_layout(
layout: &crate::vmm::numa_mem::NumaMemoryLayout,
guest_mem: &vm_memory::GuestMemoryMmap,
) -> Self {
use vm_memory::GuestMemory;
let dram_base = layout.dram_base();
let total_size = layout.total_bytes();
let mut regions = Vec::with_capacity(layout.regions().len());
for nr in layout.regions() {
let host_ptr = guest_mem
.get_host_address(vm_memory::GuestAddress(nr.gpa_start))
.unwrap();
regions.push(MemRegion {
host_ptr,
offset: nr.gpa_start - dram_base,
size: nr.size,
});
}
Self {
size: total_size,
regions,
page_tlb: std::sync::Mutex::new(None),
}
}
pub fn host_ptr_for_pa(&self, pa: u64, field_size: u64) -> Option<*mut u8> {
let end = pa.checked_add(field_size)?;
if end > self.size {
return None;
}
let (ptr, region_avail) = self.resolve_ptr(pa)?;
if field_size > region_avail {
return None;
}
Some(ptr)
}
pub(crate) fn region_avail(&self, pa: u64) -> u64 {
match self.resolve_ptr(pa) {
Some((_, avail)) => avail,
None => 0,
}
}
fn resolve_ptr(&self, offset: u64) -> Option<(*mut u8, u64)> {
let idx = self
.regions
.partition_point(|r| r.offset <= offset)
.checked_sub(1)?;
let r = &self.regions[idx];
let local = offset - r.offset;
if local < r.size {
let ptr = unsafe { r.host_ptr.add(local as usize) };
Some((ptr, r.size - local))
} else {
None
}
}
#[inline]
unsafe fn read_volatile_bytes<const N: usize>(ptr: *const u8) -> [u8; N] {
match N {
1 => {
let v = unsafe { std::ptr::read_volatile(ptr) };
let mut bytes = [0u8; N];
bytes[0] = v;
return bytes;
}
2 => {
if ptr.align_offset(std::mem::align_of::<u16>()) == 0 {
let v: u16 = unsafe { std::ptr::read_volatile(ptr as *const u16) };
let src = v.to_ne_bytes();
let mut bytes = [0u8; N];
bytes[..2].copy_from_slice(&src);
return bytes;
}
}
4 => {
if ptr.align_offset(std::mem::align_of::<u32>()) == 0 {
let v: u32 = unsafe { std::ptr::read_volatile(ptr as *const u32) };
let src = v.to_ne_bytes();
let mut bytes = [0u8; N];
bytes[..4].copy_from_slice(&src);
return bytes;
}
}
8 => {
if ptr.align_offset(std::mem::align_of::<u64>()) == 0 {
let v: u64 = unsafe { std::ptr::read_volatile(ptr as *const u64) };
let src = v.to_ne_bytes();
let mut bytes = [0u8; N];
bytes[..8].copy_from_slice(&src);
return bytes;
}
}
_ => {}
}
let mut bytes = [0u8; N];
for (i, slot) in bytes.iter_mut().enumerate() {
*slot = unsafe { std::ptr::read_volatile(ptr.add(i)) };
}
bytes
}
#[inline]
unsafe fn write_volatile_bytes<const N: usize>(ptr: *mut u8, bytes: [u8; N]) {
match N {
1 => {
unsafe { std::ptr::write_volatile(ptr, bytes[0]) };
return;
}
2 => {
if ptr.align_offset(std::mem::align_of::<u16>()) == 0 {
let mut le = [0u8; 2];
le.copy_from_slice(&bytes[..2]);
let v = u16::from_ne_bytes(le);
unsafe { std::ptr::write_volatile(ptr as *mut u16, v) };
return;
}
}
4 => {
if ptr.align_offset(std::mem::align_of::<u32>()) == 0 {
let mut le = [0u8; 4];
le.copy_from_slice(&bytes[..4]);
let v = u32::from_ne_bytes(le);
unsafe { std::ptr::write_volatile(ptr as *mut u32, v) };
return;
}
}
8 => {
if ptr.align_offset(std::mem::align_of::<u64>()) == 0 {
let mut le = [0u8; 8];
le.copy_from_slice(&bytes[..8]);
let v = u64::from_ne_bytes(le);
unsafe { std::ptr::write_volatile(ptr as *mut u64, v) };
return;
}
}
_ => {}
}
for (i, &byte) in bytes.iter().enumerate() {
unsafe { std::ptr::write_volatile(ptr.add(i), byte) };
}
}
fn read_scalar<const N: usize>(&self, pa: u64, offset: usize) -> [u8; N] {
let Some(addr) = pa.checked_add(offset as u64) else {
return [0; N];
};
let Some(end) = addr.checked_add(N as u64) else {
return [0; N];
};
if end > self.size {
return [0; N];
}
match self.resolve_ptr(addr) {
Some((ptr, region_avail)) => {
if (N as u64) > region_avail {
return [0; N];
}
unsafe { Self::read_volatile_bytes::<N>(ptr as *const u8) }
}
None => [0; N],
}
}
fn write_scalar<const N: usize>(&self, pa: u64, offset: usize, bytes: [u8; N]) {
let Some(addr) = pa.checked_add(offset as u64) else {
return;
};
let Some(end) = addr.checked_add(N as u64) else {
return;
};
if end > self.size {
return;
}
if let Some((ptr, region_avail)) = self.resolve_ptr(addr) {
if (N as u64) > region_avail {
return;
}
unsafe { Self::write_volatile_bytes::<N>(ptr, bytes) };
}
}
pub fn read_u8(&self, pa: u64, offset: usize) -> u8 {
u8::from_ne_bytes(self.read_scalar::<1>(pa, offset))
}
pub fn read_u32(&self, pa: u64, offset: usize) -> u32 {
u32::from_ne_bytes(self.read_scalar::<4>(pa, offset))
}
pub fn read_u64(&self, pa: u64, offset: usize) -> u64 {
u64::from_ne_bytes(self.read_scalar::<8>(pa, offset))
}
pub fn read_i64(&self, pa: u64, offset: usize) -> i64 {
self.read_u64(pa, offset) as i64
}
#[allow(dead_code)]
pub fn write_u8(&self, pa: u64, offset: usize, val: u8) {
self.write_scalar::<1>(pa, offset, val.to_ne_bytes());
}
pub fn write_u64(&self, pa: u64, offset: usize, val: u64) {
self.write_scalar::<8>(pa, offset, val.to_ne_bytes());
}
pub fn write_bytes(&self, pa: u64, data: &[u8]) -> usize {
let len = data.len() as u64;
if pa >= self.size {
return 0;
}
let avail = (self.size - pa).min(len) as usize;
match self.resolve_ptr(pa) {
Some((ptr, region_avail)) => {
let copy_len = avail.min(region_avail as usize);
unsafe {
std::ptr::copy_nonoverlapping(data.as_ptr(), ptr, copy_len);
}
copy_len
}
None => 0,
}
}
pub fn write_bytes_at(&self, pa: u64, offset: usize, data: &[u8]) -> usize {
let Some(addr) = pa.checked_add(offset as u64) else {
return 0;
};
self.write_bytes(addr, data)
}
pub fn read_bytes(&self, pa: u64, buf: &mut [u8]) -> usize {
let len = buf.len() as u64;
if pa >= self.size {
return 0;
}
let avail = (self.size - pa).min(len) as usize;
match self.resolve_ptr(pa) {
Some((ptr, region_avail)) => {
let copy_len = avail.min(region_avail as usize);
unsafe {
std::ptr::copy_nonoverlapping(ptr, buf.as_mut_ptr(), copy_len);
}
copy_len
}
None => 0,
}
}
#[allow(dead_code)]
pub fn write_u32(&self, pa: u64, offset: usize, val: u32) {
self.write_scalar::<4>(pa, offset, val.to_ne_bytes());
}
pub(crate) fn translate_kva(
&self,
cr3_pa: u64,
kva: Kva,
l5: bool,
tcr_el1: u64,
) -> Option<u64> {
const PAGE: u64 = 4096;
let kva_bits = kva.0;
let kva_page = kva_bits & !(PAGE - 1);
{
let guard = self.page_tlb.lock().unwrap_or_else(|e| e.into_inner());
if let Some(entry) = guard.as_ref()
&& entry.cr3_pa == cr3_pa
&& entry.l5 == l5
&& entry.tcr_el1 == tcr_el1
&& entry.kva_page == kva_page
{
return Some(entry.pa_page | (kva_bits & (PAGE - 1)));
}
}
let walk_result;
#[cfg(target_arch = "x86_64")]
{
let _ = tcr_el1; walk_result = if l5 {
self.walk_5level(cr3_pa, kva)
} else {
self.walk_4level(cr3_pa, kva)
};
}
#[cfg(target_arch = "aarch64")]
{
let _ = l5; walk_result = self.walk_aarch64(cr3_pa, kva, tcr_el1);
}
if let Some(pa) = walk_result {
let mut guard = self.page_tlb.lock().unwrap_or_else(|e| e.into_inner());
*guard = Some(TlbEntry {
cr3_pa,
l5,
tcr_el1,
kva_page,
pa_page: pa & !(PAGE - 1),
});
}
walk_result
}
#[allow(dead_code)]
pub(crate) fn translate_kva_with_aarch64_params(
&self,
cr3_pa: u64,
kva: Kva,
l5: bool,
params: &Aarch64WalkParams,
) -> Option<u64> {
const PAGE: u64 = 4096;
let kva_bits = kva.0;
let kva_page = kva_bits & !(PAGE - 1);
{
let guard = self.page_tlb.lock().unwrap_or_else(|e| e.into_inner());
if let Some(entry) = guard.as_ref()
&& entry.cr3_pa == cr3_pa
&& entry.l5 == l5
&& entry.tcr_el1 == params.tcr_el1
&& entry.kva_page == kva_page
{
return Some(entry.pa_page | (kva_bits & (PAGE - 1)));
}
}
let walk_result;
#[cfg(target_arch = "x86_64")]
{
let _ = params;
walk_result = if l5 {
self.walk_5level(cr3_pa, kva)
} else {
self.walk_4level(cr3_pa, kva)
};
}
#[cfg(target_arch = "aarch64")]
{
let _ = l5;
walk_result = self.walk_aarch64_with_params(cr3_pa, kva, params);
}
if let Some(pa) = walk_result {
let mut guard = self.page_tlb.lock().unwrap_or_else(|e| e.into_inner());
*guard = Some(TlbEntry {
cr3_pa,
l5,
tcr_el1: params.tcr_el1,
kva_page,
pa_page: pa & !(PAGE - 1),
});
}
walk_result
}
#[cfg(target_arch = "x86_64")]
fn walk_4level(&self, cr3_pa: u64, kva: Kva) -> Option<u64> {
const PRESENT: u64 = 1;
const PS: u64 = 1 << 7;
const ADDR_MASK: u64 = 0x000F_FFFF_FFFF_F000;
let kva_bits = kva.0;
let pgd_idx = (kva_bits >> 39) & 0x1FF;
let pud_idx = (kva_bits >> 30) & 0x1FF;
let pmd_idx = (kva_bits >> 21) & 0x1FF;
let pte_idx = (kva_bits >> 12) & 0x1FF;
let page_off = kva_bits & 0xFFF;
let pgd_pa = (cr3_pa & ADDR_MASK) + pgd_idx * 8;
let pgde = self.read_u64(pgd_pa, 0);
if pgde & PRESENT == 0 {
return None;
}
let pud_pa = (pgde & ADDR_MASK) + pud_idx * 8;
let pude = self.read_u64(pud_pa, 0);
if pude & PRESENT == 0 {
return None;
}
if pude & PS != 0 {
let base = pude & 0x000F_FFFF_C000_0000;
return Some(base | (kva_bits & 0x3FFF_FFFF));
}
let pmd_pa = (pude & ADDR_MASK) + pmd_idx * 8;
let pmde = self.read_u64(pmd_pa, 0);
if pmde & PRESENT == 0 {
return None;
}
if pmde & PS != 0 {
let base = pmde & 0x000F_FFFF_FFE0_0000;
return Some(base | (kva_bits & 0x1F_FFFF));
}
let pte_pa = (pmde & ADDR_MASK) + pte_idx * 8;
let ptee = self.read_u64(pte_pa, 0);
if ptee & PRESENT == 0 {
return None;
}
Some((ptee & ADDR_MASK) | page_off)
}
#[cfg(target_arch = "aarch64")]
fn walk_aarch64(&self, ttbr_pa: u64, kva: Kva, tcr_el1: u64) -> Option<u64> {
let params = Aarch64WalkParams::from_tcr_el1(tcr_el1)?;
self.walk_aarch64_with_params(ttbr_pa, kva, ¶ms)
}
#[cfg(target_arch = "aarch64")]
fn walk_aarch64_with_params(
&self,
ttbr_pa: u64,
kva: Kva,
params: &Aarch64WalkParams,
) -> Option<u64> {
use crate::vmm::aarch64::kvm::DRAM_START;
let stride = params.stride;
let mut level: u64 = params.start_level;
let indexmask_grainsize: u64 = params.indexmask_grainsize;
let mut indexmask: u64 = params.first_indexmask;
let descaddrmask: u64 = params.descaddrmask;
let mut descaddr: u64 = ttbr_pa & ((!0u64) >> (64 - 48));
let to_offset = |gpa: u64| -> Option<u64> { gpa.checked_sub(DRAM_START) };
let kva_bits = kva.0;
loop {
let table_offset: u64 = (kva_bits >> (stride * (4 - level))) & indexmask;
descaddr |= table_offset;
descaddr &= !7u64;
let descriptor = self.read_u64(descaddr, 0);
if descriptor & 1 == 0 {
return None;
}
if level == 3 && (descriptor & 2) == 0 {
return None;
}
let next = descriptor & descaddrmask;
let is_block = (descriptor & 2) == 0;
if is_block && level < 3 {
let block_legal = match stride {
9 => level == 1 || level == 2,
11 => level == 2,
13 => level == 2,
_ => false,
};
if !block_legal {
return None;
}
}
if (descriptor & 2) != 0 && level < 3 {
descaddr = to_offset(next)?;
level += 1;
indexmask = indexmask_grainsize;
continue;
}
descaddr = next;
break;
}
let page_size: u64 = 1u64 << ((stride * (4 - level)) + 3);
let pa_gpa = (descaddr & !(page_size - 1)) | (kva_bits & (page_size - 1));
to_offset(pa_gpa)
}
#[cfg(target_arch = "x86_64")]
fn walk_5level(&self, cr3_pa: u64, kva: Kva) -> Option<u64> {
const PRESENT: u64 = 1;
const PS: u64 = 1 << 7;
const ADDR_MASK: u64 = 0x000F_FFFF_FFFF_F000;
let pml5_idx = (kva.0 >> 48) & 0x1FF;
let pml5_pa = (cr3_pa & ADDR_MASK) + pml5_idx * 8;
let pml5e = self.read_u64(pml5_pa, 0);
if pml5e & PRESENT == 0 {
return None;
}
if pml5e & PS != 0 {
let base = pml5e & 0x000F_0000_0000_0000;
return Some(base | (kva.0 & 0x0000_FFFF_FFFF_FFFF));
}
let p4d_pa = pml5e & ADDR_MASK;
self.walk_4level(p4d_pa, kva)
}
pub fn size(&self) -> u64 {
self.size
}
}
pub(crate) fn read_rq_stats(mem: &GuestMem, rq_pa: u64, offsets: &KernelOffsets) -> CpuSnapshot {
CpuSnapshot {
nr_running: mem.read_u32(rq_pa, offsets.rq_nr_running),
scx_nr_running: mem.read_u32(rq_pa, offsets.rq_scx + offsets.scx_rq_nr_running),
local_dsq_depth: mem.read_u32(
rq_pa,
offsets.rq_scx + offsets.scx_rq_local_dsq + offsets.dsq_nr,
),
rq_clock: mem.read_u64(rq_pa, offsets.rq_clock),
scx_flags: mem.read_u32(rq_pa, offsets.rq_scx + offsets.scx_rq_flags),
event_counters: None,
schedstat: None,
vcpu_cpu_time_ns: None,
vcpu_perf: None,
sched_domains: None,
}
}
pub(crate) fn read_event_stats(
mem: &GuestMem,
pcpu_pa: u64,
ev: &ScxEventOffsets,
) -> ScxEventCounters {
let base = pcpu_pa + ev.event_stats_off as u64;
let read_opt = |off: Option<usize>| off.map(|o| mem.read_i64(base, o)).unwrap_or(0);
ScxEventCounters {
select_cpu_fallback: mem.read_i64(base, ev.ev_select_cpu_fallback),
dispatch_local_dsq_offline: mem.read_i64(base, ev.ev_dispatch_local_dsq_offline),
dispatch_keep_last: mem.read_i64(base, ev.ev_dispatch_keep_last),
enq_skip_exiting: mem.read_i64(base, ev.ev_enq_skip_exiting),
enq_skip_migration_disabled: mem.read_i64(base, ev.ev_enq_skip_migration_disabled),
reenq_immed: read_opt(ev.ev_reenq_immed),
reenq_local_repeat: read_opt(ev.ev_reenq_local_repeat),
refill_slice_dfl: read_opt(ev.ev_refill_slice_dfl),
bypass_duration: read_opt(ev.ev_bypass_duration),
bypass_dispatch: read_opt(ev.ev_bypass_dispatch),
bypass_activate: read_opt(ev.ev_bypass_activate),
insert_not_owned: read_opt(ev.ev_insert_not_owned),
sub_bypass_dispatch: read_opt(ev.ev_sub_bypass_dispatch),
}
}
pub(crate) fn read_rq_schedstat(mem: &GuestMem, rq_pa: u64, ss: &SchedstatOffsets) -> RqSchedstat {
let sched_info_pa = rq_pa + ss.rq_sched_info as u64;
RqSchedstat {
run_delay: mem.read_u64(sched_info_pa, ss.sched_info_run_delay),
pcount: mem.read_u64(sched_info_pa, ss.sched_info_pcount),
yld_count: mem.read_u32(rq_pa, ss.rq_yld_count),
sched_count: mem.read_u32(rq_pa, ss.rq_sched_count),
sched_goidle: mem.read_u32(rq_pa, ss.rq_sched_goidle),
ttwu_count: mem.read_u32(rq_pa, ss.rq_ttwu_count),
ttwu_local: mem.read_u32(rq_pa, ss.rq_ttwu_local),
}
}
fn read_u32_array(mem: &GuestMem, pa: u64, base_offset: usize) -> [u32; CPU_MAX_IDLE_TYPES] {
std::array::from_fn(|i| mem.read_u32(pa, base_offset + i * 4))
}
fn read_sd_stats(mem: &GuestMem, sd_pa: u64, so: &SchedDomainStatsOffsets) -> SchedDomainStats {
SchedDomainStats {
lb_count: read_u32_array(mem, sd_pa, so.sd_lb_count),
lb_failed: read_u32_array(mem, sd_pa, so.sd_lb_failed),
lb_balanced: read_u32_array(mem, sd_pa, so.sd_lb_balanced),
lb_imbalance_load: read_u32_array(mem, sd_pa, so.sd_lb_imbalance_load),
lb_imbalance_util: read_u32_array(mem, sd_pa, so.sd_lb_imbalance_util),
lb_imbalance_task: read_u32_array(mem, sd_pa, so.sd_lb_imbalance_task),
lb_imbalance_misfit: read_u32_array(mem, sd_pa, so.sd_lb_imbalance_misfit),
lb_gained: read_u32_array(mem, sd_pa, so.sd_lb_gained),
lb_hot_gained: read_u32_array(mem, sd_pa, so.sd_lb_hot_gained),
lb_nobusyg: read_u32_array(mem, sd_pa, so.sd_lb_nobusyg),
lb_nobusyq: read_u32_array(mem, sd_pa, so.sd_lb_nobusyq),
alb_count: mem.read_u32(sd_pa, so.sd_alb_count),
alb_failed: mem.read_u32(sd_pa, so.sd_alb_failed),
alb_pushed: mem.read_u32(sd_pa, so.sd_alb_pushed),
sbe_count: mem.read_u32(sd_pa, so.sd_sbe_count),
sbe_balanced: mem.read_u32(sd_pa, so.sd_sbe_balanced),
sbe_pushed: mem.read_u32(sd_pa, so.sd_sbe_pushed),
sbf_count: mem.read_u32(sd_pa, so.sd_sbf_count),
sbf_balanced: mem.read_u32(sd_pa, so.sd_sbf_balanced),
sbf_pushed: mem.read_u32(sd_pa, so.sd_sbf_pushed),
ttwu_wake_remote: mem.read_u32(sd_pa, so.sd_ttwu_wake_remote),
ttwu_move_affine: mem.read_u32(sd_pa, so.sd_ttwu_move_affine),
ttwu_move_balance: mem.read_u32(sd_pa, so.sd_ttwu_move_balance),
}
}
fn read_sd_name(
mem: &GuestMem,
sd_pa: u64,
name_offset: usize,
page_offset: u64,
start_kernel_map: u64,
phys_base: u64,
) -> String {
let name_kva = mem.read_u64(sd_pa, name_offset);
if name_kva == 0 {
return String::new();
}
let text_pa = super::symbols::text_kva_to_pa_with_base(name_kva, start_kernel_map, phys_base);
let name_pa = if text_pa < mem.size() {
text_pa
} else {
let direct_pa = super::symbols::kva_to_pa(name_kva, page_offset);
if direct_pa >= mem.size() {
return String::new();
}
direct_pa
};
let mut buf = [0u8; 16];
let n = mem.read_bytes(name_pa, &mut buf);
let end = buf[..n].iter().position(|&b| b == 0).unwrap_or(n);
String::from_utf8_lossy(&buf[..end]).into_owned()
}
pub(crate) fn read_sched_domain_tree(
mem: &GuestMem,
rq_pa: u64,
sd_offsets: &SchedDomainOffsets,
page_offset: u64,
start_kernel_map: u64,
phys_base: u64,
) -> Option<Vec<SchedDomainSnapshot>> {
const MAX_DEPTH: usize = 8;
let sd_kva = mem.read_u64(rq_pa, sd_offsets.rq_sd);
if sd_kva == 0 {
return None;
}
let mut domains = Vec::new();
let mut current_kva = sd_kva;
let mut visited: std::collections::HashSet<u64> = std::collections::HashSet::new();
for _ in 0..MAX_DEPTH {
if current_kva == 0 {
break;
}
if !visited.insert(current_kva) {
tracing::warn!(
sd_kva = format_args!("{current_kva:#x}"),
"sched_domain cycle detected; truncating tree"
);
break;
}
let sd_pa = super::symbols::kva_to_pa(current_kva, page_offset);
if sd_pa >= mem.size() {
break;
}
let level = mem.read_u32(sd_pa, sd_offsets.sd_level) as i32;
let name = read_sd_name(
mem,
sd_pa,
sd_offsets.sd_name,
page_offset,
start_kernel_map,
phys_base,
);
let flags = mem.read_u32(sd_pa, sd_offsets.sd_flags) as i32;
let span_weight = mem.read_u32(sd_pa, sd_offsets.sd_span_weight);
let stats = sd_offsets
.stats_offsets
.as_ref()
.map(|so| read_sd_stats(mem, sd_pa, so));
let snap = SchedDomainSnapshot {
level,
name,
flags,
span_weight,
balance_interval: mem.read_u32(sd_pa, sd_offsets.sd_balance_interval),
nr_balance_failed: mem.read_u32(sd_pa, sd_offsets.sd_nr_balance_failed),
newidle_call: sd_offsets
.sd_newidle_call
.map(|off| mem.read_u32(sd_pa, off)),
newidle_success: sd_offsets
.sd_newidle_success
.map(|off| mem.read_u32(sd_pa, off)),
newidle_ratio: sd_offsets
.sd_newidle_ratio
.map(|off| mem.read_u32(sd_pa, off)),
max_newidle_lb_cost: mem.read_u64(sd_pa, sd_offsets.sd_max_newidle_lb_cost),
stats,
};
domains.push(snap);
current_kva = mem.read_u64(sd_pa, sd_offsets.sd_parent);
}
Some(domains)
}
pub(crate) fn resolve_event_pcpu_pas(
mem: &GuestMem,
scx_root_pa: u64,
ev: &ScxEventOffsets,
per_cpu_offsets: &[u64],
page_offset: u64,
) -> Option<Vec<u64>> {
let scx_sched_kva = mem.read_u64(scx_root_pa, 0);
if scx_sched_kva == 0 {
return None;
}
let scx_sched_pa = super::symbols::kva_to_pa(scx_sched_kva, page_offset);
let pcpu_kva = mem.read_u64(scx_sched_pa, ev.percpu_ptr_off);
if pcpu_kva == 0 {
return None;
}
let pas: Vec<u64> = per_cpu_offsets
.iter()
.map(|&cpu_off| super::symbols::kva_to_pa(pcpu_kva.wrapping_add(cpu_off), page_offset))
.collect();
Some(pas)
}
pub(crate) struct VcpuTiming {
pub pthreads: Vec<libc::pthread_t>,
}
impl VcpuTiming {
fn read_cpu_times(&self, reported_err: &mut [bool]) -> Vec<Option<u64>> {
self.pthreads
.iter()
.enumerate()
.map(|(vcpu, &pt)| {
let mut clk: libc::clockid_t = 0;
let ret = unsafe { libc::pthread_getcpuclockid(pt, &mut clk) };
if ret != 0 {
if let Some(slot) = reported_err.get_mut(vcpu)
&& !*slot
{
tracing::warn!(
vcpu,
ret,
errno = std::io::Error::last_os_error().raw_os_error(),
"pthread_getcpuclockid failed; stall gating unavailable for this vCPU"
);
*slot = true;
}
return None;
}
let mut ts = libc::timespec {
tv_sec: 0,
tv_nsec: 0,
};
let ret = unsafe { libc::clock_gettime(clk, &mut ts) };
if ret != 0 {
if let Some(slot) = reported_err.get_mut(vcpu)
&& !*slot
{
tracing::warn!(
vcpu,
ret,
errno = std::io::Error::last_os_error().raw_os_error(),
"clock_gettime on pthread clock failed; stall gating unavailable for this vCPU"
);
*slot = true;
}
return None;
}
if ts.tv_sec < 0 || ts.tv_nsec < 0 {
if let Some(slot) = reported_err.get_mut(vcpu)
&& !*slot
{
tracing::warn!(
vcpu,
tv_sec = ts.tv_sec,
tv_nsec = ts.tv_nsec,
"negative clock_gettime result; stall gating unavailable for this vCPU"
);
*slot = true;
}
return None;
}
if let Some(slot) = reported_err.get_mut(vcpu) {
*slot = false;
}
Some(ts.tv_sec as u64 * 1_000_000_000 + ts.tv_nsec as u64)
})
.collect()
}
}
pub(crate) fn evaluate_preempted(prev: Option<u64>, curr: Option<u64>, threshold_ns: u64) -> bool {
match (prev, curr) {
(Some(p), Some(c)) => c.saturating_sub(p) < threshold_ns,
_ => false,
}
}
pub(crate) fn is_cpu_stalled(
prev: &super::CpuSnapshot,
curr: &super::CpuSnapshot,
preemption_threshold_ns: u64,
) -> bool {
if curr.rq_clock == 0 || curr.rq_clock != prev.rq_clock {
return false;
}
let idle = curr.nr_running == 0 && prev.nr_running == 0;
if idle {
return false;
}
let preempted = evaluate_preempted(
prev.vcpu_cpu_time_ns,
curr.vcpu_cpu_time_ns,
preemption_threshold_ns,
);
!preempted
}
pub(crate) struct DumpTrigger {
pub thresholds: super::MonitorThresholds,
pub virtio_con:
Option<std::sync::Arc<crate::vmm::PiMutex<crate::vmm::virtio_console::VirtioConsole>>>,
}
pub(crate) struct RqRefresh {
pub pco_pa: u64,
pub runqueues_kva: u64,
pub per_cpu_start: u64,
pub kaslr_offset: u64,
pub num_cpus: u32,
pub page_offset_base_pa: Option<u64>,
pub event: Option<EventRefresh>,
}
pub(crate) struct EventRefresh {
pub scx_root_pa: u64,
pub event_offsets: ScxEventOffsets,
}
pub(crate) enum WatchdogOverride {
ScxSched {
scx_root_pa: u64,
watchdog_offset: usize,
jiffies: u64,
},
StaticGlobal {
watchdog_timeout_pa: u64,
jiffies: u64,
},
}
pub(crate) struct ProgStatsCtx {
pub per_cpu_offsets: Vec<u64>,
pub walk: WalkContext,
pub prog_idr_kva: u64,
pub offsets: super::btf_offsets::BpfProgOffsets,
pub start_kernel_map: u64,
pub phys_base: u64,
}
pub(crate) struct MonitorLoopResult {
pub(crate) samples: Vec<MonitorSample>,
pub(crate) drain: crate::vmm::host_comms::BulkDrainResult,
pub(crate) watchdog_observation: Option<super::WatchdogObservation>,
pub(crate) page_offset: u64,
pub(crate) preemption_threshold_ns: u64,
}
pub(crate) struct MonitorConfig<'a> {
pub event_pcpu_pas: Option<&'a [u64]>,
pub dump_trigger: Option<&'a DumpTrigger>,
pub watchdog_override: Option<&'a WatchdogOverride>,
pub vcpu_timing: Option<&'a VcpuTiming>,
pub perf_capture: Option<&'a super::perf_counters::PerfCountersCapture>,
pub preemption_threshold_ns: u64,
pub prog_stats_ctx: Option<&'a ProgStatsCtx>,
pub page_offset: u64,
pub start_kernel_map: u64,
pub phys_base: u64,
pub rq_refresh: Option<&'a RqRefresh>,
pub sys_rdy: Option<&'a EventFd>,
pub watchdog_reset: Option<WatchdogReset<'a>>,
}
pub(crate) struct WatchdogReset<'a> {
pub scx_root_pa: u64,
pub workload_duration: Duration,
pub reset_ns: &'a AtomicU64,
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn monitor_loop(
mem: &GuestMem,
rq_pas: &[u64],
offsets: &KernelOffsets,
interval: Duration,
kill: &AtomicBool,
kill_evt: &EventFd,
run_start: Instant,
cfg: &MonitorConfig<'_>,
) -> MonitorLoopResult {
let dump_trigger = cfg.dump_trigger;
let watchdog_override = cfg.watchdog_override;
let vcpu_timing = cfg.vcpu_timing;
let perf_capture = cfg.perf_capture;
let preemption_threshold_ns = cfg.preemption_threshold_ns;
let prog_stats_ctx = cfg.prog_stats_ctx;
let mut page_offset = cfg.page_offset;
let start_kernel_map = cfg.start_kernel_map;
let phys_base = cfg.phys_base;
let rq_refresh = cfg.rq_refresh;
let preemption_threshold_ns = if preemption_threshold_ns > 0 {
preemption_threshold_ns
} else {
super::vcpu_preemption_threshold_ns(None)
};
let mut rq_pas_buf: Vec<u64> = rq_pas.to_vec();
let mut event_pcpu_pas_buf: Option<Vec<u64>> = cfg.event_pcpu_pas.map(|s| s.to_vec());
let mut per_cpu_offsets_buf: Vec<u64> = prog_stats_ctx
.map(|c| c.per_cpu_offsets.clone())
.unwrap_or_default();
let mut diag_iter: u32 = 0;
let mut prev_pob: u64 = 0;
let mut data_valid: bool = rq_refresh.is_none();
let mut latched_page_offset: u64 = 0;
let num_cpus: usize = rq_refresh
.map(|r| r.num_cpus as usize)
.unwrap_or(rq_pas.len());
let mut samples: Vec<MonitorSample> = Vec::new();
let mut imbalance_tracker = super::SustainedViolationTracker::default();
let mut dsq_tracker = super::SustainedViolationTracker::default();
let mut stall_trackers: Vec<super::SustainedViolationTracker> =
vec![super::SustainedViolationTracker::default(); num_cpus];
let mut dump_requested = false;
let mut cpus: Vec<CpuSnapshot> = Vec::with_capacity(num_cpus);
let mut perf_read_err_reported = false;
let mut vcpu_timing_err_reported: Vec<bool> = vcpu_timing
.map(|vt| vec![false; vt.pthreads.len()])
.unwrap_or_default();
let shm_entries: Vec<crate::vmm::wire::ShmEntry> = Vec::new();
let mut watchdog_observation: Option<super::WatchdogObservation> = None;
let mut watchdog_reset_signaled = false;
let tick_tfd = match TimerFd::new() {
Ok(t) => t,
Err(e) => {
tracing::warn!(err = %e, "monitor: timerfd_create failed");
return MonitorLoopResult {
samples,
drain: crate::vmm::host_comms::BulkDrainResult {
entries: shm_entries,
},
watchdog_observation,
page_offset: latched_page_offset,
preemption_threshold_ns,
};
}
};
let mut tick_tfd = tick_tfd;
if let Err(e) = tick_tfd.reset(interval, Some(interval)) {
tracing::warn!(err = %e, "monitor: timerfd_settime failed");
return MonitorLoopResult {
samples,
drain: crate::vmm::host_comms::BulkDrainResult {
entries: shm_entries,
},
watchdog_observation,
page_offset: latched_page_offset,
preemption_threshold_ns,
};
}
let epoll = match Epoll::new() {
Ok(e) => e,
Err(e) => {
tracing::warn!(err = %e, "monitor: epoll_create1 failed");
return MonitorLoopResult {
samples,
drain: crate::vmm::host_comms::BulkDrainResult {
entries: shm_entries,
},
watchdog_observation,
page_offset: latched_page_offset,
preemption_threshold_ns,
};
}
};
let tick_fd = tick_tfd.as_raw_fd();
let kill_fd = kill_evt.as_raw_fd();
if let Err(e) = epoll.ctl(
ControlOperation::Add,
tick_fd,
EpollEvent::new(EventSet::IN, tick_fd as u64),
) {
tracing::warn!(err = %e, "monitor: epoll_ctl add timerfd failed");
return MonitorLoopResult {
samples,
drain: crate::vmm::host_comms::BulkDrainResult {
entries: shm_entries,
},
watchdog_observation,
page_offset: latched_page_offset,
preemption_threshold_ns,
};
}
if let Err(e) = epoll.ctl(
ControlOperation::Add,
kill_fd,
EpollEvent::new(EventSet::IN, kill_fd as u64),
) {
tracing::warn!(err = %e, "monitor: epoll_ctl add kill_evt failed");
return MonitorLoopResult {
samples,
drain: crate::vmm::host_comms::BulkDrainResult {
entries: shm_entries,
},
watchdog_observation,
page_offset: latched_page_offset,
preemption_threshold_ns,
};
}
let mut epoll_buf = [EpollEvent::default(); 2];
if let Some(sys_rdy) = cfg.sys_rdy {
let boot_epoll = match Epoll::new() {
Ok(e) => e,
Err(e) => {
tracing::warn!(err = %e, "monitor: boot epoll_create1 failed");
return MonitorLoopResult {
samples,
drain: crate::vmm::host_comms::BulkDrainResult {
entries: shm_entries,
},
watchdog_observation,
page_offset: latched_page_offset,
preemption_threshold_ns,
};
}
};
let boot_fd = sys_rdy.as_raw_fd();
if let Err(e) = boot_epoll.ctl(
ControlOperation::Add,
boot_fd,
EpollEvent::new(EventSet::IN, boot_fd as u64),
) {
tracing::warn!(err = %e, "monitor: epoll_ctl add sys_rdy failed");
}
if let Err(e) = boot_epoll.ctl(
ControlOperation::Add,
kill_fd,
EpollEvent::new(EventSet::IN, kill_fd as u64),
) {
tracing::warn!(err = %e, "monitor: epoll_ctl add kill_evt (boot wait) failed");
}
let mut boot_buf = [EpollEvent::default(); 2];
let timeout_ms: i32 = 5_000;
let mut killed = false;
match boot_epoll.wait(timeout_ms, &mut boot_buf) {
Ok(n) => {
for ev in &boot_buf[..n] {
if ev.fd() == kill_fd {
killed = true;
}
}
}
Err(e) => {
tracing::warn!(err = %e, "monitor: boot epoll_wait failed");
}
}
if killed || kill.load(Ordering::Acquire) {
return MonitorLoopResult {
samples,
drain: crate::vmm::host_comms::BulkDrainResult {
entries: shm_entries,
},
watchdog_observation,
page_offset: latched_page_offset,
preemption_threshold_ns,
};
}
}
loop {
if kill.load(Ordering::Acquire) {
break;
}
let mut page_offset_resolved = false;
if let Some(refresh) = rq_refresh {
if let Some(pob_pa) = refresh.page_offset_base_pa {
let val = mem.read_u64(pob_pa, 0);
let pob_aligned = (val & 0xFFF) == 0;
let pob_stable = prev_pob == val;
prev_pob = val;
if val & (1u64 << 63) != 0 && pob_aligned && pob_stable {
page_offset = val;
page_offset_resolved = true;
}
} else {
page_offset_resolved = true;
}
}
if let Some(reset) = cfg.watchdog_reset.as_ref() {
let scx_sched_kva = mem.read_u64(reset.scx_root_pa, 0);
if scx_sched_kva != 0 && !watchdog_reset_signaled {
let elapsed = run_start.elapsed();
let target_ns = elapsed
.as_nanos()
.saturating_add(reset.workload_duration.as_nanos());
let encoded = u64::try_from(target_ns).unwrap_or(u64::MAX).max(1);
reset.reset_ns.store(encoded, Ordering::Release);
watchdog_reset_signaled = true;
}
}
if let Some(wd) = watchdog_override {
let (write_pa, write_offset, wd_jiffies) = match wd {
WatchdogOverride::ScxSched {
scx_root_pa,
watchdog_offset,
jiffies,
} => {
let sch_kva = mem.read_u64(*scx_root_pa, 0);
if sch_kva == 0 {
(None, 0, *jiffies)
} else {
let sch_pa = super::symbols::kva_to_pa(sch_kva, page_offset);
(Some(sch_pa), *watchdog_offset, *jiffies)
}
}
WatchdogOverride::StaticGlobal {
watchdog_timeout_pa,
jiffies,
} => (Some(*watchdog_timeout_pa), 0, *jiffies),
};
if let Some(pa) = write_pa {
mem.write_u64(pa, write_offset, wd_jiffies);
if watchdog_observation.is_none() {
let observed = mem.read_u64(pa, write_offset);
watchdog_observation = Some(super::WatchdogObservation {
expected_jiffies: wd_jiffies,
observed_jiffies: observed,
});
}
}
}
if let Some(refresh) = rq_refresh {
let fresh = super::symbols::read_per_cpu_offsets(mem, refresh.pco_pa, refresh.num_cpus);
if !data_valid
&& page_offset_resolved
&& !fresh.is_empty()
&& fresh.iter().all(|&v| v != 0)
{
data_valid = true;
latched_page_offset = page_offset;
eprintln!(
"DATA_VALID latched at iter={} page_offset={:#x} pco0={:#x}",
diag_iter,
page_offset,
fresh.first().copied().unwrap_or(0),
);
}
rq_pas_buf = super::symbols::compute_rq_pas(
refresh.runqueues_kva,
&fresh,
page_offset,
refresh.per_cpu_start,
refresh.kaslr_offset,
);
event_pcpu_pas_buf = refresh.event.as_ref().and_then(|ev| {
resolve_event_pcpu_pas(mem, ev.scx_root_pa, &ev.event_offsets, &fresh, page_offset)
});
per_cpu_offsets_buf = fresh;
diag_iter = diag_iter.saturating_add(1);
}
if data_valid {
cpus.clear();
cpus.extend(rq_pas_buf.iter().map(|&pa| read_rq_stats(mem, pa, offsets)));
if let (Some(pcpu_pas), Some(ev)) =
(event_pcpu_pas_buf.as_deref(), &offsets.event_offsets)
{
for (i, cpu) in cpus.iter_mut().enumerate() {
if let Some(&pcpu_pa) = pcpu_pas.get(i) {
cpu.event_counters = Some(read_event_stats(mem, pcpu_pa, ev));
}
}
}
if let Some(ss) = &offsets.schedstat_offsets {
for (i, cpu) in cpus.iter_mut().enumerate() {
if let Some(&rq_pa) = rq_pas_buf.get(i) {
cpu.schedstat = Some(read_rq_schedstat(mem, rq_pa, ss));
}
}
}
if let Some(sd) = &offsets.sched_domain_offsets {
for (i, cpu) in cpus.iter_mut().enumerate() {
if let Some(&rq_pa) = rq_pas_buf.get(i) {
cpu.sched_domains = read_sched_domain_tree(
mem,
rq_pa,
sd,
page_offset,
start_kernel_map,
phys_base,
);
}
}
}
if let Some(vt) = vcpu_timing {
let times = vt.read_cpu_times(&mut vcpu_timing_err_reported);
for (i, cpu) in cpus.iter_mut().enumerate() {
if let Some(&t) = times.get(i) {
cpu.vcpu_cpu_time_ns = t;
}
}
}
}
if let Some(pc) = perf_capture {
match pc.read_all() {
Ok(samples) => {
for (i, cpu) in cpus.iter_mut().enumerate() {
if let Some(s) = samples.get(i) {
cpu.vcpu_perf = Some(*s);
}
}
}
Err(e) => {
if !perf_read_err_reported {
tracing::warn!(
err = %e,
"perf counter read failed; vcpu_perf will be None until next successful sample"
);
perf_read_err_reported = true;
}
}
}
}
if let Some(trigger) = dump_trigger
&& !dump_requested
&& !cpus.is_empty()
{
let t = &trigger.thresholds;
let sample_idx = samples.len();
let tmp_sample = MonitorSample {
elapsed_ms: 0,
cpus: cpus.clone(),
prog_stats: None,
};
let ratio = tmp_sample.imbalance_ratio();
imbalance_tracker.record(ratio > t.max_imbalance_ratio, ratio, sample_idx);
let worst_dsq = cpus.iter().map(|c| c.local_dsq_depth).max().unwrap_or(0);
dsq_tracker.record(
worst_dsq > t.max_local_dsq_depth,
worst_dsq as f64,
sample_idx,
);
if t.fail_on_stall
&& let Some(prev) = samples.last()
{
let n = prev.cpus.len().min(cpus.len()).min(stall_trackers.len());
for i in 0..n {
let is_stall = is_cpu_stalled(&prev.cpus[i], &cpus[i], preemption_threshold_ns);
stall_trackers[i].record(is_stall, cpus[i].rq_clock as f64, sample_idx);
}
}
let sustained = imbalance_tracker.sustained(t.sustained_samples)
|| dsq_tracker.sustained(t.sustained_samples)
|| stall_trackers
.iter()
.any(|s| s.sustained(t.sustained_samples));
if sustained {
if let Some(ref vc) = trigger.virtio_con {
crate::vmm::host_comms::request_dump(vc);
} else {
tracing::warn!(
"dump_trigger: no virtio_console attached; \
SIGNAL_VC_DUMP not delivered"
);
}
dump_requested = true;
}
}
let prog_stats = if data_valid {
prog_stats_ctx.map(|ctx| {
super::bpf_prog::walk_struct_ops_runtime_stats(
mem,
ctx.walk,
ctx.prog_idr_kva,
&ctx.offsets,
&per_cpu_offsets_buf,
ctx.start_kernel_map,
ctx.phys_base,
)
})
} else {
None
};
samples.push(MonitorSample {
elapsed_ms: run_start.elapsed().as_millis() as u64,
cpus: cpus.clone(),
prog_stats,
});
match epoll.wait(-1, &mut epoll_buf) {
Ok(n) => {
for ev in &epoll_buf[..n] {
if ev.fd() == tick_fd {
let _ = tick_tfd.wait();
}
}
}
Err(e) => {
if e.raw_os_error() != Some(libc::EINTR) {
tracing::warn!(err = %e, "monitor: epoll_wait failed");
break;
}
}
}
}
if let Some(refresh) = rq_refresh {
let fresh = super::symbols::read_per_cpu_offsets(mem, refresh.pco_pa, refresh.num_cpus);
let valid_samples = samples
.iter()
.filter(|s| s.cpus.iter().any(|c| c.rq_clock > 0))
.count();
let pob_pa_live = refresh.page_offset_base_pa.unwrap_or(0);
let pob_val = if pob_pa_live != 0 {
mem.read_u64(pob_pa_live, 0)
} else {
0
};
eprintln!(
"FINAL DIAG iters={iters} samples={samples} valid={valid} data_valid={dv} \
page_offset={po:#x} phys_base={pb:#x} start_kernel_map={skm:#x} \
pco_pa={pco_pa:#x} pob_pa={pob_pa:#x} pob_val={pob_val:#x} \
pco0={pco0:#x} pco1={pco1:#x} runqueues_kva={rq:#x} \
mem_size={ms:#x} rq_pa0={rq0:#x} kaslr_off={ko:#x}",
iters = diag_iter,
samples = samples.len(),
valid = valid_samples,
dv = data_valid,
po = page_offset,
pb = phys_base,
skm = cfg.start_kernel_map,
pco_pa = refresh.pco_pa,
pob_pa = pob_pa_live,
pob_val = pob_val,
pco0 = fresh.first().copied().unwrap_or(0),
pco1 = fresh.get(1).copied().unwrap_or(0),
rq = refresh.runqueues_kva,
ms = mem.size(),
ko = refresh.kaslr_offset,
rq0 = rq_pas_buf.first().copied().unwrap_or(0),
);
if let Some(&rq0_pa) = rq_pas_buf.first() {
let raw_clock = mem.read_u64(rq0_pa, offsets.rq_clock);
let raw_nr_running = mem.read_u32(rq0_pa, offsets.rq_nr_running);
let raw_first_8 = mem.read_u64(rq0_pa, 0);
let pco0_verify = mem.read_u64(refresh.pco_pa, 0);
eprintln!(
"FINAL DIAG rq0: clock_off={} nr_off={} raw_clock={raw_clock} raw_nr={raw_nr_running} \
raw_first_8={raw_first_8:#x} pco0_verify={pco0_verify:#x} rq0_pa={rq0_pa:#x} pco_pa={:#x}",
offsets.rq_clock, offsets.rq_nr_running, refresh.pco_pa,
);
}
}
let shm_result = crate::vmm::host_comms::BulkDrainResult {
entries: shm_entries,
};
MonitorLoopResult {
samples,
drain: shm_result,
watchdog_observation,
page_offset: latched_page_offset,
preemption_threshold_ns,
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::os::unix::thread::JoinHandleExt;
const THRESHOLD_NS: u64 = 10_000_000;
#[test]
fn evaluate_preempted_both_none_is_not_preempted() {
assert!(!evaluate_preempted(None, None, THRESHOLD_NS));
}
#[test]
fn evaluate_preempted_first_read_failed_is_not_preempted() {
assert!(!evaluate_preempted(None, Some(1_000_000_000), THRESHOLD_NS));
}
#[test]
fn evaluate_preempted_current_read_failed_is_not_preempted() {
assert!(!evaluate_preempted(Some(1_000_000_000), None, THRESHOLD_NS));
}
#[test]
fn evaluate_preempted_delta_below_threshold_is_preempted() {
assert!(evaluate_preempted(
Some(1_000_000_000),
Some(1_001_000_000),
THRESHOLD_NS,
));
}
#[test]
fn evaluate_preempted_delta_at_threshold_is_not_preempted() {
assert!(!evaluate_preempted(
Some(1_000_000_000),
Some(1_010_000_000),
THRESHOLD_NS,
));
}
#[test]
fn evaluate_preempted_delta_above_threshold_is_not_preempted() {
assert!(!evaluate_preempted(
Some(1_000_000_000),
Some(2_000_000_000),
THRESHOLD_NS,
));
}
#[test]
fn evaluate_preempted_non_monotonic_treated_as_no_progress() {
assert!(evaluate_preempted(
Some(1_000_000_000),
Some(999_000_000),
THRESHOLD_NS,
));
}
#[test]
fn evaluate_preempted_zero_threshold_never_preempted() {
assert!(!evaluate_preempted(Some(100), Some(100), 0));
assert!(!evaluate_preempted(Some(100), Some(200), 0));
}
fn test_kill_evt() -> vmm_sys_util::eventfd::EventFd {
vmm_sys_util::eventfd::EventFd::new(vmm_sys_util::eventfd::EFD_NONBLOCK)
.expect("create kill EventFd")
}
fn test_config() -> MonitorConfig<'static> {
MonitorConfig {
event_pcpu_pas: None,
dump_trigger: None,
watchdog_override: None,
vcpu_timing: None,
perf_capture: None,
preemption_threshold_ns: 0,
prog_stats_ctx: None,
page_offset: 0,
start_kernel_map: super::super::symbols::START_KERNEL_MAP,
phys_base: 0,
rq_refresh: None,
sys_rdy: None,
watchdog_reset: None,
}
}
fn test_offsets() -> KernelOffsets {
KernelOffsets {
rq_nr_running: 8,
rq_clock: 16,
rq_scx: 100,
scx_rq_nr_running: 4,
scx_rq_local_dsq: 20,
scx_rq_flags: 8,
dsq_nr: 0,
event_offsets: None,
schedstat_offsets: None,
sched_domain_offsets: None,
watchdog_offsets: None,
}
}
fn test_virtio_console()
-> std::sync::Arc<crate::vmm::PiMutex<crate::vmm::virtio_console::VirtioConsole>> {
std::sync::Arc::new(crate::vmm::PiMutex::new(
crate::vmm::virtio_console::VirtioConsole::new(),
))
}
fn make_rq_buffer(
offsets: &KernelOffsets,
nr_running: u32,
scx_nr: u32,
dsq_nr: u32,
clock: u64,
flags: u32,
) -> Vec<u8> {
let size = offsets.rq_scx + offsets.scx_rq_local_dsq + offsets.dsq_nr + 8;
let mut buf = vec![0u8; size];
buf[offsets.rq_nr_running..offsets.rq_nr_running + 4]
.copy_from_slice(&nr_running.to_ne_bytes());
buf[offsets.rq_clock..offsets.rq_clock + 8].copy_from_slice(&clock.to_ne_bytes());
let scx_base = offsets.rq_scx;
buf[scx_base + offsets.scx_rq_nr_running..scx_base + offsets.scx_rq_nr_running + 4]
.copy_from_slice(&scx_nr.to_ne_bytes());
buf[scx_base + offsets.scx_rq_flags..scx_base + offsets.scx_rq_flags + 4]
.copy_from_slice(&flags.to_ne_bytes());
let dsq_base = scx_base + offsets.scx_rq_local_dsq;
buf[dsq_base + offsets.dsq_nr..dsq_base + offsets.dsq_nr + 4]
.copy_from_slice(&dsq_nr.to_ne_bytes());
buf
}
#[test]
fn read_rq_stats_known_values() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 5, 3, 7, 999_000, 0x1);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let snap = read_rq_stats(&mem, 0, &offsets);
assert_eq!(snap.nr_running, 5);
assert_eq!(snap.scx_nr_running, 3);
assert_eq!(snap.local_dsq_depth, 7);
assert_eq!(snap.rq_clock, 999_000);
assert_eq!(snap.scx_flags, 0x1);
}
#[test]
fn read_rq_stats_all_zeros() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 0, 0, 0, 0, 0);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let snap = read_rq_stats(&mem, 0, &offsets);
assert_eq!(snap.nr_running, 0);
assert_eq!(snap.scx_nr_running, 0);
assert_eq!(snap.local_dsq_depth, 0);
assert_eq!(snap.rq_clock, 0);
assert_eq!(snap.scx_flags, 0);
}
#[test]
fn read_rq_stats_max_values() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, u32::MAX, u32::MAX, u32::MAX, u64::MAX, u32::MAX);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let snap = read_rq_stats(&mem, 0, &offsets);
assert_eq!(snap.nr_running, u32::MAX);
assert_eq!(snap.scx_nr_running, u32::MAX);
assert_eq!(snap.local_dsq_depth, u32::MAX);
assert_eq!(snap.rq_clock, u64::MAX);
assert_eq!(snap.scx_flags, u32::MAX);
}
#[test]
fn read_u32_out_of_bounds() {
let buf = [0xFFu8; 8];
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
assert_eq!(mem.read_u32(6, 0), 0);
assert_eq!(mem.read_u32(4, 0), u32::from_ne_bytes([0xFF; 4]));
assert_eq!(mem.read_u32(5, 0), 0);
}
#[test]
fn read_u64_out_of_bounds() {
let buf = [0xFFu8; 16];
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
assert_eq!(mem.read_u64(10, 0), 0);
assert_eq!(mem.read_u64(8, 0), u64::from_ne_bytes([0xFF; 8]));
assert_eq!(mem.read_u64(9, 0), 0);
}
#[test]
fn monitor_loop_kill_immediately() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let kill = AtomicBool::new(true);
let kill_evt = test_kill_evt();
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
&kill_evt,
Instant::now(),
&test_config(),
);
assert!(samples.is_empty());
}
#[test]
fn monitor_loop_one_iteration() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 2, 1, 3, 500, 0);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let kill_evt = test_kill_evt();
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(50));
kill.store(true, Ordering::Release);
})
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
&kill_evt,
Instant::now(),
&test_config(),
);
handle.join().unwrap();
assert!(!samples.is_empty());
assert_eq!(samples[0].cpus.len(), 1);
assert_eq!(samples[0].cpus[0].nr_running, 2);
assert_eq!(samples[0].cpus[0].scx_nr_running, 1);
assert_eq!(samples[0].cpus[0].local_dsq_depth, 3);
assert_eq!(samples[0].cpus[0].rq_clock, 500);
}
#[test]
fn two_cpu_independent_reads() {
let offsets = test_offsets();
let buf0 = make_rq_buffer(&offsets, 10, 5, 2, 1000, 0x1);
let buf1 = make_rq_buffer(&offsets, 20, 15, 8, 2000, 0x2);
let pa1 = buf0.len() as u64;
let mut combined = buf0;
combined.extend_from_slice(&buf1);
let mem = unsafe { GuestMem::new(combined.as_ptr() as *mut u8, combined.len() as u64) };
let snap0 = read_rq_stats(&mem, 0, &offsets);
let snap1 = read_rq_stats(&mem, pa1, &offsets);
assert_eq!(snap0.nr_running, 10);
assert_eq!(snap0.scx_nr_running, 5);
assert_eq!(snap0.local_dsq_depth, 2);
assert_eq!(snap0.rq_clock, 1000);
assert_eq!(snap0.scx_flags, 0x1);
assert_eq!(snap1.nr_running, 20);
assert_eq!(snap1.scx_nr_running, 15);
assert_eq!(snap1.local_dsq_depth, 8);
assert_eq!(snap1.rq_clock, 2000);
assert_eq!(snap1.scx_flags, 0x2);
}
#[test]
fn read_u32_nonzero_pa_and_offset() {
let mut buf = [0u8; 32];
buf[20..24].copy_from_slice(&0xDEADBEEFu32.to_ne_bytes());
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
assert_eq!(mem.read_u32(12, 8), 0xDEADBEEF);
}
#[test]
fn read_u64_nonzero_pa_and_offset() {
let mut buf = [0u8; 32];
buf[16..24].copy_from_slice(&0x0123456789ABCDEFu64.to_ne_bytes());
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
assert_eq!(mem.read_u64(10, 6), 0x0123456789ABCDEF);
}
#[test]
fn monitor_loop_multi_cpu() {
let offsets = test_offsets();
let buf0 = make_rq_buffer(&offsets, 3, 2, 1, 100, 0);
let buf1 = make_rq_buffer(&offsets, 7, 5, 4, 200, 0);
let pa1 = buf0.len() as u64;
let mut combined = buf0;
combined.extend_from_slice(&buf1);
let mem = unsafe { GuestMem::new(combined.as_ptr() as *mut u8, combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let kill_evt = test_kill_evt();
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(50));
kill.store(true, Ordering::Release);
})
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0, pa1],
&offsets,
Duration::from_millis(10),
&kill,
&kill_evt,
Instant::now(),
&test_config(),
);
handle.join().unwrap();
assert!(!samples.is_empty());
for s in &samples {
assert_eq!(s.cpus.len(), 2);
}
assert_eq!(samples[0].cpus[0].nr_running, 3);
assert_eq!(samples[0].cpus[0].scx_nr_running, 2);
assert_eq!(samples[0].cpus[1].nr_running, 7);
assert_eq!(samples[0].cpus[1].scx_nr_running, 5);
}
#[test]
fn monitor_loop_elapsed_ms_progresses() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let kill_evt = test_kill_evt();
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(200));
kill.store(true, Ordering::Release);
})
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
&kill_evt,
Instant::now(),
&test_config(),
);
handle.join().unwrap();
assert!(
samples.len() >= 2,
"need at least 2 samples, got {}",
samples.len()
);
for w in samples.windows(2) {
assert!(
w[1].elapsed_ms >= w[0].elapsed_ms,
"elapsed_ms went backwards: {} -> {}",
w[0].elapsed_ms,
w[1].elapsed_ms
);
}
assert!(samples.last().unwrap().elapsed_ms > 0);
}
fn test_event_offsets() -> ScxEventOffsets {
ScxEventOffsets {
percpu_ptr_off: 0,
event_stats_off: 0,
ev_select_cpu_fallback: 0,
ev_dispatch_local_dsq_offline: 8,
ev_dispatch_keep_last: 16,
ev_enq_skip_exiting: 24,
ev_enq_skip_migration_disabled: 32,
ev_reenq_immed: None,
ev_reenq_local_repeat: None,
ev_refill_slice_dfl: None,
ev_bypass_duration: None,
ev_bypass_dispatch: None,
ev_bypass_activate: None,
ev_insert_not_owned: None,
ev_sub_bypass_dispatch: None,
}
}
fn make_event_stats_buffer(
ev: &ScxEventOffsets,
fallback: i64,
offline: i64,
keep_last: i64,
skip_exit: i64,
skip_mig: i64,
) -> Vec<u8> {
let size = ev.event_stats_off + ev.ev_enq_skip_migration_disabled + 8;
let mut buf = vec![0u8; size];
let base = ev.event_stats_off;
buf[base + ev.ev_select_cpu_fallback..base + ev.ev_select_cpu_fallback + 8]
.copy_from_slice(&fallback.to_ne_bytes());
buf[base + ev.ev_dispatch_local_dsq_offline..base + ev.ev_dispatch_local_dsq_offline + 8]
.copy_from_slice(&offline.to_ne_bytes());
buf[base + ev.ev_dispatch_keep_last..base + ev.ev_dispatch_keep_last + 8]
.copy_from_slice(&keep_last.to_ne_bytes());
buf[base + ev.ev_enq_skip_exiting..base + ev.ev_enq_skip_exiting + 8]
.copy_from_slice(&skip_exit.to_ne_bytes());
buf[base + ev.ev_enq_skip_migration_disabled..base + ev.ev_enq_skip_migration_disabled + 8]
.copy_from_slice(&skip_mig.to_ne_bytes());
buf
}
#[test]
fn read_event_stats_known_values() {
let ev = test_event_offsets();
let buf = make_event_stats_buffer(&ev, 42, 7, 100, 3, 5);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let stats = read_event_stats(&mem, 0, &ev);
assert_eq!(stats.select_cpu_fallback, 42);
assert_eq!(stats.dispatch_local_dsq_offline, 7);
assert_eq!(stats.dispatch_keep_last, 100);
assert_eq!(stats.enq_skip_exiting, 3);
assert_eq!(stats.enq_skip_migration_disabled, 5);
}
#[test]
fn read_event_stats_zeros() {
let ev = test_event_offsets();
let buf = make_event_stats_buffer(&ev, 0, 0, 0, 0, 0);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let stats = read_event_stats(&mem, 0, &ev);
assert_eq!(stats.select_cpu_fallback, 0);
assert_eq!(stats.dispatch_local_dsq_offline, 0);
}
#[test]
fn read_event_stats_optional_fields() {
let mut ev = test_event_offsets();
ev.ev_bypass_activate = Some(40);
let mut buf = [0u8; 48];
let val: i64 = 999;
buf[40..48].copy_from_slice(&val.to_ne_bytes());
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let stats = read_event_stats(&mem, 0, &ev);
assert_eq!(stats.bypass_activate, 999);
assert_eq!(stats.reenq_immed, 0);
assert_eq!(stats.bypass_duration, 0);
assert_eq!(stats.sub_bypass_dispatch, 0);
}
#[test]
fn read_i64_roundtrip() {
let val: i64 = -12345;
let buf = val.to_ne_bytes();
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
assert_eq!(mem.read_i64(0, 0), -12345);
}
#[test]
fn write_u8_and_read_u8() {
let mut buf = [0u8; 16];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u8(0, 5, 0xAB);
assert_eq!(mem.read_u8(0, 5), 0xAB);
assert_eq!(buf[5], 0xAB);
}
#[test]
fn write_u8_out_of_bounds() {
let mut buf = [0u8; 4];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u8(4, 0, 0xFF);
assert_eq!(buf, [0u8; 4]);
}
#[test]
fn write_u64_and_read_u64() {
let mut buf = [0u8; 32];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u64(0, 8, 0xDEAD_BEEF_CAFE_1234);
assert_eq!(mem.read_u64(0, 8), 0xDEAD_BEEF_CAFE_1234);
assert_eq!(
u64::from_ne_bytes(buf[8..16].try_into().unwrap()),
0xDEAD_BEEF_CAFE_1234
);
}
#[test]
fn write_u64_out_of_bounds() {
let mut buf = [0u8; 8];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u64(1, 0, 0xFF);
assert_eq!(buf, [0u8; 8]);
}
#[test]
fn write_u64_at_boundary() {
let mut buf = [0u8; 16];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u64(8, 0, 0x0123_4567_89AB_CDEF);
assert_eq!(mem.read_u64(8, 0), 0x0123_4567_89AB_CDEF);
}
#[test]
fn read_u8_out_of_bounds() {
let buf = [0xFFu8; 4];
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
assert_eq!(mem.read_u8(4, 0), 0);
assert_eq!(mem.read_u8(3, 0), 0xFF);
}
#[test]
fn read_rq_stats_has_no_event_counters() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let snap = read_rq_stats(&mem, 0, &offsets);
assert!(snap.event_counters.is_none());
}
#[test]
fn monitor_loop_with_event_counters() {
let ev = test_event_offsets();
let mut offsets = test_offsets();
offsets.event_offsets = Some(ev.clone());
let rq_buf = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let ev_buf = make_event_stats_buffer(&ev, 10, 20, 30, 40, 50);
let rq_pa = 0u64;
let ev_pa = rq_buf.len() as u64;
let mut combined = rq_buf;
combined.extend_from_slice(&ev_buf);
let mem = unsafe { GuestMem::new(combined.as_ptr() as *mut u8, combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let kill_evt = test_kill_evt();
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(30));
kill.store(true, Ordering::Release);
})
};
let ev_pas = vec![ev_pa];
let cfg = MonitorConfig {
event_pcpu_pas: Some(&ev_pas),
..test_config()
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[rq_pa],
&offsets,
Duration::from_millis(10),
&kill,
&kill_evt,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(!samples.is_empty());
let counters = samples[0].cpus[0].event_counters.as_ref().unwrap();
assert_eq!(counters.select_cpu_fallback, 10);
assert_eq!(counters.dispatch_local_dsq_offline, 20);
assert_eq!(counters.dispatch_keep_last, 30);
assert_eq!(counters.enq_skip_exiting, 40);
assert_eq!(counters.enq_skip_migration_disabled, 50);
}
#[test]
fn monitor_loop_no_event_counters_when_none() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let kill_evt = test_kill_evt();
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(30));
kill.store(true, Ordering::Release);
})
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
&kill_evt,
Instant::now(),
&test_config(),
);
handle.join().unwrap();
assert!(!samples.is_empty());
assert!(samples[0].cpus[0].event_counters.is_none());
}
#[test]
fn monitor_loop_rq_refresh_drives_pas() {
let offsets = test_offsets();
let pco_size: u64 = 16; let rq0_buf = make_rq_buffer(&offsets, 11, 1, 1, 7777, 0);
let rq1_buf = make_rq_buffer(&offsets, 22, 2, 2, 8888, 0);
let rq0_pa = pco_size;
let rq1_pa = pco_size + rq0_buf.len() as u64;
let mut combined = vec![0u8; pco_size as usize];
combined[0..8].copy_from_slice(&rq0_pa.to_ne_bytes());
combined[8..16].copy_from_slice(&rq1_pa.to_ne_bytes());
combined.extend_from_slice(&rq0_buf);
combined.extend_from_slice(&rq1_buf);
let mem = unsafe { GuestMem::new(combined.as_ptr() as *mut u8, combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let kill_evt = test_kill_evt();
let refresh = RqRefresh {
pco_pa: 0,
runqueues_kva: 0,
num_cpus: 2,
page_offset_base_pa: None,
event: None,
per_cpu_start: 0,
kaslr_offset: 0,
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(30));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
rq_refresh: Some(&refresh),
..test_config()
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[],
&offsets,
Duration::from_millis(10),
&kill,
&kill_evt,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(!samples.is_empty());
assert_eq!(samples[0].cpus.len(), 2);
assert_eq!(samples[0].cpus[0].rq_clock, 7777);
assert_eq!(samples[0].cpus[0].nr_running, 11);
assert_eq!(samples[0].cpus[1].rq_clock, 8888);
assert_eq!(samples[0].cpus[1].nr_running, 22);
}
#[test]
fn monitor_loop_rq_refresh_zero_offsets_no_panic() {
let offsets = test_offsets();
#[allow(clippy::useless_vec)]
let mem_buf = vec![0u8; 64];
let mem = unsafe { GuestMem::new(mem_buf.as_ptr() as *mut u8, mem_buf.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let kill_evt = test_kill_evt();
let refresh = RqRefresh {
pco_pa: 0,
runqueues_kva: 0,
num_cpus: 2,
page_offset_base_pa: None,
event: None,
per_cpu_start: 0,
kaslr_offset: 0,
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(30));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
rq_refresh: Some(&refresh),
..test_config()
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[],
&offsets,
Duration::from_millis(10),
&kill,
&kill_evt,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(!samples.is_empty());
for s in &samples {
assert!(
s.cpus.is_empty(),
"data_valid gate should keep cpus empty when __per_cpu_offset[] is all-zero"
);
}
}
#[test]
fn resolve_event_pcpu_pas_null_scx_root() {
let ev = test_event_offsets();
let buf = [0u8; 64];
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let result = resolve_event_pcpu_pas(&mem, 0, &ev, &[0, 0x4000], 0);
assert!(result.is_none());
}
#[test]
fn monitor_loop_with_watchdog_override() {
let offsets = test_offsets();
let rq_buf = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let scx_root_pa = rq_buf.len() as u64;
let sch_pa = scx_root_pa + 8;
let watchdog_offset: usize = 16;
let page_offset = super::super::symbols::DEFAULT_PAGE_OFFSET;
let scx_sched_kva = page_offset.wrapping_add(sch_pa);
let mut combined = rq_buf;
combined.extend_from_slice(&scx_sched_kva.to_ne_bytes());
combined.extend_from_slice(&[0u8; 64]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let kill_evt = test_kill_evt();
let wd = WatchdogOverride::ScxSched {
scx_root_pa,
watchdog_offset,
jiffies: 99999,
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(30));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
watchdog_override: Some(&wd),
page_offset,
..test_config()
};
let MonitorLoopResult {
samples,
watchdog_observation,
..
} = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
&kill_evt,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(!samples.is_empty());
let write_pa = sch_pa as usize + watchdog_offset;
let written = u64::from_ne_bytes(combined[write_pa..write_pa + 8].try_into().unwrap());
assert_eq!(written, 99999);
let obs = watchdog_observation.expect("watchdog_observation should be Some after write");
assert_eq!(obs.expected_jiffies, 99999);
assert_eq!(obs.observed_jiffies, 99999);
}
#[test]
fn monitor_loop_watchdog_override_skipped_when_scx_root_null() {
let offsets = test_offsets();
let rq_buf = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let scx_root_pa = rq_buf.len() as u64;
let mut combined = rq_buf;
combined.extend_from_slice(&[0u8; 8]); combined.extend_from_slice(&[0u8; 128]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let kill_evt = test_kill_evt();
let wd = WatchdogOverride::ScxSched {
scx_root_pa,
watchdog_offset: 16,
jiffies: 0xDEADBEEF,
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(30));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
watchdog_override: Some(&wd),
..test_config()
};
let MonitorLoopResult {
watchdog_observation,
..
} = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
&kill_evt,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(
combined[scx_root_pa as usize..].iter().all(|&b| b == 0),
"no write should occur when scx_root is null"
);
assert!(
watchdog_observation.is_none(),
"watchdog_observation should be None when scx_root is null"
);
}
#[test]
fn monitor_loop_watchdog_static_global_writes_directly() {
let offsets = test_offsets();
let rq_buf = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let watchdog_pa = rq_buf.len() as u64;
let mut combined = rq_buf;
combined.extend_from_slice(&[0u8; 8]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let kill_evt = test_kill_evt();
let wd = WatchdogOverride::StaticGlobal {
watchdog_timeout_pa: watchdog_pa,
jiffies: 77777,
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(30));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
watchdog_override: Some(&wd),
..test_config()
};
let MonitorLoopResult {
samples,
watchdog_observation,
..
} = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
&kill_evt,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(!samples.is_empty());
let written = u64::from_ne_bytes(
combined[watchdog_pa as usize..watchdog_pa as usize + 8]
.try_into()
.unwrap(),
);
assert_eq!(written, 77777);
let obs = watchdog_observation.expect("watchdog_observation should be Some");
assert_eq!(obs.expected_jiffies, 77777);
assert_eq!(obs.observed_jiffies, 77777);
}
#[test]
fn monitor_loop_dump_trigger_fires_on_imbalance() {
let offsets = test_offsets();
let buf0 = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let buf1 = make_rq_buffer(&offsets, 20, 20, 1, 200, 0);
let pa1 = buf0.len() as u64;
let mut combined = buf0;
combined.extend_from_slice(&buf1);
combined.extend(vec![0u8; 64]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let kill_evt = test_kill_evt();
let virtio_con = test_virtio_console();
let trigger = DumpTrigger {
thresholds: super::super::MonitorThresholds {
max_imbalance_ratio: 2.0,
sustained_samples: 2,
fail_on_stall: false,
..Default::default()
},
virtio_con: Some(virtio_con.clone()),
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(200));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
dump_trigger: Some(&trigger),
..test_config()
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0, pa1],
&offsets,
Duration::from_millis(10),
&kill,
&kill_evt,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(!samples.is_empty());
let pending = virtio_con.lock().pending_rx_bytes();
assert!(
pending.contains(&crate::vmm::virtio_console::SIGNAL_VC_DUMP),
"imbalance threshold violation should queue SIGNAL_VC_DUMP; \
pending RX bytes: {pending:?}"
);
}
#[test]
fn monitor_loop_dump_trigger_stall_with_sustained_window() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 2, 1, 1, 5000, 0);
let mut combined = buf;
combined.extend(vec![0u8; 64]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let kill_evt = test_kill_evt();
let virtio_con = test_virtio_console();
let trigger = DumpTrigger {
thresholds: super::super::MonitorThresholds {
max_imbalance_ratio: 100.0,
max_local_dsq_depth: 10000,
fail_on_stall: true,
sustained_samples: 2,
..Default::default()
},
virtio_con: Some(virtio_con.clone()),
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(200));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
dump_trigger: Some(&trigger),
..test_config()
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
&kill_evt,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(
samples.len() >= 3,
"need >= 3 samples for 2 stall pairs, got {}",
samples.len()
);
let pending = virtio_con.lock().pending_rx_bytes();
assert!(
pending.contains(&crate::vmm::virtio_console::SIGNAL_VC_DUMP),
"stall should trigger SIGNAL_VC_DUMP after sustained_samples=2; \
pending RX bytes: {pending:?}"
);
}
#[test]
fn monitor_loop_dump_trigger_idle_cpu_no_stall() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 0, 0, 0, 5000, 0);
let mut combined = buf;
combined.extend(vec![0u8; 64]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let kill_evt = test_kill_evt();
let virtio_con = test_virtio_console();
let trigger = DumpTrigger {
thresholds: super::super::MonitorThresholds {
max_imbalance_ratio: 100.0,
max_local_dsq_depth: 10000,
fail_on_stall: true,
sustained_samples: 1,
..Default::default()
},
virtio_con: Some(virtio_con.clone()),
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(200));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
dump_trigger: Some(&trigger),
..test_config()
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
&kill_evt,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(
samples.len() >= 2,
"need >= 2 samples, got {}",
samples.len()
);
let pending = virtio_con.lock().pending_rx_bytes();
assert!(
!pending.contains(&crate::vmm::virtio_console::SIGNAL_VC_DUMP),
"idle CPU should not queue SIGNAL_VC_DUMP; pending RX bytes: {pending:?}"
);
}
#[test]
fn monitor_loop_vcpu_timing_preempted_no_stall() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 2, 1, 1, 5000, 0);
let mut combined = buf;
combined.extend(vec![0u8; 64]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let kill_evt = test_kill_evt();
let sleeper_kill = std::sync::Arc::new(AtomicBool::new(false));
let sleeper_kill_clone = sleeper_kill.clone();
let sleeper = std::thread::Builder::new()
.name("vcpu-sleeper".into())
.spawn(move || {
while !sleeper_kill_clone.load(Ordering::Acquire) {
std::thread::sleep(Duration::from_millis(100));
}
})
.unwrap();
let pt = sleeper.as_pthread_t() as libc::pthread_t;
let vcpu_timing = VcpuTiming { pthreads: vec![pt] };
let virtio_con = test_virtio_console();
let trigger = DumpTrigger {
thresholds: super::super::MonitorThresholds {
max_imbalance_ratio: 100.0,
max_local_dsq_depth: 10000,
fail_on_stall: true,
sustained_samples: 1,
..Default::default()
},
virtio_con: Some(virtio_con.clone()),
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(150));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
dump_trigger: Some(&trigger),
vcpu_timing: Some(&vcpu_timing),
preemption_threshold_ns: 10_000_000,
..test_config()
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(30),
&kill,
&kill_evt,
Instant::now(),
&cfg,
);
handle.join().unwrap();
sleeper_kill.store(true, Ordering::Release);
let _ = sleeper.join();
assert!(
samples.len() >= 2,
"need >= 2 samples, got {}",
samples.len()
);
let pending = virtio_con.lock().pending_rx_bytes();
assert!(
!pending.contains(&crate::vmm::virtio_console::SIGNAL_VC_DUMP),
"preempted vCPU should not queue SIGNAL_VC_DUMP; pending RX bytes: {pending:?}"
);
}
#[test]
fn monitor_loop_vcpu_timing_running_stall_fires() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 2, 1, 1, 5000, 0);
let mut combined = buf;
combined.extend(vec![0u8; 64]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let kill_evt = test_kill_evt();
let spinner_kill = std::sync::Arc::new(AtomicBool::new(false));
let spinner_kill_clone = spinner_kill.clone();
let spinner = std::thread::Builder::new()
.name("vcpu-spinner".into())
.spawn(move || {
while !spinner_kill_clone.load(Ordering::Relaxed) {
std::hint::spin_loop();
}
})
.unwrap();
let pt = spinner.as_pthread_t() as libc::pthread_t;
let vcpu_timing = VcpuTiming { pthreads: vec![pt] };
let virtio_con = test_virtio_console();
let trigger = DumpTrigger {
thresholds: super::super::MonitorThresholds {
max_imbalance_ratio: 100.0,
max_local_dsq_depth: 10000,
fail_on_stall: true,
sustained_samples: 2,
..Default::default()
},
virtio_con: Some(virtio_con.clone()),
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(200));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
dump_trigger: Some(&trigger),
vcpu_timing: Some(&vcpu_timing),
preemption_threshold_ns: 10_000_000,
..test_config()
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(30),
&kill,
&kill_evt,
Instant::now(),
&cfg,
);
handle.join().unwrap();
spinner_kill.store(true, Ordering::Release);
let _ = spinner.join();
assert!(
samples.len() >= 3,
"need >= 3 samples for 2 stall pairs, got {}",
samples.len()
);
let pending = virtio_con.lock().pending_rx_bytes();
assert!(
pending.contains(&crate::vmm::virtio_console::SIGNAL_VC_DUMP),
"real stall (vCPU running, clock stuck, nr_running>0) should queue \
SIGNAL_VC_DUMP; pending RX bytes: {pending:?}"
);
}
#[test]
fn reactive_and_evaluate_stall_consistency() {
let offsets = test_offsets();
let buf0 = make_rq_buffer(&offsets, 2, 1, 1, 5000, 0);
let buf1 = make_rq_buffer(&offsets, 1, 1, 1, 9000, 0);
let pa1 = buf0.len() as u64;
let mut combined = buf0;
combined.extend_from_slice(&buf1);
combined.extend(vec![0u8; 64]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let kill_evt = test_kill_evt();
let thresholds = super::super::MonitorThresholds {
max_imbalance_ratio: 100.0,
max_local_dsq_depth: 10000,
fail_on_stall: true,
sustained_samples: 2,
..Default::default()
};
let virtio_con = test_virtio_console();
let trigger = DumpTrigger {
thresholds,
virtio_con: Some(virtio_con.clone()),
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(200));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
dump_trigger: Some(&trigger),
..test_config()
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0, pa1],
&offsets,
Duration::from_millis(10),
&kill,
&kill_evt,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(
samples.len() >= 3,
"need >= 3 samples, got {}",
samples.len()
);
let pending = virtio_con.lock().pending_rx_bytes();
let reactive_stall = pending.contains(&crate::vmm::virtio_console::SIGNAL_VC_DUMP);
let summary = super::super::MonitorSummary::from_samples(&samples);
let report = super::super::MonitorReport {
samples,
summary,
..Default::default()
};
let verdict = thresholds.evaluate(&report);
assert!(reactive_stall, "reactive path should detect stall");
assert!(
!verdict.passed,
"evaluate should detect stall: {:?}",
verdict.details
);
assert!(
verdict.details.iter().any(|d| d.contains("rq_clock stall")),
"evaluate details should mention stall: {:?}",
verdict.details
);
}
#[test]
fn reactive_and_evaluate_idle_consistency() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 0, 0, 0, 5000, 0);
let mut combined = buf;
combined.extend(vec![0u8; 64]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let kill_evt = test_kill_evt();
let thresholds = super::super::MonitorThresholds {
max_imbalance_ratio: 100.0,
max_local_dsq_depth: 10000,
fail_on_stall: true,
sustained_samples: 1,
..Default::default()
};
let virtio_con = test_virtio_console();
let trigger = DumpTrigger {
thresholds,
virtio_con: Some(virtio_con.clone()),
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(200));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
dump_trigger: Some(&trigger),
..test_config()
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
&kill_evt,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(
samples.len() >= 2,
"need >= 2 samples, got {}",
samples.len()
);
let pending = virtio_con.lock().pending_rx_bytes();
assert!(
!pending.contains(&crate::vmm::virtio_console::SIGNAL_VC_DUMP),
"reactive: idle CPU should not queue SIGNAL_VC_DUMP; pending RX: {pending:?}"
);
let summary = super::super::MonitorSummary::from_samples(&samples);
assert!(
!summary.stall_detected,
"from_samples: idle CPU should not flag stall"
);
let report = super::super::MonitorReport {
samples,
summary,
..Default::default()
};
let verdict = thresholds.evaluate(&report);
assert!(
verdict.passed,
"evaluate: idle CPU should pass: {:?}",
verdict.details
);
}
fn test_schedstat_offsets() -> super::super::btf_offsets::SchedstatOffsets {
super::super::btf_offsets::SchedstatOffsets {
rq_sched_info: 200,
sched_info_run_delay: 8,
sched_info_pcount: 0,
rq_yld_count: 300,
rq_sched_count: 304,
rq_sched_goidle: 308,
rq_ttwu_count: 312,
rq_ttwu_local: 316,
}
}
#[allow(clippy::too_many_arguments)]
fn make_schedstat_buffer(
ss: &super::super::btf_offsets::SchedstatOffsets,
run_delay: u64,
pcount: u64,
yld_count: u32,
sched_count: u32,
sched_goidle: u32,
ttwu_count: u32,
ttwu_local: u32,
) -> Vec<u8> {
let size = ss.rq_ttwu_local + 4 + 8;
let mut buf = vec![0u8; size];
let si_base = ss.rq_sched_info;
buf[si_base + ss.sched_info_pcount..si_base + ss.sched_info_pcount + 8]
.copy_from_slice(&pcount.to_ne_bytes());
buf[si_base + ss.sched_info_run_delay..si_base + ss.sched_info_run_delay + 8]
.copy_from_slice(&run_delay.to_ne_bytes());
buf[ss.rq_yld_count..ss.rq_yld_count + 4].copy_from_slice(&yld_count.to_ne_bytes());
buf[ss.rq_sched_count..ss.rq_sched_count + 4].copy_from_slice(&sched_count.to_ne_bytes());
buf[ss.rq_sched_goidle..ss.rq_sched_goidle + 4]
.copy_from_slice(&sched_goidle.to_ne_bytes());
buf[ss.rq_ttwu_count..ss.rq_ttwu_count + 4].copy_from_slice(&ttwu_count.to_ne_bytes());
buf[ss.rq_ttwu_local..ss.rq_ttwu_local + 4].copy_from_slice(&ttwu_local.to_ne_bytes());
buf
}
#[test]
fn read_rq_schedstat_known_values() {
let ss = test_schedstat_offsets();
let buf = make_schedstat_buffer(&ss, 50000, 10, 3, 100, 20, 80, 40);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let stats = read_rq_schedstat(&mem, 0, &ss);
assert_eq!(stats.run_delay, 50000);
assert_eq!(stats.pcount, 10);
assert_eq!(stats.yld_count, 3);
assert_eq!(stats.sched_count, 100);
assert_eq!(stats.sched_goidle, 20);
assert_eq!(stats.ttwu_count, 80);
assert_eq!(stats.ttwu_local, 40);
}
#[test]
fn read_rq_schedstat_zeros() {
let ss = test_schedstat_offsets();
let buf = make_schedstat_buffer(&ss, 0, 0, 0, 0, 0, 0, 0);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let stats = read_rq_schedstat(&mem, 0, &ss);
assert_eq!(stats.run_delay, 0);
assert_eq!(stats.pcount, 0);
assert_eq!(stats.yld_count, 0);
assert_eq!(stats.sched_count, 0);
assert_eq!(stats.sched_goidle, 0);
assert_eq!(stats.ttwu_count, 0);
assert_eq!(stats.ttwu_local, 0);
}
#[test]
fn monitor_loop_with_schedstat_overlay() {
let ss = test_schedstat_offsets();
let mut offsets = test_offsets();
offsets.schedstat_offsets = Some(ss.clone());
let rq_size = ss.rq_ttwu_local + 4 + 8;
let mut buf = vec![0u8; rq_size];
buf[offsets.rq_nr_running..offsets.rq_nr_running + 4].copy_from_slice(&2u32.to_ne_bytes());
buf[offsets.rq_clock..offsets.rq_clock + 8].copy_from_slice(&500u64.to_ne_bytes());
let si_base = ss.rq_sched_info;
buf[si_base + ss.sched_info_run_delay..si_base + ss.sched_info_run_delay + 8]
.copy_from_slice(&12345u64.to_ne_bytes());
buf[si_base + ss.sched_info_pcount..si_base + ss.sched_info_pcount + 8]
.copy_from_slice(&7u64.to_ne_bytes());
buf[ss.rq_sched_count..ss.rq_sched_count + 4].copy_from_slice(&42u32.to_ne_bytes());
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let kill_evt = test_kill_evt();
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(30));
kill.store(true, Ordering::Release);
})
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
&kill_evt,
Instant::now(),
&test_config(),
);
handle.join().unwrap();
assert!(!samples.is_empty());
let ss_snap = samples[0].cpus[0].schedstat.as_ref().unwrap();
assert_eq!(ss_snap.run_delay, 12345);
assert_eq!(ss_snap.pcount, 7);
assert_eq!(ss_snap.sched_count, 42);
}
#[test]
fn monitor_loop_no_schedstat_when_none() {
let offsets = test_offsets();
assert!(offsets.schedstat_offsets.is_none());
let buf = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let kill_evt = test_kill_evt();
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(30));
kill.store(true, Ordering::Release);
})
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
&kill_evt,
Instant::now(),
&test_config(),
);
handle.join().unwrap();
assert!(!samples.is_empty());
assert!(samples[0].cpus[0].schedstat.is_none());
}
fn test_sched_domain_offsets() -> SchedDomainOffsets {
SchedDomainOffsets {
rq_sd: 400,
sd_parent: 0,
sd_level: 8,
sd_flags: 12,
sd_name: 16,
sd_span_weight: 24,
sd_balance_interval: 28,
sd_nr_balance_failed: 32,
sd_newidle_call: Some(36),
sd_newidle_success: Some(40),
sd_newidle_ratio: Some(44),
sd_max_newidle_lb_cost: 48,
stats_offsets: Some(test_sd_stats_offsets()),
}
}
fn test_sd_stats_offsets() -> SchedDomainStatsOffsets {
SchedDomainStatsOffsets {
sd_lb_count: 56,
sd_lb_failed: 68,
sd_lb_balanced: 80,
sd_lb_imbalance_load: 92,
sd_lb_imbalance_util: 104,
sd_lb_imbalance_task: 116,
sd_lb_imbalance_misfit: 128,
sd_lb_gained: 140,
sd_lb_hot_gained: 152,
sd_lb_nobusyg: 164,
sd_lb_nobusyq: 176,
sd_alb_count: 188,
sd_alb_failed: 192,
sd_alb_pushed: 196,
sd_sbe_count: 200,
sd_sbe_balanced: 204,
sd_sbe_pushed: 208,
sd_sbf_count: 212,
sd_sbf_balanced: 216,
sd_sbf_pushed: 220,
sd_ttwu_wake_remote: 224,
sd_ttwu_move_affine: 228,
sd_ttwu_move_balance: 232,
}
}
#[allow(clippy::too_many_arguments)]
fn make_sd_buffer(
sd: &SchedDomainOffsets,
parent_kva: u64,
level: i32,
flags: i32,
name_kva: u64,
span_weight: u32,
balance_interval: u32,
newidle_call: u32,
lb_count_0: u32,
alb_pushed: u32,
ttwu_wake_remote: u32,
) -> Vec<u8> {
let so = sd.stats_offsets.as_ref().unwrap();
let size = so.sd_ttwu_move_balance + 4 + 8;
let mut buf = vec![0u8; size];
buf[sd.sd_parent..sd.sd_parent + 8].copy_from_slice(&parent_kva.to_ne_bytes());
buf[sd.sd_level..sd.sd_level + 4].copy_from_slice(&level.to_ne_bytes());
buf[sd.sd_flags..sd.sd_flags + 4].copy_from_slice(&flags.to_ne_bytes());
buf[sd.sd_name..sd.sd_name + 8].copy_from_slice(&name_kva.to_ne_bytes());
buf[sd.sd_span_weight..sd.sd_span_weight + 4].copy_from_slice(&span_weight.to_ne_bytes());
buf[sd.sd_balance_interval..sd.sd_balance_interval + 4]
.copy_from_slice(&balance_interval.to_ne_bytes());
if let Some(off) = sd.sd_newidle_call {
buf[off..off + 4].copy_from_slice(&newidle_call.to_ne_bytes());
}
buf[so.sd_lb_count..so.sd_lb_count + 4].copy_from_slice(&lb_count_0.to_ne_bytes());
buf[so.sd_alb_pushed..so.sd_alb_pushed + 4].copy_from_slice(&alb_pushed.to_ne_bytes());
buf[so.sd_ttwu_wake_remote..so.sd_ttwu_wake_remote + 4]
.copy_from_slice(&ttwu_wake_remote.to_ne_bytes());
buf
}
#[test]
fn read_sched_domain_tree_null_sd() {
let sd_off = test_sched_domain_offsets();
let buf = vec![0u8; 512];
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let result = read_sched_domain_tree(&mem, 0, &sd_off, 0, 0, 0);
assert!(result.is_none());
}
#[test]
fn read_sched_domain_tree_single_domain() {
let sd_off = test_sched_domain_offsets();
let sd_pa: u64 = 1024;
let name_pa: u64 = 2048;
let sd_buf = make_sd_buffer(&sd_off, 0, 0, 0x42, name_pa, 4, 64, 15, 10, 3, 7);
let name_bytes = b"SMT\0";
let total_size = (name_pa as usize) + 16;
let mut buf = vec![0u8; total_size];
buf[sd_off.rq_sd..sd_off.rq_sd + 8].copy_from_slice(&sd_pa.to_ne_bytes());
buf[sd_pa as usize..sd_pa as usize + sd_buf.len()].copy_from_slice(&sd_buf);
buf[name_pa as usize..name_pa as usize + name_bytes.len()].copy_from_slice(name_bytes);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let domains = read_sched_domain_tree(&mem, 0, &sd_off, 0, 0, 0).unwrap();
assert_eq!(domains.len(), 1);
assert_eq!(domains[0].level, 0);
assert_eq!(domains[0].name, "SMT");
assert_eq!(domains[0].flags, 0x42);
assert_eq!(domains[0].span_weight, 4);
assert_eq!(domains[0].balance_interval, 64);
assert_eq!(domains[0].newidle_call, Some(15));
let stats = domains[0].stats.as_ref().unwrap();
assert_eq!(stats.lb_count[0], 10);
assert_eq!(stats.alb_pushed, 3);
assert_eq!(stats.ttwu_wake_remote, 7);
}
#[test]
fn read_sched_domain_tree_two_levels() {
let sd_off = test_sched_domain_offsets();
let sd0_pa: u64 = 1024;
let sd1_pa: u64 = 2048;
let name0_pa: u64 = 3072;
let name1_pa: u64 = 3088;
let sd0_buf = make_sd_buffer(&sd_off, sd1_pa, 0, 0x10, name0_pa, 2, 32, 8, 5, 1, 2);
let sd1_buf = make_sd_buffer(&sd_off, 0, 1, 0x20, name1_pa, 8, 128, 22, 20, 4, 10);
let total_size = 3104;
let mut buf = vec![0u8; total_size];
buf[sd_off.rq_sd..sd_off.rq_sd + 8].copy_from_slice(&sd0_pa.to_ne_bytes());
buf[sd0_pa as usize..sd0_pa as usize + sd0_buf.len()].copy_from_slice(&sd0_buf);
buf[sd1_pa as usize..sd1_pa as usize + sd1_buf.len()].copy_from_slice(&sd1_buf);
buf[name0_pa as usize..name0_pa as usize + 4].copy_from_slice(b"SMT\0");
buf[name1_pa as usize..name1_pa as usize + 3].copy_from_slice(b"MC\0");
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let domains = read_sched_domain_tree(&mem, 0, &sd_off, 0, 0, 0).unwrap();
assert_eq!(domains.len(), 2);
assert_eq!(domains[0].level, 0);
assert_eq!(domains[0].name, "SMT");
assert_eq!(domains[0].span_weight, 2);
assert_eq!(domains[0].balance_interval, 32);
assert_eq!(domains[0].newidle_call, Some(8));
let s0 = domains[0].stats.as_ref().unwrap();
assert_eq!(s0.lb_count[0], 5);
assert_eq!(domains[1].level, 1);
assert_eq!(domains[1].name, "MC");
assert_eq!(domains[1].span_weight, 8);
assert_eq!(domains[1].balance_interval, 128);
assert_eq!(domains[1].newidle_call, Some(22));
let s1 = domains[1].stats.as_ref().unwrap();
assert_eq!(s1.lb_count[0], 20);
assert_eq!(s1.alb_pushed, 4);
assert_eq!(s1.ttwu_wake_remote, 10);
}
#[test]
fn read_sched_domain_tree_self_reference_breaks_cycle() {
let sd_off = test_sched_domain_offsets();
let sd_pa: u64 = 1024;
let sd_buf = make_sd_buffer(&sd_off, sd_pa, 0, 0, 0, 1, 0, 0, 0, 0, 0);
let total_size = sd_pa as usize + sd_buf.len();
let mut buf = vec![0u8; total_size];
buf[sd_off.rq_sd..sd_off.rq_sd + 8].copy_from_slice(&sd_pa.to_ne_bytes());
buf[sd_pa as usize..sd_pa as usize + sd_buf.len()].copy_from_slice(&sd_buf);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let domains = read_sched_domain_tree(&mem, 0, &sd_off, 0, 0, 0).unwrap();
assert_eq!(
domains.len(),
1,
"self-referential sd should produce exactly one snapshot"
);
}
#[test]
fn read_sched_domain_tree_max_depth_bound_on_long_chain() {
let sd_off = test_sched_domain_offsets();
const SD_SIZE: u64 = 248;
const CHAIN_LEN: usize = 10;
let first_pa: u64 = 1024;
let pa = |i: usize| first_pa + (i as u64) * SD_SIZE;
let total_size = pa(CHAIN_LEN) as usize;
let mut buf = vec![0u8; total_size];
buf[sd_off.rq_sd..sd_off.rq_sd + 8].copy_from_slice(&pa(0).to_ne_bytes());
for i in 0..CHAIN_LEN {
let parent_kva = if i + 1 == CHAIN_LEN { 0 } else { pa(i + 1) };
let sd_buf = make_sd_buffer(&sd_off, parent_kva, i as i32, 0, 0, 1, 0, 0, 0, 0, 0);
let start = pa(i) as usize;
buf[start..start + sd_buf.len()].copy_from_slice(&sd_buf);
}
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let domains = read_sched_domain_tree(&mem, 0, &sd_off, 0, 0, 0).unwrap();
assert_eq!(
domains.len(),
8,
"acyclic chain of {CHAIN_LEN} levels must truncate at MAX_DEPTH=8"
);
for (i, snap) in domains.iter().enumerate() {
assert_eq!(snap.level, i as i32, "level mismatch at index {i}");
}
}
#[test]
fn read_sched_domain_tree_out_of_bounds_pa() {
let sd_off = test_sched_domain_offsets();
let bad_kva: u64 = 0xFFFF_FFFF_FFFF_0000;
let mut buf = vec![0u8; 512];
buf[sd_off.rq_sd..sd_off.rq_sd + 8].copy_from_slice(&bad_kva.to_ne_bytes());
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let domains = read_sched_domain_tree(&mem, 0, &sd_off, 0, 0, 0);
assert!(domains.is_some());
assert!(domains.unwrap().is_empty());
}
#[test]
fn read_sched_domain_tree_newidle_none() {
let mut sd_off = test_sched_domain_offsets();
sd_off.sd_newidle_call = None;
sd_off.sd_newidle_success = None;
sd_off.sd_newidle_ratio = None;
let sd_pa: u64 = 1024;
let name_pa: u64 = 2048;
let sd_buf = make_sd_buffer(&sd_off, 0, 0, 0x42, name_pa, 4, 64, 0, 10, 3, 7);
let name_bytes = b"SMT\0";
let total_size = (name_pa as usize) + 16;
let mut buf = vec![0u8; total_size];
buf[sd_off.rq_sd..sd_off.rq_sd + 8].copy_from_slice(&sd_pa.to_ne_bytes());
buf[sd_pa as usize..sd_pa as usize + sd_buf.len()].copy_from_slice(&sd_buf);
buf[name_pa as usize..name_pa as usize + name_bytes.len()].copy_from_slice(name_bytes);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let domains = read_sched_domain_tree(&mem, 0, &sd_off, 0, 0, 0).unwrap();
assert_eq!(domains.len(), 1);
assert_eq!(domains[0].level, 0);
assert_eq!(domains[0].name, "SMT");
assert_eq!(domains[0].flags, 0x42);
assert_eq!(domains[0].span_weight, 4);
assert_eq!(domains[0].balance_interval, 64);
assert_eq!(domains[0].newidle_call, None);
assert_eq!(domains[0].newidle_success, None);
assert_eq!(domains[0].newidle_ratio, None);
let stats = domains[0].stats.as_ref().unwrap();
assert_eq!(stats.lb_count[0], 10);
assert_eq!(stats.alb_pushed, 3);
assert_eq!(stats.ttwu_wake_remote, 7);
}
#[test]
fn read_u32_array_known_values() {
let mut buf = [0u8; 16];
buf[0..4].copy_from_slice(&10u32.to_ne_bytes());
buf[4..8].copy_from_slice(&20u32.to_ne_bytes());
buf[8..12].copy_from_slice(&30u32.to_ne_bytes());
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let arr = read_u32_array(&mem, 0, 0);
assert_eq!(arr, [10, 20, 30]);
}
#[test]
fn btf_offsets_couple_with_rq_reader() {
let path = match crate::monitor::find_test_vmlinux() {
Some(p) => p,
None => skip!("no test vmlinux available"),
};
let offsets = crate::test_support::require_kernel_offsets(&path);
let max_scalar_off = offsets.rq_clock + 8;
let max_scx_off = offsets.rq_scx + offsets.scx_rq_local_dsq + offsets.dsq_nr + 4;
let max_flags_off = offsets.rq_scx + offsets.scx_rq_flags + 4;
let size = max_scalar_off.max(max_scx_off).max(max_flags_off) + 64;
let mut buf = vec![0u8; size];
let nr_running: u32 = 0xDEAD_BEEF;
let scx_nr: u32 = 0x1234_5678;
let dsq_depth: u32 = 0x0BAD_F00D;
let clock: u64 = 0xCAFE_BABE_1357_9BDF;
let flags: u32 = 0xA5A5_A5A5;
buf[offsets.rq_nr_running..offsets.rq_nr_running + 4]
.copy_from_slice(&nr_running.to_ne_bytes());
buf[offsets.rq_clock..offsets.rq_clock + 8].copy_from_slice(&clock.to_ne_bytes());
let scx_nr_off = offsets.rq_scx + offsets.scx_rq_nr_running;
buf[scx_nr_off..scx_nr_off + 4].copy_from_slice(&scx_nr.to_ne_bytes());
let scx_flags_off = offsets.rq_scx + offsets.scx_rq_flags;
buf[scx_flags_off..scx_flags_off + 4].copy_from_slice(&flags.to_ne_bytes());
let dsq_off = offsets.rq_scx + offsets.scx_rq_local_dsq + offsets.dsq_nr;
buf[dsq_off..dsq_off + 4].copy_from_slice(&dsq_depth.to_ne_bytes());
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let snap = read_rq_stats(&mem, 0, &offsets);
assert_eq!(snap.nr_running, nr_running);
assert_eq!(snap.scx_nr_running, scx_nr);
assert_eq!(snap.local_dsq_depth, dsq_depth);
assert_eq!(snap.rq_clock, clock);
assert_eq!(snap.scx_flags, flags);
}
#[test]
fn write_u32_at_boundary_writes_full_word() {
let mut buf = [0u8; 16];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u32(12, 0, 0xDEAD_BEEF);
assert_eq!(mem.read_u32(12, 0), 0xDEAD_BEEF);
assert_eq!(
u32::from_ne_bytes(buf[12..16].try_into().unwrap()),
0xDEAD_BEEF
);
}
#[test]
fn write_u32_one_past_boundary_is_noop() {
let mut buf = [0u8; 16];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u32(13, 0, 0xFFFF_FFFF);
assert_eq!(buf, [0u8; 16]);
}
#[test]
fn write_u8_at_boundary_writes_last_byte() {
let mut buf = [0u8; 4];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u8(3, 0, 0xAB);
assert_eq!(buf[3], 0xAB);
}
#[test]
fn write_scalar_offset_arg_is_added_to_pa() {
let mut buf = [0u8; 32];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u64(8, 16, 0x0123_4567_89AB_CDEF);
assert_eq!(mem.read_u64(24, 0), 0x0123_4567_89AB_CDEF);
assert_eq!(
u64::from_ne_bytes(buf[24..32].try_into().unwrap()),
0x0123_4567_89AB_CDEF
);
}
#[test]
fn write_scalar_offset_only_out_of_bounds_is_noop() {
let mut buf = [0xCCu8; 8];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u64(4, 4, 0xFFFF_FFFF_FFFF_FFFF);
assert_eq!(buf, [0xCCu8; 8]);
}
#[test]
fn guest_mem_new_size_smaller_than_write_offset_is_noop() {
let mut backing = [0u8; 32];
let declared_size: u64 = 8;
let mem = unsafe { GuestMem::new(backing.as_mut_ptr(), declared_size) };
mem.write_u64(16, 0, 0xDEAD_BEEF_CAFE_1234);
assert_eq!(backing, [0u8; 32]);
mem.write_u64(0, 0, 0xDEAD_BEEF_CAFE_1234);
assert_eq!(
u64::from_ne_bytes(backing[0..8].try_into().unwrap()),
0xDEAD_BEEF_CAFE_1234
);
assert_eq!(backing[8..32], [0u8; 24]);
}
#[test]
fn resolve_ptr_multi_region_routes_to_correct_region() {
let mut buf0 = [0xAAu8; 64];
let mut buf1 = [0xBBu8; 64];
let regions = vec![
MemRegion {
host_ptr: buf0.as_mut_ptr(),
offset: 0,
size: 64,
},
MemRegion {
host_ptr: buf1.as_mut_ptr(),
offset: 1024, size: 64,
},
];
let mem = unsafe { GuestMem::from_regions_for_test(regions) };
assert_eq!(mem.read_u8(0, 0), 0xAA);
assert_eq!(mem.read_u8(63, 0), 0xAA);
assert_eq!(mem.read_u8(1024, 0), 0xBB);
assert_eq!(mem.read_u8(1087, 0), 0xBB);
assert_eq!(mem.read_u8(64, 0), 0);
assert_eq!(mem.read_u8(512, 0), 0);
assert_eq!(mem.read_u8(1023, 0), 0);
mem.write_u32(1024, 0, 0x1234_5678);
assert_eq!(buf0, [0xAAu8; 64]);
assert_eq!(
u32::from_ne_bytes(buf1[0..4].try_into().unwrap()),
0x1234_5678
);
let buf0_snapshot = buf0;
let buf1_snapshot = buf1;
mem.write_u32(900, 0, 0xFFFF_FFFF);
assert_eq!(buf0, buf0_snapshot);
assert_eq!(buf1, buf1_snapshot);
}
#[test]
fn resolve_ptr_multi_region_read_ring_volatile_routes_correctly() {
let mut buf0 = [0u8; 64];
let mut buf1 = [0u8; 64];
buf1[0..8].copy_from_slice(&[1, 2, 3, 4, 5, 6, 7, 8]);
let regions = vec![
MemRegion {
host_ptr: buf0.as_mut_ptr(),
offset: 0,
size: 64,
},
MemRegion {
host_ptr: buf1.as_mut_ptr(),
offset: 1024,
size: 64,
},
];
let mem = unsafe { GuestMem::from_regions_for_test(regions) };
for (i, expected) in [1, 2, 3, 4, 5, 6, 7, 8].iter().enumerate() {
assert_eq!(mem.read_u8(1024 + i as u64, 0), *expected);
}
}
#[test]
fn resolve_ptr_offset_at_exact_region_end_is_out_of_region() {
let mut buf = [0xCCu8; 16];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
assert_eq!(mem.read_u8(16, 0), 0);
}
#[test]
fn write_u8_and_read_u8_bounds_at_declared_size() {
const SIZE: u64 = 8;
let mut buf = [0u8; SIZE as usize];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), SIZE) };
mem.write_u8(SIZE - 1, 0, 0xAB);
assert_eq!(
mem.read_u8(SIZE - 1, 0),
0xAB,
"write at last valid offset must round-trip via read_u8"
);
assert_eq!(
buf[(SIZE - 1) as usize],
0xAB,
"write at last valid offset must land in the backing byte"
);
let snapshot = buf;
mem.write_u8(SIZE, 0, 0xFF);
assert_eq!(
buf, snapshot,
"write past the end must be a silent no-op — no byte of \
the backing buffer may change"
);
assert_eq!(
mem.read_u8(SIZE, 0),
0,
"read past the end must return 0, not stale memory"
);
assert_eq!(
mem.read_u8(SIZE + 1, 0),
0,
"read several bytes past the end must also return 0"
);
}
#[test]
#[cfg(target_arch = "x86_64")]
fn walk_4level_simple_4kib_page() {
let kva: u64 = 0xFFFF_8880_0000_5678;
let pgd_idx = (kva >> 39) & 0x1FF;
let pud_idx = (kva >> 30) & 0x1FF;
let pmd_idx = (kva >> 21) & 0x1FF;
let pte_idx = (kva >> 12) & 0x1FF;
let page_off = kva & 0xFFF;
let pgd_pa: u64 = 0x10000;
let pud_pa: u64 = pgd_pa + 0x1000;
let pmd_pa: u64 = pud_pa + 0x1000;
let pte_pa: u64 = pmd_pa + 0x1000;
let data_pa: u64 = pte_pa + 0x1000;
let size = (data_pa + 0x1000) as usize;
let mut buf = vec![0u8; size];
let write_entry = |buf: &mut Vec<u8>, base: u64, idx: u64, val: u64| {
let off = (base + idx * 8) as usize;
buf[off..off + 8].copy_from_slice(&val.to_ne_bytes());
};
write_entry(&mut buf, pgd_pa, pgd_idx, pud_pa | 0x63);
write_entry(&mut buf, pud_pa, pud_idx, pmd_pa | 0x63);
write_entry(&mut buf, pmd_pa, pmd_idx, pte_pa | 0x63);
write_entry(&mut buf, pte_pa, pte_idx, data_pa | 0x63);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let pa = mem.walk_4level(pgd_pa, Kva(kva));
assert_eq!(
pa,
Some(data_pa | page_off),
"4-level walk must compose leaf PTE base with KVA in-page offset"
);
}
#[test]
#[cfg(target_arch = "x86_64")]
fn walk_4level_pmd_ps_bit_2mib_huge_page() {
let kva: u64 = 0xFFFF_8880_0020_3456;
let pgd_idx = (kva >> 39) & 0x1FF;
let pud_idx = (kva >> 30) & 0x1FF;
let pmd_idx = (kva >> 21) & 0x1FF;
let in_page_off_2m = kva & 0x1F_FFFF;
let pgd_pa: u64 = 0x10000;
let pud_pa: u64 = pgd_pa + 0x1000;
let pmd_pa: u64 = pud_pa + 0x1000;
let huge_base: u64 = 0x20_0000;
let size = (pmd_pa + 0x1000) as usize;
let mut buf = vec![0u8; size];
let write_entry = |buf: &mut Vec<u8>, base: u64, idx: u64, val: u64| {
let off = (base + idx * 8) as usize;
buf[off..off + 8].copy_from_slice(&val.to_ne_bytes());
};
write_entry(&mut buf, pgd_pa, pgd_idx, pud_pa | 0x63);
write_entry(&mut buf, pud_pa, pud_idx, pmd_pa | 0x63);
write_entry(&mut buf, pmd_pa, pmd_idx, huge_base | 0xE3);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let pa = mem.walk_4level(pgd_pa, Kva(kva));
assert_eq!(
pa,
Some(huge_base | in_page_off_2m),
"PMD PS-bit leaf must splice 2 MiB base with KVA bits [20:0]"
);
}
#[test]
#[cfg(target_arch = "x86_64")]
fn walk_4level_unmapped_pud_returns_none() {
let kva: u64 = 0xFFFF_8880_0000_5000;
let pgd_idx = (kva >> 39) & 0x1FF;
let pud_idx = (kva >> 30) & 0x1FF;
let pgd_pa: u64 = 0x10000;
let pud_pa: u64 = pgd_pa + 0x1000;
let size = (pud_pa + 0x1000) as usize;
let mut buf = vec![0u8; size];
let write_entry = |buf: &mut Vec<u8>, base: u64, idx: u64, val: u64| {
let off = (base + idx * 8) as usize;
buf[off..off + 8].copy_from_slice(&val.to_ne_bytes());
};
write_entry(&mut buf, pgd_pa, pgd_idx, pud_pa | 0x63);
write_entry(&mut buf, pud_pa, pud_idx, 0x3000);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
assert_eq!(
mem.walk_4level(pgd_pa, Kva(kva)),
None,
"missing PRESENT bit at any level must return None"
);
}
#[test]
#[cfg(target_arch = "x86_64")]
fn walk_5level_descent_to_4kib_page() {
let kva: u64 = 0xFF42_8881_0000_5678;
let pml5_idx = (kva >> 48) & 0x1FF;
let pgd_idx = (kva >> 39) & 0x1FF;
let pud_idx = (kva >> 30) & 0x1FF;
let pmd_idx = (kva >> 21) & 0x1FF;
let pte_idx = (kva >> 12) & 0x1FF;
let page_off = kva & 0xFFF;
let pml5_pa: u64 = 0x10000;
let p4d_pa: u64 = pml5_pa + 0x1000;
let pud_pa: u64 = p4d_pa + 0x1000;
let pmd_pa: u64 = pud_pa + 0x1000;
let pte_pa: u64 = pmd_pa + 0x1000;
let data_pa: u64 = pte_pa + 0x1000;
let size = (data_pa + 0x1000) as usize;
let mut buf = vec![0u8; size];
let write_entry = |buf: &mut Vec<u8>, base: u64, idx: u64, val: u64| {
let off = (base + idx * 8) as usize;
buf[off..off + 8].copy_from_slice(&val.to_ne_bytes());
};
write_entry(&mut buf, pml5_pa, pml5_idx, p4d_pa | 0x63);
write_entry(&mut buf, p4d_pa, pgd_idx, pud_pa | 0x63);
write_entry(&mut buf, pud_pa, pud_idx, pmd_pa | 0x63);
write_entry(&mut buf, pmd_pa, pmd_idx, pte_pa | 0x63);
write_entry(&mut buf, pte_pa, pte_idx, data_pa | 0x63);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let pa = mem.walk_5level(pml5_pa, Kva(kva));
assert_eq!(
pa,
Some(data_pa | page_off),
"5-level walk must descend PML5->P4D->PUD->PMD->PTE and \
splice the leaf PTE base with the KVA in-page offset"
);
}
#[test]
#[cfg(target_arch = "x86_64")]
fn walk_4level_pud_ps_bit_1gib_huge_page() {
let kva: u64 = 0xFFFF_8880_4123_4567;
let pgd_idx = (kva >> 39) & 0x1FF;
let pud_idx = (kva >> 30) & 0x1FF;
let in_page_off_1g = kva & 0x3FFF_FFFF;
let pgd_pa: u64 = 0x10000;
let pud_pa: u64 = pgd_pa + 0x1000;
let huge_base: u64 = 0x4000_0000;
let size = (pud_pa + 0x1000) as usize;
let mut buf = vec![0u8; size];
let write_entry = |buf: &mut Vec<u8>, base: u64, idx: u64, val: u64| {
let off = (base + idx * 8) as usize;
buf[off..off + 8].copy_from_slice(&val.to_ne_bytes());
};
write_entry(&mut buf, pgd_pa, pgd_idx, pud_pa | 0x63);
write_entry(&mut buf, pud_pa, pud_idx, huge_base | 0xE3);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let pa = mem.walk_4level(pgd_pa, Kva(kva));
assert_eq!(
pa,
Some(huge_base | in_page_off_1g),
"PUD PS-bit leaf must splice 1 GiB base with KVA bits [29:0]"
);
}
#[test]
#[cfg(target_arch = "x86_64")]
fn walk_5level_pml5_ps_bit_256tib_huge_page() {
let kva: u64 = 0xFF42_0001_2345_6789;
let pml5_idx = (kva >> 48) & 0x1FF;
let in_page_off_256t = kva & 0x0000_FFFF_FFFF_FFFF;
let pml5_pa: u64 = 0x10000;
let huge_base: u64 = 0x000A_0000_0000_0000;
let size = (pml5_pa + 0x1000) as usize;
let mut buf = vec![0u8; size];
let write_entry = |buf: &mut Vec<u8>, base: u64, idx: u64, val: u64| {
let off = (base + idx * 8) as usize;
buf[off..off + 8].copy_from_slice(&val.to_ne_bytes());
};
write_entry(&mut buf, pml5_pa, pml5_idx, huge_base | 0xE3);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let pa = mem.walk_5level(pml5_pa, Kva(kva));
assert_eq!(
pa,
Some(huge_base | in_page_off_256t),
"PML5 PS-bit leaf must splice 256 TiB base with KVA bits [47:0]"
);
}
#[test]
#[cfg(target_arch = "x86_64")]
fn walk_5level_unmapped_pml5_returns_none() {
let kva: u64 = 0xFF42_8881_0000_5000;
let pml5_idx = (kva >> 48) & 0x1FF;
let pml5_pa: u64 = 0x10000;
let size = (pml5_pa + 0x1000) as usize;
let mut buf = vec![0u8; size];
let write_entry = |buf: &mut Vec<u8>, base: u64, idx: u64, val: u64| {
let off = (base + idx * 8) as usize;
buf[off..off + 8].copy_from_slice(&val.to_ne_bytes());
};
write_entry(&mut buf, pml5_pa, pml5_idx, 0x3000);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
assert_eq!(
mem.walk_5level(pml5_pa, Kva(kva)),
None,
"missing PRESENT bit at the PML5 root must return None \
without recursing into walk_4level"
);
}
#[test]
fn write_bytes_at_offset_overflow_returns_zero() {
let mut buf = [0u8; 64];
let snapshot = buf;
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
let n = mem.write_bytes_at(u64::MAX, 1, &[0xAA; 8]);
assert_eq!(n, 0, "wraparound add must return 0 bytes written");
assert_eq!(
buf, snapshot,
"overflow must not mutate any byte of the backing buffer"
);
let n = mem.write_bytes_at(u64::MAX - 7, 8, &[0xBB; 4]);
assert_eq!(n, 0, "wraparound to 0 must return 0 bytes written");
assert_eq!(buf, snapshot, "no aliasing into low DRAM is permitted");
}
}