use super::btf_offsets::{
CPU_MAX_IDLE_TYPES, KernelOffsets, SchedDomainOffsets, SchedDomainStatsOffsets,
SchedstatOffsets, ScxEventOffsets,
};
use super::{
CpuSnapshot, Kva, MonitorSample, RqSchedstat, SchedDomainSnapshot, SchedDomainStats,
ScxEventCounters,
};
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::{Duration, Instant};
#[derive(Debug, Clone, Copy)]
pub(crate) struct MemRegion {
pub(crate) host_ptr: *mut u8,
pub(crate) offset: u64,
pub(crate) size: u64,
}
pub struct GuestMem {
size: u64,
regions: Vec<MemRegion>,
}
unsafe impl Send for GuestMem {}
unsafe impl Sync for GuestMem {}
impl GuestMem {
pub unsafe fn new(base: *mut u8, size: u64) -> Self {
Self {
size,
regions: vec![MemRegion {
host_ptr: base,
offset: 0,
size,
}],
}
}
#[cfg(test)]
pub(crate) unsafe fn from_regions_for_test(regions: Vec<MemRegion>) -> Self {
assert!(!regions.is_empty(), "at least one region required");
let size = regions
.iter()
.map(|r| r.offset + r.size)
.max()
.expect("non-empty");
Self { size, regions }
}
pub(crate) fn from_layout(
layout: &crate::vmm::numa_mem::NumaMemoryLayout,
guest_mem: &vm_memory::GuestMemoryMmap,
) -> Self {
use vm_memory::GuestMemory;
let dram_base = layout.dram_base();
let total_size = layout.total_bytes();
let mut regions = Vec::with_capacity(layout.regions().len());
for nr in layout.regions() {
let host_ptr = guest_mem
.get_host_address(vm_memory::GuestAddress(nr.gpa_start))
.unwrap();
regions.push(MemRegion {
host_ptr,
offset: nr.gpa_start - dram_base,
size: nr.size,
});
}
Self {
size: total_size,
regions,
}
}
fn resolve_ptr(&self, offset: u64) -> Option<(*mut u8, u64)> {
let idx = self
.regions
.partition_point(|r| r.offset <= offset)
.checked_sub(1)?;
let r = &self.regions[idx];
let local = offset - r.offset;
if local < r.size {
let ptr = unsafe { r.host_ptr.add(local as usize) };
Some((ptr, r.size - local))
} else {
None
}
}
#[inline]
unsafe fn read_volatile_bytes<const N: usize>(ptr: *const u8) -> [u8; N] {
let mut bytes = [0u8; N];
for (i, slot) in bytes.iter_mut().enumerate() {
*slot = unsafe { std::ptr::read_volatile(ptr.add(i)) };
}
bytes
}
#[inline]
unsafe fn write_volatile_bytes<const N: usize>(ptr: *mut u8, bytes: [u8; N]) {
for (i, &byte) in bytes.iter().enumerate() {
unsafe { std::ptr::write_volatile(ptr.add(i), byte) };
}
}
fn read_scalar<const N: usize>(&self, pa: u64, offset: usize) -> [u8; N] {
let Some(addr) = pa.checked_add(offset as u64) else {
return [0; N];
};
let Some(end) = addr.checked_add(N as u64) else {
return [0; N];
};
if end > self.size {
return [0; N];
}
match self.resolve_ptr(addr) {
Some((ptr, region_avail)) => {
if (N as u64) > region_avail {
return [0; N];
}
unsafe { Self::read_volatile_bytes::<N>(ptr as *const u8) }
}
None => [0; N],
}
}
fn write_scalar<const N: usize>(&self, pa: u64, offset: usize, bytes: [u8; N]) {
let Some(addr) = pa.checked_add(offset as u64) else {
return;
};
let Some(end) = addr.checked_add(N as u64) else {
return;
};
if end > self.size {
return;
}
if let Some((ptr, region_avail)) = self.resolve_ptr(addr) {
if (N as u64) > region_avail {
return;
}
unsafe { Self::write_volatile_bytes::<N>(ptr, bytes) };
}
}
pub fn read_u8(&self, pa: u64, offset: usize) -> u8 {
u8::from_ne_bytes(self.read_scalar::<1>(pa, offset))
}
pub fn read_u32(&self, pa: u64, offset: usize) -> u32 {
u32::from_ne_bytes(self.read_scalar::<4>(pa, offset))
}
pub fn read_u64(&self, pa: u64, offset: usize) -> u64 {
u64::from_ne_bytes(self.read_scalar::<8>(pa, offset))
}
pub fn read_i64(&self, pa: u64, offset: usize) -> i64 {
self.read_u64(pa, offset) as i64
}
pub fn write_u8(&self, pa: u64, offset: usize, val: u8) {
self.write_scalar::<1>(pa, offset, val.to_ne_bytes());
}
pub fn write_u64(&self, pa: u64, offset: usize, val: u64) {
self.write_scalar::<8>(pa, offset, val.to_ne_bytes());
}
pub fn read_bytes(&self, pa: u64, buf: &mut [u8]) -> usize {
let len = buf.len() as u64;
if pa >= self.size {
return 0;
}
let avail = (self.size - pa).min(len) as usize;
match self.resolve_ptr(pa) {
Some((ptr, region_avail)) => {
let copy_len = avail.min(region_avail as usize);
unsafe {
std::ptr::copy_nonoverlapping(ptr, buf.as_mut_ptr(), copy_len);
}
copy_len
}
None => 0,
}
}
#[cfg(test)]
pub fn write_u32(&self, pa: u64, offset: usize, val: u32) {
self.write_scalar::<4>(pa, offset, val.to_ne_bytes());
}
pub(crate) fn translate_kva(&self, cr3_pa: u64, kva: Kva, l5: bool) -> Option<u64> {
#[cfg(target_arch = "x86_64")]
{
if l5 {
self.walk_5level(cr3_pa, kva)
} else {
self.walk_4level(cr3_pa, kva)
}
}
#[cfg(target_arch = "aarch64")]
{
let _ = l5; self.walk_3level_aarch64_64k(cr3_pa, kva)
}
}
#[cfg(target_arch = "x86_64")]
fn walk_4level(&self, cr3_pa: u64, kva: Kva) -> Option<u64> {
const PRESENT: u64 = 1;
const PS: u64 = 1 << 7;
const ADDR_MASK: u64 = 0x000F_FFFF_FFFF_F000;
let kva_bits = kva.0;
let pgd_idx = (kva_bits >> 39) & 0x1FF;
let pud_idx = (kva_bits >> 30) & 0x1FF;
let pmd_idx = (kva_bits >> 21) & 0x1FF;
let pte_idx = (kva_bits >> 12) & 0x1FF;
let page_off = kva_bits & 0xFFF;
let pgd_pa = (cr3_pa & ADDR_MASK) + pgd_idx * 8;
let pgde = self.read_u64(pgd_pa, 0);
if pgde & PRESENT == 0 {
return None;
}
let pud_pa = (pgde & ADDR_MASK) + pud_idx * 8;
let pude = self.read_u64(pud_pa, 0);
if pude & PRESENT == 0 {
return None;
}
if pude & PS != 0 {
let base = pude & 0x000F_FFFF_C000_0000;
return Some(base | (kva_bits & 0x3FFF_FFFF));
}
let pmd_pa = (pude & ADDR_MASK) + pmd_idx * 8;
let pmde = self.read_u64(pmd_pa, 0);
if pmde & PRESENT == 0 {
return None;
}
if pmde & PS != 0 {
let base = pmde & 0x000F_FFFF_FFE0_0000;
return Some(base | (kva_bits & 0x1F_FFFF));
}
let pte_pa = (pmde & ADDR_MASK) + pte_idx * 8;
let ptee = self.read_u64(pte_pa, 0);
if ptee & PRESENT == 0 {
return None;
}
Some((ptee & ADDR_MASK) | page_off)
}
#[cfg(target_arch = "aarch64")]
fn walk_3level_aarch64_64k(&self, ttbr_pa: u64, kva: Kva) -> Option<u64> {
use crate::vmm::kvm::DRAM_START;
const VALID: u64 = 1;
const TABLE: u64 = 0b11;
const BLOCK: u64 = 0b01;
const DESC_MASK: u64 = 0b11;
const ADDR_MASK: u64 = 0x0000_FFFF_FFFF_0000;
let to_offset = |gpa: u64| -> Option<u64> { gpa.checked_sub(DRAM_START) };
let kva_bits = kva.0;
let pgd_idx = (kva_bits >> 42) & 0x3F; let pmd_idx = (kva_bits >> 29) & 0x1FFF; let pte_idx = (kva_bits >> 16) & 0x1FFF; let page_off = kva_bits & 0xFFFF;
let pgd_off = (ttbr_pa & ADDR_MASK) + pgd_idx * 8;
let pgde = self.read_u64(pgd_off, 0);
if pgde & VALID == 0 {
return None;
}
if pgde & DESC_MASK == BLOCK {
let base = pgde & 0x0000_FC00_0000_0000;
return Some(to_offset(base)? | (kva_bits & 0x3FF_FFFF_FFFF));
}
let pmd_off = to_offset(pgde & ADDR_MASK)? + pmd_idx * 8;
let pmde = self.read_u64(pmd_off, 0);
if pmde & VALID == 0 {
return None;
}
if pmde & DESC_MASK == BLOCK {
let base = pmde & 0x0000_FFFF_E000_0000;
return Some(to_offset(base)? | (kva_bits & 0x1FFF_FFFF));
}
let pte_off = to_offset(pmde & ADDR_MASK)? + pte_idx * 8;
let ptee = self.read_u64(pte_off, 0);
if ptee & VALID == 0 {
return None;
}
if ptee & DESC_MASK != TABLE {
return None;
}
Some(to_offset(ptee & ADDR_MASK)? | page_off)
}
#[cfg(target_arch = "x86_64")]
fn walk_5level(&self, cr3_pa: u64, kva: Kva) -> Option<u64> {
const PRESENT: u64 = 1;
const ADDR_MASK: u64 = 0x000F_FFFF_FFFF_F000;
let pml5_idx = (kva.0 >> 48) & 0x1FF;
let pml5_pa = (cr3_pa & ADDR_MASK) + pml5_idx * 8;
let pml5e = self.read_u64(pml5_pa, 0);
if pml5e & PRESENT == 0 {
return None;
}
let p4d_pa = pml5e & ADDR_MASK;
self.walk_4level(p4d_pa, kva)
}
pub fn size(&self) -> u64 {
self.size
}
}
pub(crate) fn read_rq_stats(mem: &GuestMem, rq_pa: u64, offsets: &KernelOffsets) -> CpuSnapshot {
CpuSnapshot {
nr_running: mem.read_u32(rq_pa, offsets.rq_nr_running),
scx_nr_running: mem.read_u32(rq_pa, offsets.rq_scx + offsets.scx_rq_nr_running),
local_dsq_depth: mem.read_u32(
rq_pa,
offsets.rq_scx + offsets.scx_rq_local_dsq + offsets.dsq_nr,
),
rq_clock: mem.read_u64(rq_pa, offsets.rq_clock),
scx_flags: mem.read_u32(rq_pa, offsets.rq_scx + offsets.scx_rq_flags),
event_counters: None,
schedstat: None,
vcpu_cpu_time_ns: None,
sched_domains: None,
}
}
pub(crate) fn read_event_stats(
mem: &GuestMem,
pcpu_pa: u64,
ev: &ScxEventOffsets,
) -> ScxEventCounters {
let base = pcpu_pa + ev.event_stats_off as u64;
let read_opt = |off: Option<usize>| off.map(|o| mem.read_i64(base, o)).unwrap_or(0);
ScxEventCounters {
select_cpu_fallback: mem.read_i64(base, ev.ev_select_cpu_fallback),
dispatch_local_dsq_offline: mem.read_i64(base, ev.ev_dispatch_local_dsq_offline),
dispatch_keep_last: mem.read_i64(base, ev.ev_dispatch_keep_last),
enq_skip_exiting: mem.read_i64(base, ev.ev_enq_skip_exiting),
enq_skip_migration_disabled: mem.read_i64(base, ev.ev_enq_skip_migration_disabled),
reenq_immed: read_opt(ev.ev_reenq_immed),
reenq_local_repeat: read_opt(ev.ev_reenq_local_repeat),
refill_slice_dfl: read_opt(ev.ev_refill_slice_dfl),
bypass_duration: read_opt(ev.ev_bypass_duration),
bypass_dispatch: read_opt(ev.ev_bypass_dispatch),
bypass_activate: read_opt(ev.ev_bypass_activate),
insert_not_owned: read_opt(ev.ev_insert_not_owned),
sub_bypass_dispatch: read_opt(ev.ev_sub_bypass_dispatch),
}
}
pub(crate) fn read_rq_schedstat(mem: &GuestMem, rq_pa: u64, ss: &SchedstatOffsets) -> RqSchedstat {
let sched_info_pa = rq_pa + ss.rq_sched_info as u64;
RqSchedstat {
run_delay: mem.read_u64(sched_info_pa, ss.sched_info_run_delay),
pcount: mem.read_u64(sched_info_pa, ss.sched_info_pcount),
yld_count: mem.read_u32(rq_pa, ss.rq_yld_count),
sched_count: mem.read_u32(rq_pa, ss.rq_sched_count),
sched_goidle: mem.read_u32(rq_pa, ss.rq_sched_goidle),
ttwu_count: mem.read_u32(rq_pa, ss.rq_ttwu_count),
ttwu_local: mem.read_u32(rq_pa, ss.rq_ttwu_local),
}
}
fn read_u32_array(mem: &GuestMem, pa: u64, base_offset: usize) -> [u32; CPU_MAX_IDLE_TYPES] {
std::array::from_fn(|i| mem.read_u32(pa, base_offset + i * 4))
}
fn read_sd_stats(mem: &GuestMem, sd_pa: u64, so: &SchedDomainStatsOffsets) -> SchedDomainStats {
SchedDomainStats {
lb_count: read_u32_array(mem, sd_pa, so.sd_lb_count),
lb_failed: read_u32_array(mem, sd_pa, so.sd_lb_failed),
lb_balanced: read_u32_array(mem, sd_pa, so.sd_lb_balanced),
lb_imbalance_load: read_u32_array(mem, sd_pa, so.sd_lb_imbalance_load),
lb_imbalance_util: read_u32_array(mem, sd_pa, so.sd_lb_imbalance_util),
lb_imbalance_task: read_u32_array(mem, sd_pa, so.sd_lb_imbalance_task),
lb_imbalance_misfit: read_u32_array(mem, sd_pa, so.sd_lb_imbalance_misfit),
lb_gained: read_u32_array(mem, sd_pa, so.sd_lb_gained),
lb_hot_gained: read_u32_array(mem, sd_pa, so.sd_lb_hot_gained),
lb_nobusyg: read_u32_array(mem, sd_pa, so.sd_lb_nobusyg),
lb_nobusyq: read_u32_array(mem, sd_pa, so.sd_lb_nobusyq),
alb_count: mem.read_u32(sd_pa, so.sd_alb_count),
alb_failed: mem.read_u32(sd_pa, so.sd_alb_failed),
alb_pushed: mem.read_u32(sd_pa, so.sd_alb_pushed),
sbe_count: mem.read_u32(sd_pa, so.sd_sbe_count),
sbe_balanced: mem.read_u32(sd_pa, so.sd_sbe_balanced),
sbe_pushed: mem.read_u32(sd_pa, so.sd_sbe_pushed),
sbf_count: mem.read_u32(sd_pa, so.sd_sbf_count),
sbf_balanced: mem.read_u32(sd_pa, so.sd_sbf_balanced),
sbf_pushed: mem.read_u32(sd_pa, so.sd_sbf_pushed),
ttwu_wake_remote: mem.read_u32(sd_pa, so.sd_ttwu_wake_remote),
ttwu_move_affine: mem.read_u32(sd_pa, so.sd_ttwu_move_affine),
ttwu_move_balance: mem.read_u32(sd_pa, so.sd_ttwu_move_balance),
}
}
fn read_sd_name(mem: &GuestMem, sd_pa: u64, name_offset: usize, page_offset: u64) -> String {
let name_kva = mem.read_u64(sd_pa, name_offset);
if name_kva == 0 {
return String::new();
}
let text_pa = super::symbols::text_kva_to_pa(name_kva);
let name_pa = if text_pa < mem.size() {
text_pa
} else {
let direct_pa = super::symbols::kva_to_pa(name_kva, page_offset);
if direct_pa >= mem.size() {
return String::new();
}
direct_pa
};
let mut buf = [0u8; 16];
let n = mem.read_bytes(name_pa, &mut buf);
let end = buf[..n].iter().position(|&b| b == 0).unwrap_or(n);
String::from_utf8_lossy(&buf[..end]).into_owned()
}
pub(crate) fn read_sched_domain_tree(
mem: &GuestMem,
rq_pa: u64,
sd_offsets: &SchedDomainOffsets,
page_offset: u64,
) -> Option<Vec<SchedDomainSnapshot>> {
const MAX_DEPTH: usize = 8;
let sd_kva = mem.read_u64(rq_pa, sd_offsets.rq_sd);
if sd_kva == 0 {
return None;
}
let mut domains = Vec::new();
let mut current_kva = sd_kva;
let mut visited: std::collections::HashSet<u64> = std::collections::HashSet::new();
for _ in 0..MAX_DEPTH {
if current_kva == 0 {
break;
}
if !visited.insert(current_kva) {
tracing::warn!(
sd_kva = format_args!("{current_kva:#x}"),
"sched_domain cycle detected; truncating tree"
);
break;
}
let sd_pa = super::symbols::kva_to_pa(current_kva, page_offset);
if sd_pa >= mem.size() {
break;
}
let level = mem.read_u32(sd_pa, sd_offsets.sd_level) as i32;
let name = read_sd_name(mem, sd_pa, sd_offsets.sd_name, page_offset);
let flags = mem.read_u32(sd_pa, sd_offsets.sd_flags) as i32;
let span_weight = mem.read_u32(sd_pa, sd_offsets.sd_span_weight);
let stats = sd_offsets
.stats_offsets
.as_ref()
.map(|so| read_sd_stats(mem, sd_pa, so));
let snap = SchedDomainSnapshot {
level,
name,
flags,
span_weight,
balance_interval: mem.read_u32(sd_pa, sd_offsets.sd_balance_interval),
nr_balance_failed: mem.read_u32(sd_pa, sd_offsets.sd_nr_balance_failed),
newidle_call: sd_offsets
.sd_newidle_call
.map(|off| mem.read_u32(sd_pa, off)),
newidle_success: sd_offsets
.sd_newidle_success
.map(|off| mem.read_u32(sd_pa, off)),
newidle_ratio: sd_offsets
.sd_newidle_ratio
.map(|off| mem.read_u32(sd_pa, off)),
max_newidle_lb_cost: mem.read_u64(sd_pa, sd_offsets.sd_max_newidle_lb_cost),
stats,
};
domains.push(snap);
current_kva = mem.read_u64(sd_pa, sd_offsets.sd_parent);
}
Some(domains)
}
pub(crate) fn resolve_event_pcpu_pas(
mem: &GuestMem,
scx_root_pa: u64,
ev: &ScxEventOffsets,
per_cpu_offsets: &[u64],
page_offset: u64,
) -> Option<Vec<u64>> {
let scx_sched_kva = mem.read_u64(scx_root_pa, 0);
if scx_sched_kva == 0 {
return None;
}
let scx_sched_pa = super::symbols::kva_to_pa(scx_sched_kva, page_offset);
let pcpu_kva = mem.read_u64(scx_sched_pa, ev.percpu_ptr_off);
if pcpu_kva == 0 {
return None;
}
let pas: Vec<u64> = per_cpu_offsets
.iter()
.map(|&cpu_off| super::symbols::kva_to_pa(pcpu_kva.wrapping_add(cpu_off), page_offset))
.collect();
Some(pas)
}
pub(crate) struct VcpuTiming {
pub pthreads: Vec<libc::pthread_t>,
}
impl VcpuTiming {
fn read_cpu_times(&self, reported_err: &mut [bool]) -> Vec<Option<u64>> {
self.pthreads
.iter()
.enumerate()
.map(|(vcpu, &pt)| {
let mut clk: libc::clockid_t = 0;
let ret = unsafe { libc::pthread_getcpuclockid(pt, &mut clk) };
if ret != 0 {
if let Some(slot) = reported_err.get_mut(vcpu)
&& !*slot
{
tracing::warn!(
vcpu,
ret,
errno = std::io::Error::last_os_error().raw_os_error(),
"pthread_getcpuclockid failed; stall gating unavailable for this vCPU"
);
*slot = true;
}
return None;
}
let mut ts = libc::timespec {
tv_sec: 0,
tv_nsec: 0,
};
let ret = unsafe { libc::clock_gettime(clk, &mut ts) };
if ret != 0 {
if let Some(slot) = reported_err.get_mut(vcpu)
&& !*slot
{
tracing::warn!(
vcpu,
ret,
errno = std::io::Error::last_os_error().raw_os_error(),
"clock_gettime on pthread clock failed; stall gating unavailable for this vCPU"
);
*slot = true;
}
return None;
}
if ts.tv_sec < 0 || ts.tv_nsec < 0 {
if let Some(slot) = reported_err.get_mut(vcpu)
&& !*slot
{
tracing::warn!(
vcpu,
tv_sec = ts.tv_sec,
tv_nsec = ts.tv_nsec,
"negative clock_gettime result; stall gating unavailable for this vCPU"
);
*slot = true;
}
return None;
}
if let Some(slot) = reported_err.get_mut(vcpu) {
*slot = false;
}
Some(ts.tv_sec as u64 * 1_000_000_000 + ts.tv_nsec as u64)
})
.collect()
}
}
pub(crate) fn evaluate_preempted(prev: Option<u64>, curr: Option<u64>, threshold_ns: u64) -> bool {
match (prev, curr) {
(Some(p), Some(c)) => c.saturating_sub(p) < threshold_ns,
_ => false,
}
}
pub(crate) fn is_cpu_stalled(
prev: &super::CpuSnapshot,
curr: &super::CpuSnapshot,
preemption_threshold_ns: u64,
) -> bool {
if curr.rq_clock == 0 || curr.rq_clock != prev.rq_clock {
return false;
}
let idle = curr.nr_running == 0 && prev.nr_running == 0;
if idle {
return false;
}
let preempted = evaluate_preempted(
prev.vcpu_cpu_time_ns,
curr.vcpu_cpu_time_ns,
preemption_threshold_ns,
);
!preempted
}
pub(crate) struct DumpTrigger {
pub shm_base_pa: u64,
pub thresholds: super::MonitorThresholds,
}
pub(crate) enum WatchdogOverride {
ScxSched {
scx_root_pa: u64,
watchdog_offset: usize,
jiffies: u64,
page_offset: u64,
},
StaticGlobal {
watchdog_timeout_pa: u64,
jiffies: u64,
},
}
pub(crate) struct ProgStatsCtx {
pub cached: Vec<super::bpf_prog::CachedProgInfo>,
pub per_cpu_offsets: Vec<u64>,
pub page_offset: u64,
pub offsets: super::btf_offsets::BpfProgOffsets,
}
pub(crate) struct MonitorLoopResult {
pub(crate) samples: Vec<MonitorSample>,
pub(crate) drain: crate::vmm::shm_ring::ShmDrainResult,
pub(crate) watchdog_observation: Option<super::WatchdogObservation>,
}
pub(crate) struct MonitorConfig<'a> {
pub event_pcpu_pas: Option<&'a [u64]>,
pub dump_trigger: Option<&'a DumpTrigger>,
pub watchdog_override: Option<&'a WatchdogOverride>,
pub vcpu_timing: Option<&'a VcpuTiming>,
pub preemption_threshold_ns: u64,
pub shm_base_pa: Option<u64>,
pub prog_stats_ctx: Option<&'a ProgStatsCtx>,
pub page_offset: u64,
}
pub(crate) fn monitor_loop(
mem: &GuestMem,
rq_pas: &[u64],
offsets: &KernelOffsets,
interval: Duration,
kill: &AtomicBool,
run_start: Instant,
cfg: &MonitorConfig<'_>,
) -> MonitorLoopResult {
let event_pcpu_pas = cfg.event_pcpu_pas;
let dump_trigger = cfg.dump_trigger;
let watchdog_override = cfg.watchdog_override;
let vcpu_timing = cfg.vcpu_timing;
let preemption_threshold_ns = cfg.preemption_threshold_ns;
let shm_base_pa = cfg.shm_base_pa;
let prog_stats_ctx = cfg.prog_stats_ctx;
let page_offset = cfg.page_offset;
let preemption_threshold_ns = if preemption_threshold_ns > 0 {
preemption_threshold_ns
} else {
super::vcpu_preemption_threshold_ns(None)
};
let mut samples: Vec<MonitorSample> = Vec::new();
let mut imbalance_tracker = super::SustainedViolationTracker::default();
let mut dsq_tracker = super::SustainedViolationTracker::default();
let mut stall_trackers: Vec<super::SustainedViolationTracker> =
vec![super::SustainedViolationTracker::default(); rq_pas.len()];
let mut dump_requested = false;
let mut cpus: Vec<CpuSnapshot> = Vec::with_capacity(rq_pas.len());
let mut vcpu_timing_err_reported: Vec<bool> = vcpu_timing
.map(|vt| vec![false; vt.pthreads.len()])
.unwrap_or_default();
let mut shm_entries: Vec<crate::vmm::shm_ring::ShmEntry> = Vec::new();
let mut shm_drops: u64 = 0;
let mut watchdog_observation: Option<super::WatchdogObservation> = None;
loop {
if kill.load(Ordering::Acquire) {
break;
}
if let Some(wd) = watchdog_override {
let (write_pa, write_offset, wd_jiffies) = match wd {
WatchdogOverride::ScxSched {
scx_root_pa,
watchdog_offset,
jiffies,
page_offset,
} => {
let sch_kva = mem.read_u64(*scx_root_pa, 0);
if sch_kva == 0 {
(None, 0, *jiffies)
} else {
let sch_pa = super::symbols::kva_to_pa(sch_kva, *page_offset);
(Some(sch_pa), *watchdog_offset, *jiffies)
}
}
WatchdogOverride::StaticGlobal {
watchdog_timeout_pa,
jiffies,
} => (Some(*watchdog_timeout_pa), 0, *jiffies),
};
if let Some(pa) = write_pa {
mem.write_u64(pa, write_offset, wd_jiffies);
if watchdog_observation.is_none() {
let observed = mem.read_u64(pa, write_offset);
watchdog_observation = Some(super::WatchdogObservation {
expected_jiffies: wd_jiffies,
observed_jiffies: observed,
});
}
}
}
cpus.clear();
cpus.extend(rq_pas.iter().map(|&pa| read_rq_stats(mem, pa, offsets)));
if let (Some(pcpu_pas), Some(ev)) = (event_pcpu_pas, &offsets.event_offsets) {
for (i, cpu) in cpus.iter_mut().enumerate() {
if let Some(&pcpu_pa) = pcpu_pas.get(i) {
cpu.event_counters = Some(read_event_stats(mem, pcpu_pa, ev));
}
}
}
if let Some(ss) = &offsets.schedstat_offsets {
for (i, cpu) in cpus.iter_mut().enumerate() {
if let Some(&rq_pa) = rq_pas.get(i) {
cpu.schedstat = Some(read_rq_schedstat(mem, rq_pa, ss));
}
}
}
if let Some(sd) = &offsets.sched_domain_offsets {
for (i, cpu) in cpus.iter_mut().enumerate() {
if let Some(&rq_pa) = rq_pas.get(i) {
cpu.sched_domains = read_sched_domain_tree(mem, rq_pa, sd, page_offset);
}
}
}
if let Some(vt) = vcpu_timing {
let times = vt.read_cpu_times(&mut vcpu_timing_err_reported);
for (i, cpu) in cpus.iter_mut().enumerate() {
if let Some(&t) = times.get(i) {
cpu.vcpu_cpu_time_ns = t;
}
}
}
if let Some(trigger) = dump_trigger
&& !dump_requested
&& !cpus.is_empty()
{
let t = &trigger.thresholds;
let sample_idx = samples.len();
let tmp_sample = MonitorSample {
elapsed_ms: 0,
cpus: cpus.clone(),
prog_stats: None,
};
let ratio = tmp_sample.imbalance_ratio();
imbalance_tracker.record(ratio > t.max_imbalance_ratio, ratio, sample_idx);
let worst_dsq = cpus.iter().map(|c| c.local_dsq_depth).max().unwrap_or(0);
dsq_tracker.record(
worst_dsq > t.max_local_dsq_depth,
worst_dsq as f64,
sample_idx,
);
if t.fail_on_stall
&& let Some(prev) = samples.last()
{
let n = prev.cpus.len().min(cpus.len()).min(stall_trackers.len());
for i in 0..n {
let is_stall = is_cpu_stalled(&prev.cpus[i], &cpus[i], preemption_threshold_ns);
stall_trackers[i].record(is_stall, cpus[i].rq_clock as f64, sample_idx);
}
}
let sustained = imbalance_tracker.sustained(t.sustained_samples)
|| dsq_tracker.sustained(t.sustained_samples)
|| stall_trackers
.iter()
.any(|s| s.sustained(t.sustained_samples));
if sustained {
mem.write_u8(
trigger.shm_base_pa,
crate::vmm::shm_ring::DUMP_REQ_OFFSET,
crate::vmm::shm_ring::DUMP_REQ_SYSRQ_D,
);
dump_requested = true;
}
}
let prog_stats = prog_stats_ctx.map(|ctx| {
super::bpf_prog::read_prog_runtime_stats(
mem,
&ctx.cached,
&ctx.per_cpu_offsets,
ctx.page_offset,
&ctx.offsets,
)
});
samples.push(MonitorSample {
elapsed_ms: run_start.elapsed().as_millis() as u64,
cpus: cpus.clone(),
prog_stats,
});
if let Some(shm_pa) = shm_base_pa {
let drain = crate::vmm::shm_ring::shm_drain_live(mem, shm_pa);
shm_drops = shm_drops.max(drain.drops);
if drain
.entries
.iter()
.any(|e| e.msg_type == crate::vmm::shm_ring::MSG_TYPE_SCHED_EXIT && e.crc_ok)
{
shm_entries.extend(drain.entries);
kill.store(true, Ordering::Release);
break;
}
shm_entries.extend(drain.entries);
}
std::thread::sleep(interval);
}
let shm_result = crate::vmm::shm_ring::ShmDrainResult {
entries: shm_entries,
drops: shm_drops,
};
MonitorLoopResult {
samples,
drain: shm_result,
watchdog_observation,
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::os::unix::thread::JoinHandleExt;
const THRESHOLD_NS: u64 = 10_000_000;
#[test]
fn evaluate_preempted_both_none_is_not_preempted() {
assert!(!evaluate_preempted(None, None, THRESHOLD_NS));
}
#[test]
fn evaluate_preempted_first_read_failed_is_not_preempted() {
assert!(!evaluate_preempted(None, Some(1_000_000_000), THRESHOLD_NS));
}
#[test]
fn evaluate_preempted_current_read_failed_is_not_preempted() {
assert!(!evaluate_preempted(Some(1_000_000_000), None, THRESHOLD_NS));
}
#[test]
fn evaluate_preempted_delta_below_threshold_is_preempted() {
assert!(evaluate_preempted(
Some(1_000_000_000),
Some(1_001_000_000),
THRESHOLD_NS,
));
}
#[test]
fn evaluate_preempted_delta_at_threshold_is_not_preempted() {
assert!(!evaluate_preempted(
Some(1_000_000_000),
Some(1_010_000_000),
THRESHOLD_NS,
));
}
#[test]
fn evaluate_preempted_delta_above_threshold_is_not_preempted() {
assert!(!evaluate_preempted(
Some(1_000_000_000),
Some(2_000_000_000),
THRESHOLD_NS,
));
}
#[test]
fn evaluate_preempted_non_monotonic_treated_as_no_progress() {
assert!(evaluate_preempted(
Some(1_000_000_000),
Some(999_000_000),
THRESHOLD_NS,
));
}
#[test]
fn evaluate_preempted_zero_threshold_never_preempted() {
assert!(!evaluate_preempted(Some(100), Some(100), 0));
assert!(!evaluate_preempted(Some(100), Some(200), 0));
}
fn test_config() -> MonitorConfig<'static> {
MonitorConfig {
event_pcpu_pas: None,
dump_trigger: None,
watchdog_override: None,
vcpu_timing: None,
preemption_threshold_ns: 0,
shm_base_pa: None,
prog_stats_ctx: None,
page_offset: 0,
}
}
fn test_offsets() -> KernelOffsets {
KernelOffsets {
rq_nr_running: 8,
rq_clock: 16,
rq_scx: 100,
scx_rq_nr_running: 4,
scx_rq_local_dsq: 20,
scx_rq_flags: 8,
dsq_nr: 0,
event_offsets: None,
schedstat_offsets: None,
sched_domain_offsets: None,
watchdog_offsets: None,
}
}
fn make_rq_buffer(
offsets: &KernelOffsets,
nr_running: u32,
scx_nr: u32,
dsq_nr: u32,
clock: u64,
flags: u32,
) -> Vec<u8> {
let size = offsets.rq_scx + offsets.scx_rq_local_dsq + offsets.dsq_nr + 8;
let mut buf = vec![0u8; size];
buf[offsets.rq_nr_running..offsets.rq_nr_running + 4]
.copy_from_slice(&nr_running.to_ne_bytes());
buf[offsets.rq_clock..offsets.rq_clock + 8].copy_from_slice(&clock.to_ne_bytes());
let scx_base = offsets.rq_scx;
buf[scx_base + offsets.scx_rq_nr_running..scx_base + offsets.scx_rq_nr_running + 4]
.copy_from_slice(&scx_nr.to_ne_bytes());
buf[scx_base + offsets.scx_rq_flags..scx_base + offsets.scx_rq_flags + 4]
.copy_from_slice(&flags.to_ne_bytes());
let dsq_base = scx_base + offsets.scx_rq_local_dsq;
buf[dsq_base + offsets.dsq_nr..dsq_base + offsets.dsq_nr + 4]
.copy_from_slice(&dsq_nr.to_ne_bytes());
buf
}
#[test]
fn read_rq_stats_known_values() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 5, 3, 7, 999_000, 0x1);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let snap = read_rq_stats(&mem, 0, &offsets);
assert_eq!(snap.nr_running, 5);
assert_eq!(snap.scx_nr_running, 3);
assert_eq!(snap.local_dsq_depth, 7);
assert_eq!(snap.rq_clock, 999_000);
assert_eq!(snap.scx_flags, 0x1);
}
#[test]
fn read_rq_stats_all_zeros() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 0, 0, 0, 0, 0);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let snap = read_rq_stats(&mem, 0, &offsets);
assert_eq!(snap.nr_running, 0);
assert_eq!(snap.scx_nr_running, 0);
assert_eq!(snap.local_dsq_depth, 0);
assert_eq!(snap.rq_clock, 0);
assert_eq!(snap.scx_flags, 0);
}
#[test]
fn read_rq_stats_max_values() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, u32::MAX, u32::MAX, u32::MAX, u64::MAX, u32::MAX);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let snap = read_rq_stats(&mem, 0, &offsets);
assert_eq!(snap.nr_running, u32::MAX);
assert_eq!(snap.scx_nr_running, u32::MAX);
assert_eq!(snap.local_dsq_depth, u32::MAX);
assert_eq!(snap.rq_clock, u64::MAX);
assert_eq!(snap.scx_flags, u32::MAX);
}
#[test]
fn read_u32_out_of_bounds() {
let buf = [0xFFu8; 8];
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
assert_eq!(mem.read_u32(6, 0), 0);
assert_eq!(mem.read_u32(4, 0), u32::from_ne_bytes([0xFF; 4]));
assert_eq!(mem.read_u32(5, 0), 0);
}
#[test]
fn read_u64_out_of_bounds() {
let buf = [0xFFu8; 16];
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
assert_eq!(mem.read_u64(10, 0), 0);
assert_eq!(mem.read_u64(8, 0), u64::from_ne_bytes([0xFF; 8]));
assert_eq!(mem.read_u64(9, 0), 0);
}
#[test]
fn monitor_loop_kill_immediately() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let kill = AtomicBool::new(true);
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
Instant::now(),
&test_config(),
);
assert!(samples.is_empty());
}
#[test]
fn monitor_loop_one_iteration() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 2, 1, 3, 500, 0);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(50));
kill.store(true, Ordering::Release);
})
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
Instant::now(),
&test_config(),
);
handle.join().unwrap();
assert!(!samples.is_empty());
assert_eq!(samples[0].cpus.len(), 1);
assert_eq!(samples[0].cpus[0].nr_running, 2);
assert_eq!(samples[0].cpus[0].scx_nr_running, 1);
assert_eq!(samples[0].cpus[0].local_dsq_depth, 3);
assert_eq!(samples[0].cpus[0].rq_clock, 500);
}
#[test]
fn two_cpu_independent_reads() {
let offsets = test_offsets();
let buf0 = make_rq_buffer(&offsets, 10, 5, 2, 1000, 0x1);
let buf1 = make_rq_buffer(&offsets, 20, 15, 8, 2000, 0x2);
let pa1 = buf0.len() as u64;
let mut combined = buf0;
combined.extend_from_slice(&buf1);
let mem = unsafe { GuestMem::new(combined.as_ptr() as *mut u8, combined.len() as u64) };
let snap0 = read_rq_stats(&mem, 0, &offsets);
let snap1 = read_rq_stats(&mem, pa1, &offsets);
assert_eq!(snap0.nr_running, 10);
assert_eq!(snap0.scx_nr_running, 5);
assert_eq!(snap0.local_dsq_depth, 2);
assert_eq!(snap0.rq_clock, 1000);
assert_eq!(snap0.scx_flags, 0x1);
assert_eq!(snap1.nr_running, 20);
assert_eq!(snap1.scx_nr_running, 15);
assert_eq!(snap1.local_dsq_depth, 8);
assert_eq!(snap1.rq_clock, 2000);
assert_eq!(snap1.scx_flags, 0x2);
}
#[test]
fn read_u32_nonzero_pa_and_offset() {
let mut buf = [0u8; 32];
buf[20..24].copy_from_slice(&0xDEADBEEFu32.to_ne_bytes());
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
assert_eq!(mem.read_u32(12, 8), 0xDEADBEEF);
}
#[test]
fn read_u64_nonzero_pa_and_offset() {
let mut buf = [0u8; 32];
buf[16..24].copy_from_slice(&0x0123456789ABCDEFu64.to_ne_bytes());
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
assert_eq!(mem.read_u64(10, 6), 0x0123456789ABCDEF);
}
#[test]
fn monitor_loop_multi_cpu() {
let offsets = test_offsets();
let buf0 = make_rq_buffer(&offsets, 3, 2, 1, 100, 0);
let buf1 = make_rq_buffer(&offsets, 7, 5, 4, 200, 0);
let pa1 = buf0.len() as u64;
let mut combined = buf0;
combined.extend_from_slice(&buf1);
let mem = unsafe { GuestMem::new(combined.as_ptr() as *mut u8, combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(50));
kill.store(true, Ordering::Release);
})
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0, pa1],
&offsets,
Duration::from_millis(10),
&kill,
Instant::now(),
&test_config(),
);
handle.join().unwrap();
assert!(!samples.is_empty());
for s in &samples {
assert_eq!(s.cpus.len(), 2);
}
assert_eq!(samples[0].cpus[0].nr_running, 3);
assert_eq!(samples[0].cpus[0].scx_nr_running, 2);
assert_eq!(samples[0].cpus[1].nr_running, 7);
assert_eq!(samples[0].cpus[1].scx_nr_running, 5);
}
#[test]
fn monitor_loop_elapsed_ms_progresses() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(200));
kill.store(true, Ordering::Release);
})
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
Instant::now(),
&test_config(),
);
handle.join().unwrap();
assert!(
samples.len() >= 2,
"need at least 2 samples, got {}",
samples.len()
);
for w in samples.windows(2) {
assert!(
w[1].elapsed_ms >= w[0].elapsed_ms,
"elapsed_ms went backwards: {} -> {}",
w[0].elapsed_ms,
w[1].elapsed_ms
);
}
assert!(samples.last().unwrap().elapsed_ms > 0);
}
fn test_event_offsets() -> ScxEventOffsets {
ScxEventOffsets {
percpu_ptr_off: 0,
event_stats_off: 0,
ev_select_cpu_fallback: 0,
ev_dispatch_local_dsq_offline: 8,
ev_dispatch_keep_last: 16,
ev_enq_skip_exiting: 24,
ev_enq_skip_migration_disabled: 32,
ev_reenq_immed: None,
ev_reenq_local_repeat: None,
ev_refill_slice_dfl: None,
ev_bypass_duration: None,
ev_bypass_dispatch: None,
ev_bypass_activate: None,
ev_insert_not_owned: None,
ev_sub_bypass_dispatch: None,
}
}
fn make_event_stats_buffer(
ev: &ScxEventOffsets,
fallback: i64,
offline: i64,
keep_last: i64,
skip_exit: i64,
skip_mig: i64,
) -> Vec<u8> {
let size = ev.event_stats_off + ev.ev_enq_skip_migration_disabled + 8;
let mut buf = vec![0u8; size];
let base = ev.event_stats_off;
buf[base + ev.ev_select_cpu_fallback..base + ev.ev_select_cpu_fallback + 8]
.copy_from_slice(&fallback.to_ne_bytes());
buf[base + ev.ev_dispatch_local_dsq_offline..base + ev.ev_dispatch_local_dsq_offline + 8]
.copy_from_slice(&offline.to_ne_bytes());
buf[base + ev.ev_dispatch_keep_last..base + ev.ev_dispatch_keep_last + 8]
.copy_from_slice(&keep_last.to_ne_bytes());
buf[base + ev.ev_enq_skip_exiting..base + ev.ev_enq_skip_exiting + 8]
.copy_from_slice(&skip_exit.to_ne_bytes());
buf[base + ev.ev_enq_skip_migration_disabled..base + ev.ev_enq_skip_migration_disabled + 8]
.copy_from_slice(&skip_mig.to_ne_bytes());
buf
}
#[test]
fn read_event_stats_known_values() {
let ev = test_event_offsets();
let buf = make_event_stats_buffer(&ev, 42, 7, 100, 3, 5);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let stats = read_event_stats(&mem, 0, &ev);
assert_eq!(stats.select_cpu_fallback, 42);
assert_eq!(stats.dispatch_local_dsq_offline, 7);
assert_eq!(stats.dispatch_keep_last, 100);
assert_eq!(stats.enq_skip_exiting, 3);
assert_eq!(stats.enq_skip_migration_disabled, 5);
}
#[test]
fn read_event_stats_zeros() {
let ev = test_event_offsets();
let buf = make_event_stats_buffer(&ev, 0, 0, 0, 0, 0);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let stats = read_event_stats(&mem, 0, &ev);
assert_eq!(stats.select_cpu_fallback, 0);
assert_eq!(stats.dispatch_local_dsq_offline, 0);
}
#[test]
fn read_event_stats_optional_fields() {
let mut ev = test_event_offsets();
ev.ev_bypass_activate = Some(40);
let mut buf = [0u8; 48];
let val: i64 = 999;
buf[40..48].copy_from_slice(&val.to_ne_bytes());
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let stats = read_event_stats(&mem, 0, &ev);
assert_eq!(stats.bypass_activate, 999);
assert_eq!(stats.reenq_immed, 0);
assert_eq!(stats.bypass_duration, 0);
assert_eq!(stats.sub_bypass_dispatch, 0);
}
#[test]
fn read_i64_roundtrip() {
let val: i64 = -12345;
let buf = val.to_ne_bytes();
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
assert_eq!(mem.read_i64(0, 0), -12345);
}
#[test]
fn write_u8_and_read_u8() {
let mut buf = [0u8; 16];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u8(0, 5, 0xAB);
assert_eq!(mem.read_u8(0, 5), 0xAB);
assert_eq!(buf[5], 0xAB);
}
#[test]
fn write_u8_out_of_bounds() {
let mut buf = [0u8; 4];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u8(4, 0, 0xFF);
assert_eq!(buf, [0u8; 4]);
}
#[test]
fn write_u64_and_read_u64() {
let mut buf = [0u8; 32];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u64(0, 8, 0xDEAD_BEEF_CAFE_1234);
assert_eq!(mem.read_u64(0, 8), 0xDEAD_BEEF_CAFE_1234);
assert_eq!(
u64::from_ne_bytes(buf[8..16].try_into().unwrap()),
0xDEAD_BEEF_CAFE_1234
);
}
#[test]
fn write_u64_out_of_bounds() {
let mut buf = [0u8; 8];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u64(1, 0, 0xFF);
assert_eq!(buf, [0u8; 8]);
}
#[test]
fn write_u64_at_boundary() {
let mut buf = [0u8; 16];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u64(8, 0, 0x0123_4567_89AB_CDEF);
assert_eq!(mem.read_u64(8, 0), 0x0123_4567_89AB_CDEF);
}
#[test]
fn read_u8_out_of_bounds() {
let buf = [0xFFu8; 4];
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
assert_eq!(mem.read_u8(4, 0), 0);
assert_eq!(mem.read_u8(3, 0), 0xFF);
}
#[test]
fn read_rq_stats_has_no_event_counters() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let snap = read_rq_stats(&mem, 0, &offsets);
assert!(snap.event_counters.is_none());
}
#[test]
fn monitor_loop_with_event_counters() {
let ev = test_event_offsets();
let mut offsets = test_offsets();
offsets.event_offsets = Some(ev.clone());
let rq_buf = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let ev_buf = make_event_stats_buffer(&ev, 10, 20, 30, 40, 50);
let rq_pa = 0u64;
let ev_pa = rq_buf.len() as u64;
let mut combined = rq_buf;
combined.extend_from_slice(&ev_buf);
let mem = unsafe { GuestMem::new(combined.as_ptr() as *mut u8, combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(30));
kill.store(true, Ordering::Release);
})
};
let ev_pas = vec![ev_pa];
let cfg = MonitorConfig {
event_pcpu_pas: Some(&ev_pas),
..test_config()
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[rq_pa],
&offsets,
Duration::from_millis(10),
&kill,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(!samples.is_empty());
let counters = samples[0].cpus[0].event_counters.as_ref().unwrap();
assert_eq!(counters.select_cpu_fallback, 10);
assert_eq!(counters.dispatch_local_dsq_offline, 20);
assert_eq!(counters.dispatch_keep_last, 30);
assert_eq!(counters.enq_skip_exiting, 40);
assert_eq!(counters.enq_skip_migration_disabled, 50);
}
#[test]
fn monitor_loop_no_event_counters_when_none() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(30));
kill.store(true, Ordering::Release);
})
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
Instant::now(),
&test_config(),
);
handle.join().unwrap();
assert!(!samples.is_empty());
assert!(samples[0].cpus[0].event_counters.is_none());
}
#[test]
fn resolve_event_pcpu_pas_null_scx_root() {
let ev = test_event_offsets();
let buf = [0u8; 64];
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let result = resolve_event_pcpu_pas(&mem, 0, &ev, &[0, 0x4000], 0);
assert!(result.is_none());
}
#[test]
fn monitor_loop_with_watchdog_override() {
let offsets = test_offsets();
let rq_buf = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let scx_root_pa = rq_buf.len() as u64;
let sch_pa = scx_root_pa + 8;
let watchdog_offset: usize = 16;
let page_offset = super::super::symbols::DEFAULT_PAGE_OFFSET;
let scx_sched_kva = page_offset.wrapping_add(sch_pa);
let mut combined = rq_buf;
combined.extend_from_slice(&scx_sched_kva.to_ne_bytes());
combined.extend_from_slice(&[0u8; 64]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let wd = WatchdogOverride::ScxSched {
scx_root_pa,
watchdog_offset,
jiffies: 99999,
page_offset,
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(30));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
watchdog_override: Some(&wd),
..test_config()
};
let MonitorLoopResult {
samples,
watchdog_observation,
..
} = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(!samples.is_empty());
let write_pa = sch_pa as usize + watchdog_offset;
let written = u64::from_ne_bytes(combined[write_pa..write_pa + 8].try_into().unwrap());
assert_eq!(written, 99999);
let obs = watchdog_observation.expect("watchdog_observation should be Some after write");
assert_eq!(obs.expected_jiffies, 99999);
assert_eq!(obs.observed_jiffies, 99999);
}
#[test]
fn monitor_loop_watchdog_override_skipped_when_scx_root_null() {
let offsets = test_offsets();
let rq_buf = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let scx_root_pa = rq_buf.len() as u64;
let mut combined = rq_buf;
combined.extend_from_slice(&[0u8; 8]); combined.extend_from_slice(&[0u8; 128]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let wd = WatchdogOverride::ScxSched {
scx_root_pa,
watchdog_offset: 16,
jiffies: 0xDEADBEEF,
page_offset: super::super::symbols::DEFAULT_PAGE_OFFSET,
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(30));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
watchdog_override: Some(&wd),
..test_config()
};
let MonitorLoopResult {
watchdog_observation,
..
} = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(
combined[scx_root_pa as usize..].iter().all(|&b| b == 0),
"no write should occur when scx_root is null"
);
assert!(
watchdog_observation.is_none(),
"watchdog_observation should be None when scx_root is null"
);
}
#[test]
fn monitor_loop_watchdog_static_global_writes_directly() {
let offsets = test_offsets();
let rq_buf = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let watchdog_pa = rq_buf.len() as u64;
let mut combined = rq_buf;
combined.extend_from_slice(&[0u8; 8]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let wd = WatchdogOverride::StaticGlobal {
watchdog_timeout_pa: watchdog_pa,
jiffies: 77777,
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(30));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
watchdog_override: Some(&wd),
..test_config()
};
let MonitorLoopResult {
samples,
watchdog_observation,
..
} = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(!samples.is_empty());
let written = u64::from_ne_bytes(
combined[watchdog_pa as usize..watchdog_pa as usize + 8]
.try_into()
.unwrap(),
);
assert_eq!(written, 77777);
let obs = watchdog_observation.expect("watchdog_observation should be Some");
assert_eq!(obs.expected_jiffies, 77777);
assert_eq!(obs.observed_jiffies, 77777);
}
#[test]
fn monitor_loop_dump_trigger_fires_on_imbalance() {
let offsets = test_offsets();
let buf0 = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let buf1 = make_rq_buffer(&offsets, 20, 20, 1, 200, 0);
let pa1 = buf0.len() as u64;
let mut combined = buf0;
combined.extend_from_slice(&buf1);
let shm_pa = combined.len() as u64;
combined.extend(vec![0u8; 64]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let trigger = DumpTrigger {
shm_base_pa: shm_pa,
thresholds: super::super::MonitorThresholds {
max_imbalance_ratio: 2.0,
sustained_samples: 2,
fail_on_stall: false,
..Default::default()
},
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(200));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
dump_trigger: Some(&trigger),
..test_config()
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0, pa1],
&offsets,
Duration::from_millis(10),
&kill,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(!samples.is_empty());
let dump_byte = combined[shm_pa as usize + crate::vmm::shm_ring::DUMP_REQ_OFFSET];
assert_eq!(
dump_byte,
crate::vmm::shm_ring::DUMP_REQ_SYSRQ_D,
"dump request should have been written to SHM"
);
}
#[test]
fn monitor_loop_dump_trigger_stall_with_sustained_window() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 2, 1, 1, 5000, 0);
let shm_pa = buf.len() as u64;
let mut combined = buf;
combined.extend(vec![0u8; 64]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let trigger = DumpTrigger {
shm_base_pa: shm_pa,
thresholds: super::super::MonitorThresholds {
max_imbalance_ratio: 100.0,
max_local_dsq_depth: 10000,
fail_on_stall: true,
sustained_samples: 2,
..Default::default()
},
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(200));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
dump_trigger: Some(&trigger),
..test_config()
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(
samples.len() >= 3,
"need >= 3 samples for 2 stall pairs, got {}",
samples.len()
);
let dump_byte = combined[shm_pa as usize + crate::vmm::shm_ring::DUMP_REQ_OFFSET];
assert_eq!(
dump_byte,
crate::vmm::shm_ring::DUMP_REQ_SYSRQ_D,
"stall should trigger dump after sustained_samples=2"
);
}
#[test]
fn monitor_loop_dump_trigger_idle_cpu_no_stall() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 0, 0, 0, 5000, 0);
let shm_pa = buf.len() as u64;
let mut combined = buf;
combined.extend(vec![0u8; 64]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let trigger = DumpTrigger {
shm_base_pa: shm_pa,
thresholds: super::super::MonitorThresholds {
max_imbalance_ratio: 100.0,
max_local_dsq_depth: 10000,
fail_on_stall: true,
sustained_samples: 1,
..Default::default()
},
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(200));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
dump_trigger: Some(&trigger),
..test_config()
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(
samples.len() >= 2,
"need >= 2 samples, got {}",
samples.len()
);
let dump_byte = combined[shm_pa as usize + crate::vmm::shm_ring::DUMP_REQ_OFFSET];
assert_eq!(dump_byte, 0, "idle CPU should not trigger stall dump");
}
#[test]
fn monitor_loop_vcpu_timing_preempted_no_stall() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 2, 1, 1, 5000, 0);
let shm_pa = buf.len() as u64;
let mut combined = buf;
combined.extend(vec![0u8; 64]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let sleeper_kill = std::sync::Arc::new(AtomicBool::new(false));
let sleeper_kill_clone = sleeper_kill.clone();
let sleeper = std::thread::Builder::new()
.name("vcpu-sleeper".into())
.spawn(move || {
while !sleeper_kill_clone.load(Ordering::Acquire) {
std::thread::sleep(Duration::from_millis(100));
}
})
.unwrap();
let pt = sleeper.as_pthread_t() as libc::pthread_t;
let vcpu_timing = VcpuTiming { pthreads: vec![pt] };
let trigger = DumpTrigger {
shm_base_pa: shm_pa,
thresholds: super::super::MonitorThresholds {
max_imbalance_ratio: 100.0,
max_local_dsq_depth: 10000,
fail_on_stall: true,
sustained_samples: 1,
..Default::default()
},
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(150));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
dump_trigger: Some(&trigger),
vcpu_timing: Some(&vcpu_timing),
preemption_threshold_ns: 10_000_000,
..test_config()
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(30),
&kill,
Instant::now(),
&cfg,
);
handle.join().unwrap();
sleeper_kill.store(true, Ordering::Release);
let _ = sleeper.join();
assert!(
samples.len() >= 2,
"need >= 2 samples, got {}",
samples.len()
);
let dump_byte = combined[shm_pa as usize + crate::vmm::shm_ring::DUMP_REQ_OFFSET];
assert_eq!(dump_byte, 0, "preempted vCPU should not trigger stall dump");
}
#[test]
fn monitor_loop_vcpu_timing_running_stall_fires() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 2, 1, 1, 5000, 0);
let shm_pa = buf.len() as u64;
let mut combined = buf;
combined.extend(vec![0u8; 64]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let spinner_kill = std::sync::Arc::new(AtomicBool::new(false));
let spinner_kill_clone = spinner_kill.clone();
let spinner = std::thread::Builder::new()
.name("vcpu-spinner".into())
.spawn(move || {
while !spinner_kill_clone.load(Ordering::Relaxed) {
std::hint::spin_loop();
}
})
.unwrap();
let pt = spinner.as_pthread_t() as libc::pthread_t;
let vcpu_timing = VcpuTiming { pthreads: vec![pt] };
let trigger = DumpTrigger {
shm_base_pa: shm_pa,
thresholds: super::super::MonitorThresholds {
max_imbalance_ratio: 100.0,
max_local_dsq_depth: 10000,
fail_on_stall: true,
sustained_samples: 2,
..Default::default()
},
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(200));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
dump_trigger: Some(&trigger),
vcpu_timing: Some(&vcpu_timing),
preemption_threshold_ns: 10_000_000,
..test_config()
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(30),
&kill,
Instant::now(),
&cfg,
);
handle.join().unwrap();
spinner_kill.store(true, Ordering::Release);
let _ = spinner.join();
assert!(
samples.len() >= 3,
"need >= 3 samples for 2 stall pairs, got {}",
samples.len()
);
let dump_byte = combined[shm_pa as usize + crate::vmm::shm_ring::DUMP_REQ_OFFSET];
assert_eq!(
dump_byte,
crate::vmm::shm_ring::DUMP_REQ_SYSRQ_D,
"real stall (vCPU running, clock stuck, nr_running>0) should trigger dump"
);
}
#[test]
fn reactive_and_evaluate_stall_consistency() {
let offsets = test_offsets();
let buf0 = make_rq_buffer(&offsets, 2, 1, 1, 5000, 0);
let buf1 = make_rq_buffer(&offsets, 1, 1, 1, 9000, 0);
let pa1 = buf0.len() as u64;
let mut combined = buf0;
combined.extend_from_slice(&buf1);
let shm_pa = combined.len() as u64;
combined.extend(vec![0u8; 64]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let thresholds = super::super::MonitorThresholds {
max_imbalance_ratio: 100.0,
max_local_dsq_depth: 10000,
fail_on_stall: true,
sustained_samples: 2,
..Default::default()
};
let trigger = DumpTrigger {
shm_base_pa: shm_pa,
thresholds,
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(200));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
dump_trigger: Some(&trigger),
..test_config()
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0, pa1],
&offsets,
Duration::from_millis(10),
&kill,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(
samples.len() >= 3,
"need >= 3 samples, got {}",
samples.len()
);
let dump_byte = combined[shm_pa as usize + crate::vmm::shm_ring::DUMP_REQ_OFFSET];
let reactive_stall = dump_byte == crate::vmm::shm_ring::DUMP_REQ_SYSRQ_D;
let summary = super::super::MonitorSummary::from_samples(&samples);
let report = super::super::MonitorReport {
samples,
summary,
..Default::default()
};
let verdict = thresholds.evaluate(&report);
assert!(reactive_stall, "reactive path should detect stall");
assert!(
!verdict.passed,
"evaluate should detect stall: {:?}",
verdict.details
);
assert!(
verdict.details.iter().any(|d| d.contains("rq_clock stall")),
"evaluate details should mention stall: {:?}",
verdict.details
);
}
#[test]
fn reactive_and_evaluate_idle_consistency() {
let offsets = test_offsets();
let buf = make_rq_buffer(&offsets, 0, 0, 0, 5000, 0);
let shm_pa = buf.len() as u64;
let mut combined = buf;
combined.extend(vec![0u8; 64]);
let mem = unsafe { GuestMem::new(combined.as_mut_ptr(), combined.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let thresholds = super::super::MonitorThresholds {
max_imbalance_ratio: 100.0,
max_local_dsq_depth: 10000,
fail_on_stall: true,
sustained_samples: 1,
..Default::default()
};
let trigger = DumpTrigger {
shm_base_pa: shm_pa,
thresholds,
};
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(200));
kill.store(true, Ordering::Release);
})
};
let cfg = MonitorConfig {
dump_trigger: Some(&trigger),
..test_config()
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
Instant::now(),
&cfg,
);
handle.join().unwrap();
assert!(
samples.len() >= 2,
"need >= 2 samples, got {}",
samples.len()
);
let dump_byte = combined[shm_pa as usize + crate::vmm::shm_ring::DUMP_REQ_OFFSET];
assert_eq!(
dump_byte, 0,
"reactive: idle CPU should not trigger stall dump"
);
let summary = super::super::MonitorSummary::from_samples(&samples);
assert!(
!summary.stall_detected,
"from_samples: idle CPU should not flag stall"
);
let report = super::super::MonitorReport {
samples,
summary,
..Default::default()
};
let verdict = thresholds.evaluate(&report);
assert!(
verdict.passed,
"evaluate: idle CPU should pass: {:?}",
verdict.details
);
}
fn test_schedstat_offsets() -> super::super::btf_offsets::SchedstatOffsets {
super::super::btf_offsets::SchedstatOffsets {
rq_sched_info: 200,
sched_info_run_delay: 8,
sched_info_pcount: 0,
rq_yld_count: 300,
rq_sched_count: 304,
rq_sched_goidle: 308,
rq_ttwu_count: 312,
rq_ttwu_local: 316,
}
}
#[allow(clippy::too_many_arguments)]
fn make_schedstat_buffer(
ss: &super::super::btf_offsets::SchedstatOffsets,
run_delay: u64,
pcount: u64,
yld_count: u32,
sched_count: u32,
sched_goidle: u32,
ttwu_count: u32,
ttwu_local: u32,
) -> Vec<u8> {
let size = ss.rq_ttwu_local + 4 + 8;
let mut buf = vec![0u8; size];
let si_base = ss.rq_sched_info;
buf[si_base + ss.sched_info_pcount..si_base + ss.sched_info_pcount + 8]
.copy_from_slice(&pcount.to_ne_bytes());
buf[si_base + ss.sched_info_run_delay..si_base + ss.sched_info_run_delay + 8]
.copy_from_slice(&run_delay.to_ne_bytes());
buf[ss.rq_yld_count..ss.rq_yld_count + 4].copy_from_slice(&yld_count.to_ne_bytes());
buf[ss.rq_sched_count..ss.rq_sched_count + 4].copy_from_slice(&sched_count.to_ne_bytes());
buf[ss.rq_sched_goidle..ss.rq_sched_goidle + 4]
.copy_from_slice(&sched_goidle.to_ne_bytes());
buf[ss.rq_ttwu_count..ss.rq_ttwu_count + 4].copy_from_slice(&ttwu_count.to_ne_bytes());
buf[ss.rq_ttwu_local..ss.rq_ttwu_local + 4].copy_from_slice(&ttwu_local.to_ne_bytes());
buf
}
#[test]
fn read_rq_schedstat_known_values() {
let ss = test_schedstat_offsets();
let buf = make_schedstat_buffer(&ss, 50000, 10, 3, 100, 20, 80, 40);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let stats = read_rq_schedstat(&mem, 0, &ss);
assert_eq!(stats.run_delay, 50000);
assert_eq!(stats.pcount, 10);
assert_eq!(stats.yld_count, 3);
assert_eq!(stats.sched_count, 100);
assert_eq!(stats.sched_goidle, 20);
assert_eq!(stats.ttwu_count, 80);
assert_eq!(stats.ttwu_local, 40);
}
#[test]
fn read_rq_schedstat_zeros() {
let ss = test_schedstat_offsets();
let buf = make_schedstat_buffer(&ss, 0, 0, 0, 0, 0, 0, 0);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let stats = read_rq_schedstat(&mem, 0, &ss);
assert_eq!(stats.run_delay, 0);
assert_eq!(stats.pcount, 0);
assert_eq!(stats.yld_count, 0);
assert_eq!(stats.sched_count, 0);
assert_eq!(stats.sched_goidle, 0);
assert_eq!(stats.ttwu_count, 0);
assert_eq!(stats.ttwu_local, 0);
}
#[test]
fn monitor_loop_with_schedstat_overlay() {
let ss = test_schedstat_offsets();
let mut offsets = test_offsets();
offsets.schedstat_offsets = Some(ss.clone());
let rq_size = ss.rq_ttwu_local + 4 + 8;
let mut buf = vec![0u8; rq_size];
buf[offsets.rq_nr_running..offsets.rq_nr_running + 4].copy_from_slice(&2u32.to_ne_bytes());
buf[offsets.rq_clock..offsets.rq_clock + 8].copy_from_slice(&500u64.to_ne_bytes());
let si_base = ss.rq_sched_info;
buf[si_base + ss.sched_info_run_delay..si_base + ss.sched_info_run_delay + 8]
.copy_from_slice(&12345u64.to_ne_bytes());
buf[si_base + ss.sched_info_pcount..si_base + ss.sched_info_pcount + 8]
.copy_from_slice(&7u64.to_ne_bytes());
buf[ss.rq_sched_count..ss.rq_sched_count + 4].copy_from_slice(&42u32.to_ne_bytes());
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(30));
kill.store(true, Ordering::Release);
})
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
Instant::now(),
&test_config(),
);
handle.join().unwrap();
assert!(!samples.is_empty());
let ss_snap = samples[0].cpus[0].schedstat.as_ref().unwrap();
assert_eq!(ss_snap.run_delay, 12345);
assert_eq!(ss_snap.pcount, 7);
assert_eq!(ss_snap.sched_count, 42);
}
#[test]
fn monitor_loop_no_schedstat_when_none() {
let offsets = test_offsets();
assert!(offsets.schedstat_offsets.is_none());
let buf = make_rq_buffer(&offsets, 1, 1, 1, 100, 0);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let kill = std::sync::Arc::new(AtomicBool::new(false));
let handle = {
let kill = std::sync::Arc::clone(&kill);
std::thread::spawn(move || {
std::thread::sleep(Duration::from_millis(30));
kill.store(true, Ordering::Release);
})
};
let MonitorLoopResult { samples, .. } = monitor_loop(
&mem,
&[0],
&offsets,
Duration::from_millis(10),
&kill,
Instant::now(),
&test_config(),
);
handle.join().unwrap();
assert!(!samples.is_empty());
assert!(samples[0].cpus[0].schedstat.is_none());
}
fn test_sched_domain_offsets() -> SchedDomainOffsets {
SchedDomainOffsets {
rq_sd: 400,
sd_parent: 0,
sd_level: 8,
sd_flags: 12,
sd_name: 16,
sd_span_weight: 24,
sd_balance_interval: 28,
sd_nr_balance_failed: 32,
sd_newidle_call: Some(36),
sd_newidle_success: Some(40),
sd_newidle_ratio: Some(44),
sd_max_newidle_lb_cost: 48,
stats_offsets: Some(test_sd_stats_offsets()),
}
}
fn test_sd_stats_offsets() -> SchedDomainStatsOffsets {
SchedDomainStatsOffsets {
sd_lb_count: 56,
sd_lb_failed: 68,
sd_lb_balanced: 80,
sd_lb_imbalance_load: 92,
sd_lb_imbalance_util: 104,
sd_lb_imbalance_task: 116,
sd_lb_imbalance_misfit: 128,
sd_lb_gained: 140,
sd_lb_hot_gained: 152,
sd_lb_nobusyg: 164,
sd_lb_nobusyq: 176,
sd_alb_count: 188,
sd_alb_failed: 192,
sd_alb_pushed: 196,
sd_sbe_count: 200,
sd_sbe_balanced: 204,
sd_sbe_pushed: 208,
sd_sbf_count: 212,
sd_sbf_balanced: 216,
sd_sbf_pushed: 220,
sd_ttwu_wake_remote: 224,
sd_ttwu_move_affine: 228,
sd_ttwu_move_balance: 232,
}
}
#[allow(clippy::too_many_arguments)]
fn make_sd_buffer(
sd: &SchedDomainOffsets,
parent_kva: u64,
level: i32,
flags: i32,
name_kva: u64,
span_weight: u32,
balance_interval: u32,
newidle_call: u32,
lb_count_0: u32,
alb_pushed: u32,
ttwu_wake_remote: u32,
) -> Vec<u8> {
let so = sd.stats_offsets.as_ref().unwrap();
let size = so.sd_ttwu_move_balance + 4 + 8;
let mut buf = vec![0u8; size];
buf[sd.sd_parent..sd.sd_parent + 8].copy_from_slice(&parent_kva.to_ne_bytes());
buf[sd.sd_level..sd.sd_level + 4].copy_from_slice(&level.to_ne_bytes());
buf[sd.sd_flags..sd.sd_flags + 4].copy_from_slice(&flags.to_ne_bytes());
buf[sd.sd_name..sd.sd_name + 8].copy_from_slice(&name_kva.to_ne_bytes());
buf[sd.sd_span_weight..sd.sd_span_weight + 4].copy_from_slice(&span_weight.to_ne_bytes());
buf[sd.sd_balance_interval..sd.sd_balance_interval + 4]
.copy_from_slice(&balance_interval.to_ne_bytes());
if let Some(off) = sd.sd_newidle_call {
buf[off..off + 4].copy_from_slice(&newidle_call.to_ne_bytes());
}
buf[so.sd_lb_count..so.sd_lb_count + 4].copy_from_slice(&lb_count_0.to_ne_bytes());
buf[so.sd_alb_pushed..so.sd_alb_pushed + 4].copy_from_slice(&alb_pushed.to_ne_bytes());
buf[so.sd_ttwu_wake_remote..so.sd_ttwu_wake_remote + 4]
.copy_from_slice(&ttwu_wake_remote.to_ne_bytes());
buf
}
#[test]
fn read_sched_domain_tree_null_sd() {
let sd_off = test_sched_domain_offsets();
let buf = vec![0u8; 512];
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let result = read_sched_domain_tree(&mem, 0, &sd_off, 0);
assert!(result.is_none());
}
#[test]
fn read_sched_domain_tree_single_domain() {
let sd_off = test_sched_domain_offsets();
let sd_pa: u64 = 1024;
let name_pa: u64 = 2048;
let sd_buf = make_sd_buffer(&sd_off, 0, 0, 0x42, name_pa, 4, 64, 15, 10, 3, 7);
let name_bytes = b"SMT\0";
let total_size = (name_pa as usize) + 16;
let mut buf = vec![0u8; total_size];
buf[sd_off.rq_sd..sd_off.rq_sd + 8].copy_from_slice(&sd_pa.to_ne_bytes());
buf[sd_pa as usize..sd_pa as usize + sd_buf.len()].copy_from_slice(&sd_buf);
buf[name_pa as usize..name_pa as usize + name_bytes.len()].copy_from_slice(name_bytes);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let domains = read_sched_domain_tree(&mem, 0, &sd_off, 0).unwrap();
assert_eq!(domains.len(), 1);
assert_eq!(domains[0].level, 0);
assert_eq!(domains[0].name, "SMT");
assert_eq!(domains[0].flags, 0x42);
assert_eq!(domains[0].span_weight, 4);
assert_eq!(domains[0].balance_interval, 64);
assert_eq!(domains[0].newidle_call, Some(15));
let stats = domains[0].stats.as_ref().unwrap();
assert_eq!(stats.lb_count[0], 10);
assert_eq!(stats.alb_pushed, 3);
assert_eq!(stats.ttwu_wake_remote, 7);
}
#[test]
fn read_sched_domain_tree_two_levels() {
let sd_off = test_sched_domain_offsets();
let sd0_pa: u64 = 1024;
let sd1_pa: u64 = 2048;
let name0_pa: u64 = 3072;
let name1_pa: u64 = 3088;
let sd0_buf = make_sd_buffer(&sd_off, sd1_pa, 0, 0x10, name0_pa, 2, 32, 8, 5, 1, 2);
let sd1_buf = make_sd_buffer(&sd_off, 0, 1, 0x20, name1_pa, 8, 128, 22, 20, 4, 10);
let total_size = 3104;
let mut buf = vec![0u8; total_size];
buf[sd_off.rq_sd..sd_off.rq_sd + 8].copy_from_slice(&sd0_pa.to_ne_bytes());
buf[sd0_pa as usize..sd0_pa as usize + sd0_buf.len()].copy_from_slice(&sd0_buf);
buf[sd1_pa as usize..sd1_pa as usize + sd1_buf.len()].copy_from_slice(&sd1_buf);
buf[name0_pa as usize..name0_pa as usize + 4].copy_from_slice(b"SMT\0");
buf[name1_pa as usize..name1_pa as usize + 3].copy_from_slice(b"MC\0");
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let domains = read_sched_domain_tree(&mem, 0, &sd_off, 0).unwrap();
assert_eq!(domains.len(), 2);
assert_eq!(domains[0].level, 0);
assert_eq!(domains[0].name, "SMT");
assert_eq!(domains[0].span_weight, 2);
assert_eq!(domains[0].balance_interval, 32);
assert_eq!(domains[0].newidle_call, Some(8));
let s0 = domains[0].stats.as_ref().unwrap();
assert_eq!(s0.lb_count[0], 5);
assert_eq!(domains[1].level, 1);
assert_eq!(domains[1].name, "MC");
assert_eq!(domains[1].span_weight, 8);
assert_eq!(domains[1].balance_interval, 128);
assert_eq!(domains[1].newidle_call, Some(22));
let s1 = domains[1].stats.as_ref().unwrap();
assert_eq!(s1.lb_count[0], 20);
assert_eq!(s1.alb_pushed, 4);
assert_eq!(s1.ttwu_wake_remote, 10);
}
#[test]
fn read_sched_domain_tree_self_reference_breaks_cycle() {
let sd_off = test_sched_domain_offsets();
let sd_pa: u64 = 1024;
let sd_buf = make_sd_buffer(&sd_off, sd_pa, 0, 0, 0, 1, 0, 0, 0, 0, 0);
let total_size = sd_pa as usize + sd_buf.len();
let mut buf = vec![0u8; total_size];
buf[sd_off.rq_sd..sd_off.rq_sd + 8].copy_from_slice(&sd_pa.to_ne_bytes());
buf[sd_pa as usize..sd_pa as usize + sd_buf.len()].copy_from_slice(&sd_buf);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let domains = read_sched_domain_tree(&mem, 0, &sd_off, 0).unwrap();
assert_eq!(
domains.len(),
1,
"self-referential sd should produce exactly one snapshot"
);
}
#[test]
fn read_sched_domain_tree_max_depth_bound_on_long_chain() {
let sd_off = test_sched_domain_offsets();
const SD_SIZE: u64 = 248;
const CHAIN_LEN: usize = 10;
let first_pa: u64 = 1024;
let pa = |i: usize| first_pa + (i as u64) * SD_SIZE;
let total_size = pa(CHAIN_LEN) as usize;
let mut buf = vec![0u8; total_size];
buf[sd_off.rq_sd..sd_off.rq_sd + 8].copy_from_slice(&pa(0).to_ne_bytes());
for i in 0..CHAIN_LEN {
let parent_kva = if i + 1 == CHAIN_LEN { 0 } else { pa(i + 1) };
let sd_buf = make_sd_buffer(&sd_off, parent_kva, i as i32, 0, 0, 1, 0, 0, 0, 0, 0);
let start = pa(i) as usize;
buf[start..start + sd_buf.len()].copy_from_slice(&sd_buf);
}
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let domains = read_sched_domain_tree(&mem, 0, &sd_off, 0).unwrap();
assert_eq!(
domains.len(),
8,
"acyclic chain of {CHAIN_LEN} levels must truncate at MAX_DEPTH=8"
);
for (i, snap) in domains.iter().enumerate() {
assert_eq!(snap.level, i as i32, "level mismatch at index {i}");
}
}
#[test]
fn read_sched_domain_tree_out_of_bounds_pa() {
let sd_off = test_sched_domain_offsets();
let bad_kva: u64 = 0xFFFF_FFFF_FFFF_0000;
let mut buf = vec![0u8; 512];
buf[sd_off.rq_sd..sd_off.rq_sd + 8].copy_from_slice(&bad_kva.to_ne_bytes());
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let domains = read_sched_domain_tree(&mem, 0, &sd_off, 0);
assert!(domains.is_some());
assert!(domains.unwrap().is_empty());
}
#[test]
fn read_sched_domain_tree_newidle_none() {
let mut sd_off = test_sched_domain_offsets();
sd_off.sd_newidle_call = None;
sd_off.sd_newidle_success = None;
sd_off.sd_newidle_ratio = None;
let sd_pa: u64 = 1024;
let name_pa: u64 = 2048;
let sd_buf = make_sd_buffer(&sd_off, 0, 0, 0x42, name_pa, 4, 64, 0, 10, 3, 7);
let name_bytes = b"SMT\0";
let total_size = (name_pa as usize) + 16;
let mut buf = vec![0u8; total_size];
buf[sd_off.rq_sd..sd_off.rq_sd + 8].copy_from_slice(&sd_pa.to_ne_bytes());
buf[sd_pa as usize..sd_pa as usize + sd_buf.len()].copy_from_slice(&sd_buf);
buf[name_pa as usize..name_pa as usize + name_bytes.len()].copy_from_slice(name_bytes);
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let domains = read_sched_domain_tree(&mem, 0, &sd_off, 0).unwrap();
assert_eq!(domains.len(), 1);
assert_eq!(domains[0].level, 0);
assert_eq!(domains[0].name, "SMT");
assert_eq!(domains[0].flags, 0x42);
assert_eq!(domains[0].span_weight, 4);
assert_eq!(domains[0].balance_interval, 64);
assert_eq!(domains[0].newidle_call, None);
assert_eq!(domains[0].newidle_success, None);
assert_eq!(domains[0].newidle_ratio, None);
let stats = domains[0].stats.as_ref().unwrap();
assert_eq!(stats.lb_count[0], 10);
assert_eq!(stats.alb_pushed, 3);
assert_eq!(stats.ttwu_wake_remote, 7);
}
#[test]
fn read_u32_array_known_values() {
let mut buf = [0u8; 16];
buf[0..4].copy_from_slice(&10u32.to_ne_bytes());
buf[4..8].copy_from_slice(&20u32.to_ne_bytes());
buf[8..12].copy_from_slice(&30u32.to_ne_bytes());
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let arr = read_u32_array(&mem, 0, 0);
assert_eq!(arr, [10, 20, 30]);
}
#[test]
fn btf_offsets_couple_with_rq_reader() {
let path = match crate::monitor::find_test_vmlinux() {
Some(p) => p,
None => skip!("no test vmlinux available"),
};
let offsets = crate::test_support::require_kernel_offsets(&path);
let max_scalar_off = offsets.rq_clock + 8;
let max_scx_off = offsets.rq_scx + offsets.scx_rq_local_dsq + offsets.dsq_nr + 4;
let max_flags_off = offsets.rq_scx + offsets.scx_rq_flags + 4;
let size = max_scalar_off.max(max_scx_off).max(max_flags_off) + 64;
let mut buf = vec![0u8; size];
let nr_running: u32 = 0xDEAD_BEEF;
let scx_nr: u32 = 0x1234_5678;
let dsq_depth: u32 = 0x0BAD_F00D;
let clock: u64 = 0xCAFE_BABE_1357_9BDF;
let flags: u32 = 0xA5A5_A5A5;
buf[offsets.rq_nr_running..offsets.rq_nr_running + 4]
.copy_from_slice(&nr_running.to_ne_bytes());
buf[offsets.rq_clock..offsets.rq_clock + 8].copy_from_slice(&clock.to_ne_bytes());
let scx_nr_off = offsets.rq_scx + offsets.scx_rq_nr_running;
buf[scx_nr_off..scx_nr_off + 4].copy_from_slice(&scx_nr.to_ne_bytes());
let scx_flags_off = offsets.rq_scx + offsets.scx_rq_flags;
buf[scx_flags_off..scx_flags_off + 4].copy_from_slice(&flags.to_ne_bytes());
let dsq_off = offsets.rq_scx + offsets.scx_rq_local_dsq + offsets.dsq_nr;
buf[dsq_off..dsq_off + 4].copy_from_slice(&dsq_depth.to_ne_bytes());
let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
let snap = read_rq_stats(&mem, 0, &offsets);
assert_eq!(snap.nr_running, nr_running);
assert_eq!(snap.scx_nr_running, scx_nr);
assert_eq!(snap.local_dsq_depth, dsq_depth);
assert_eq!(snap.rq_clock, clock);
assert_eq!(snap.scx_flags, flags);
}
#[test]
fn write_u32_at_boundary_writes_full_word() {
let mut buf = [0u8; 16];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u32(12, 0, 0xDEAD_BEEF);
assert_eq!(mem.read_u32(12, 0), 0xDEAD_BEEF);
assert_eq!(
u32::from_ne_bytes(buf[12..16].try_into().unwrap()),
0xDEAD_BEEF
);
}
#[test]
fn write_u32_one_past_boundary_is_noop() {
let mut buf = [0u8; 16];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u32(13, 0, 0xFFFF_FFFF);
assert_eq!(buf, [0u8; 16]);
}
#[test]
fn write_u8_at_boundary_writes_last_byte() {
let mut buf = [0u8; 4];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u8(3, 0, 0xAB);
assert_eq!(buf[3], 0xAB);
}
#[test]
fn write_scalar_offset_arg_is_added_to_pa() {
let mut buf = [0u8; 32];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u64(8, 16, 0x0123_4567_89AB_CDEF);
assert_eq!(mem.read_u64(24, 0), 0x0123_4567_89AB_CDEF);
assert_eq!(
u64::from_ne_bytes(buf[24..32].try_into().unwrap()),
0x0123_4567_89AB_CDEF
);
}
#[test]
fn write_scalar_offset_only_out_of_bounds_is_noop() {
let mut buf = [0xCCu8; 8];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
mem.write_u64(4, 4, 0xFFFF_FFFF_FFFF_FFFF);
assert_eq!(buf, [0xCCu8; 8]);
}
#[test]
fn guest_mem_new_size_smaller_than_write_offset_is_noop() {
let mut backing = [0u8; 32];
let declared_size: u64 = 8;
let mem = unsafe { GuestMem::new(backing.as_mut_ptr(), declared_size) };
mem.write_u64(16, 0, 0xDEAD_BEEF_CAFE_1234);
assert_eq!(backing, [0u8; 32]);
mem.write_u64(0, 0, 0xDEAD_BEEF_CAFE_1234);
assert_eq!(
u64::from_ne_bytes(backing[0..8].try_into().unwrap()),
0xDEAD_BEEF_CAFE_1234
);
assert_eq!(backing[8..32], [0u8; 24]);
}
#[test]
fn resolve_ptr_multi_region_routes_to_correct_region() {
let mut buf0 = [0xAAu8; 64];
let mut buf1 = [0xBBu8; 64];
let regions = vec![
MemRegion {
host_ptr: buf0.as_mut_ptr(),
offset: 0,
size: 64,
},
MemRegion {
host_ptr: buf1.as_mut_ptr(),
offset: 1024, size: 64,
},
];
let mem = unsafe { GuestMem::from_regions_for_test(regions) };
assert_eq!(mem.read_u8(0, 0), 0xAA);
assert_eq!(mem.read_u8(63, 0), 0xAA);
assert_eq!(mem.read_u8(1024, 0), 0xBB);
assert_eq!(mem.read_u8(1087, 0), 0xBB);
assert_eq!(mem.read_u8(64, 0), 0);
assert_eq!(mem.read_u8(512, 0), 0);
assert_eq!(mem.read_u8(1023, 0), 0);
mem.write_u32(1024, 0, 0x1234_5678);
assert_eq!(buf0, [0xAAu8; 64]);
assert_eq!(
u32::from_ne_bytes(buf1[0..4].try_into().unwrap()),
0x1234_5678
);
let buf0_snapshot = buf0;
let buf1_snapshot = buf1;
mem.write_u32(900, 0, 0xFFFF_FFFF);
assert_eq!(buf0, buf0_snapshot);
assert_eq!(buf1, buf1_snapshot);
}
#[test]
fn resolve_ptr_multi_region_read_ring_volatile_routes_correctly() {
let mut buf0 = [0u8; 64];
let mut buf1 = [0u8; 64];
buf1[0..8].copy_from_slice(&[1, 2, 3, 4, 5, 6, 7, 8]);
let regions = vec![
MemRegion {
host_ptr: buf0.as_mut_ptr(),
offset: 0,
size: 64,
},
MemRegion {
host_ptr: buf1.as_mut_ptr(),
offset: 1024,
size: 64,
},
];
let mem = unsafe { GuestMem::from_regions_for_test(regions) };
for (i, expected) in [1, 2, 3, 4, 5, 6, 7, 8].iter().enumerate() {
assert_eq!(mem.read_u8(1024 + i as u64, 0), *expected);
}
}
#[test]
fn resolve_ptr_offset_at_exact_region_end_is_out_of_region() {
let mut buf = [0xCCu8; 16];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
assert_eq!(mem.read_u8(16, 0), 0);
}
#[test]
fn write_u8_and_read_u8_bounds_at_declared_size() {
const SIZE: u64 = 8;
let mut buf = [0u8; SIZE as usize];
let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), SIZE) };
mem.write_u8(SIZE - 1, 0, 0xAB);
assert_eq!(
mem.read_u8(SIZE - 1, 0),
0xAB,
"write at last valid offset must round-trip via read_u8"
);
assert_eq!(
buf[(SIZE - 1) as usize],
0xAB,
"write at last valid offset must land in the backing byte"
);
let snapshot = buf;
mem.write_u8(SIZE, 0, 0xFF);
assert_eq!(
buf, snapshot,
"write past the end must be a silent no-op — no byte of \
the backing buffer may change"
);
assert_eq!(
mem.read_u8(SIZE, 0),
0,
"read past the end must return 0, not stale memory"
);
assert_eq!(
mem.read_u8(SIZE + 1, 0),
0,
"read several bytes past the end must also return 0"
);
}
}