use std::os::unix::io::AsRawFd;
use std::os::unix::thread::JoinHandleExt;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicI32, AtomicPtr, AtomicU8, AtomicU64, Ordering};
use std::thread::JoinHandle;
use std::time::{Duration, Instant};
use vmm_sys_util::epoll::{ControlOperation, Epoll, EpollEvent, EventSet};
use vmm_sys_util::eventfd::{EFD_NONBLOCK, EventFd};
use vmm_sys_util::timerfd::TimerFd;
use super::exit_dispatch;
use crate::monitor;
use crate::sync::Latch;
#[derive(Clone, Copy)]
pub(crate) struct ImmediateExitHandle {
ptr: *mut u8,
}
unsafe impl Send for ImmediateExitHandle {}
unsafe impl Sync for ImmediateExitHandle {}
impl ImmediateExitHandle {
pub(crate) fn from_vcpu(vcpu: &mut kvm_ioctls::VcpuFd) -> Self {
let kvm_run = vcpu.get_kvm_run();
let ptr: *mut u8 = &mut kvm_run.immediate_exit;
Self { ptr }
}
pub(crate) fn set(&self, val: u8) {
unsafe {
std::ptr::write_volatile(self.ptr, val);
}
}
#[cfg(test)]
pub(crate) fn read_byte(&self) -> u8 {
unsafe { std::ptr::read_volatile(self.ptr) }
}
}
pub(crate) fn duration_to_jiffies(d: Duration, hz: u64) -> u64 {
(d.as_millis() as u64).saturating_mul(hz) / 1000
}
pub(crate) fn vcpu_signal() -> libc::c_int {
libc::SIGRTMIN()
}
pub(crate) fn load_probe_bss_offset(
kernel: &crate::monitor::guest::GuestKernel<'_>,
btf_kva: u64,
base: &btf_rs::Btf,
offsets: &crate::monitor::btf_offsets::BpfMapOffsets,
) -> Option<u32> {
let mem = kernel.mem();
let walk = kernel.walk_context();
let btf_pa = crate::monitor::idr::translate_any_kva(
mem,
walk.cr3_pa,
walk.page_offset,
btf_kva,
walk.l5,
walk.tcr_el1,
)?;
let data_kva = mem.read_u64(btf_pa, offsets.btf_data);
let data_size = mem.read_u32(btf_pa, offsets.btf_data_size) as usize;
let base_kva = mem.read_u64(btf_pa, offsets.btf_base_btf);
if data_kva == 0 || data_size == 0 {
return None;
}
if data_size > crate::monitor::dump::MAX_BTF_BLOB {
return None;
}
let blob = kernel.read_kva_bytes_chunked(data_kva, data_size)?;
let prog_btf = if base_kva != 0 {
btf_rs::Btf::from_split_bytes(&blob, base).ok()?
} else {
btf_rs::Btf::from_bytes(&blob).ok()?
};
crate::monitor::btf_offsets::resolve_var_offset_in_section(
&prog_btf,
".bss",
"ktstr_err_exit_detected",
)
}
extern "C" fn vcpu_signal_handler(_: libc::c_int, _: *mut libc::siginfo_t, _: *mut libc::c_void) {
std::sync::atomic::fence(Ordering::Acquire);
}
pub(crate) fn register_vcpu_signal_handler() {
unsafe {
let mut sa: libc::sigaction = std::mem::zeroed();
sa.sa_sigaction = vcpu_signal_handler as *const () as usize;
sa.sa_flags = libc::SA_SIGINFO;
libc::sigemptyset(&mut sa.sa_mask);
let rc = libc::sigaction(vcpu_signal(), &sa, std::ptr::null_mut());
assert_eq!(
rc,
0,
"register_vcpu_signal_handler: sigaction(SIGRTMIN, SA_SIGINFO) failed: {} \
— vCPU kicks would silently no-op and KVM_RUN would block forever",
std::io::Error::last_os_error(),
);
let mut set: libc::sigset_t = std::mem::zeroed();
libc::sigemptyset(&mut set);
libc::sigaddset(&mut set, vcpu_signal());
let rc = libc::pthread_sigmask(libc::SIG_UNBLOCK, &set, std::ptr::null_mut());
assert_eq!(
rc,
0,
"register_vcpu_signal_handler: pthread_sigmask(SIG_UNBLOCK, SIGRTMIN) failed: {} \
— signal would stay blocked and pthread_kick deliveries would queue forever",
std::io::Error::from_raw_os_error(rc),
);
}
}
pub(crate) fn pin_current_thread(cpu: usize, label: &str) {
let mut cpuset = nix::sched::CpuSet::new();
if let Err(e) = cpuset.set(cpu) {
eprintln!("performance_mode: WARNING: cpuset.set({cpu}) for {label}: {e}");
return;
}
match nix::sched::sched_setaffinity(nix::unistd::Pid::from_raw(0), &cpuset) {
Ok(()) => eprintln!("performance_mode: pinned {label} to host CPU {cpu}"),
Err(e) => eprintln!("performance_mode: WARNING: pin {label} to CPU {cpu}: {e}"),
}
}
pub(crate) fn set_thread_cpumask(cpus: &[usize], label: &str) {
let mut cpuset = nix::sched::CpuSet::new();
let mut applied: Vec<usize> = Vec::with_capacity(cpus.len());
let mut skipped: Vec<usize> = Vec::new();
for &cpu in cpus {
match cpuset.set(cpu) {
Ok(()) => applied.push(cpu),
Err(e) => {
eprintln!("no_perf_mode: WARNING: cpuset.set({cpu}) for {label}: {e}; skipping");
skipped.push(cpu);
}
}
}
if !skipped.is_empty() {
eprintln!(
"no_perf_mode: {label}: skipped {} of {} requested CPUs ({skipped:?}); proceeding with {applied:?}",
skipped.len(),
cpus.len(),
);
}
if applied.is_empty() {
eprintln!(
"no_perf_mode: WARNING: {label}: no valid CPUs to mask (every requested entry failed)"
);
return;
}
let n = applied.len();
let cpu_list_str = {
let mut parts: Vec<String> = Vec::new();
let mut start = applied[0];
let mut end = applied[0];
for &cpu in &applied[1..] {
if cpu == end + 1 {
end = cpu;
} else {
if start == end {
parts.push(format!("{start}"));
} else {
parts.push(format!("{start}-{end}"));
}
start = cpu;
end = cpu;
}
}
if start == end {
parts.push(format!("{start}"));
} else {
parts.push(format!("{start}-{end}"));
}
parts.join(",")
};
match nix::sched::sched_setaffinity(nix::unistd::Pid::from_raw(0), &cpuset) {
Ok(()) => eprintln!("no_perf_mode: mask {label} to {n} CPUs ({cpu_list_str})"),
Err(e) => {
eprintln!("no_perf_mode: WARNING: mask {label} to {n} CPUs ({cpu_list_str}): {e}")
}
}
}
pub(crate) fn set_rt_priority(priority: i32, label: &str) {
let param = libc::sched_param {
sched_priority: priority,
};
let rc = unsafe { libc::sched_setscheduler(0, libc::SCHED_FIFO, ¶m) };
if rc == 0 {
tracing::info!(
label = label,
priority = priority,
"performance_mode: {label} set to SCHED_FIFO priority {priority}",
);
} else {
let err = std::io::Error::last_os_error();
tracing::warn!(
label = label,
priority = priority,
err = %err,
"performance_mode: WARNING: SCHED_FIFO for {label}: {err} (need CAP_SYS_NICE)",
);
}
}
pub(crate) fn open_vcpu_perf_capture(
vcpu_tid_slots: &[(Arc<AtomicI32>, Arc<Latch>)],
) -> Option<monitor::perf_counters::PerfCountersCapture> {
let overall_deadline = Instant::now() + Duration::from_secs(1);
let mut tids: Vec<libc::pid_t> = Vec::with_capacity(vcpu_tid_slots.len());
for (slot, latch) in vcpu_tid_slots {
let now = Instant::now();
let remaining = overall_deadline.saturating_duration_since(now);
if remaining.is_zero() {
tids.push(slot.load(Ordering::Acquire));
continue;
}
latch.wait_timeout(remaining);
tids.push(slot.load(Ordering::Acquire));
}
if !tids.iter().all(|&t| t > 0) {
let missing: Vec<usize> = tids
.iter()
.enumerate()
.filter_map(|(i, &t)| (t == 0).then_some(i))
.collect();
tracing::warn!(
?missing,
"vCPU TID slots never published; per-vCPU perf capture disabled"
);
return None;
}
match monitor::perf_counters::PerfCountersCapture::open(&tids) {
Ok(cap) => Some(cap),
Err(e) => {
tracing::warn!(
err = %e,
"perf_event_open failed; per-vCPU IPC/cache-miss capture disabled"
);
None
}
}
}
pub(crate) struct VcpuThread {
pub(crate) handle: JoinHandle<kvm_ioctls::VcpuFd>,
pub(crate) exited: Arc<AtomicBool>,
pub(crate) immediate_exit: Option<ImmediateExitHandle>,
pub(crate) exit_evt: Arc<EventFd>,
pub(crate) alive: Arc<AtomicBool>,
}
pub(crate) struct ApFreezeHandles {
pub(crate) parked: Vec<Arc<AtomicBool>>,
pub(crate) regs: Vec<Arc<std::sync::Mutex<Option<exit_dispatch::VcpuRegSnapshot>>>>,
}
pub(crate) struct WatchpointArm {
pub(crate) request_kva: AtomicU64,
pub(crate) kind_host_ptr: AtomicPtr<u32>,
pub(crate) hit: AtomicBool,
pub(crate) hit_evt: EventFd,
pub(crate) user: [WatchpointSlot; 3],
pub(crate) any_armed: AtomicU8,
}
pub(crate) struct WatchpointSlot {
pub(crate) request_kva: AtomicU64,
pub(crate) hit: AtomicBool,
pub(crate) tag: std::sync::Mutex<String>,
}
impl WatchpointSlot {
fn new() -> Self {
Self {
request_kva: AtomicU64::new(0),
hit: AtomicBool::new(false),
tag: std::sync::Mutex::new(String::new()),
}
}
}
pub(crate) const SCX_EXIT_ERROR_THRESHOLD: u32 = 1024;
impl WatchpointArm {
pub(crate) fn new() -> std::io::Result<Self> {
Ok(Self {
request_kva: AtomicU64::new(0),
kind_host_ptr: AtomicPtr::new(std::ptr::null_mut()),
hit: AtomicBool::new(false),
hit_evt: EventFd::new(EFD_NONBLOCK)?,
user: [
WatchpointSlot::new(),
WatchpointSlot::new(),
WatchpointSlot::new(),
],
any_armed: AtomicU8::new(0),
})
}
pub(crate) fn mark_armed(&self) {
self.any_armed.store(1, Ordering::Relaxed);
}
pub(crate) fn latch_hit(&self) {
if self
.hit
.compare_exchange(false, true, Ordering::Release, Ordering::Relaxed)
.is_err()
{
return;
}
if let Err(e) = self.hit_evt.write(1) {
tracing::warn!(
error = %e,
"WatchpointArm::latch_hit: eventfd write failed; \
coordinator will still trip on next epoll timeout"
);
}
}
pub(crate) fn latch_user_hit(&self, idx: usize) {
if idx >= self.user.len() {
return;
}
if self.user[idx]
.hit
.compare_exchange(false, true, Ordering::Release, Ordering::Relaxed)
.is_err()
{
return;
}
if let Err(e) = self.hit_evt.write(1) {
tracing::warn!(
error = %e,
idx,
"WatchpointArm::latch_user_hit: eventfd write failed; \
coordinator will still trip on next epoll timeout"
);
}
}
}
#[allow(dead_code)]
pub(crate) const WATCHPOINT_MAX_NON_EINTR_FAILURES: u8 = 3;
#[cfg(target_arch = "x86_64")]
pub(crate) fn self_arm_watchpoint(
vcpu: &mut kvm_ioctls::VcpuFd,
watchpoint: &WatchpointArm,
armed_slots: &mut [u64; 4],
failures: &mut u8,
single_step_pending: bool,
single_step_slot: usize,
armed_single_step: &mut bool,
) -> bool {
let _ = (
single_step_pending,
single_step_slot,
&mut *armed_single_step,
);
if watchpoint.any_armed.load(Ordering::Relaxed) == 0 {
return false;
}
let mut requests = [0u64; 4];
requests[0] = watchpoint.request_kva.load(Ordering::Acquire);
for i in 0..3 {
requests[i + 1] = watchpoint.user[i].request_kva.load(Ordering::Acquire);
}
if requests == *armed_slots {
return false;
}
use kvm_bindings::{KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_USE_HW_BP, kvm_guest_debug};
let mut debug_struct = kvm_guest_debug {
control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP,
pad: 0,
arch: kvm_bindings::kvm_guest_debug_arch::default(),
};
let mut dr7: u64 = 0x400 | 0x200 | 0x100;
for (i, kva) in requests.iter().enumerate() {
if *kva == 0 {
continue;
}
debug_struct.arch.debugreg[i] = *kva;
dr7 |= (0b11) << (2 * i); dr7 |= (0b01) << (16 + 4 * i); dr7 |= (0b11) << (18 + 4 * i); }
debug_struct.arch.debugreg[7] = dr7;
match vcpu.set_guest_debug(&debug_struct) {
Ok(()) => {
*armed_slots = requests;
*failures = 0;
true
}
Err(e) => {
if e.errno() == libc::EINTR {
tracing::debug!(
err = %e,
requests = ?requests,
"self_arm_watchpoint: EINTR — will retry next iteration"
);
return false;
}
*failures = failures.saturating_add(1);
tracing::warn!(
err = %e,
requests = ?requests,
failures = *failures,
"self_arm_watchpoint: KVM_SET_GUEST_DEBUG failed"
);
if *failures >= WATCHPOINT_MAX_NON_EINTR_FAILURES {
tracing::warn!(
requests = ?requests,
failures = *failures,
"self_arm_watchpoint: hit retry cap, suppressing further \
attempts; falling back to BPF .bss poll for failure-dump \
trigger"
);
*armed_slots = requests;
}
false
}
}
}
#[cfg(target_arch = "aarch64")]
pub(crate) fn self_arm_watchpoint(
vcpu: &mut kvm_ioctls::VcpuFd,
watchpoint: &WatchpointArm,
armed_slots: &mut [u64; 4],
failures: &mut u8,
single_step_pending: bool,
single_step_slot: usize,
armed_single_step: &mut bool,
) -> bool {
if watchpoint.any_armed.load(Ordering::Relaxed) == 0 {
return false;
}
let mut requests = [0u64; 4];
requests[0] = watchpoint.request_kva.load(Ordering::Acquire);
for i in 0..3 {
requests[i + 1] = watchpoint.user[i].request_kva.load(Ordering::Acquire);
}
if requests == *armed_slots && *armed_single_step == single_step_pending {
return false;
}
use kvm_bindings::{
KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_GUESTDBG_USE_HW, kvm_guest_debug,
};
let mut control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW;
if single_step_pending {
control |= KVM_GUESTDBG_SINGLESTEP;
}
let mut debug_struct = kvm_guest_debug {
control,
pad: 0,
arch: kvm_bindings::kvm_guest_debug_arch::default(),
};
let step_mask: u8 = (single_step_slot & 0xF) as u8;
for (i, kva) in requests.iter().enumerate() {
if *kva == 0 {
continue;
}
debug_struct.arch.dbg_wvr[i] = *kva & !0x7u64;
let byte_offset = (*kva & 0x7u64) as u32;
let bas: u64 = 0xFu64 << byte_offset;
let e = if single_step_pending && (step_mask & (1u8 << i)) != 0 {
0u64
} else {
1u64
};
let wcr: u64 = e | (0b11u64 << 1) | (0b10u64 << 3) | (bas << 5);
debug_struct.arch.dbg_wcr[i] = wcr;
}
match vcpu.set_guest_debug(&debug_struct) {
Ok(()) => {
*armed_slots = requests;
*armed_single_step = single_step_pending;
*failures = 0;
true
}
Err(e) => {
if e.errno() == libc::EINTR {
tracing::debug!(
err = %e,
requests = ?requests,
"self_arm_watchpoint: EINTR — will retry next iteration"
);
return false;
}
*failures = failures.saturating_add(1);
tracing::warn!(
err = %e,
requests = ?requests,
failures = *failures,
"self_arm_watchpoint: KVM_SET_GUEST_DEBUG failed"
);
if *failures >= WATCHPOINT_MAX_NON_EINTR_FAILURES {
tracing::warn!(
requests = ?requests,
failures = *failures,
"self_arm_watchpoint: hit retry cap, suppressing further \
attempts; falling back to BPF .bss poll for failure-dump \
trigger"
);
*armed_slots = requests;
*armed_single_step = single_step_pending;
}
false
}
}
}
impl VcpuThread {
pub(crate) fn kick(&self) {
if let Some(ref ie) = self.immediate_exit
&& self.alive.load(Ordering::Acquire)
{
ie.set(1);
std::sync::atomic::fence(Ordering::Release);
}
self.signal();
}
pub(crate) fn signal(&self) {
unsafe {
libc::pthread_kill(self.handle.as_pthread_t() as libc::pthread_t, vcpu_signal());
}
}
pub(crate) fn wait_for_exit(&self, timeout: Duration) {
if self.exited.load(Ordering::Acquire) {
return;
}
let epoll = match Epoll::new() {
Ok(e) => e,
Err(e) => {
tracing::warn!(%e, "wait_for_exit: epoll_create1 failed");
return;
}
};
const EXIT_TOKEN: u64 = 0;
const KICK_TOKEN: u64 = 1;
if let Err(e) = epoll.ctl(
ControlOperation::Add,
self.exit_evt.as_raw_fd(),
EpollEvent::new(EventSet::IN, EXIT_TOKEN),
) {
tracing::warn!(%e, "wait_for_exit: add exit_evt to epoll");
return;
}
let mut kick_timer = match TimerFd::new() {
Ok(t) => t,
Err(e) => {
tracing::warn!(%e, "wait_for_exit: timerfd_create failed");
return;
}
};
let kick_interval = Duration::from_millis(10);
if let Err(e) = kick_timer.reset(kick_interval, Some(kick_interval)) {
tracing::warn!(%e, "wait_for_exit: timerfd_settime failed");
return;
}
if let Err(e) = epoll.ctl(
ControlOperation::Add,
kick_timer.as_raw_fd(),
EpollEvent::new(EventSet::IN, KICK_TOKEN),
) {
tracing::warn!(%e, "wait_for_exit: add timerfd to epoll");
return;
}
let start = Instant::now();
let mut events = [EpollEvent::default(); 2];
loop {
if self.exited.load(Ordering::Acquire) {
return;
}
let elapsed = start.elapsed();
if elapsed >= timeout {
return;
}
let remaining_ms = (timeout - elapsed).as_millis().min(i32::MAX as u128) as i32;
match epoll.wait(remaining_ms, &mut events) {
Ok(0) => return, Ok(n) => {
for ev in &events[..n] {
if ev.data() == KICK_TOKEN {
let _ = kick_timer.wait();
self.kick();
}
}
}
Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
Err(e) => {
tracing::warn!(%e, "wait_for_exit: epoll_wait failed");
return;
}
}
}
}
}
#[derive(Clone)]
pub(crate) struct BpfMapWriteParams {
pub(crate) map_name_suffix: String,
pub(crate) offset: usize,
pub(crate) value: u32,
}
#[cfg(test)]
mod tests {
use super::*;
use crate::vmm::kvm;
use crate::vmm::topology::Topology;
#[test]
fn vcpu_signal_is_sigrtmin() {
let sig = vcpu_signal();
assert!(sig >= libc::SIGRTMIN(), "signal should be >= SIGRTMIN");
assert!(sig <= libc::SIGRTMAX(), "signal should be <= SIGRTMAX");
}
#[test]
fn vcpu_exit_flag_transitions() {
let exited = Arc::new(AtomicBool::new(false));
assert!(
!exited.load(Ordering::Acquire),
"initial state must be false"
);
let exited_clone = Arc::clone(&exited);
let handle = std::thread::spawn(move || {
exited_clone.store(true, Ordering::Release);
});
handle.join().unwrap();
assert!(
exited.load(Ordering::Acquire),
"flag must be true after cross-thread store"
);
}
#[test]
fn duration_to_jiffies_basic() {
assert_eq!(duration_to_jiffies(Duration::from_millis(500), 1000), 500);
assert_eq!(duration_to_jiffies(Duration::from_millis(1500), 1000), 1500);
assert_eq!(duration_to_jiffies(Duration::from_secs(4), 250), 1000);
assert_eq!(duration_to_jiffies(Duration::from_millis(0), 1000), 0);
assert_eq!(duration_to_jiffies(Duration::from_secs(1), 0), 0);
}
#[test]
fn immediate_exit_handle_set_clear() {
let topo = Topology {
llcs: 1,
cores_per_llc: 1,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let mut vm = kvm::KtstrKvm::new(topo, 64, false).unwrap();
let handle = ImmediateExitHandle::from_vcpu(&mut vm.vcpus[0]);
assert_eq!(
vm.vcpus[0].get_kvm_run().immediate_exit,
0,
"immediate_exit should start at 0"
);
handle.set(1);
assert_eq!(
vm.vcpus[0].get_kvm_run().immediate_exit,
1,
"handle.set(1) should be visible via get_kvm_run()"
);
vm.vcpus[0].set_kvm_immediate_exit(0);
assert_eq!(
vm.vcpus[0].get_kvm_run().immediate_exit,
0,
"set_kvm_immediate_exit(0) should clear the flag"
);
}
#[test]
fn immediate_exit_handle_cross_vcpu() {
let topo = Topology {
llcs: 1,
cores_per_llc: 2,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let mut vm = kvm::KtstrKvm::new(topo, 64, false).unwrap();
let h0 = ImmediateExitHandle::from_vcpu(&mut vm.vcpus[0]);
let h1 = ImmediateExitHandle::from_vcpu(&mut vm.vcpus[1]);
h0.set(1);
assert_eq!(vm.vcpus[0].get_kvm_run().immediate_exit, 1);
assert_eq!(
vm.vcpus[1].get_kvm_run().immediate_exit,
0,
"setting vcpu0 handle should not affect vcpu1"
);
h1.set(1);
assert_eq!(vm.vcpus[1].get_kvm_run().immediate_exit, 1);
h0.set(0);
h1.set(0);
assert_eq!(vm.vcpus[0].get_kvm_run().immediate_exit, 0);
assert_eq!(vm.vcpus[1].get_kvm_run().immediate_exit, 0);
}
#[test]
fn vcpu_thread_kick_sets_immediate_exit() {
let topo = Topology {
llcs: 1,
cores_per_llc: 1,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let mut vm = kvm::KtstrKvm::new(topo, 64, false).unwrap();
let ie = ImmediateExitHandle::from_vcpu(&mut vm.vcpus[0]);
ie.set(1);
std::sync::atomic::fence(Ordering::Release);
assert_eq!(
vm.vcpus[0].get_kvm_run().immediate_exit,
1,
"kick pattern should set immediate_exit=1"
);
vm.vcpus[0].set_kvm_immediate_exit(0);
assert_eq!(vm.vcpus[0].get_kvm_run().immediate_exit, 0);
}
#[test]
fn vcpu_thread_kick_skips_ie_when_alive_false() {
use std::sync::Barrier;
register_vcpu_signal_handler();
let topo = Topology {
llcs: 1,
cores_per_llc: 1,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let mut vm = kvm::KtstrKvm::new(topo, 64, false).unwrap();
let ie = ImmediateExitHandle::from_vcpu(&mut vm.vcpus[0]);
let barrier = Arc::new(Barrier::new(2));
let barrier_thread = barrier.clone();
let probe_vcpu = vm.vcpus.remove(0);
let handle = std::thread::Builder::new()
.name("kick-test-stub".into())
.spawn(move || {
barrier_thread.wait();
probe_vcpu
})
.unwrap();
let exited = Arc::new(AtomicBool::new(false));
let exit_evt = Arc::new(EventFd::new(EFD_NONBLOCK).unwrap());
let alive = Arc::new(AtomicBool::new(false));
let vt = VcpuThread {
handle,
exited,
immediate_exit: Some(ie),
exit_evt,
alive,
};
let read_byte = || vt.immediate_exit.as_ref().unwrap().read_byte();
assert_eq!(read_byte(), 0);
vt.kick();
assert_eq!(
read_byte(),
0,
"kick() must skip ie.set when alive == false (UAF gate)",
);
barrier.wait();
let _ = vt.handle.join();
}
#[test]
fn vcpu_thread_kick_writes_ie_when_alive_true() {
use std::sync::Barrier;
register_vcpu_signal_handler();
let topo = Topology {
llcs: 1,
cores_per_llc: 1,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let mut vm = kvm::KtstrKvm::new(topo, 64, false).unwrap();
let ie = ImmediateExitHandle::from_vcpu(&mut vm.vcpus[0]);
let barrier = Arc::new(Barrier::new(2));
let barrier_thread = barrier.clone();
let probe_vcpu = vm.vcpus.remove(0);
let handle = std::thread::Builder::new()
.name("kick-test-stub-alive".into())
.spawn(move || {
barrier_thread.wait();
probe_vcpu
})
.unwrap();
let exited = Arc::new(AtomicBool::new(false));
let exit_evt = Arc::new(EventFd::new(EFD_NONBLOCK).unwrap());
let alive = Arc::new(AtomicBool::new(true));
let vt = VcpuThread {
handle,
exited,
immediate_exit: Some(ie),
exit_evt,
alive,
};
let read_byte = || vt.immediate_exit.as_ref().unwrap().read_byte();
assert_eq!(read_byte(), 0);
vt.kick();
assert_eq!(
read_byte(),
1,
"kick() must write ie.set(1) when alive == true",
);
barrier.wait();
let _ = vt.handle.join();
}
#[test]
fn set_rt_priority_applies_when_capable() {
let param = libc::sched_param { sched_priority: 1 };
let rc = unsafe { libc::sched_setscheduler(0, libc::SCHED_FIFO, ¶m) };
if rc != 0 {
skip!("no CAP_SYS_NICE capability available");
}
let policy = unsafe { libc::sched_getscheduler(0) };
assert_eq!(policy, libc::SCHED_FIFO);
let mut out_param: libc::sched_param = unsafe { std::mem::zeroed() };
unsafe { libc::sched_getparam(0, &mut out_param) };
assert_eq!(out_param.sched_priority, 1);
let restore = libc::sched_param { sched_priority: 0 };
unsafe { libc::sched_setscheduler(0, libc::SCHED_OTHER, &restore) };
}
#[test]
#[tracing_test::traced_test]
fn set_rt_priority_warns_without_cap() {
let probe = libc::sched_param { sched_priority: 1 };
let rc = unsafe { libc::sched_setscheduler(0, libc::SCHED_FIFO, &probe) };
if rc == 0 {
let restore = libc::sched_param { sched_priority: 0 };
unsafe { libc::sched_setscheduler(0, libc::SCHED_OTHER, &restore) };
skip!("CAP_SYS_NICE present — cannot exercise warn path");
}
set_rt_priority(1, "test-thread");
assert!(
logs_contain("need CAP_SYS_NICE"),
"warn event must include the 'need CAP_SYS_NICE' hint \
so operators reading stderr know what permission to \
grant",
);
assert!(
logs_contain("SCHED_FIFO"),
"warn event must name the policy whose attachment failed",
);
assert!(
logs_contain("test-thread"),
"warn event must name the label so operators can attribute \
the warning to a specific vCPU / monitor / watchdog thread",
);
}
}