ktstr 0.5.2

Test harness for Linux process schedulers
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
//! Host-side per-vCPU hardware performance counters via
//! `perf_event_open(2)` with `exclude_host=1`.
//!
//! The PMU is a partitioned resource: when a counter is configured with
//! `exclude_host=1`, the kernel toggles the EVENTSEL_ENABLE bit on/off
//! at every guest entry and exit so the counter only ticks while the
//! vCPU thread is executing inside the guest. Verified at
//! `arch/x86/events/intel/core.c:5080` (`if (event->attr.exclude_host)
//! arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE;`) and
//! `arch/x86/events/intel/core.c:5090-5093` (`core_pmu_enable_event`
//! skips enable when `exclude_host` is set). KVM's
//! `intel_guest_get_msrs` is what flips the bit at VMENTER/VMEXIT.
//!
//! Practical use: combined with per-CPU schedstat (run_delay,
//! sched_count, ttwu_count) this distinguishes the three "vCPU is
//! consuming time" failure modes that look identical from the
//! host-stat view alone:
//!   - IPC ≈ 0 → guest is halted / idle (CPU not really busy).
//!   - IPC ≈ 0.5 → guest is spinning (lock contention, `cpu_relax`).
//!   - IPC ≥ 1 → guest is doing productive work.
//!
//! Cost is zero in-guest: hardware PMU counters are partitioned at the
//! VMCS/VMCB level. The host-side cost is one read per (vCPU, counter)
//! per monitor tick — an `lseek+read` syscall pair returning 24 bytes.
//!
//! Per-fd `read()` returns three u64s when
//! `PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING`
//! is set in `read_format` (see `tools/include/uapi/linux/perf_event.h`
//! `enum perf_event_read_format`). When `time_enabled > time_running`
//! the kernel multiplexed the counter; the consumer scales raw counts
//! by `time_enabled / time_running` to recover the "what would have
//! been counted if pinned" estimate. We store all three values raw —
//! scaling is the consumer's choice.

#![cfg(target_os = "linux")]

use std::io;
use std::os::fd::{AsRawFd, FromRawFd, OwnedFd};

use perf_event_open_sys as pes;
use pes::bindings::{
    PERF_COUNT_HW_BRANCH_MISSES, PERF_COUNT_HW_CACHE_MISSES, PERF_COUNT_HW_CPU_CYCLES,
    PERF_COUNT_HW_INSTRUCTIONS, PERF_FORMAT_TOTAL_TIME_ENABLED, PERF_FORMAT_TOTAL_TIME_RUNNING,
    PERF_TYPE_HARDWARE, perf_event_attr,
};

/// One per-vCPU sample for the four hardware counters. Fields are raw
/// values as returned by `read(fd, ...)`; scaling against
/// `time_enabled` / `time_running` is the consumer's choice (see the
/// module-level doc on multiplexing).
///
/// `time_enabled` and `time_running` are per-counter-set globals: when
/// reading multiple events that share the same vCPU+task target, the
/// kernel reports one pair across all four reads on the same monitor
/// tick — but we capture per-counter to avoid a fragile assumption
/// about kernel grouping. Consumers that scale should pick one
/// (typically `cycles.{time_enabled,time_running}`) and apply
/// uniformly.
#[derive(Debug, Clone, Copy, Default, serde::Serialize, serde::Deserialize)]
#[non_exhaustive]
pub struct VcpuPerfSample {
    /// `PERF_COUNT_HW_CPU_CYCLES` raw count.
    pub cycles: u64,
    /// `PERF_COUNT_HW_INSTRUCTIONS` raw count.
    pub instructions: u64,
    /// `PERF_COUNT_HW_CACHE_MISSES` raw count.
    pub cache_misses: u64,
    /// `PERF_COUNT_HW_BRANCH_MISSES` raw count.
    pub branch_misses: u64,
    /// Wall-clock ns the kernel allocated to this counter (sum across
    /// all counters; reflects how long the event was attached to the
    /// task). When equal to `time_running` the counter was never
    /// multiplexed.
    pub time_enabled_ns: u64,
    /// ns the counter was actually scheduled on a hardware PMU slot.
    /// `< time_enabled_ns` indicates multiplexing — multiply raw
    /// counts by `time_enabled_ns / time_running_ns` to recover the
    /// scaled estimate.
    pub time_running_ns: u64,
}

impl VcpuPerfSample {
    /// `instructions / cycles`, the standard "instructions per cycle"
    /// metric. `0.0` when `cycles == 0` (counter wasn't running or
    /// guest never executed). The caller decides whether to scale by
    /// `time_enabled / time_running` first; for the IPC ratio scaling
    /// is unnecessary because both numerator and denominator scale by
    /// the same factor.
    pub fn ipc(&self) -> f64 {
        if self.cycles == 0 {
            0.0
        } else {
            self.instructions as f64 / self.cycles as f64
        }
    }
}

/// Four `perf_event_open` file descriptors targeting one vCPU thread
/// (cycles, instructions, cache-misses, branch-misses) — all
/// configured with `exclude_host=1` so they only count while the
/// vCPU is executing inside the guest.
///
/// `Drop` closes the four fds via `OwnedFd`.
pub struct VcpuPerfCounters {
    cycles: OwnedFd,
    instructions: OwnedFd,
    cache_misses: OwnedFd,
    branch_misses: OwnedFd,
}

impl VcpuPerfCounters {
    /// Open the four counters bound to `tid` (Linux thread ID — the
    /// value `gettid()` returns inside the vCPU thread, NOT a
    /// `pthread_t`). `cpu = -1` lets the counter follow the thread
    /// across host CPUs.
    ///
    /// Returns `Err` when `perf_event_open(2)` fails on any of the
    /// four counters. The first error short-circuits — already-opened
    /// fds are dropped/closed by `OwnedFd`'s `Drop`. Common failure
    /// modes: `EACCES` (perf_event_paranoid too high or
    /// CAP_PERFMON missing), `ENODEV` (counter not available on this
    /// hardware), `ESRCH` (`tid` no longer exists), `EOPNOTSUPP`
    /// (running under a hypervisor that virtualizes the PMU
    /// differently — `exclude_host` may be rejected when KVM is the
    /// guest hypervisor). Caller treats `Err` as "perf data
    /// unavailable for this vCPU" and continues without it.
    pub fn open(tid: libc::pid_t) -> io::Result<Self> {
        let cycles = open_one(tid, PERF_COUNT_HW_CPU_CYCLES as u64)?;
        let instructions = open_one(tid, PERF_COUNT_HW_INSTRUCTIONS as u64)?;
        let cache_misses = open_one(tid, PERF_COUNT_HW_CACHE_MISSES as u64)?;
        let branch_misses = open_one(tid, PERF_COUNT_HW_BRANCH_MISSES as u64)?;
        Ok(Self {
            cycles,
            instructions,
            cache_misses,
            branch_misses,
        })
    }

    /// Read all four counters into a single [`VcpuPerfSample`].
    /// Returns `Err` only if a `read(2)` syscall fails (kernel
    /// shouldn't error on a properly opened perf fd; surfaced for
    /// diagnostic visibility). Each `read` returns 24 bytes:
    /// `[value, time_enabled, time_running]` as native-endian u64s
    /// because `read_format` was set to
    /// `PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING`.
    /// A short read (less than 24 bytes) is treated as failure —
    /// the kernel always returns the full record or fails.
    pub fn read(&self) -> io::Result<VcpuPerfSample> {
        let (cycles, time_enabled_ns, time_running_ns) = read_one(&self.cycles)?;
        let (instructions, _, _) = read_one(&self.instructions)?;
        let (cache_misses, _, _) = read_one(&self.cache_misses)?;
        let (branch_misses, _, _) = read_one(&self.branch_misses)?;
        Ok(VcpuPerfSample {
            cycles,
            instructions,
            cache_misses,
            branch_misses,
            time_enabled_ns,
            time_running_ns,
        })
    }
}

fn open_one(tid: libc::pid_t, config: u64) -> io::Result<OwnedFd> {
    let mut attr = perf_event_attr {
        size: std::mem::size_of::<perf_event_attr>() as u32,
        type_: PERF_TYPE_HARDWARE,
        config,
        read_format: (PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING) as u64,
        ..Default::default()
    };
    // disabled=0 — start counting immediately on open. We don't gate
    // with PERF_EVENT_IOC_ENABLE because the monitor opens these on
    // a steady-state vCPU that is already running.
    attr.set_disabled(0);
    // exclude_host=1 — see module-level doc. The verified
    // arch/x86/events/intel/core.c paths zero EVENTSEL_ENABLE during
    // host execution and KVM toggles it on guest entry/exit.
    attr.set_exclude_host(1);
    // exclude_user=0, exclude_kernel=0 — count both guest userspace
    // and guest kernel. Together with exclude_host=1 this means
    // "everything that runs inside the VM, regardless of CPL".
    attr.set_exclude_user(0);
    attr.set_exclude_kernel(0);
    // exclude_hv=0 — there is no hypervisor below us in the typical
    // baremetal-host case; setting this would be wrong on
    // platforms where the host kernel itself runs as a guest of a
    // higher hypervisor (rare on the dev/CI hosts ktstr targets).
    attr.set_exclude_hv(0);
    // pinned=0 — share PMU slots with other tools (perf record,
    // bpftop). Multiplexing is signaled via time_enabled vs
    // time_running and the consumer scales accordingly.
    attr.set_pinned(0);
    // SAFETY: `attr` is a valid initialized perf_event_attr; the
    // syscall reads it as a `*const`. `cpu = -1` means "follow the
    // thread"; `group_fd = -1` makes this a standalone leader.
    // Returning fd is checked below.
    let fd = unsafe { pes::perf_event_open(&mut attr, tid, -1, -1, 0) };
    if fd < 0 {
        return Err(io::Error::last_os_error());
    }
    // SAFETY: the kernel returned a valid fd; we own it from now on.
    Ok(unsafe { OwnedFd::from_raw_fd(fd) })
}

/// Read one perf fd into `(value, time_enabled, time_running)`.
fn read_one(fd: &OwnedFd) -> io::Result<(u64, u64, u64)> {
    let mut buf = [0u64; 3];
    // SAFETY: buf is a valid 24-byte writable region; the kernel
    // writes the read_format record into it. read(2) returns the
    // number of bytes; we check below.
    let n = unsafe {
        libc::read(
            fd.as_raw_fd(),
            buf.as_mut_ptr() as *mut libc::c_void,
            std::mem::size_of_val(&buf),
        )
    };
    if n < 0 {
        return Err(io::Error::last_os_error());
    }
    if (n as usize) < std::mem::size_of_val(&buf) {
        return Err(io::Error::new(
            io::ErrorKind::UnexpectedEof,
            format!(
                "short read on perf fd: got {n} bytes, expected {}",
                std::mem::size_of_val(&buf)
            ),
        ));
    }
    Ok((buf[0], buf[1], buf[2]))
}

/// All-vCPU perf counter capture. The monitor thread opens one
/// [`VcpuPerfCounters`] per vCPU TID at startup, then reads all of
/// them on every sample. `Drop` closes every fd through `OwnedFd`.
pub struct PerfCountersCapture {
    /// One counter set per vCPU. Index = vCPU id.
    pub per_vcpu: Vec<VcpuPerfCounters>,
}

impl PerfCountersCapture {
    /// Open counters for every TID in `tids`. `tids[i]` is the
    /// Linux TID of vCPU `i` (BSP or AP). On any open failure the
    /// already-opened fds are closed by `OwnedFd`'s `Drop` as the
    /// partial vec drops.
    ///
    /// Caller decides what to do on partial failure. The current
    /// monitor integration falls back to "no perf data" wholesale —
    /// individual per-vCPU error tracking would clutter the
    /// `CpuSnapshot.perf` field's "all-or-nothing per sample"
    /// invariant.
    pub fn open(tids: &[libc::pid_t]) -> io::Result<Self> {
        let mut per_vcpu = Vec::with_capacity(tids.len());
        for &tid in tids {
            per_vcpu.push(VcpuPerfCounters::open(tid)?);
        }
        Ok(Self { per_vcpu })
    }

    /// Read every vCPU's counter set into a `Vec<VcpuPerfSample>`
    /// indexed by vCPU. Read errors propagate as `Err`; the caller
    /// drops the whole sample rather than returning a partial
    /// vec — a half-populated vec would produce misleading deltas
    /// in any timeline analysis.
    pub fn read_all(&self) -> io::Result<Vec<VcpuPerfSample>> {
        self.per_vcpu.iter().map(|p| p.read()).collect()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// `perf_event_open(2)` requires either `CAP_PERFMON` or
    /// `kernel.perf_event_paranoid <= 2` for hardware counters bound
    /// to a non-self thread. The CI runner always runs as root so
    /// neither gate fires; in this unit test we exercise the
    /// caller-side error propagation by opening against the current
    /// thread's tid (which always has permission to monitor itself
    /// regardless of paranoid level).
    #[test]
    fn open_self_then_read_returns_consistent_fields() {
        // Read this thread's tid via the gettid syscall — libc
        // versions older than 0.2.156 don't expose `libc::gettid` as
        // a stable function on every target, so use the syscall
        // wrapper directly.
        let tid = unsafe { libc::syscall(libc::SYS_gettid) } as libc::pid_t;
        let counters = match VcpuPerfCounters::open(tid) {
            Ok(c) => c,
            Err(e) => {
                // Skip when the kernel rejected the open — covers
                // CI containers that disable perf_event_open or
                // kernels missing the requested counter (e.g. some
                // VM-on-VM nesting drops PERF_COUNT_HW_BRANCH_MISSES).
                eprintln!("perf_event_open unavailable in this env: {e}; skipping");
                return;
            }
        };
        // Burn a few hundred million instructions so cycles and
        // instructions are nonzero; the loop body is NOT optimized
        // away by `volatile` reads.
        let mut acc: u64 = 0;
        for i in 0u64..1_000_000 {
            unsafe { std::ptr::read_volatile(&i) };
            acc = acc.wrapping_add(i);
        }
        std::hint::black_box(acc);

        let sample = counters.read().expect("read perf counters");
        // exclude_host=1 means *zero* host-side counts — when this
        // test runs on baremetal (not inside a VM), every counter
        // should be 0 because no guest entry happened. We verify
        // the structural invariants instead: each counter read
        // succeeded, time_running <= time_enabled, and the values
        // are <= u64::MAX (trivially true; here for documentation).
        assert!(
            sample.time_running_ns <= sample.time_enabled_ns,
            "time_running ({}) > time_enabled ({})",
            sample.time_running_ns,
            sample.time_enabled_ns,
        );
        // Cycles fit in u63 in any sane scenario — guard against an
        // accidental sign-extension bug in the read path.
        assert!(sample.cycles < (1u64 << 63));
    }

    #[test]
    fn ipc_zero_when_cycles_zero() {
        let s = VcpuPerfSample::default();
        assert_eq!(s.ipc(), 0.0);
    }

    #[test]
    fn ipc_computes_instructions_over_cycles() {
        let s = VcpuPerfSample {
            cycles: 200,
            instructions: 100,
            ..Default::default()
        };
        assert!((s.ipc() - 0.5).abs() < 1e-9);
    }

    // -- Verdict API integration coverage -------------------------------
    //
    // The three IPC-band classifications named in the module-level doc
    // (≈0 = halted, ≈0.5 = spinning, ≥1 = productive) become Verdict
    // claims when a scenario test asserts on a captured VcpuPerfSample.
    // Pin the integration shape so a future change to `ipc()`'s
    // numerator-denominator order or the f64 precision routing does
    // not silently produce passing claims that should fail.

    /// Authorial claim: a productive vCPU has IPC ≥ 1.0 — pin the
    /// claim! macro routing through ClaimBuilder<f64>::at_least and
    /// the Verdict accumulator. Failing-branch siblings cover the
    /// idle and spinning cases.
    #[test]
    fn vcpu_perf_sample_ipc_productive_claim_passes() {
        use crate::assert::Verdict;
        let s = VcpuPerfSample {
            cycles: 1_000,
            instructions: 1_500,
            ..Default::default()
        };
        let mut v = Verdict::new();
        let ipc = s.ipc();
        crate::claim!(v, ipc).at_least(1.0);
        crate::claim!(v, ipc).is_finite();
        let r = v.into_result();
        assert!(
            r.passed,
            "productive IPC=1.5 must satisfy at_least(1.0): {:?}",
            r.details,
        );
    }

    /// Idle / halted vCPU: cycles=0 produces ipc()=0.0 by the
    /// guard. A claim demanding ipc ≥ 1.0 must fail with a labeled
    /// detail naming the threshold.
    #[test]
    fn vcpu_perf_sample_idle_ipc_fails_productive_claim() {
        use crate::assert::Verdict;
        let s = VcpuPerfSample::default();
        let ipc = s.ipc();
        let mut v = Verdict::new();
        crate::claim!(v, ipc).at_least(1.0);
        let r = v.into_result();
        assert!(!r.passed, "idle vCPU's ipc=0 must fail at_least(1.0)");
        let msg = &r.details[0].message;
        assert!(msg.contains("at least 1"), "msg must name threshold: {msg}");
        assert!(msg.contains("ipc"), "msg must include the label: {msg}");
    }

    /// Multiplexed counter: time_running < time_enabled means the
    /// PMU was multiplexed. A scenario test that asserts the
    /// counter wasn't multiplexed can claim
    /// `time_enabled == time_running` via `claim!(eq)`. Pin the
    /// failure path — multiplexing surfaces with the observed and
    /// expected values both visible.
    #[test]
    fn vcpu_perf_sample_multiplex_detect_via_eq_claim() {
        use crate::assert::Verdict;
        let s = VcpuPerfSample {
            cycles: 500,
            instructions: 800,
            cache_misses: 10,
            branch_misses: 2,
            time_enabled_ns: 1_000_000,
            time_running_ns: 600_000,
        };
        let mut v = Verdict::new();
        crate::claim!(v, s.time_running_ns).eq(s.time_enabled_ns);
        let r = v.into_result();
        assert!(!r.passed, "multiplexed sample must fail eq claim");
        let msg = &r.details[0].message;
        assert!(
            msg.contains("expected 1000000"),
            "msg must reflect expected value: {msg}",
        );
        assert!(
            msg.contains("was 600000"),
            "msg must reflect observed value: {msg}",
        );
    }
}