1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
//! Public [`VmResult`] returned from [`super::KtstrVm::run`], plus
//! the internal [`VmRunState`] passed from `run_vm` to
//! `collect_results` and the [`KvmStatsTotals`] aggregate of per-vCPU
//! KVM counters.
//!
//! The split keeps the result-shaping types independent of the
//! orchestration code (which still lives in [`super::KtstrVm`]). Test
//! code outside `vmm/` constructs `VmResult` literals and reads
//! `KvmStatsTotals` fields, so both types stay public; `VmRunState`
//! is `pub(crate)`-only because it's an implementation detail of the
//! run-then-collect handoff.
use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::AtomicBool;
use std::thread::JoinHandle;
use std::time::{Duration, Instant};
use super::console;
use super::host_comms::BulkDrainResult;
use super::kvm;
use super::pi_mutex::PiMutex;
use super::vcpu::{VcpuThread, WatchpointArm};
use super::virtio_blk::{VirtioBlkCounters, VirtioBlkCountersSnapshot};
use super::virtio_net::{VirtioNetCounters, VirtioNetCountersSnapshot};
use super::wire;
use crate::monitor;
/// Result of a VM execution.
///
/// `Clone` is supported, but two field categories have different
/// Clone semantics that callers must understand:
///
/// 1. **Pure-data fields** (the bulk of the struct): primitives,
/// `String`, `Vec`, `Option<_>`, plus `MonitorReport` /
/// `BulkDrainResult` / `ProgVerifierStats` / `StimulusEvent` /
/// `KvmStatsTotals` / `VirtioBlkCountersSnapshot` /
/// `VirtioNetCountersSnapshot`. Every clone produces an
/// independent value — mutations to one do not affect the
/// other. The `virtio_blk_counters` / `virtio_net_counters`
/// fields are materialized `*CountersSnapshot` types (atomic
/// loads done at construction time inside
/// `super::KtstrVm::collect_results`), so clones cannot alias
/// live device state.
///
/// 2. **Arc-shared handles** (`snapshot_bridge`, `stats_client`):
/// these wrap `Arc<Mutex<…>>` / `Arc<AtomicUsize>` and clone via
/// shallow refcount bump. Two `VmResult` clones SHARE the
/// underlying store — calling `snapshot_bridge.drain()` on one
/// clone empties the data visible to the other. See each
/// field's own doc for the precise drain / iteration contract.
/// If you need an independent snapshot view, drain into a local
/// `Vec` before cloning the `VmResult`.
#[derive(Debug, Clone)]
pub struct VmResult {
/// Overall success flag: `true` when the test reported a pass AND
/// the VM exited cleanly without crash, timeout, or watchdog.
pub success: bool,
/// Guest exit code as surfaced through the SHM ring
/// (`MSG_TYPE_EXIT`) or COM2 sentinel.
pub exit_code: i32,
/// Wall-clock duration of the VM run.
pub duration: Duration,
/// True when the host hit its watchdog before the guest exited.
pub timed_out: bool,
/// Captured guest stdout (and any non-dmesg serial console content).
pub output: String,
/// Captured guest stderr (separated from `output` when the guest
/// reported them distinctly).
pub stderr: String,
/// Host-side monitor report: sampled per-CPU state, stall
/// verdicts, and SCX event deltas. `None` when the monitor did
/// not run (host-only tests, early VM failure).
pub monitor: Option<monitor::MonitorReport>,
/// TLV messages drained from the guest after VM exit. Merges
/// mid-flight bytes the freeze coordinator pulled off
/// virtio-console port 1 during the run with the final port-1
/// `port1_tx_buf` flush.
pub guest_messages: Option<BulkDrainResult>,
/// Stimulus events extracted from guest TLV entries.
#[allow(dead_code)]
pub stimulus_events: Vec<wire::StimulusEvent>,
/// BPF verifier stats collected from host-side memory reads.
pub verifier_stats: Vec<monitor::bpf_prog::ProgVerifierStats>,
/// KVM per-vCPU cumulative stats (requires Linux >= 5.14).
pub kvm_stats: Option<KvmStatsTotals>,
/// Crash message extracted from COM2 output via
/// `crate::test_support::extract_panic_message`. The guest
/// panic hook in `rust_init.rs` writes `PANIC: <info>\n<bt>\n`
/// to `/dev/ttyS1` synchronously inside `KVM_RUN`, so the host
/// captures the full backtrace in `output` even when the guest
/// is wedged. `None` when no `PANIC:`-prefixed line was seen.
pub crash_message: Option<String>,
/// Wall-clock time from BSP exit to the moment
/// `super::KtstrVm::collect_results` finishes assembling
/// [`VmResult`].
/// Records the host-side cost of every teardown step that runs
/// after the guest has stopped advancing: watchdog join, AP joins,
/// monitor join, BPF-writer join, SHM drain, exit/crash-message
/// extraction, and BPF verifier-stat read. Always `Some(_)` for
/// VMs whose `super::KtstrVm::run_vm` returns normally —
/// including the host-watchdog timeout path, because
/// `run_bsp_loop` exits cleanly with `timed_out = true` and
/// `collect_results` still executes, populating the field.
/// `None` only when `run_vm` does not complete (a BSP panic
/// propagated through `?`, or any pre-BSP setup error that
/// returns an `Err` before `VmRunState` is constructed) and on
/// the `test_fixture` / skip-sidecar paths that never boot a VM.
/// Persisted via
/// [`SidecarResult`](crate::test_support::SidecarResult) so stats
/// tooling can flag cleanup regressions across runs.
pub cleanup_duration: Option<Duration>,
/// Host-side virtio-blk device counters, snapshotted after the
/// guest has exited. `Some(_)` when the builder attached a disk
/// via `super::KtstrVmBuilder::disk`; `None` when no disk was
/// configured and `super::KtstrVm::init_virtio_blk` returned
/// `None`. The device increments its internal `AtomicU64`
/// counters from `drain_bracket_impl` (production cfg: on the
/// dedicated `ktstr-vblk` worker thread; cfg(test): inline on
/// the test thread); by the time `collect_results` constructs
/// the [`VmResult`] every vCPU and the worker have joined and
/// no further mutation can occur. The snapshot is taken at that
/// point — readers see plain `u64` fields holding the final
/// cumulative totals; no atomic load is needed on the consumer
/// side.
///
/// The counter struct exposes nine `AtomicU64` fields, each
/// bumped from `drain_bracket_impl` (in `src/vmm/virtio_blk/device.rs`)
/// via the `VirtioBlkCounters::record_*` helpers. Per-request
/// cumulative counters, per-event cumulative counters, and
/// per-request live gauges are kept distinct per the
/// counter-taxonomy doc on `VirtioBlkCounters`:
///
/// - `reads_completed` — count of `VIRTIO_BLK_T_IN` requests
/// that returned `S_OK` to the guest. Bumped together with
/// `bytes_read` per `VirtioBlkCounters::record_read`.
/// - `writes_completed` — count of `VIRTIO_BLK_T_OUT` requests
/// that returned `S_OK`. Bumped together with `bytes_written`.
/// - `flushes_completed` — count of `VIRTIO_BLK_T_FLUSH`
/// requests that returned `S_OK` (real `fdatasync` for
/// read-write disks, no-op for `read_only`).
/// - `bytes_read` — total bytes returned to the guest for
/// completed reads.
/// - `bytes_written` — total bytes accepted from the guest for
/// completed writes.
/// - `throttled_count` — cumulative token-bucket **stall events**
/// for the device's lifetime. The chain is rolled back and
/// the worker arms a retry timerfd; the guest does not see
/// `S_IOERR` for a stall (the request is deferred until the
/// bucket refills). This counter is separate from `io_errors`
/// so operators can distinguish "throttle bucket drained,
/// request deferred" from "real IO problem". Per-event (NOT
/// per-request): a single chain that stalls twice produces
/// two bumps.
/// - `io_errors` — every path that reports `S_IOERR`:
/// spec violations, backend `pread`/`pwrite` errors,
/// malformed chains, `add_used` failures.
/// Stalls do not report `S_IOERR`; see `throttled_count`.
/// - `currently_throttled_gauge` — **live gauge**: how many
/// requests are RIGHT NOW waiting for throttle tokens.
/// Increments when a chain transitions into stalled,
/// decrements on retry success or reset. Bounded at 0 or 1
/// on this single-queue device. NOT cumulative — answers
/// "what's stuck now," distinct from `throttled_count`
/// which answers "how many stall events happened over
/// time."
/// - `invalid_avail_idx_count` — cumulative count of
/// `Error::InvalidAvailRingIndex` events observed by
/// `drain_bracket_impl` (avail.idx more than `queue.size`
/// ahead of `next_avail` — a virtio-v1.2 §2.7.13.3
/// avail.idx-distance violation by the guest). Per-event
/// counter; the `queue_poisoned` flag short-circuits
/// subsequent kicks so one guest fault produces exactly
/// one bump regardless of how many notifications follow
/// before reset.
///
/// Counters are cumulative for the device's lifetime. A guest
/// driver re-bind (writing `STATUS=0` to `VIRTIO_MMIO_STATUS`
/// triggers `VirtioBlk::reset`) does NOT zero them — the
/// device's internal `AtomicU64` storage persists across reset
/// cycles, and the post-exit snapshot captures the final
/// cumulative totals spanning the entire device lifetime, not
/// just a post-reset fragment.
///
/// Reading example:
///
/// ```ignore
/// let r: VmResult = builder.run()?;
/// let c = r.virtio_blk_counters.expect("disk attached");
/// assert!(c.reads_completed > 0);
/// ```
///
/// `#[allow(dead_code)]` mirrors `stimulus_events` above: the
/// field is part of the public API surface and read by user
/// test code outside `lib.rs`, but the lib build doesn't see
/// any in-tree readers because no lib code path calls
/// `.virtio_blk_counters` on a `VmResult`. The in-tree readers
/// live in unit tests.
#[allow(dead_code)]
pub virtio_blk_counters: Option<VirtioBlkCountersSnapshot>,
/// Host-side virtio-net device counters, snapshotted after the
/// guest has exited. `Some(_)` when the builder attached a
/// network via `super::KtstrVmBuilder::network`; `None` when
/// no network was configured and
/// `super::KtstrVm::init_virtio_net` returned `None`. The
/// device increments its internal `AtomicU64` counters on the
/// vCPU thread inside `process_tx_loopback`; by the time
/// `collect_results` constructs the [`VmResult`] every vCPU has
/// joined and no further mutation can occur. The snapshot is
/// taken at that point — readers see plain `u64` fields holding
/// the final cumulative totals; no atomic load is needed on the
/// consumer side.
///
/// The counter struct exposes eleven `AtomicU64` fields, each
/// bumped from `process_tx_loopback`:
///
/// - `tx_packets` — count of TX chains the device accepted
/// and marked used; advances per parsed chain regardless of
/// downstream RX outcome.
/// - `tx_bytes` — bytes of L2 frame data captured from
/// successfully parsed TX chains (excludes the 12-byte
/// virtio header).
/// - `rx_packets` / `rx_bytes` — count + bytes of RX chains
/// successfully written and marked used. In v0's pure-
/// loopback mode the steady-state expectation is
/// `rx_packets == tx_packets - tx_dropped_no_rx_buffer`;
/// asymmetric counts surface RX-side breakage.
/// - `tx_dropped_no_rx_buffer` — successfully-captured TX
/// frames the device could not deliver because the RX queue
/// was empty (back-pressure event).
/// - `tx_chain_invalid` / `rx_chain_invalid` — chains rejected
/// for malformed shape (short header, wrong direction,
/// attacker-controlled descriptor address overflow).
/// - `rx_write_failed` — RX chain whose shape was valid but
/// whose guest-memory `write_slice` (header or frame) hit
/// an unmapped GPA. Distinct from `rx_chain_invalid` so an
/// operator can tell "guest violated the RX descriptor-
/// direction rule" from "guest posted a buffer at an
/// unmapped GPA"; the two are mutually exclusive per chain.
/// - `tx_add_used_failures` / `rx_add_used_failures` —
/// `add_used` failures, indicating the queue's used-ring
/// address itself is unmapped or otherwise inaccessible.
/// Distinct from the `*_chain_invalid` / `rx_write_failed`
/// counters so an operator can tell "guest sent malformed
/// frame" / "guest's posted buffer GPA was unmapped" from
/// "queue itself is broken".
/// - `invalid_avail_idx_count` — cumulative count of
/// `Error::InvalidAvailRingIndex` events observed by
/// `process_tx_loopback` (avail.idx more than `queue.size`
/// ahead of `next_avail` — virtio-v1.2 §2.7.13.3 violation
/// by the guest). Per-event counter; the per-queue
/// `queue_poisoned` flag short-circuits subsequent kicks
/// so one guest fault produces exactly one bump regardless
/// of how many notifications follow before reset.
///
/// Counters are cumulative for the device's lifetime — a guest
/// driver re-bind (writing `STATUS=0`) does NOT zero them.
#[allow(dead_code)]
pub virtio_net_counters: Option<VirtioNetCountersSnapshot>,
/// Snapshot bridge populated by the freeze coordinator over the
/// run's lifetime. Every `Op::CaptureSnapshot` and `Op::WatchSnapshot`
/// fire stores a `FailureDumpReport` keyed by its tag.
///
/// `#[ktstr_test]` test bodies whose scenario fires snapshot
/// ops in the guest assert on the captured reports through a
/// `post_vm = NAME` attribute. The named callback runs on the
/// HOST after `vm.run()` returns (see
/// [`crate::test_support::KtstrTestEntry::post_vm`]) and
/// receives `&VmResult`; it calls
/// [`crate::scenario::snapshot::SnapshotBridge::drain`] on
/// this field to take ownership of the stored reports and
/// walks them — typically through
/// [`crate::scenario::snapshot::Snapshot::new`] for typed
/// access to map values, per-CPU entries, and scalar
/// variables. Out-of-tree consumers can drain the bridge the
/// same way: `VmResult` is in `ktstr::prelude`.
///
/// Always present after a successful `run_vm`; `None`-equivalent
/// (empty) when the VM crashed before any snapshot fired.
///
/// **Drained by `evaluate_vm_result`**: the framework's
/// `crate::test_support::eval` path drains this bridge to
/// auto-populate [`crate::assert::ScenarioStats::phases`]
/// before returning the AssertResult. A `post_vm` callback or
/// any code path that runs THROUGH `evaluate_vm_result`
/// observes an empty bridge here — the periodic captures the
/// drain consumed are recovered as the per-phase
/// [`crate::assert::PhaseBucket`] entries on
/// `result.stats.phases`, which is the framework-curated
/// equivalent surface. Integration tests under `tests/` that
/// bypass `evaluate_vm_result` (e.g. `tests/stats_bridge_e2e.rs`,
/// `tests/temporal_assertions_e2e.rs`) see the bridge intact
/// because their entry path never reaches the auto-populate
/// site; those consumers continue to call
/// `result.snapshot_bridge.drain*()` directly without
/// observable contract change.
pub snapshot_bridge: crate::scenario::snapshot::SnapshotBridge,
/// Live scheduler-stats client. `Some(_)` when the run wired the
/// virtio-console port-2 stats bridge (the in-tree path always
/// does so, but tests that construct a [`VmResult`] manually via
/// `Self::test_fixture` leave this `None`). Test code that
/// asserts on scheduler-reported metrics calls
/// `super::SchedStatsClient::stats` /
/// `super::SchedStatsClient::stats_meta` on this handle WHILE
/// the guest is alive — calling after VM exit will time out
/// because the relay thread has already exited. Cloneable;
/// multiple test threads may share the same client.
#[allow(dead_code)]
pub stats_client: Option<super::SchedStatsClient>,
/// Number of periodic snapshot boundaries the freeze
/// coordinator actually fired during this run. Includes both
/// successful captures and rendezvous-timeout placeholders.
/// Tests can assert `result.periodic_fired >= some_lower_bound`
/// to guard periodic-capture coverage; mismatches against
/// [`Self::periodic_target`] flag missing samples (early VM
/// exit, kill-flag stop, abandoned-after-timeouts).
pub periodic_fired: u32,
/// Configured `num_snapshots` count for the entry that drove
/// this run (mirrors the `KtstrTestEntry::num_snapshots` field
/// the entry was registered with). `0` when periodic capture
/// was disabled. Pairs with [`Self::periodic_fired`] so a
/// test can compute coverage without re-reading the entry
/// table.
pub periodic_target: u32,
/// Runtime virt-KASLR offset (kernel-image slide). Captured
/// from the freeze coordinator's `kern_virt_kaslr` Arc snapshot
/// at run-end via `load(Acquire).saturating_sub(1)`. `0` means
/// either (a) KASLR was off — test ran with
/// `#[ktstr_test(kaslr = false)]` or
/// `Scheduler::kargs(&["nokaslr"])`, OR (b) the derivation
/// chain (MSR_LSTAR readback in `vmm::x86_64::msr_kaslr` +
/// KERN_ADDRS `_text` path in `freeze_coord::dispatch.rs`) never
/// published a non-zero value (early-boot crash, kallsyms masked
/// by kptr_restrict, FRED-enabled kernel). E2E test consumers
/// distinguish (a) from (b) by reading the test entry's `kaslr`
/// attribute alongside this field — see
/// [`Self::kaslr_enabled`] for the binary-question companion.
pub kern_kaslr_offset: u64,
}
impl VmResult {
/// Whether the guest kernel booted with KASLR enabled (= a
/// non-zero virt-KASLR offset published into the freeze
/// coordinator's `kern_virt_kaslr` Arc). Returns `true` when
/// [`Self::kern_kaslr_offset`] is non-zero. The inverse case
/// (returns `false`) covers two scenarios: (a) the test
/// explicitly opted out via `#[ktstr_test(kaslr = false)]` or
/// `Scheduler::kargs(&["nokaslr"])`, OR (b) the derivation
/// chain failed to publish a non-zero value (early-boot crash,
/// kallsyms masked, kernel built without `CONFIG_RANDOMIZE_BASE`).
/// E2E test consumers distinguish (a) from (b) by reading the
/// test entry's `kaslr` attribute alongside this method.
///
/// Companion to [`Self::kern_kaslr_offset`] — use this when the
/// caller cares about the binary "did KASLR happen?" question
/// and use the raw field for exact-offset assertions
/// (alignment, entropy-range, etc.).
pub fn kaslr_enabled(&self) -> bool {
self.kern_kaslr_offset != 0
}
/// One-line sugar for the recurring `post_vm`-callback boilerplate
/// `SampleSeries::from_drained_typed(self.snapshot_bridge.drain_ordered_with_stats(), self.monitor.clone()).periodic_only()`.
/// Equivalent in every observable way: same drain, same monitor
/// clone, same `periodic_only()` filter — exposed as a single
/// method so every benchmarking / per-phase / cross-phase test
/// expresses the projection in one statement instead of three.
///
/// The bridge drain is destructive (the snapshot bridge yields
/// each capture exactly once); calling this method twice on the
/// same [`VmResult`] leaves the second call with an empty series.
/// If a post_vm callback needs both the raw drain and a series
/// view, drain the bridge into a local Vec first and construct
/// the series via [`crate::scenario::sample::SampleSeries::from_drained_typed`].
///
/// Takes `&self` rather than `&mut self` so it composes with the
/// `#[ktstr_test(post_vm = ...)]` callback signature
/// (`fn(&VmResult) -> Result<()>`). The underlying bridge uses
/// interior mutability for its drain queue, so the destructive
/// semantics ride on the bridge's lock rather than Rust's
/// borrow-check exclusivity.
pub fn periodic_series(&self) -> crate::scenario::sample::SampleSeries {
crate::scenario::sample::SampleSeries::from_drained_typed(
self.snapshot_bridge.drain_ordered_with_stats(),
self.monitor.clone(),
)
.periodic_only()
}
/// Minimal "nothing happened" fixture for tests that exercise
/// code consuming a [`VmResult`] without actually booting a VM
/// (the sidecar-write tests in `src/test_support/sidecar.rs`
/// are the primary users). Every field carries the empty /
/// default / `None` value that `run_vm` would produce for a
/// VM that launched, exited cleanly with exit code 0, and
/// produced no telemetry. Tests that need a specific field
/// override it with a struct-update expression:
///
/// ```ignore
/// let result = VmResult { success: false, ..VmResult::test_fixture() };
/// ```
///
/// Gated on `#[cfg(test)]` so the symbol does not appear in
/// release builds — production `VmResult` values flow from
/// `run_vm` and never from this fixture. See
/// `sidecar_vm_result_is_test_fixture_boilerplate` in
/// `test_support/sidecar.rs` for the motivating deduplication
/// (7 identical literal constructions collapsed to a single
/// call).
#[cfg(test)]
pub fn test_fixture() -> Self {
Self {
success: true,
exit_code: 0,
duration: Duration::from_secs(1),
timed_out: false,
output: String::new(),
stderr: String::new(),
monitor: None,
guest_messages: None,
stimulus_events: Vec::new(),
verifier_stats: Vec::new(),
kvm_stats: None,
crash_message: None,
cleanup_duration: None,
virtio_blk_counters: None,
virtio_net_counters: None,
snapshot_bridge: empty_snapshot_bridge_for_tests(),
stats_client: None,
periodic_fired: 0,
periodic_target: 0,
kern_kaslr_offset: 0,
}
}
}
/// Build an empty `SnapshotBridge` whose capture callback always
/// returns `None`. Used by `VmResult::test_fixture` and the legacy
/// `VmResult` literal constructions in unit tests so they still
/// compile after the snapshot_bridge field landed. Production
/// `run_vm` constructs its own bridge whose callback is
/// intentionally unused — the freeze coordinator stores reports
/// directly via `bridge.store(name, report)`.
#[cfg(test)]
pub(crate) fn empty_snapshot_bridge_for_tests() -> crate::scenario::snapshot::SnapshotBridge {
let cb: crate::scenario::snapshot::CaptureCallback = std::sync::Arc::new(|_| None);
crate::scenario::snapshot::SnapshotBridge::new(cb)
}
/// Per-vCPU KVM stats read after VM exit. Each map holds cumulative
/// counter values from the VM's lifetime.
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct KvmStatsTotals {
/// Per-vCPU stat maps. Index is vCPU id.
pub per_vcpu: Vec<HashMap<String, u64>>,
}
/// KVM stat names surfaced in sidecar output for scheduler testing.
///
/// Covers VM exit rate, halt-polling behavior, preemption notifications,
/// signal-driven exits, and hypercall counts; all fields scheduler
/// authors typically correlate with scx decisions.
///
/// Per-arch availability: `halt_exits`, `preemption_reported`, and
/// `hypercalls` are published by KVM only on x86. On aarch64 the
/// kernel does not expose these stats via `KVM_GET_STATS_FD`; they
/// are absent from the per-vCPU map and read as `0` from
/// [`KvmStatsTotals::sum`] / [`KvmStatsTotals::avg`]. The remaining
/// names (`exits`, `halt_successful_poll`, `halt_attempted_poll`,
/// `halt_wait_ns`, `signal_exits`) are published on both arches.
#[allow(dead_code)]
pub const KVM_INTERESTING_STATS: &[&str] = &[
"exits",
"halt_exits",
"halt_successful_poll",
"halt_attempted_poll",
"halt_wait_ns",
"preemption_reported",
"signal_exits",
"hypercalls",
];
impl KvmStatsTotals {
/// Sum a stat across all vCPUs.
pub fn sum(&self, name: &str) -> u64 {
self.per_vcpu.iter().filter_map(|m| m.get(name)).sum()
}
/// Average a stat across all vCPUs (returns 0 if no vCPUs).
pub fn avg(&self, name: &str) -> u64 {
if self.per_vcpu.is_empty() {
return 0;
}
self.sum(name) / self.per_vcpu.len() as u64
}
}
/// State returned by [`super::KtstrVm::run_vm`] after the BSP exits.
/// Passed to [`super::KtstrVm::collect_results`] to produce
/// [`VmResult`].
pub(crate) struct VmRunState {
pub(crate) exit_code: i32,
pub(crate) timed_out: bool,
pub(crate) ap_threads: Vec<VcpuThread>,
pub(crate) monitor_handle: Option<JoinHandle<monitor::reader::MonitorLoopResult>>,
pub(crate) bpf_write_handle: Option<JoinHandle<()>>,
/// Freeze coordinator handle, always `None` in the
/// production path: [`super::KtstrVm::run_vm`] joins the
/// coordinator BEFORE the BSP `VcpuFd` falls out of scope so the
/// coordinator's captured BSP `ImmediateExitHandle` cannot
/// outlive the kvm_run mmap (UAF prevention). The optional shape
/// is preserved so the field stays trivially constructible in
/// any future test-only or alternative-orchestration path that
/// might not perform the early join.
pub(crate) freeze_coordinator: Option<JoinHandle<()>>,
pub(crate) com1: Arc<PiMutex<console::Serial>>,
pub(crate) com2: Arc<PiMutex<console::Serial>>,
pub(crate) kill: Arc<AtomicBool>,
/// Wake fd paired with `kill`. Setters that flip `kill`
/// (`collect_results`, vCPU shutdown classifier, panic hook)
/// also write to this EventFd so any consumer blocked in
/// `epoll_wait` (notably the freeze coordinator and the
/// monitor sampler) wakes within microseconds of the flip
/// rather than waiting up to one full poll interval. The
/// AtomicBool above remains the source of truth — the EventFd
/// is purely a wake signal. EFD_NONBLOCK so a saturated
/// counter never stalls the writer.
pub(crate) kill_evt: Arc<vmm_sys_util::eventfd::EventFd>,
/// Broadcast freeze flag for the failure-dump coordinator. When the
/// coordinator receives a guest-side error-exit signal it sets this
/// to true, kicks every vCPU, waits for all `parked` flags to flip
/// true, and then reads guest BPF map state. Released to false to
/// resume normal execution. Lives alongside `kill` so the same Arc
/// pattern (broadcast + per-vCPU ACK) covers both shutdown and
/// freeze rendezvous.
pub(crate) freeze: Arc<AtomicBool>,
/// Hardware-watchpoint arming state Arc, forwarded so
/// [`super::KtstrVm::collect_results`] can invalidate the
/// `kind_host_ptr` and `request_kva` slots after every vCPU
/// thread joins but BEFORE `vm` drops.
///
/// Without the invalidation, the slots' published values
/// continue to address (a) a host pointer into `vm.guest_mem`'s
/// mapping that becomes unmapped when `vm` drops and (b) a
/// guest KVA whose translation goes through the same mapping.
/// The freeze coordinator joins before `vm` drops in
/// `run_vm`, and AP threads join inside `collect_results` —
/// but defense-in-depth says we zero the slots once every
/// reader is gone so any future restructuring (a stray Arc
/// clone surviving past teardown, a follow-up that adds a
/// new reader path) cannot trip a use-after-free.
///
/// Declared before `vm` so the implicit drop order on
/// `VmRunState` teardown drops `watchpoint` first: any Arc
/// clone outliving the struct can no longer dereference its
/// `kind_host_ptr` after `vm.guest_mem` has unmapped, even if
/// a future caller forgets the explicit pre-drop
/// invalidation in `collect_results`.
pub(crate) watchpoint: Arc<WatchpointArm>,
pub(crate) vm: kvm::KtstrKvm,
/// Captured immediately after the BSP exits its run loop. Subtracted
/// from `Instant::now()` in [`super::KtstrVm::collect_results`]
/// right before the [`VmResult`] is returned to populate
/// [`VmResult::cleanup_duration`]. Records the wall-clock cost of
/// every host-side teardown step that runs after the guest has
/// stopped advancing, in execution order: the watchdog-thread join
/// in [`super::KtstrVm::run_vm`], then the AP-thread joins, the
/// monitor-thread join, the BPF-map-writer join, the SHM-ring
/// drain, the post-exit exit-code/crash-message extraction, and
/// finally the BPF verifier-stat read inside
/// [`super::KtstrVm::collect_results`].
pub(crate) cleanup_start: Instant,
/// Cloned counter handle from [`super::KtstrVm::init_virtio_blk`]
/// when a disk was attached, captured before the device-arc is
/// dropped so [`super::KtstrVm::collect_results`] can snapshot
/// it into [`VmResult::virtio_blk_counters`]. The device worker
/// bumps these atomics from `drain_bracket_impl` (production cfg:
/// dedicated `ktstr-vblk` thread; cfg(test): inline on the test
/// thread); by the time `collect_results` reads this field every
/// vCPU thread has joined upstream, the worker can receive no
/// further kicks, and the conversion site
/// (`run.virtio_blk_counters.as_deref().map(|c| c.snapshot())`)
/// loads the final cumulative state into a plain-u64 snapshot
/// before storing on the public `VmResult`.
pub(crate) virtio_blk_counters: Option<Arc<VirtioBlkCounters>>,
/// Cloned counter handle from [`super::KtstrVm::init_virtio_net`]
/// when a network was attached, captured before the device-arc
/// is dropped so [`super::KtstrVm::collect_results`] can
/// snapshot it into [`VmResult::virtio_net_counters`]. Same
/// Arc-handoff + snapshot-at-assignment pattern as
/// `virtio_blk_counters` above.
pub(crate) virtio_net_counters: Option<Arc<VirtioNetCounters>>,
/// Snapshot bridge owning every report captured during the run.
/// The freeze coordinator clones this bridge into its closure
/// state; on every guest-side
/// [`crate::vmm::wire::MSG_TYPE_SNAPSHOT_REQUEST`] frame the
/// coordinator's TOKEN_TX handler decoded with kind
/// [`crate::vmm::wire::SNAPSHOT_KIND_CAPTURE`], the dispatch runs
/// `freeze_and_capture(false)` and stores the resulting
/// `FailureDumpReport` here keyed by the snapshot name. After
/// VM exit, [`super::KtstrVm::collect_results`] forwards the
/// bridge onto [`VmResult::snapshot_bridge`] so the test code
/// can drain captured snapshots and walk them via the
/// [`crate::scenario::snapshot::Snapshot`] accessor surface.
pub(crate) snapshot_bridge: crate::scenario::snapshot::SnapshotBridge,
/// Cached aarch64 TCR_EL1 register, populated lazily by the BSP
/// once the guest kernel programs the MMU. Always `None` on
/// x86_64 (the register does not exist). Threads that construct
/// a `GuestKernel` for page-table walks (monitor, BPF map writer,
/// freeze coordinator, post-exit verifier-stats collector) read
/// this atomic to feed the granule-agnostic walker (4 KB / 16 KB
/// / 64 KB). A 0 reading on aarch64 means "kernel hasn't reached
/// MMU bring-up yet"; the walker's T1SZ=0 gate rejects walks in
/// that state and the affected lookup returns `None` cleanly.
pub(crate) tcr_el1: Option<Arc<std::sync::atomic::AtomicU64>>,
/// Cached BSP CR3 (x86_64) / TTBR1_EL1 (aarch64), populated lazily
/// by the BSP loop after initial page-table setup. Used by
/// post-exit `GuestKernel` constructions to walk the live page
/// tables for `phys_base` resolution. `0` means the cache wasn't
/// populated (early boot crash); the walk fails and `phys_base`
/// falls back to `0`, which produces correct translations on
/// non-KASLR boots.
pub(crate) cr3: Arc<std::sync::atomic::AtomicU64>,
/// Cached vmlinux bytes for collect_verifier_stats. Avoids
/// re-reading from disk (14-28s on cold cache).
pub(crate) vmlinux_data: Option<Arc<Vec<u8>>>,
/// Pre-built prog accessor from the accessor-init worker.
/// When present, `collect_verifier_stats` skips the ~4s
/// ELF/BTF parse and uses this directly.
pub(crate) prog_accessor: Option<crate::monitor::bpf_prog::GuestMemProgAccessorOwned>,
/// Guest-reported phys_base (biased +1). Used by
/// `collect_verifier_stats` fallback when the pre-built prog
/// accessor is unavailable.
pub(crate) kern_phys_base: u64,
/// Runtime virt-KASLR offset (kernel-image slide), captured from
/// the freeze coordinator's `kern_virt_kaslr` Arc snapshot at run
/// end via `load(Acquire).saturating_sub(1)`. `0` means either
/// (a) KASLR was off (test ran with `#[ktstr_test(kaslr = false)]`
/// or `Scheduler::kargs(&["nokaslr"])`), or (b) the derivation
/// chain (MSR_LSTAR readback at `vmm::x86_64::msr_kaslr` +
/// KERN_ADDRS `_text` path at `freeze_coord::dispatch.rs`) never
/// published a non-zero value (early-boot crash, kallsyms masked
/// by kptr_restrict, FRED-enabled kernel). E2E test consumers
/// distinguish (a) from (b) by asserting against the test entry's
/// `kaslr` attribute. The companion [`Self::kern_phys_base`]
/// carries the kernel-image physical-randomization slide; together
/// they identify the KASLR-randomized kernel layout.
pub kern_kaslr_offset: u64,
/// Virtio-console device shared with vCPU threads. Carries the
/// port-1 (`/dev/vport0p1`) bulk TLV stream from guest to host;
/// `collect_results` calls `drain_bulk()` after the run to feed
/// `parse_tlv_stream` and produce the `BulkDrainResult` that
/// `VmResult.guest_messages` exposes to test verdicts.
pub(crate) virtio_con: Arc<crate::vmm::PiMutex<crate::vmm::virtio_console::VirtioConsole>>,
/// Bulk TLV entries the freeze coordinator parsed from
/// `port1_tx_buf` mid-run. The coord's TOKEN_TX handler reads
/// the device's accumulated bulk bytes, feeds them through
/// [`crate::vmm::bulk::HostAssembler`], and stashes every parsed
/// frame here so [`super::KtstrVm::collect_results`] can merge
/// them into `VmResult::guest_messages` alongside the post-exit
/// `drain_bulk` and the post-mortem SHM CRASH-ring drain.
/// Without this stash every EXIT / TEST / PAYLOAD_METRICS /
/// RAW_PAYLOAD_OUTPUT / PROFRAW frame consumed by the coord
/// would vanish — only the leftover bytes that arrived on
/// `port1_tx_buf` after the coord exited would reach the
/// verdict, and a typical run would surface no metrics.
pub(crate) bulk_messages: Arc<std::sync::Mutex<Vec<crate::vmm::wire::ShmEntry>>>,
/// Scheduler-stats client constructed at the top of `run_vm`,
/// or `None` when the run has no scheduler attached
/// (`scheduler_binary` is `None` on the builder). Forwarded
/// to [`VmResult::stats_client`] so test code can issue
/// `request_raw` / typed `stats` / `stats_meta` calls through
/// the run's lifetime. The drainer thread tears down when the
/// last clone of the client drops; `None` here means no
/// drainer was spawned at all, so the run pays no
/// stats-bridge cost.
pub(crate) stats_client: Option<super::SchedStatsClient>,
/// Periodic captures actually fired by the freeze coordinator
/// during the run (success + timeout-placeholder count).
/// Forwarded to [`VmResult::periodic_fired`] from the run-loop's
/// `next_periodic_idx` final value.
pub(crate) periodic_fired: u32,
/// Configured periodic-snapshot target (mirrors
/// `KtstrVm::num_snapshots`). Forwarded to
/// [`VmResult::periodic_target`] so test code can compute
/// coverage as `fired / target`.
pub(crate) periodic_target: u32,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn vm_result_fields_carry_values() {
let r = VmResult {
duration: Duration::from_secs(5),
output: "hello world".into(),
stderr: "boot log".into(),
cleanup_duration: Some(Duration::from_millis(50)),
..VmResult::test_fixture()
};
assert!(r.success);
assert_eq!(r.exit_code, 0);
assert!(!r.timed_out);
assert_eq!(r.duration, Duration::from_secs(5));
assert_eq!(r.output, "hello world");
assert_eq!(r.stderr, "boot log");
assert!(r.monitor.is_none());
assert!(r.guest_messages.is_none());
assert!(r.stimulus_events.is_empty());
assert_eq!(r.cleanup_duration, Some(Duration::from_millis(50)));
assert!(r.virtio_blk_counters.is_none());
// Second construction covers the opposite polarity of
// every boolean/numeric field so no field is silently
// dropped by a future refactor that only exercises the
// success path.
let r2 = VmResult {
success: false,
exit_code: 1,
duration: Duration::from_millis(500),
timed_out: true,
virtio_blk_counters: Some(VirtioBlkCountersSnapshot::default()),
periodic_fired: 3,
periodic_target: 7,
..VmResult::test_fixture()
};
assert!(!r2.success);
assert_eq!(r2.exit_code, 1);
assert!(r2.timed_out);
assert_eq!(r2.duration, Duration::from_millis(500));
assert!(r2.cleanup_duration.is_none());
assert_eq!(r2.periodic_fired, 3);
assert_eq!(r2.periodic_target, 7);
// Opposite polarity: counters present. Reads must observe
// the default-zero values for every field — a future field
// added to VirtioBlkCountersSnapshot that doesn't initialise
// to 0 would break the "fresh device reports zero activity"
// contract that VmResult readers rely on. The snapshot was
// taken from the device's atomic counters at collect_results
// time, after every vCPU and worker thread joined; readers
// see plain `u64` field reads with no atomic ordering needed.
let counters = r2.virtio_blk_counters.as_ref().unwrap();
assert_eq!(counters.reads_completed, 0);
assert_eq!(counters.writes_completed, 0);
assert_eq!(counters.flushes_completed, 0);
assert_eq!(counters.bytes_read, 0);
assert_eq!(counters.bytes_written, 0);
assert_eq!(counters.throttled_count, 0);
assert_eq!(counters.io_errors, 0);
assert_eq!(counters.currently_throttled_gauge, 0);
assert_eq!(counters.invalid_avail_idx_count, 0);
}
#[test]
fn vm_result_without_monitor_has_no_samples() {
let r = VmResult {
output: "test output".into(),
..VmResult::test_fixture()
};
assert!(r.monitor.is_none());
// Output and exit_code must still be accessible.
assert_eq!(r.output, "test output");
assert_eq!(r.exit_code, 0);
}
#[test]
fn vm_result_with_monitor_carries_summary() {
let summary = monitor::MonitorSummary {
prog_stats_deltas: None,
total_samples: 5,
max_imbalance_ratio: 3.5,
max_local_dsq_depth: 10,
stuck_detected: true,
event_deltas: None,
schedstat_deltas: None,
..Default::default()
};
let report = monitor::MonitorReport {
samples: vec![],
summary: summary.clone(),
..Default::default()
};
let r = VmResult {
success: false,
exit_code: 1,
duration: Duration::from_millis(500),
timed_out: true,
stderr: "kernel panic".into(),
monitor: Some(report),
..VmResult::test_fixture()
};
let mon = r.monitor.as_ref().unwrap();
assert_eq!(mon.summary.total_samples, 5);
assert!((mon.summary.max_imbalance_ratio - 3.5).abs() < f64::EPSILON);
assert_eq!(mon.summary.max_local_dsq_depth, 10);
assert!(mon.summary.stuck_detected);
assert!(r.timed_out);
assert_eq!(r.exit_code, 1);
assert_eq!(r.stderr, "kernel panic");
}
/// Compile-time pin that `VmResult: Clone`. A future field
/// added with a non-Clone type would break the derive at compile
/// time and break this test's `let _: Self = self_clone(r)` call.
/// Cheap insurance that nobody silently strips the Clone derive
/// or adds a non-Clone field.
#[test]
fn vm_result_is_clone() {
fn self_clone<T: Clone>(t: &T) -> T {
t.clone()
}
let r = VmResult::test_fixture();
let _: VmResult = self_clone(&r);
}
/// Pin the documented aliasing semantic on the Arc-shared
/// `snapshot_bridge` field: clones of `VmResult` share the
/// underlying snapshot store. A future refactor that turned
/// `SnapshotBridge` into a deep-copy struct would break this
/// test — at which point the doc paragraph at the head of
/// `VmResult` must be updated to drop the Arc-shared-handle
/// category. Loud failure on contract drift, not a silent
/// behavior change.
#[test]
fn vm_result_clone_snapshot_bridge_aliases_via_arc() {
let r = VmResult::test_fixture();
let c = r.clone();
// Pre-condition: both bridges start empty.
assert_eq!(r.snapshot_bridge.len(), 0);
assert_eq!(c.snapshot_bridge.len(), 0);
// Store a synthetic report through ONE clone's bridge.
r.snapshot_bridge.store(
"regression_pin",
crate::monitor::dump::FailureDumpReport::default(),
);
// The OTHER clone observes the store — proves the Arc<Mutex<…>>
// is shared, not deep-copied. If this assertion ever fires,
// SnapshotBridge's Clone has changed shape and VmResult's
// doc paragraph must be revisited.
assert_eq!(
r.snapshot_bridge.len(),
c.snapshot_bridge.len(),
"snapshot_bridge clones must observe the same store \
per the VmResult Clone contract (Arc-shared handle)"
);
assert_eq!(c.snapshot_bridge.len(), 1);
}
}