ktstr 0.6.0

Test harness for Linux process schedulers
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
//! [`SnapshotError`] (every fallible accessor's structured error) plus
//! its [`std::fmt::Display`] impl and [`SnapshotResult`] alias. Lives in its own
//! file so the variant catalogue is easy to scan when adding a new
//! accessor — `cargo doc` surfaces the same single-page view as the
//! source.

use super::HEX_KEY_PREFIX;

// ---------------------------------------------------------------------------
// Missing-stats reason
// ---------------------------------------------------------------------------

/// Why a sample's `stats` slot is unavailable — carried on
/// [`SnapshotError::MissingStats`] so operator diagnostics name
/// the specific failure mode rather than the generic "stats
/// absent". Built by [`From<&crate::vmm::sched_stats::SchedStatsError>`]
/// for the relay-failure path, plus dedicated variants for the
/// pre-client gates that the `crate::vmm::SchedStatsError` enum doesn't
/// cover (no scheduler binary configured).
#[derive(Debug, Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
#[non_exhaustive]
pub enum MissingStatsReason {
    /// No `scheduler_binary` was configured on the run, so the
    /// freeze coordinator never wired a `crate::vmm::SchedStatsClient`.
    /// Every periodic sample bypasses the stats request entirely
    /// and lands here.
    NoSchedulerBinary,
    /// The guest relay never connected to the scheduler's Unix
    /// socket (no scheduler running, or the scheduler refused the
    /// connection).
    NoScheduler { reason: String },
    /// The host-side coordinator marked the run as freezing while
    /// this stats request was in flight (or about to start);
    /// scx_stats responses are undefined while the scheduler's
    /// userspace thread is paused.
    DuringFreeze,
    /// The run-wide cancel flag was set (watchdog fired or the
    /// run is shutting down) while this stats request was in
    /// flight or about to start.
    Cancelled,
    /// The scheduler returned a non-zero `errno` in the typed
    /// `crate::vmm::StatsResponse` envelope. The `args` payload is preserved
    /// so operators can render the scheduler-side message.
    SchedulerError { errno: i32, args: serde_json::Value },
    /// The typed envelope was decoded but the inner `args` map
    /// did not contain the expected `"resp"` key — protocol
    /// mismatch with the scheduler.
    MissingResp { args: serde_json::Value },
    /// The caller passed a stats request larger than the client's
    /// `crate::vmm::sched_stats::MAX_REQUEST_BYTES` cap.
    RequestTooLarge { size: usize, max: usize },
    /// The scheduler's response grew past
    /// `crate::vmm::sched_stats::MAX_RESPONSE_BYTES` without ever emitting a newline.
    ResponseTooLarge { size: usize, max: usize },
    /// The shared response mutex was poisoned by a previous
    /// panic; the stats client cannot recover for this sample.
    MutexPoisoned,
}

impl std::fmt::Display for MissingStatsReason {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::NoSchedulerBinary => {
                write!(f, "no scheduler_binary configured for this run")
            }
            Self::NoScheduler { reason } => {
                write!(f, "guest relay reports no scheduler: {reason}")
            }
            Self::DuringFreeze => {
                write!(
                    f,
                    "stats request cancelled — freeze coordinator paused the scheduler"
                )
            }
            Self::Cancelled => {
                write!(
                    f,
                    "stats request cancelled — run-wide cancel flag was set (watchdog or shutdown)"
                )
            }
            Self::SchedulerError { errno, args } => {
                write!(f, "scheduler returned errno={errno} (args={args})")
            }
            Self::MissingResp { args } => {
                write!(f, "scheduler envelope missing 'resp' key (args={args})")
            }
            Self::RequestTooLarge { size, max } => {
                write!(f, "stats request {size} bytes exceeds {max}-byte cap")
            }
            Self::ResponseTooLarge { size, max } => {
                write!(f, "stats response {size} bytes exceeds {max}-byte cap")
            }
            Self::MutexPoisoned => {
                write!(f, "stats client response mutex was poisoned")
            }
        }
    }
}

impl From<&anyhow::Error> for MissingStatsReason {
    /// Downcast the anyhow chain to a typed
    /// `crate::vmm::SchedStatsError`
    /// when one is present (every `SchedStatsClient` failure path
    /// boxes a typed variant via `anyhow::anyhow!(SchedStatsError::…)`,
    /// so the downcast succeeds on every well-formed sched_stats
    /// error). Falls back to [`MissingStatsReason::NoScheduler`]
    /// carrying the rendered display when the downcast fails — that
    /// covers serde / IO / other errors that didn't originate inside
    /// `crate::vmm::SchedStatsClient` but still surface through the same
    /// `Result<_, anyhow::Error>` return.
    fn from(e: &anyhow::Error) -> Self {
        if let Some(typed) = e.downcast_ref::<crate::vmm::sched_stats::SchedStatsError>() {
            return Self::from(typed);
        }
        Self::NoScheduler {
            reason: e.to_string(),
        }
    }
}

impl From<&crate::vmm::sched_stats::SchedStatsError> for MissingStatsReason {
    fn from(e: &crate::vmm::sched_stats::SchedStatsError) -> Self {
        use crate::vmm::sched_stats::SchedStatsError as S;
        match e {
            S::Poisoned => Self::MutexPoisoned,
            S::RequestTooLarge { size, max } => Self::RequestTooLarge {
                size: *size,
                max: *max,
            },
            S::ResponseTooLarge { size, max } => Self::ResponseTooLarge {
                size: *size,
                max: *max,
            },
            S::DuringFreeze => Self::DuringFreeze,
            S::Cancelled => Self::Cancelled,
            S::NoScheduler { reason } => Self::NoScheduler {
                reason: reason.clone(),
            },
            S::SchedulerError { errno, args } => Self::SchedulerError {
                errno: *errno,
                args: args.clone(),
            },
            S::MissingResp { args } => Self::MissingResp { args: args.clone() },
        }
    }
}

// ---------------------------------------------------------------------------
// Excluded map payload
// ---------------------------------------------------------------------------

/// One captured map that the KVA-whitelist filter rejected.
/// Payload for [`SnapshotError::ActiveFilterExcludedMaps::excluded_maps`].
/// The `map_kva` field name matches
/// [`crate::monitor::dump::FailureDumpMap::map_kva`] (the
/// source-of-truth field), and a `map_kva == 0` here flags a
/// capture where the per-map KVA was not recorded (synthetic
/// fixture or capture-path bug — production captures filter zero
/// KVAs out at the walker level).
#[derive(Debug, Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
#[non_exhaustive]
pub struct ExcludedMap {
    pub name: String,
    pub map_kva: u64,
}

// ---------------------------------------------------------------------------
// Error type
// ---------------------------------------------------------------------------

/// Reason a snapshot accessor or terminal read could not resolve.
///
/// Returned by every fallible accessor (`Snapshot::map`,
/// `SnapshotEntry::get`, `SnapshotField::as_u64`, …) so a missing
/// field, type mismatch, or absent map surfaces as a structured
/// error the test author can `?`-propagate. Each variant carries
/// the path / alternatives needed to fix the call site without
/// re-running the test.
#[derive(Debug, Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
#[non_exhaustive]
pub enum SnapshotError {
    /// No map matched the requested name. `available` enumerates
    /// the captured map names so a typo surfaces in test output.
    MapNotFound {
        requested: String,
        available: Vec<String>,
    },
    /// No top-level global variable matched the requested name in
    /// any `*.bss` / `*.data` / `*.rodata` global-section map.
    /// `available` lists the union of every section's top-level
    /// member names.
    VarNotFound {
        requested: String,
        available: Vec<String>,
    },
    /// More than one global-section map exposes a top-level member
    /// with the requested name, so [`super::Snapshot::var`] cannot pick a
    /// deterministic answer. `found_in` lists every map (in capture
    /// order) where the name was seen — the caller should disambiguate
    /// via [`super::Snapshot::map`] and walk into the named map directly
    /// (e.g. `snap.map("scx_obj.bss")?.at(0).get("nr_cpus")`).
    AmbiguousVar {
        requested: String,
        found_in: Vec<String>,
    },
    /// A path component did not match any
    /// `crate::monitor::btf_render::RenderedValue::Struct` member at that depth. `requested`
    /// is the user-supplied lookup string; `walked` is the prefix
    /// that resolved successfully; `component` is the failing
    /// segment; `available` lists the struct's actual member names.
    FieldNotFound {
        requested: String,
        walked: String,
        component: String,
        available: Vec<String>,
    },
    /// A path component reached a non-Struct value where a struct
    /// was expected (e.g. descending into a `Uint` leaf).
    /// `requested` is the user-supplied lookup string; `kind` names
    /// the actual variant for diagnostics.
    NotAStruct {
        requested: String,
        walked: String,
        component: String,
        kind: String,
    },
    /// A typed accessor (`as_u64` etc.) was called on a rendered
    /// shape it cannot decode (e.g. `as_str` on a `Struct`).
    /// `expected` names the scalar type the accessor requires;
    /// `actual` names the rendered variant; `requested` is the
    /// user-supplied lookup string (empty when the accessor was
    /// invoked on a leaf without a path walk).
    TypeMismatch {
        expected: String,
        actual: String,
        requested: String,
    },
    /// A map index was out of range for the underlying entry list.
    IndexOutOfRange {
        map: String,
        index: usize,
        len: usize,
    },
    /// A per-CPU slot was out of range or unmapped.
    PerCpuSlot {
        map: String,
        cpu: u32,
        len: usize,
        unmapped: bool,
    },
    /// A predicate-based lookup (`find`, `max_by`) found no match.
    /// `len` is the number of entries the lookup traversed before
    /// giving up; `available_keys` is a small sample (up to
    /// `NO_MATCH_KEY_SAMPLE` entries) of rendered keys seen during
    /// the traversal so an operator can distinguish "empty map"
    /// (`len == 0`) from "populated map with no predicate hit"
    /// (`len > 0`) and inspect the sample to debug the predicate.
    /// Keys are rendered via `crate::monitor::btf_render::RenderedValue`'s `Display` impl and
    /// each is capped at `NO_MATCH_KEY_CHAR_CAP` chars with an
    /// ellipsis to keep the failure message readable for wide struct
    /// keys.
    ///
    /// Aggregation methods (`max_by`, `cpu_max_u64` / `cpu_min_u64`
    /// / `cpu_max_f64` / `cpu_min_f64`) produce this variant for
    /// empty / all-None inputs; their NoMatch always carries
    /// `len == 0` and empty `available_keys`. Only `find` can
    /// produce `len > 0` here.
    NoMatch {
        map: String,
        op: String,
        len: usize,
        available_keys: Vec<String>,
    },
    /// A path string contained an empty component (e.g. `"a..b"`).
    /// `requested` is the user-supplied lookup string.
    EmptyPathComponent { requested: String },
    /// `EntryAccessor::get` was called on a per-CPU entry without
    /// narrowing to a CPU first via [`super::SnapshotMap::cpu`].
    PerCpuNotNarrowed { map: String },
    /// Hash entry has no rendered key/value side (BTF type id was
    /// missing at capture time, leaving the hex bytes only).
    NoRendered { map: String, side: String },
    /// The sample's underlying `crate::monitor::dump::FailureDumpReport`
    /// is a placeholder produced by
    /// `crate::monitor::dump::FailureDumpReport::placeholder` —
    /// the freeze-rendezvous path could not collect real data
    /// (typical cause: vCPU rendezvous timed out). Temporal
    /// patterns in [`crate::assert::temporal`] route this variant
    /// through their per-sample skip handling so a placeholder
    /// sample never falsely registers as zero progress against a
    /// monotonicity / rate / steady / ratio band. The `reason`
    /// string mirrors `FailureDumpReport::scx_walker_unavailable`
    /// when present (set by `placeholder()` to the constructor
    /// argument), giving the operator the cause without re-walking
    /// the report.
    PlaceholderSample { tag: String, reason: String },
    /// A [`SampleSeries::stats`](crate::scenario::sample::SampleSeries::stats)
    /// projection ran on a sample whose `stats` field carries an
    /// `Err` — the stats client was not wired (no
    /// `scheduler_binary`) or the per-sample stats request failed.
    /// The carried [`MissingStatsReason`] identifies the *why* so
    /// operator diagnostics distinguish "no scheduler configured"
    /// from "scheduler refused the request" from "watchdog
    /// cancelled the request" without re-walking the source error.
    /// Distinguishes a per-sample stats coverage gap from an
    /// in-stats-JSON path miss (`TypeMismatch` /
    /// `FieldNotFound`) so the temporal-assertion site can
    /// branch on the cause without re-walking the source.
    MissingStats {
        tag: String,
        reason: MissingStatsReason,
    },
    /// A [`SampleSeries::host`](crate::scenario::sample::SampleSeries::host)
    /// projection ran on a sample whose `per_cpu_time` slice did
    /// not include `cpu` — placeholder report (freeze rendezvous
    /// timed out), or a kernel that didn't surface per-CPU
    /// `kernel_stat`/`tick_cpu_sched`/`kernel_cpustat` resolution
    /// for the requested CPU. Distinguishes a per-sample host-data
    /// coverage gap from a kernel-walker failure (`Unavailable` on
    /// the broader Snapshot accessor) so the temporal-assertion
    /// site can decide whether to fail strict or skip with a
    /// rendered Note.
    HostFieldUnavailable { tag: String, cpu: u32 },
    /// [`super::Snapshot::var`] / [`super::Snapshot::live_var`] /
    /// [`super::Snapshot::map`] was called on a snapshot whose
    /// underlying `crate::monitor::dump::FailureDumpReport` is a
    /// placeholder (the freeze-rendezvous path could not collect
    /// real data — typical cause: vCPU rendezvous timed out). The
    /// captured `report.maps` is empty by construction so the
    /// var/map lookup has nothing to walk. Distinct from
    /// [`Self::VarNotFound`] (which means "the captured report did
    /// not contain a global by this name") so the assertion site
    /// can distinguish "freeze failed" from "typo in field name".
    /// `tag` carries the capture tag (if any).
    PlaceholderSnapshot { tag: Option<String> },
    /// [`super::Snapshot::active`] / [`super::Snapshot::live_var`]
    /// could not identify a currently-active scheduler from the
    /// snapshot's `*scx_root` + `prog_runtime_stats`. Typical
    /// causes: snapshot taken in the dead window between
    /// [`crate::scenario::ops::Op::DetachScheduler`] +
    /// [`crate::scenario::ops::Op::AttachScheduler`]; snapshot
    /// taken in the post-swap settle window before the new
    /// scheduler's progs have advanced their run counter; snapshot
    /// captured before any scheduler attached. Distinct from
    /// [`Self::AmbiguousVar`] (which means "the snapshot has
    /// multiple scheduler bss copies and the call did not opt
    /// into active-only filtering") so the assertion site can
    /// distinguish "no scheduler is running right now" from
    /// "multiple are running, pick one".
    NoActiveScheduler { reason: String },
    /// [`super::Snapshot::var`] / [`super::Snapshot::map`] (or one
    /// of the `live_*` shortcuts) ran against an active-filtered
    /// view where the KVA whitelist excluded EVERY captured map
    /// that shared the active obj prefix (i.e. the admitted set
    /// for this obj was empty). Distinct from [`Self::VarNotFound`]
    /// — `VarNotFound` means "the active filter admitted maps but
    /// none carry the requested name"; this variant means "the
    /// active filter admitted zero maps for this obj, so the
    /// lookup never got the chance to walk anything."
    ///
    /// The variant never fires when at least one captured
    /// `<active_obj>.*` map passes the KVA whitelist — in that
    /// case the lookup miss is a real typo or absent symbol and
    /// the standard `VarNotFound` / `MapNotFound` carries the
    /// admitted list. This narrow firing scope prevents
    /// false-positives that would otherwise mask genuine typos
    /// in same-binary post-swap captures.
    ///
    /// Typical causes when this DOES fire: stale walker capture
    /// (captured KVAs predate the most recent struct_ops swap),
    /// same-binary post-swap window where the report still
    /// carries the old instance's maps, or a walker bug that
    /// resolved `*scx_root` against a different binary's map set.
    ActiveFilterExcludedMaps {
        /// User-supplied lookup string (the `var` / `map`
        /// argument). For [`super::Snapshot::live_vars_via`] this
        /// carries the joined name list `"[a, b, c]"`.
        requested: String,
        /// Obj name the active filter pinned to
        /// (`*scx_root → struct_ops map → obj prefix` resolution).
        active_obj: String,
        /// Maps captured under the active obj prefix that the KVA
        /// whitelist rejected.
        excluded_maps: Vec<ExcludedMap>,
        /// KVA whitelist the walker populated for the active obj.
        /// A non-empty set whose every entry mismatched the
        /// captured `map_kva` values points at stale capture or
        /// KVA aliasing; an empty set is unreachable through this
        /// variant (no filter means no exclusion).
        whitelist_kvas: Vec<u64>,
    },
    /// A walker-resolved [`crate::scenario::sample::SampleSeries::bpf_live_u64`]
    /// / `bpf_live_i64` / `bpf_live_f64` projection detected that
    /// the snapshot's per-snapshot walker output
    /// ([`crate::monitor::dump::FailureDumpReport::active_map_kvas`])
    /// disagrees with an earlier same-phase snapshot's walker
    /// output for the same lookup. The framework pins the first
    /// non-empty walker output it sees per phase and surfaces this
    /// variant for every later same-phase snapshot whose walker
    /// resolved to a different KVA set — without this gate the
    /// projected series would silently switch between bss copies
    /// mid-phase (typical cause: post-`Op::ReplaceScheduler` swap
    /// window where the walker re-publishes mid-phase) and
    /// downstream reducers like
    /// [`crate::assert::temporal::SeriesField::counter_delta_per_phase`]
    /// would see non-monotonic counter values. The drifted
    /// samples become per-sample `Err` slots; the temporal
    /// patterns' standard error-skip semantics apply.
    WalkerDriftedWithinPhase {
        phase: crate::assert::Phase,
        pinned_kvas: Vec<u64>,
        sample_kvas: Vec<u64>,
        requested: String,
    },
    /// A user-supplied projection closure (the kind passed to
    /// [`crate::scenario::sample::SampleSeries::bpf`]) signalled
    /// failure for reasons that don't fit the structured variants
    /// above. `reason` is the closure's free-form explanation —
    /// "lookup returned None for sched_id A, B, C" — so the failure
    /// message stays diagnostic without forcing the closure to
    /// synthesize an `available: Vec<String>` it cannot populate.
    ///
    /// Closures should reach for the structured variants
    /// ([`Self::VarNotFound`], [`Self::MapNotFound`], etc.) when
    /// they can; this variant is the escape hatch for higher-level
    /// disambiguation logic (e.g. "I walked vars(name) and none of
    /// the candidates matched my active-instance fingerprint").
    /// Surfaces in temporal-assertion failure messages as
    /// `projection failed: <reason>`.
    ProjectionFailed { reason: String },
}

impl std::fmt::Display for SnapshotError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            SnapshotError::MapNotFound {
                requested,
                available,
            } => {
                write!(
                    f,
                    "snapshot has no map '{requested}' (captured maps: {available:?})"
                )
            }
            SnapshotError::VarNotFound {
                requested,
                available,
            } => {
                write!(
                    f,
                    "snapshot has no global variable '{requested}' in any \
                     *.bss/*.data/*.rodata map (available globals: {available:?})"
                )
            }
            SnapshotError::AmbiguousVar {
                requested,
                found_in,
            } => {
                write!(
                    f,
                    "snapshot global '{requested}' is ambiguous (found in \
                     {found_in:?}); use Snapshot::active().var(name) (or the \
                     shorthand Snapshot::live_var(name)) to pick the active \
                     scheduler's copy automatically, or Snapshot::map(name) \
                     to address a specific scheduler's bss explicitly"
                )
            }
            SnapshotError::FieldNotFound {
                requested,
                walked,
                component,
                available,
            } => {
                write!(
                    f,
                    "path '{requested}': component '{component}' (after walking '{walked}') \
                     not found (members at this depth: {available:?})"
                )
            }
            SnapshotError::NotAStruct {
                requested,
                walked,
                component,
                kind,
            } => {
                write!(
                    f,
                    "path '{requested}': component '{component}' (after walking '{walked}') \
                     expected a Struct, got {kind}"
                )
            }
            SnapshotError::TypeMismatch {
                expected,
                actual,
                requested,
            } => {
                write!(
                    f,
                    "path '{requested}': cannot read as {expected} — actual rendered \
                     variant is {actual}"
                )
            }
            SnapshotError::IndexOutOfRange { map, index, len } => {
                write!(f, "map '{map}': index {index} out of range (length {len})")
            }
            SnapshotError::PerCpuSlot {
                map,
                cpu,
                len,
                unmapped,
            } => {
                if *unmapped {
                    write!(f, "map '{map}': cpu {cpu} per-CPU slot is unmapped (None)")
                } else {
                    write!(
                        f,
                        "map '{map}': cpu {cpu} out of range (have {len} per-CPU slots)"
                    )
                }
            }
            SnapshotError::NoMatch {
                map,
                op,
                len,
                available_keys,
            } => {
                if *len == 0 {
                    write!(f, "map '{map}': {op} matched no entries (map is empty)")
                } else if available_keys.is_empty() {
                    write!(
                        f,
                        "map '{map}': {op} matched none of {len} entries (sample keys unavailable)"
                    )
                } else {
                    write!(
                        f,
                        "map '{map}': {op} matched none of {len} entries (first {sampled}: {available_keys:?})",
                        sampled = available_keys.len(),
                    )?;
                    // The `hex:` prefix is only ever produced by
                    // `render_entry_key`'s fallback path when the
                    // entry's `key` field was `None` at capture time.
                    // Typed `RenderedValue::Display` does not emit
                    // this prefix for any scalar variant; `Struct`
                    // emits `TypeName{...}` inline or `TypeName:`
                    // breadcrumb, where a `hex:` collision would
                    // require a BTF struct literally named `hex` —
                    // no real kernel scheduler does that. The hint
                    // therefore fires only when BTF was uniformly
                    // absent for this map's key type at capture time,
                    // and names the kernel-side fix so the operator
                    // does not have to reverse-engineer the `hex:`
                    // discriminator.
                    if available_keys.iter().all(|k| k.starts_with(HEX_KEY_PREFIX)) {
                        write!(
                            f,
                            " (BTF missing at capture — keys shown as hex bytes; \
                             rebuild guest kernel with CONFIG_DEBUG_INFO_BTF=y for \
                             typed keys)"
                        )?;
                    }
                    Ok(())
                }
            }
            SnapshotError::EmptyPathComponent { requested } => {
                write!(
                    f,
                    "path '{requested}' has an empty component (consecutive '.')"
                )
            }
            SnapshotError::PerCpuNotNarrowed { map } => {
                write!(
                    f,
                    "map '{map}': per-CPU entry without a CPU narrow — call .cpu(N) first"
                )
            }
            SnapshotError::NoRendered { map, side } => {
                write!(
                    f,
                    "map '{map}': {side} has no rendered structure (no BTF type at capture time)"
                )
            }
            SnapshotError::PlaceholderSample { tag, reason } => {
                write!(
                    f,
                    "sample '{tag}' is a placeholder report (capture pipeline did not land): \
                     {reason}"
                )
            }
            SnapshotError::MissingStats { tag, reason } => {
                write!(f, "sample '{tag}': stats absent ({reason})")
            }
            SnapshotError::HostFieldUnavailable { tag, cpu } => {
                write!(
                    f,
                    "sample '{tag}': per_cpu_time has no entry for cpu {cpu} \
                     (placeholder report or kernel-walker resolution failure)"
                )
            }
            SnapshotError::PlaceholderSnapshot { tag } => match tag {
                Some(t) => write!(
                    f,
                    "snapshot '{t}' is a placeholder — the freeze-rendezvous \
                     path could not capture real data; no maps to walk"
                ),
                None => f.write_str(
                    "snapshot is a placeholder — the freeze-rendezvous path \
                     could not capture real data; no maps to walk",
                ),
            },
            SnapshotError::NoActiveScheduler { reason } => {
                write!(
                    f,
                    "snapshot has no currently-active scheduler ({reason}); \
                     use Snapshot::vars(name) to enumerate every observed \
                     copy explicitly, Snapshot::live_var(name) to keep the \
                     typed error path while opting into the active filter, \
                     or Snapshot::map(\"<obj>.<section>\") to address a \
                     specific scheduler's bss directly"
                )
            }
            SnapshotError::ActiveFilterExcludedMaps {
                requested,
                active_obj,
                excluded_maps,
                whitelist_kvas,
            } => {
                let excluded_rendered = excluded_maps
                    .iter()
                    .map(|m| format!("{}@{:#x}", m.name, m.map_kva))
                    .collect::<Vec<_>>()
                    .join(", ");
                let some_zero = excluded_maps.iter().any(|m| m.map_kva == 0);
                let some_alias = excluded_maps
                    .iter()
                    .any(|m| m.map_kva != 0 && !whitelist_kvas.contains(&m.map_kva));
                let cause = match (some_zero, some_alias) {
                    (false, true) => {
                        "this snapshot pre-dates your most recent \
                         Op::ReplaceScheduler / Op::AttachScheduler — \
                         wait for the next periodic boundary (or re-run \
                         the test) so the walker re-publishes the live \
                         scheduler's KVAs"
                    }
                    (true, false) => {
                        "the captured maps have no recorded KVAs — \
                         the snapshot pre-dates the walker plumbing, \
                         or the capture path failed to record per-map KVAs"
                    }
                    (true, true) => {
                        "some captured maps lack KVAs and some disagree \
                         with the walker's whitelist — both \
                         pre-walker-capture state and a post-swap window \
                         can produce this; re-run the test to regenerate \
                         the snapshot"
                    }
                    (false, false) => "captured KVAs were neither absent nor in disagreement",
                };
                write!(
                    f,
                    "snapshot lookup '{requested}' returned no hits under the \
                     active filter (obj='{active_obj}'): the walker's KVA \
                     whitelist {whitelist_kvas:#x?} excluded {n} captured map(s) \
                     sharing the obj prefix: {excluded_rendered}{cause}. \
                     Reach for Snapshot::vars('{requested}') to enumerate every \
                     copy across all obj prefixes, or Snapshot::map(\"<name>\") \
                     to address one of the excluded maps directly.",
                    n = excluded_maps.len(),
                )
            }
            SnapshotError::WalkerDriftedWithinPhase {
                phase,
                pinned_kvas,
                sample_kvas,
                requested,
            } => {
                write!(
                    f,
                    "walker drift within {phase:?}: lookup '{requested}' resolved against \
                     KVA set {sample_kvas:#x?}, but an earlier same-phase snapshot pinned \
                     {pinned_kvas:#x?}. The walker re-published mid-phase (typical cause: \
                     a post-Op::ReplaceScheduler swap window). The drifted sample is \
                     surfaced as Err so per-phase reducers (counter_delta_per_phase, \
                     ratio_across_phases) see monotonic Ok-sequences from one walker \
                     decision; address by stepping the phase past the swap settle window \
                     or by reading via the explicit picker form."
                )
            }
            SnapshotError::ProjectionFailed { reason } => {
                write!(f, "projection failed: {reason}")
            }
        }
    }
}

impl std::error::Error for SnapshotError {}

/// Result alias for snapshot accessors.
pub type SnapshotResult<T> = std::result::Result<T, SnapshotError>;

/// Typed shape of one entry drained from the snapshot bridge's
/// ordered per-tag store. Fields:
/// * `tag`: snapshot name the report was stored under.
/// * `report`: `crate::monitor::dump::FailureDumpReport` of the
///   captured guest state.
/// * `stats`: scheduler-side stats JSON or a typed
///   [`MissingStatsReason`] when capture happened without a
///   wired stats client.
/// * `elapsed_ms`: optional wall-clock anchor (ms since run-start).
/// * `step_index`: scenario phase index stamped at capture time.
///   `Some(idx)` for captures stored via the step-aware entry
///   points ([`crate::scenario::snapshot::SnapshotBridge::capture_with_step`]
///   or [`crate::scenario::snapshot::SnapshotBridge::store_with_stats_and_step`]);
///   `None` for fixture-injected captures via the unstamped legacy
///   paths ([`crate::scenario::snapshot::SnapshotBridge::capture`]
///   / [`crate::scenario::snapshot::SnapshotBridge::store`]
///   / [`crate::scenario::snapshot::SnapshotBridge::store_with_stats`]).
///
/// Used by [`crate::scenario::snapshot::SnapshotBridge::drain_ordered_with_stats`]
/// and [`crate::scenario::sample::SampleSeries::from_drained_typed`].
/// `#[non_exhaustive]` so future additive fields stay
/// pattern-match-compatible via rest-pattern destructure
/// (`DrainedSnapshotEntry { tag, report, .. }`).
#[derive(Debug)]
#[non_exhaustive]
pub struct DrainedSnapshotEntry {
    pub tag: String,
    pub report: crate::monitor::dump::FailureDumpReport,
    pub stats: std::result::Result<serde_json::Value, MissingStatsReason>,
    pub elapsed_ms: Option<u64>,
    pub step_index: Option<u16>,
}

#[cfg(test)]
mod tests_api_gaps {
    use super::*;

    /// Pin: `SnapshotError::ProjectionFailed { reason }` renders as
    /// `projection failed: <reason>` so the temporal-assertion
    /// failure path surfaces the closure's diagnostic without
    /// re-wrapping. Closure call-sites synthesize this variant
    /// when the structured variants (`VarNotFound`, `MapNotFound`,
    /// `AmbiguousVar`) require an `available: Vec<String>` they
    /// cannot populate.
    #[test]
    fn projection_failed_display_carries_reason() {
        let e = SnapshotError::ProjectionFailed {
            reason: "live_var_via picker rejected all 2 candidates".to_string(),
        };
        let rendered = format!("{e}");
        assert_eq!(
            rendered,
            "projection failed: live_var_via picker rejected all 2 candidates"
        );
    }

    /// Pin: `ProjectionFailed` participates in the same
    /// `PartialEq` / `Hash` derive set as every other variant —
    /// pattern-match callers can assert "yes, my projection
    /// closure failed" without falling through to a `_` arm.
    #[test]
    fn projection_failed_eq_and_hash_round_trip() {
        let a = SnapshotError::ProjectionFailed {
            reason: "x".to_string(),
        };
        let b = a.clone();
        assert_eq!(a, b);
        let mut seen = std::collections::HashSet::new();
        seen.insert(a);
        assert!(seen.contains(&b));
    }
}