ktstr 0.4.14

Test harness for Linux process schedulers
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
//! Task enrichment capture builder for the failure-dump freeze path.
//!
//! Owns the [`crate::monitor::dump::TaskWalkerEntry`] vec plus the
//! sched-class and lock-slowpath registries the per-task enrichment
//! walker needs. The freeze coordinator stack-borrows from a
//! returned [`TaskEnrichmentOwned`] to construct the borrow-only
//! [`crate::monitor::dump::TaskEnrichmentCapture`] passed into
//! `dump_state`.
//!
//! Two-phase task harvest:
//!   1. Primary source — the kernel's global `scx_tasks` LIST_HEAD
//!      (`kernel/sched/ext.c:47`). Every scx-managed task linked
//!      via `task_struct.scx.tasks_node`. This list outlives per-rq
//!      `runnable_list` because `scx_bypass`
//!      (`kernel/sched/ext.c:5304-5404`) drains runnable_list during
//!      scheduler teardown without touching `scx_tasks`. The
//!      walker filters cursor entries (`SCX_TASK_CURSOR` flag set)
//!      that `scx_task_iter_start`
//!      (`kernel/sched/ext.c:843-846`) inserts.
//!   2. Per-CPU `rq->scx.runnable_list` walk — only used to flag
//!      which tasks were runnable on which CPU at freeze time
//!      (`is_runnable_in_scx: true`) and to stamp `running_pc`
//!      from the matching vCPU's instruction pointer when
//!      `rq->curr == task_kva`. Tasks found ONLY on a runnable_list
//!      (race with global-list linkage) are still included with
//!      the runnable flag set.
//!
//! See [`crate::monitor::task_enrichment`] for the per-task walker
//! semantics this capture feeds.

use crate::monitor::bpf_map::GuestMemMapAccessorOwned;
use crate::monitor::btf_offsets::{ScxWalkerOffsets, TaskEnrichmentOffsets};
use crate::monitor::dump::TaskWalkerEntry;
use crate::monitor::idr::translate_any_kva;
use crate::monitor::reader::GuestMem;
use crate::monitor::task_enrichment::{LockSlowpathRegistry, SchedClassRegistry};
use crate::vmm::exit_dispatch::VcpuRegSnapshot;

use super::capture_scx::ScxWalkerOwned;

/// Owned data the freeze coordinator stack-allocates to back a
/// [`crate::monitor::dump::TaskEnrichmentCapture`].
pub(crate) struct TaskEnrichmentOwned {
    /// Tasks discovered by walking the kernel's global `scx_tasks`
    /// LIST_HEAD plus every per-CPU `rq->scx.runnable_list`. Each
    /// entry carries the task KVA; `is_runnable_in_scx` is true
    /// when the task appeared on any CPU's runnable_list at freeze
    /// time; `running_pc` is stamped when the task was the curr
    /// task on some vCPU (matched via `rq->curr` against that
    /// vCPU's instruction pointer).
    pub(crate) tasks: Vec<TaskWalkerEntry>,
    /// Cached sched_class symbol KVAs for class decode + the
    /// PI-boost-out-of-SCX flag.
    pub(crate) sched_classes: SchedClassRegistry,
    /// Cached lock-slowpath symbol KVAs for stack-PC pattern
    /// matching.
    pub(crate) lock_slowpaths: LockSlowpathRegistry,
}

/// Maximum entries any single runnable_list walk visits before
/// bailing with a partial result. Mirrors
/// [`crate::monitor::scx_walker`]'s cap so a corrupt-pointer chain
/// (or an attacker-controlled freeze instant) cannot turn the
/// capture builder into an unbounded read loop. 4096 is generous —
/// a real per-CPU runnable_list has at most ~num_threads entries
/// per CPU and that figure stays well below this on realistic
/// configurations.
const MAX_NODES_PER_LIST: u32 = 4096;

/// Build the task-enrichment owned-data set when every prerequisite
/// resolves.
///
/// Returns `None` to signal "no capture this freeze." A `None`
/// return propagates to [`crate::monitor::dump::DumpContext::task_enrichment_capture`]
/// being `None`, which leaves `task_enrichments` empty in the report
/// and stamps `task_enrichments_unavailable` with
/// [`crate::monitor::dump::REASON_NO_TASK_WALKER`].
///
/// Phase 1 harvests the kernel's global `scx_tasks` LIST_HEAD via
/// [`crate::monitor::scx_walker::walk_scx_tasks_global`] — the
/// durable task source that survives scheduler teardown. Phase 2
/// walks each per-CPU `rq->scx.runnable_list` only to identify
/// runnable tasks and stamp `running_pc` on the curr task. The
/// `running_pc` slot is set from `vcpu_regs[cpu].instruction_pointer`
/// when `rq->curr == task_kva` — only the curr task has a meaningful
/// PC (every other runnable-but-not-current task would need a
/// kernel-side stack unwinder, which ktstr does not implement).
/// Tasks present in only one source (race window between
/// `scx_init_task` and `list_add_tail` on `scx_tasks`) are still
/// included; tasks on the global list but not on any runnable_list
/// surface with `is_runnable_in_scx: false`.
///
/// Prerequisites that gate the build:
/// - `scx_owned` carries the per-CPU rq KVA + PA arrays.
/// - `scx_walker_offsets` resolves the rq / scx_rq / task / see
///   sub-groups. Per-sub-group BTF absence collapses the whole
///   walker to `None`; container_of math cannot run on partial
///   offsets.
/// - `offsets` (TaskEnrichmentOffsets) is required so the
///   downstream borrow capture is constructible. The walker itself
///   does not read enrichment offsets — that happens in
///   `walk_task_enrichment` during dump_state — but their presence
///   is the gate for the capture's existence.
///
/// `_owned_accessor` carries the [`crate::monitor::guest::GuestKernel`]
/// (sched_class + lock-slowpath registry resolution reads its
/// vmlinux symtab); `scx_owned` carries the SCX walker's per-CPU rq
/// arrays + `scx_root_kva`; `offsets` carries the BTF offsets the
/// per-task enrichment walker needs; `vcpu_regs` lets this capture
/// stamp `running_pc` on the curr-task entry by matching `rq->curr`
/// against each vCPU's instruction pointer.
///
/// # Implementer requirements
///
/// 1. **6.12+ kernel compatibility.** Every Option-typed offset on
///    [`TaskEnrichmentOffsets`] (`task_struct_core_cookie` is the
///    canonical example — `CONFIG_SCHED_CORE`-gated) and every
///    Option-typed sub-field on
///    [`crate::monitor::scx_walker::RqScxState`] reachable through
///    `scx_owned` must be guarded with `Option::map`/`if let Some`
///    before reading. A 6.12 kernel built without
///    `CONFIG_SCHED_CORE` lacks `task_struct.core_cookie` BTF
///    encoding; the per-task enrichment must still produce a
///    [`crate::monitor::task_enrichment::TaskEnrichment`] with
///    `core_cookie: None` rather than failing the whole task.
///    Read [`crate::monitor::task_enrichment::walk_task_enrichment`]
///    — it already gates per-Option; preserve that guarantee at
///    the build site.
/// 2. **Tests.** At minimum: happy-path (every offset resolved,
///    every field populated), partial (older-kernel subset of
///    Options None, capture still succeeds with partial data),
///    empty/unmapped (guest memory translate fails, capture returns
///    None gracefully). Pattern: synthetic
///    [`crate::monitor::reader::GuestMem`] buffer +
///    [`crate::monitor::guest::GuestKernel::new_for_test`] with
///    known offsets + assert. See `task_enrichment.rs` tests for
///    the layout pattern.
/// 3. **Serde compat.** New fields landing on
///    [`crate::monitor::task_enrichment::TaskEnrichment`] must use
///    `#[serde(default, skip_serializing_if = ...)]` — old JSON
///    consumers must round-trip.
pub(crate) fn build(
    owned_accessor: &GuestMemMapAccessorOwned,
    scx_owned: Option<&ScxWalkerOwned>,
    scx_walker_offsets: Option<&ScxWalkerOffsets>,
    offsets: Option<&TaskEnrichmentOffsets>,
    vcpu_regs: &[Option<VcpuRegSnapshot>],
) -> Option<TaskEnrichmentOwned> {
    // Every prereq must be present. If any is absent, the freeze
    // path falls back to `task_enrichments_unavailable` via
    // dump_state's existing diagnostic path (the borrow capture
    // simply isn't constructed).
    let scx_owned = scx_owned?;
    let scx_offs = scx_walker_offsets?;
    let _ = offsets?;

    // The four sub-groups required for both the global walk and
    // the per-CPU runnable_list walk. `task` + `see` give us the
    // container_of math for both linkages (`tasks_node` for the
    // global list, `runnable_node` for the per-rq list); `rq` +
    // `scx_rq` give the per-CPU runnable_list head address.
    let rq_offs = scx_offs.rq.as_ref()?;
    let scx_rq_offs = scx_offs.scx_rq.as_ref()?;
    let task_offs = scx_offs.task.as_ref()?;
    let see_offs = scx_offs.see.as_ref()?;

    let kernel = owned_accessor.guest_kernel();
    let mem = kernel.mem();
    let walk = kernel.walk_context();

    // Field offsets within task_struct for both list linkages:
    //   - tasks_node: links into the kernel's global `scx_tasks`
    //     LIST_HEAD (kernel/sched/ext.c:47). Survives the per-rq
    //     runnable_list drain that scx_bypass triggers during
    //     scheduler teardown (kernel/sched/ext.c:5304-5404).
    //   - runnable_node: links into the per-CPU
    //     `rq->scx.runnable_list`. Used here only to identify
    //     which tasks were runnable on which CPU at freeze time
    //     (sets `is_runnable_in_scx: true` and the running_pc
    //     stamp on the curr task).
    //
    // checked_add guards against malformed BTF offsets. Plain `+`
    // would wrap silently on overflow and produce nonsense
    // container_of math; collapsing the whole capture to None is
    // the safe degradation for an attacker-controlled BTF blob.
    let tasks_node_off_in_task = task_offs.scx.checked_add(see_offs.tasks_node)?;
    let runnable_node_off_in_task = task_offs.scx.checked_add(see_offs.runnable_node)?;
    let runnable_list_off = rq_offs.scx.checked_add(scx_rq_offs.runnable_list)?;

    // Phase 1: harvest the global scx_tasks list — every task
    // currently owned by an scx_sched, regardless of CPU. This is
    // the durable task source: `scx_tasks` only loses entries when
    // `sched_ext_dead` (kernel/sched/ext.c:3792) explicitly removes
    // a dying task, whereas runnable_list churns every dispatch
    // cycle and gets fully drained by scheduler teardown.
    let mut all_task_kvas: Vec<u64> = crate::monitor::scx_walker::walk_scx_tasks_global(
        kernel,
        scx_owned.scx_tasks_kva,
        tasks_node_off_in_task,
        see_offs.tasks_node,
        see_offs.flags,
    );
    // O(1) membership set for the runnable-only race-window
    // dedup. Pre-fix this used Vec::contains which is O(N) per
    // call and turned the per-CPU runnable_list merge into
    // O(N*M) where N=global tasks and M=tasks across all
    // runnable_lists. On a host with many threads under
    // scheduler stall the inner Vec::contains scans dominated
    // freeze-time CPU. Initialize from the global list so the
    // common-case "task already present from scx_tasks walk"
    // hits in O(1).
    let mut all_task_kvas_set: std::collections::HashSet<u64> =
        all_task_kvas.iter().copied().collect();

    // Phase 2: per-CPU runnable_list walk to identify which tasks
    // were runnable on each CPU at freeze time, and stamp
    // running_pc on the curr task. We collect these into a separate
    // map so the global-list pass remains the canonical task
    // source even when scx_tasks is partially drained at teardown.
    //
    // Capacity hint: at most one entry per task across every
    // runnable_list. Sized off the global walk's lower bound to
    // avoid a HashMap rehash on the common path; runnable-only
    // race-window tasks land in the same map and a small overshoot
    // is cheaper than a power-of-two doubling at insert time.
    let mut runnable_on_cpu: std::collections::HashMap<u64, Option<u64>> =
        std::collections::HashMap::with_capacity(all_task_kvas.len());
    for (cpu, (&rq_kva, &rq_pa)) in scx_owned
        .rq_kvas
        .iter()
        .zip(scx_owned.rq_pas.iter())
        .enumerate()
    {
        // `rq->curr` is the currently-running task on this CPU.
        // The matching vCPU's instruction pointer (when captured)
        // is the running PC for that task — used by the
        // lock-slowpath matcher to decide whether a runnable task
        // is stuck in a lock acquire path. Reading curr_kva==0 is
        // normal (idle CPU mid-park) and means no curr task
        // exists; in that case running_pc never stamps regardless
        // of the vCPU register capture.
        let curr_kva = mem.read_u64(rq_pa, rq_offs.curr);
        let vcpu_pc = vcpu_regs
            .get(cpu)
            .and_then(|reg| reg.as_ref())
            .map(|reg| reg.instruction_pointer);

        // checked_add: a malformed BTF offset combined with a
        // hostile rq KVA/PA could wrap u64 silently with plain
        // wrapping_add and point head_kva/head_pa into unrelated
        // memory. Skip the CPU on overflow rather than walking a
        // garbage list head.
        let Some(head_kva) = rq_kva.checked_add(runnable_list_off as u64) else {
            continue;
        };
        let Some(head_pa) = rq_pa.checked_add(runnable_list_off as u64) else {
            continue;
        };

        let task_kvas = walk_runnable_list(mem, walk, head_kva, head_pa, runnable_node_off_in_task);

        for task_kva in task_kvas {
            // Stamp running_pc only on the curr task — every other
            // runnable-but-not-current task has no stack PCs
            // without a kernel-side unwinder (see
            // `task_enrichment.rs`'s doc on `lock_slowpath_match`).
            let running_pc = if task_kva == curr_kva && curr_kva != 0 {
                vcpu_pc
            } else {
                None
            };
            // Insert if absent; promote from None→Some(pc) if a
            // subsequent walker on a different CPU finds the task
            // currently running there.
            runnable_on_cpu
                .entry(task_kva)
                .and_modify(|e| {
                    if e.is_none() {
                        *e = running_pc;
                    }
                })
                .or_insert(running_pc);
            // If runnable_list found a task absent from the global
            // walk (race: task added to runnable_list before its
            // scx_init_task linkage finished, or scx_tasks was
            // raced through teardown), include it so the
            // enrichment record still surfaces. Membership probe
            // via a HashSet — Vec::contains here was O(N) per
            // call and turned this loop into the dominant
            // freeze-time cost on hosts with many threads.
            if all_task_kvas_set.insert(task_kva) {
                all_task_kvas.push(task_kva);
            }
        }
    }

    // Phase 3: build TaskWalkerEntry per task. A task's
    // `is_runnable_in_scx` flag is set when it appeared on any
    // CPU's runnable_list at freeze time; otherwise it's a
    // queued-for-dispatch / sleeping / etc. task that lives only
    // on the global list.
    //
    // Single HashMap probe via .get() per task — pre-fix this did
    // contains_key + get which double-hashed each task_kva. The
    // .map() over Option<&Option<u64>> collapses both axes (entry
    // present? running_pc populated?) into the constructed entry.
    let tasks: Vec<TaskWalkerEntry> = all_task_kvas
        .into_iter()
        .map(|task_kva| match runnable_on_cpu.get(&task_kva) {
            Some(running_pc) => TaskWalkerEntry {
                task_kva,
                is_runnable_in_scx: true,
                running_pc: *running_pc,
            },
            None => TaskWalkerEntry {
                task_kva,
                is_runnable_in_scx: false,
                running_pc: None,
            },
        })
        .collect();

    let sched_classes = SchedClassRegistry::from_guest_kernel(kernel);
    let lock_slowpaths = LockSlowpathRegistry::from_guest_kernel(kernel);

    Some(TaskEnrichmentOwned {
        tasks,
        sched_classes,
        lock_slowpaths,
    })
}

/// Walk one CPU's `rq->scx.runnable_list`, recovering each
/// `task_struct` KVA via container_of with `runnable_node_off_in_task`
/// as the field offset within `task_struct`.
///
/// Bounded by [`MAX_NODES_PER_LIST`]. Any pointer follow that fails
/// to translate (slab page race, PA out of bounds) terminates the
/// walk early — we return what we've collected so far rather than
/// aborting. Stops cleanly when the chain closes back to `head_kva`,
/// when the next pointer is NULL, or when the visited cap is hit.
fn walk_runnable_list(
    mem: &GuestMem,
    walk: crate::monitor::reader::WalkContext,
    head_kva: u64,
    head_pa: u64,
    runnable_node_off_in_task: usize,
) -> Vec<u64> {
    let mut task_kvas: Vec<u64> = Vec::new();
    let mut node_kva = mem.read_u64(head_pa, 0);
    if node_kva == 0 {
        return task_kvas;
    }

    let mut visited: u32 = 0;
    while node_kva != head_kva {
        if visited >= MAX_NODES_PER_LIST {
            return task_kvas;
        }
        visited += 1;

        // container_of: task_kva = node_kva - runnable_node_off_in_task.
        let task_kva = node_kva.wrapping_sub(runnable_node_off_in_task as u64);
        task_kvas.push(task_kva);

        let Some(node_pa) = translate_any_kva(
            mem,
            walk.cr3_pa,
            walk.page_offset,
            node_kva,
            walk.l5,
            walk.tcr_el1,
        ) else {
            return task_kvas;
        };
        let next_kva = mem.read_u64(node_pa, 0);
        if next_kva == 0 {
            return task_kvas;
        }
        node_kva = next_kva;
    }
    task_kvas
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::monitor::btf_offsets::{
        RqStructOffsets, SchedExtEntityOffsets, ScxRqOffsets, ScxWalkerOffsets,
        TaskStructCoreOffsets,
    };
    use crate::monitor::guest::GuestKernel;
    use crate::monitor::reader::GuestMem;
    use std::collections::HashMap;

    /// Direct-mapping page_offset used by the test constructors.
    /// `page_offset = 0` makes `kva_to_pa` the identity, so kvas in
    /// test fixtures equal their physical-buffer offsets and
    /// `translate_any_kva` returns immediately on the direct-mapping
    /// fast path.
    const TEST_PAGE_OFFSET: u64 = 0;

    /// Build a synthetic GuestKernel for tests — no vmlinux ELF
    /// parse, no symbol table. The walker only needs `mem`, `cr3_pa`,
    /// `page_offset`, and `l5`; symbols come into play only when
    /// `from_guest_kernel` is called on the registries (which return
    /// every-slot-None on an empty symbol map — see the dedicated
    /// tests below).
    fn make_kernel<'a>(mem: &'a GuestMem) -> GuestKernel<'a> {
        GuestKernel::new_for_test(mem, HashMap::new(), TEST_PAGE_OFFSET, 0, false)
    }

    /// Build a fully-populated [`ScxWalkerOffsets`] with deterministic
    /// field offsets that the synthetic memory layout below uses.
    /// `rq.scx == 0` and `scx_rq.runnable_list == 0` means the
    /// runnable_list head sits at the start of each rq fixture;
    /// `task.scx == 0` and `see.runnable_node == 0` means container_of
    /// is `task_kva = node_kva` (no offset shift) — keeps the
    /// arithmetic readable in the fixtures.
    fn test_scx_offsets() -> ScxWalkerOffsets {
        ScxWalkerOffsets {
            rq: Some(RqStructOffsets { scx: 0, curr: 8 }),
            scx_rq: Some(ScxRqOffsets {
                local_dsq: 0,
                runnable_list: 0,
                nr_running: 96,
                flags: 100,
                cpu_released: 104,
                ops_qseq: 112,
                kick_sync: Some(120),
                nr_immed: Some(128),
                clock: Some(136),
            }),
            task: Some(TaskStructCoreOffsets {
                comm: 100,
                pid: 200,
                scx: 0,
            }),
            see: Some(SchedExtEntityOffsets {
                runnable_node: 0,
                runnable_at: 16,
                weight: 24,
                slice: 32,
                dsq_vtime: 40,
                dsq: 48,
                dsq_list: 56,
                flags: 72,
                dsq_flags: 76,
                sticky_cpu: 80,
                holding_cpu: 84,
                tasks_node: 88,
            }),
            dsq_lnode: None,
            dsq: None,
            sched: None,
            sched_pnode: None,
            sched_pcpu: None,
            rht: None,
        }
    }

    /// Walk a hand-built runnable_list with two entries. Layout:
    /// - head at PA 0x100, head.next at offset 0
    /// - n1 at PA 0x200, task1 starts at PA 0x200 (offset 0)
    /// - n2 at PA 0x300, task2 starts at PA 0x300
    /// - head.next = 0x200, n1.next = 0x300, n2.next = 0x100 (back
    ///   to head — terminator)
    ///
    /// Container_of with `runnable_node_off_in_task = 0` means
    /// `task_kva == node_kva`, so we expect [0x200, 0x300].
    #[test]
    fn walk_runnable_list_basic_two_tasks() {
        let mut buf = vec![0u8; 0x1000];
        let head = 0x100usize;
        let n1 = 0x200usize;
        let n2 = 0x300usize;

        buf[head..head + 8].copy_from_slice(&(n1 as u64).to_le_bytes());
        buf[n1..n1 + 8].copy_from_slice(&(n2 as u64).to_le_bytes());
        buf[n2..n2 + 8].copy_from_slice(&(head as u64).to_le_bytes());

        let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };

        let kvas = walk_runnable_list(
            &mem,
            crate::monitor::reader::WalkContext::default(),
            head as u64,
            head as u64,
            0,
        );
        assert_eq!(kvas, vec![n1 as u64, n2 as u64]);
    }

    /// Empty runnable_list: head.next == head — the kernel's empty
    /// list invariant. Walker returns no kvas.
    #[test]
    fn walk_runnable_list_empty() {
        let mut buf = vec![0u8; 0x1000];
        let head = 0x100usize;
        buf[head..head + 8].copy_from_slice(&(head as u64).to_le_bytes());
        let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };

        let kvas = walk_runnable_list(
            &mem,
            crate::monitor::reader::WalkContext::default(),
            head as u64,
            head as u64,
            0,
        );
        assert!(kvas.is_empty());
    }

    /// NULL next pointer: walker bails defensively before decoding
    /// a node. This is the unmapped/uninitialized fixture — distinct
    /// from cycle-cap truncation.
    #[test]
    fn walk_runnable_list_null_next_bails() {
        let mut buf = vec![0u8; 0x1000];
        let head = 0x100usize;
        buf[head..head + 8].copy_from_slice(&0u64.to_le_bytes());
        let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };

        let kvas = walk_runnable_list(
            &mem,
            crate::monitor::reader::WalkContext::default(),
            head as u64,
            head as u64,
            0,
        );
        assert!(kvas.is_empty());
    }

    /// container_of subtraction: with a non-zero
    /// `runnable_node_off_in_task`, `task_kva = node_kva - off`.
    /// Confirms the offset arithmetic that connects rq->scx walking
    /// to `task_struct` addresses on a real layout (in real kernels
    /// `task_struct.scx` lives at a non-zero offset and
    /// `sched_ext_entity.runnable_node` is also non-zero).
    #[test]
    fn walk_runnable_list_container_of_subtraction() {
        let mut buf = vec![0u8; 0x1000];
        let head = 0x100usize;
        let n1 = 0x200usize;
        let off = 0x40usize;

        buf[head..head + 8].copy_from_slice(&(n1 as u64).to_le_bytes());
        buf[n1..n1 + 8].copy_from_slice(&(head as u64).to_le_bytes());

        let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
        let kvas = walk_runnable_list(
            &mem,
            crate::monitor::reader::WalkContext::default(),
            head as u64,
            head as u64,
            off,
        );
        assert_eq!(kvas, vec![(n1 - off) as u64]);
    }

    /// Cycle protection: a runnable_list whose chain length exceeds
    /// [`MAX_NODES_PER_LIST`] returns truncated rather than looping
    /// forever. We synthesize a 2-node cycle that does NOT close
    /// back to head; the walker hits the cap and returns.
    #[test]
    fn walk_runnable_list_truncates_at_cap() {
        let mut buf = vec![0u8; 0x1000];
        let head = 0x100usize;
        let n1 = 0x200usize;
        let n2 = 0x300usize;

        buf[head..head + 8].copy_from_slice(&(n1 as u64).to_le_bytes());
        // n1.next = n2, n2.next = n1 — infinite loop never reaching
        // head. The visited-cap kicks in.
        buf[n1..n1 + 8].copy_from_slice(&(n2 as u64).to_le_bytes());
        buf[n2..n2 + 8].copy_from_slice(&(n1 as u64).to_le_bytes());

        let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
        let kvas = walk_runnable_list(
            &mem,
            crate::monitor::reader::WalkContext::default(),
            head as u64,
            head as u64,
            0,
        );
        assert_eq!(kvas.len() as u32, MAX_NODES_PER_LIST);
    }

    /// Unmapped guest memory: when the in-list next pointer reads as
    /// a value outside the synthetic GuestMem buffer,
    /// `translate_any_kva` returns None on the next iteration and
    /// the walker bails. The walker pushes container_of(node_kva)
    /// before translating (matching the scx_walker pattern), so the
    /// garbage node_kva surfaces as a final task_kva entry — but
    /// the walk terminates rather than dereferencing further. This
    /// is the slab-page-eviction-race surrogate; the consumer (the
    /// per-task enrichment walker) revalidates the task_kva via
    /// translate_any_kva before reading task_struct fields, so a
    /// terminal-garbage entry is benign.
    #[test]
    fn walk_runnable_list_unmapped_memory_terminates_walk() {
        let mut buf = vec![0u8; 0x1000];
        let head = 0x100usize;
        let n1 = 0x200usize;
        let garbage_next: u64 = 0xffff_ffff_ffff_0000;

        buf[head..head + 8].copy_from_slice(&(n1 as u64).to_le_bytes());
        // n1.next is an out-of-bounds KVA that translate_any_kva
        // cannot resolve. Walker pushes n1, advances node_kva to
        // garbage, pushes garbage's container_of, attempts the
        // translate, fails, returns. Two entries total: the real
        // task and the terminal garbage entry. The downstream
        // enrichment walker filters garbage by re-running
        // translate_any_kva on each task_kva before reading fields.
        buf[n1..n1 + 8].copy_from_slice(&garbage_next.to_le_bytes());
        let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };

        let kvas = walk_runnable_list(
            &mem,
            crate::monitor::reader::WalkContext::default(),
            head as u64,
            head as u64,
            0,
        );
        assert_eq!(kvas, vec![n1 as u64, garbage_next]);
    }

    /// Multi-CPU walk: with two CPUs each holding a single runnable
    /// task, the resulting tasks vec has both entries. Validates the
    /// per-CPU iteration loop in build() (without invoking the
    /// public entrypoint, which requires a vmlinux-backed
    /// GuestMemMapAccessorOwned).
    #[test]
    fn multi_cpu_walk_concatenates_tasks() {
        let mut buf = vec![0u8; 0x2000];
        let cpu0_head = 0x100usize;
        let cpu0_n1 = 0x200usize;
        let cpu1_head = 0x800usize;
        let cpu1_n1 = 0x900usize;

        buf[cpu0_head..cpu0_head + 8].copy_from_slice(&(cpu0_n1 as u64).to_le_bytes());
        buf[cpu0_n1..cpu0_n1 + 8].copy_from_slice(&(cpu0_head as u64).to_le_bytes());
        buf[cpu1_head..cpu1_head + 8].copy_from_slice(&(cpu1_n1 as u64).to_le_bytes());
        buf[cpu1_n1..cpu1_n1 + 8].copy_from_slice(&(cpu1_head as u64).to_le_bytes());

        let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };

        let kvas0 = walk_runnable_list(
            &mem,
            crate::monitor::reader::WalkContext::default(),
            cpu0_head as u64,
            cpu0_head as u64,
            0,
        );
        let kvas1 = walk_runnable_list(
            &mem,
            crate::monitor::reader::WalkContext::default(),
            cpu1_head as u64,
            cpu1_head as u64,
            0,
        );

        let mut combined: Vec<u64> = Vec::new();
        combined.extend(kvas0);
        combined.extend(kvas1);
        assert_eq!(combined, vec![cpu0_n1 as u64, cpu1_n1 as u64]);
    }

    /// Partial ScxWalkerOffsets: with the optional sub-fields on
    /// [`ScxRqOffsets`] (kick_sync / nr_immed / clock) all None
    /// — simulating a v6.12 kernel BTF — capture_tasks still
    /// successfully walks the runnable_list because the four
    /// required sub-groups (rq, scx_rq, task, see) are present.
    /// Confirms the partial-offset path at the build()-gate level.
    #[test]
    fn scx_offsets_with_optional_fields_none_still_walks() {
        let mut buf = vec![0u8; 0x1000];
        let head = 0x100usize;
        let n1 = 0x200usize;
        buf[head..head + 8].copy_from_slice(&(n1 as u64).to_le_bytes());
        buf[n1..n1 + 8].copy_from_slice(&(head as u64).to_le_bytes());
        let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };

        let offsets = ScxWalkerOffsets {
            rq: Some(RqStructOffsets { scx: 0, curr: 8 }),
            scx_rq: Some(ScxRqOffsets {
                local_dsq: 0,
                runnable_list: 0,
                nr_running: 96,
                flags: 100,
                cpu_released: 104,
                ops_qseq: 112,
                kick_sync: None,
                nr_immed: None,
                clock: None,
            }),
            task: Some(TaskStructCoreOffsets {
                comm: 100,
                pid: 200,
                scx: 0,
            }),
            see: Some(SchedExtEntityOffsets {
                runnable_node: 0,
                runnable_at: 16,
                weight: 24,
                slice: 32,
                dsq_vtime: 40,
                dsq: 48,
                dsq_list: 56,
                flags: 72,
                dsq_flags: 76,
                sticky_cpu: 80,
                holding_cpu: 84,
                tasks_node: 88,
            }),
            dsq_lnode: None,
            dsq: None,
            sched: None,
            sched_pnode: None,
            sched_pcpu: None,
            rht: None,
        };

        let runnable_list_off =
            offsets.rq.as_ref().unwrap().scx + offsets.scx_rq.as_ref().unwrap().runnable_list;
        let runnable_node_off_in_task =
            offsets.task.as_ref().unwrap().scx + offsets.see.as_ref().unwrap().runnable_node;
        // runnable_list_off and runnable_node_off_in_task both 0
        // here; the walk produces n1's KVA.
        assert_eq!(runnable_list_off, 0);
        let kvas = walk_runnable_list(
            &mem,
            crate::monitor::reader::WalkContext::default(),
            head as u64,
            head as u64,
            runnable_node_off_in_task,
        );
        assert_eq!(kvas, vec![n1 as u64]);
    }

    /// Required-sub-group absence: when one of the four required
    /// sub-groups (`see`) is None — simulating a kernel where
    /// `sched_ext_entity` BTF resolution failed — the gate check in
    /// build() returns None. We assert this via a constructed
    /// ScxWalkerOffsets and the same gate logic.
    #[test]
    fn missing_required_subgroup_gates_to_none() {
        let offsets = ScxWalkerOffsets {
            rq: Some(RqStructOffsets { scx: 0, curr: 8 }),
            scx_rq: Some(ScxRqOffsets {
                local_dsq: 0,
                runnable_list: 0,
                nr_running: 96,
                flags: 100,
                cpu_released: 104,
                ops_qseq: 112,
                kick_sync: None,
                nr_immed: None,
                clock: None,
            }),
            task: Some(TaskStructCoreOffsets {
                comm: 100,
                pid: 200,
                scx: 0,
            }),
            see: None, // required sub-group absent
            dsq_lnode: None,
            dsq: None,
            sched: None,
            sched_pnode: None,
            sched_pcpu: None,
            rht: None,
        };
        // The `?` chain in build() short-circuits when any of the
        // four required sub-groups is None.
        assert!(offsets.see.is_none());
    }

    /// SchedClassRegistry with empty symbol table: every slot None.
    /// Confirms the no-symbol fall-through path doesn't panic and
    /// the per-class decode returns None for any pointer (since no
    /// pointer ever matches a None slot).
    #[test]
    fn sched_class_registry_empty_symbols_yields_none_slots() {
        let mut buf = vec![0u8; 0x1000];
        let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
        let kernel = make_kernel(&mem);
        let r = SchedClassRegistry::from_guest_kernel(&kernel);
        assert!(r.fair.is_none());
        assert!(r.rt.is_none());
        assert!(r.dl.is_none());
        assert!(r.idle.is_none());
        assert!(r.stop.is_none());
        assert!(r.ext.is_none());
        // decode never matches any slot when every slot is None.
        assert!(r.decode(0xffff_ffff_8000_1000).is_none());
    }

    /// LockSlowpathRegistry mirror: empty harness → every slot None
    /// → match_pc returns None for every PC.
    #[test]
    fn lock_slowpath_registry_empty_symbols_yields_none_slots() {
        let mut buf = vec![0u8; 0x1000];
        let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
        let kernel = make_kernel(&mem);
        let r = LockSlowpathRegistry::from_guest_kernel(&kernel);
        assert!(r.queued_spin_lock_slowpath.is_none());
        assert!(r.mutex_lock_slowpath.is_none());
        assert!(r.rwsem_down_read_slowpath.is_none());
        assert!(r.rwsem_down_write_slowpath.is_none());
        assert!(r.match_pc(0xdeadbeef).is_none());
    }

    /// SchedClassRegistry with populated symbols: the kernel's six
    /// per-class symbols resolve to distinct KVAs and the registry
    /// captures each. Decode against a known KVA returns the
    /// expected name.
    #[test]
    fn sched_class_registry_populated_symbols_decode_known() {
        let fair_kva: u64 = 0xffff_ffff_8000_1000;
        let mut symbols = HashMap::new();
        symbols.insert("fair_sched_class".to_string(), fair_kva);
        symbols.insert("ext_sched_class".to_string(), 0xffff_ffff_8000_1300);

        let mut buf = vec![0u8; 0x1000];
        let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
        let kernel = GuestKernel::new_for_test(&mem, symbols, TEST_PAGE_OFFSET, 0, false);
        let r = SchedClassRegistry::from_guest_kernel(&kernel);
        assert_eq!(r.fair, Some(fair_kva));
        assert_eq!(r.ext, Some(0xffff_ffff_8000_1300));
        assert_eq!(r.rt, None);
        assert_eq!(r.decode(fair_kva), Some("fair"));
        assert_eq!(r.decode(0xffff_ffff_8000_1300), Some("ext"));
    }

    /// Confirm ScxOffsets construction stays consistent with the
    /// runnable_list_off / runnable_node_off computations — pinning
    /// the field-offset arithmetic at the test level so a future
    /// kernel rename surfaces here.
    #[test]
    fn offset_arithmetic_stable() {
        let off = test_scx_offsets();
        let rq = off.rq.as_ref().unwrap();
        let scx_rq = off.scx_rq.as_ref().unwrap();
        let task = off.task.as_ref().unwrap();
        let see = off.see.as_ref().unwrap();

        assert_eq!(rq.scx + scx_rq.runnable_list, 0);
        assert_eq!(task.scx + see.runnable_node, 0);
        // tasks_node offset (88) is the see-relative offset, NOT
        // the task-struct offset; the full task-relative offset is
        // task.scx + see.tasks_node = 0 + 88 = 88.
        assert_eq!(task.scx + see.tasks_node, 88);
        // curr is at rq + 8 in our fixture.
        assert_eq!(rq.curr, 8);
    }

    /// Build a fixture for the integration logic: a global
    /// scx_tasks list with two tasks T1 and T2, where only T1
    /// appears on a per-CPU runnable_list. After build(), T1 must
    /// have `is_runnable_in_scx: true` and T2 must have
    /// `is_runnable_in_scx: false`. This validates the new
    /// global-list-as-primary-source behavior — every task on
    /// scx_tasks surfaces, with the rq->scx walk only contributing
    /// the runnable flag and running_pc stamp.
    ///
    /// Tested via the integration helper `merge_global_and_runnable`
    /// (extracted below) so the test bypasses the
    /// GuestMemMapAccessorOwned construction the public
    /// `build()` would require — same testing pattern as the
    /// existing walk_runnable_list-only tests.
    #[test]
    fn merge_global_walk_and_runnable_list_flags_correctly() {
        // Two tasks total; only T1 appears in any runnable_list.
        let t1: u64 = 0x1000;
        let t2: u64 = 0x2000;
        let global = vec![t1, t2];
        let runnable_per_cpu: Vec<(u64, u64)> = vec![(t1, 0xdead_beef)];

        let entries = merge_global_and_runnable(&global, &runnable_per_cpu);

        // Both tasks must surface — the global walk is the canonical
        // source; the runnable walk only annotates.
        assert_eq!(entries.len(), 2);
        // T1: runnable on a CPU; running_pc Some.
        let e1 = entries.iter().find(|e| e.task_kva == t1).unwrap();
        assert!(e1.is_runnable_in_scx);
        assert_eq!(e1.running_pc, Some(0xdead_beef));
        // T2: only on global list; not runnable, no running_pc.
        let e2 = entries.iter().find(|e| e.task_kva == t2).unwrap();
        assert!(!e2.is_runnable_in_scx);
        assert_eq!(e2.running_pc, None);
    }

    /// Race-window guard: a task on a per-CPU runnable_list but NOT
    /// (yet) on the global scx_tasks list — possible when a task's
    /// scx_init_task added it to runnable_list before its
    /// scx_tasks linkage finished. The merge logic must include the
    /// task in the result with `is_runnable_in_scx: true` so the
    /// enrichment record still surfaces.
    #[test]
    fn merge_includes_runnable_only_task() {
        let t1: u64 = 0x1000;
        let global: Vec<u64> = vec![]; // empty global list
        let runnable_per_cpu: Vec<(u64, u64)> = vec![(t1, 0)];

        let entries = merge_global_and_runnable(&global, &runnable_per_cpu);

        assert_eq!(entries.len(), 1);
        assert_eq!(entries[0].task_kva, t1);
        assert!(entries[0].is_runnable_in_scx);
    }

    /// Both lists empty (idle guest, no scx scheduler) → empty
    /// result. Confirms the merge doesn't fabricate entries.
    #[test]
    fn merge_both_empty_yields_empty() {
        let entries = merge_global_and_runnable(&[], &[]);
        assert!(entries.is_empty());
    }

    /// Cross-CPU running_pc promotion: a task on the global list
    /// that's also runnable on CPU 0 (no PC) but currently running
    /// on CPU 1 (PC stamped) must have its running_pc populated
    /// from CPU 1 — the second visit promotes None→Some.
    #[test]
    fn merge_promotes_running_pc_across_cpus() {
        let t: u64 = 0x1000;
        let global = vec![t];
        // Same task appears in two runnable_lists; first with no
        // PC (CPU 0 — task is on the runnable list but not curr),
        // then with PC (CPU 1 — task is curr there). The merge
        // must promote None→Some.
        // We mimic this with a (task, pc) pair list where pc==0
        // means "no stamp" and any non-zero is a real PC.
        let runnable_per_cpu: Vec<(u64, u64)> = vec![(t, 0), (t, 0xc0de)];
        let entries = merge_global_and_runnable(&global, &runnable_per_cpu);

        assert_eq!(entries.len(), 1);
        assert!(entries[0].is_runnable_in_scx);
        assert_eq!(entries[0].running_pc, Some(0xc0de));
    }

    /// Test helper that mirrors the integration logic in `build()`:
    /// merge a global-list task vec with per-CPU runnable
    /// (task_kva, running_pc) tuples into a single
    /// `Vec<TaskWalkerEntry>`. The pc value `0` in the tuples means
    /// "no PC" (None); any non-zero value is a real instruction
    /// pointer (Some). This shape mirrors the curr-task-only PC
    /// stamping the production builder does.
    fn merge_global_and_runnable(
        global: &[u64],
        runnable_per_cpu: &[(u64, u64)],
    ) -> Vec<TaskWalkerEntry> {
        let mut runnable_on_cpu: HashMap<u64, Option<u64>> = HashMap::with_capacity(global.len());
        for &(task_kva, pc) in runnable_per_cpu {
            let pc_opt = if pc == 0 { None } else { Some(pc) };
            runnable_on_cpu
                .entry(task_kva)
                .and_modify(|e| {
                    if e.is_none() {
                        *e = pc_opt;
                    }
                })
                .or_insert(pc_opt);
        }
        let mut all: Vec<u64> = global.to_vec();
        let mut all_set: std::collections::HashSet<u64> = all.iter().copied().collect();
        for &(task_kva, _) in runnable_per_cpu {
            if all_set.insert(task_kva) {
                all.push(task_kva);
            }
        }
        all.into_iter()
            .map(|task_kva| match runnable_on_cpu.get(&task_kva) {
                Some(running_pc) => TaskWalkerEntry {
                    task_kva,
                    is_runnable_in_scx: true,
                    running_pc: *running_pc,
                },
                None => TaskWalkerEntry {
                    task_kva,
                    is_runnable_in_scx: false,
                    running_pc: None,
                },
            })
            .collect()
    }

    // ---------------------------------------------------------------
    // Production-path integration tests for build()
    //
    // These tests exercise the production walkers (walk_scx_tasks_global
    // from scx_walker, walk_runnable_list from capture_tasks) plus the
    // merge-and-flag integration that build() performs. They cannot
    // call build() directly because GuestMemMapAccessorOwned requires
    // a real vmlinux ELF — instead they drive the same code paths
    // build() drives, with the test-only merge_global_and_runnable
    // helper standing in for the (task_kva, pc) pairing build() does.
    // The bug the tests guard against ("task_enrichments=0") was
    // exactly that build() walked only the per-rq runnable_list — the
    // tests below pin the post-fix behavior: scx_tasks (global) is the
    // canonical source, runnable_list only annotates.
    // ---------------------------------------------------------------

    /// Pinning the bug-fix invariant: when the global scx_tasks list
    /// holds 3 tasks but only T1 is on a per-CPU runnable_list, the
    /// merged result MUST contain all 3 task entries — not just the
    /// 1 the runnable_list pass found. T1 carries
    /// is_runnable_in_scx=true; T2 and T3 carry false. This is the
    /// fix for task_enrichments=0 — pre-fix, build() only walked
    /// runnable_list and missed T2/T3 because scx_bypass drains
    /// runnable_list during teardown.
    #[test]
    fn build_uses_global_scx_tasks_as_primary_source() {
        // Layout: global scx_tasks list at head_kva in the text
        // mapping with 3 task entries. Per-CPU runnable_list with
        // only T1. Drive walk_scx_tasks_global on the global head,
        // walk_runnable_list on the runnable head, merge.
        let head_kva = crate::monitor::symbols::START_KERNEL_MAP + 0x100;
        let head_pa = 0x100usize;
        let t1_node_kva: u64 = 0x800;
        let t2_node_kva: u64 = 0xa00;
        let t3_node_kva: u64 = 0xc00;
        let tasks_node_off_in_task: usize = 0x40;
        let tasks_node_off_in_see: usize = 0x60;
        let flags_off_in_see: usize = 0x44;

        let mut buf = vec![0u8; 0x2000];
        // Global list: head -> t1 -> t2 -> t3 -> head
        buf[head_pa..head_pa + 8].copy_from_slice(&t1_node_kva.to_le_bytes());
        buf[t1_node_kva as usize..t1_node_kva as usize + 8]
            .copy_from_slice(&t2_node_kva.to_le_bytes());
        buf[t2_node_kva as usize..t2_node_kva as usize + 8]
            .copy_from_slice(&t3_node_kva.to_le_bytes());
        buf[t3_node_kva as usize..t3_node_kva as usize + 8]
            .copy_from_slice(&head_kva.to_le_bytes());

        let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
        let kernel = make_kernel(&mem);

        let global = crate::monitor::scx_walker::walk_scx_tasks_global(
            &kernel,
            head_kva,
            tasks_node_off_in_task,
            tasks_node_off_in_see,
            flags_off_in_see,
        );
        assert_eq!(
            global.len(),
            3,
            "global walk must surface all 3 tasks regardless of runnable_list state"
        );

        let t1_kva = t1_node_kva.wrapping_sub(tasks_node_off_in_task as u64);
        // Only T1 is annotated as runnable; T2 and T3 surface from
        // global walk only.
        let runnable_per_cpu: Vec<(u64, u64)> = vec![(t1_kva, 0xdead_beef)];
        let entries = merge_global_and_runnable(&global, &runnable_per_cpu);

        assert_eq!(entries.len(), 3, "merge must preserve all 3 global tasks");
        let e1 = entries.iter().find(|e| e.task_kva == t1_kva).unwrap();
        assert!(e1.is_runnable_in_scx, "T1 must be flagged runnable");
        assert_eq!(e1.running_pc, Some(0xdead_beef));

        let t2_kva = t2_node_kva.wrapping_sub(tasks_node_off_in_task as u64);
        let t3_kva = t3_node_kva.wrapping_sub(tasks_node_off_in_task as u64);
        let e2 = entries.iter().find(|e| e.task_kva == t2_kva).unwrap();
        let e3 = entries.iter().find(|e| e.task_kva == t3_kva).unwrap();
        assert!(!e2.is_runnable_in_scx, "T2 absent from runnable_list");
        assert!(!e3.is_runnable_in_scx, "T3 absent from runnable_list");
        assert_eq!(e2.running_pc, None);
        assert_eq!(e3.running_pc, None);
    }

    /// Fallback path: scx_tasks_kva = 0 (stripped vmlinux or kernel
    /// without sched_ext). The global walk returns empty, but the
    /// per-CPU runnable_list walk still produces task entries. Pins
    /// the behavior that a build() without scx_tasks symbol
    /// resolution still enriches tasks discovered via runnable_list
    /// — the dump degrades to runnable-list-only on stripped vmlinux
    /// rather than producing zero enrichments.
    #[test]
    fn build_falls_back_to_runnable_list_when_scx_tasks_kva_zero() {
        // Pre-populate buf at offset 0 with a value that would
        // surface as a task_kva if a buggy walker dereferenced PA 0
        // — guards against a regression where scx_tasks_kva=0 would
        // be passed through and produce a phantom entry.
        let mut buf = vec![0u8; 0x1000];
        buf[0..8].copy_from_slice(&0xdead_u64.to_le_bytes());
        let head = 0x100usize;
        let n1 = 0x200usize;
        // runnable_list head: head -> n1 -> head (one task)
        buf[head..head + 8].copy_from_slice(&(n1 as u64).to_le_bytes());
        buf[n1..n1 + 8].copy_from_slice(&(head as u64).to_le_bytes());
        let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
        let kernel = make_kernel(&mem);

        // Global walk on scx_tasks_kva=0 must return empty.
        let global =
            crate::monitor::scx_walker::walk_scx_tasks_global(&kernel, 0, 0x40, 0x60, 0x44);
        assert!(
            global.is_empty(),
            "scx_tasks_kva=0 must NOT produce phantom entries"
        );

        // Runnable_list walk produces n1.
        let runnable_kvas = walk_runnable_list(
            &mem,
            crate::monitor::reader::WalkContext::default(),
            head as u64,
            head as u64,
            0,
        );
        assert_eq!(runnable_kvas, vec![n1 as u64]);

        let runnable_per_cpu: Vec<(u64, u64)> =
            runnable_kvas.iter().map(|&k| (k, 0xfeedu64)).collect();
        let entries = merge_global_and_runnable(&global, &runnable_per_cpu);

        // Empty global + non-empty runnable → entries from
        // runnable_list only, all flagged is_runnable_in_scx=true.
        assert_eq!(entries.len(), 1);
        assert_eq!(entries[0].task_kva, n1 as u64);
        assert!(entries[0].is_runnable_in_scx);
        assert_eq!(entries[0].running_pc, Some(0xfeed));
    }

    /// Cursor-skip via global walk: scx_tasks contains
    /// [T1, cursor, T2]. The cursor entry has SCX_TASK_CURSOR
    /// (1<<31) set on its enclosing sched_ext_entity.flags and must
    /// be filtered. The merged result must contain exactly T1 and
    /// T2 with no phantom cursor task. This pins build()-level
    /// integration of the cursor-skip — a regression that dropped
    /// cursor filtering would surface here as a 3-entry result.
    #[test]
    fn build_skips_cursor_entries_via_global_walk() {
        let head_kva = crate::monitor::symbols::START_KERNEL_MAP + 0x100;
        let head_pa = 0x100usize;
        let t1_node_kva: u64 = 0x800;
        let cursor_node_kva: u64 = 0xa00;
        let t2_node_kva: u64 = 0xc00;
        let tasks_node_off_in_task: usize = 0x40;
        let tasks_node_off_in_see: usize = 0x60;
        let flags_off_in_see: usize = 0x44;

        let mut buf = vec![0u8; 0x1000];
        buf[head_pa..head_pa + 8].copy_from_slice(&t1_node_kva.to_le_bytes());
        buf[t1_node_kva as usize..t1_node_kva as usize + 8]
            .copy_from_slice(&cursor_node_kva.to_le_bytes());
        buf[cursor_node_kva as usize..cursor_node_kva as usize + 8]
            .copy_from_slice(&t2_node_kva.to_le_bytes());
        buf[t2_node_kva as usize..t2_node_kva as usize + 8]
            .copy_from_slice(&head_kva.to_le_bytes());

        // Stamp SCX_TASK_CURSOR on the cursor entry's see.flags.
        let cursor_see_kva = cursor_node_kva.wrapping_sub(tasks_node_off_in_see as u64);
        let cursor_flags_pa = (cursor_see_kva as usize).wrapping_add(flags_off_in_see);
        let cursor_flags: u32 = 1 << 31;
        buf[cursor_flags_pa..cursor_flags_pa + 4].copy_from_slice(&cursor_flags.to_le_bytes());

        let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
        let kernel = make_kernel(&mem);

        let global = crate::monitor::scx_walker::walk_scx_tasks_global(
            &kernel,
            head_kva,
            tasks_node_off_in_task,
            tasks_node_off_in_see,
            flags_off_in_see,
        );
        assert_eq!(
            global.len(),
            2,
            "cursor entry must be filtered before reaching the merge"
        );

        // No runnable_list contribution — keeps the test focused on
        // cursor-skip preservation through the merge.
        let entries = merge_global_and_runnable(&global, &[]);
        assert_eq!(entries.len(), 2);
        let cursor_task_kva = cursor_node_kva.wrapping_sub(tasks_node_off_in_task as u64);
        assert!(
            !entries.iter().any(|e| e.task_kva == cursor_task_kva),
            "cursor's container_of result must NOT appear in merged entries"
        );
    }

    /// Production-like nonzero task.scx offset: real kernels place
    /// `task_struct.scx` at byte ~2528 (the offset varies by kernel
    /// version and config). The arithmetic
    ///   tasks_node_off_in_task = task.scx + see.tasks_node
    /// must compose correctly. A regression that drops `task.scx`
    /// (reading from `see.tasks_node` alone) would put every
    /// recovered task_kva 2528 bytes off — the `assert_eq!`
    /// against the expected base will catch that. We layout list
    /// nodes at deliberate KVAs and verify container_of recovers
    /// the right task_kva.
    #[test]
    fn build_with_nonzero_task_scx_offset() {
        // Layout choices:
        //   task.scx        = 0x300 (production-like — task_struct
        //                            embeds sched_ext_entity at a
        //                            non-zero offset)
        //   see.tasks_node  = 0x60  (sched_ext_entity.tasks_node)
        //   tasks_node_off_in_task = 0x300 + 0x60 = 0x360
        //   see.runnable_node = 0  (test-fixture choice, distinct
        //                            from real layout but isolates
        //                            the tasks_node arithmetic)
        //
        // Place 1 task. tasks_node_kva = 0x800. Expected
        // recovered task_kva = 0x800 - 0x360 = 0x4a0.
        let head_kva = crate::monitor::symbols::START_KERNEL_MAP + 0x100;
        let head_pa = 0x100usize;
        let t1_node_kva: u64 = 0x800;
        let task_scx_off: usize = 0x300;
        let tasks_node_off_in_see: usize = 0x60;
        let tasks_node_off_in_task: usize = task_scx_off + tasks_node_off_in_see;
        let flags_off_in_see: usize = 0x44;

        let mut buf = vec![0u8; 0x1000];
        buf[head_pa..head_pa + 8].copy_from_slice(&t1_node_kva.to_le_bytes());
        buf[t1_node_kva as usize..t1_node_kva as usize + 8]
            .copy_from_slice(&head_kva.to_le_bytes());
        let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
        let kernel = make_kernel(&mem);

        let global = crate::monitor::scx_walker::walk_scx_tasks_global(
            &kernel,
            head_kva,
            tasks_node_off_in_task,
            tasks_node_off_in_see,
            flags_off_in_see,
        );

        let expected_task_kva = t1_node_kva.wrapping_sub(tasks_node_off_in_task as u64);
        assert_eq!(global.len(), 1);
        assert_eq!(
            global[0], expected_task_kva,
            "container_of must subtract task.scx + see.tasks_node, not just see.tasks_node"
        );
        // Sanity: a buggy walker that dropped task.scx would land
        // at t1_node_kva - tasks_node_off_in_see = 0x800 - 0x60 =
        // 0x7a0. Confirm the result is NOT that value.
        let buggy_value = t1_node_kva.wrapping_sub(tasks_node_off_in_see as u64);
        assert_ne!(
            global[0], buggy_value,
            "regression guard: recovered task_kva must NOT match the see-only-arithmetic result"
        );
    }

    /// Race window: a task on a per-CPU runnable_list but NOT yet
    /// on the global scx_tasks list. This is the inverse of the
    /// canonical case — `scx_init_task` adds to runnable_list before
    /// finishing the scx_tasks linkage (or scx_tasks was raced
    /// during teardown). The merged result must include the task
    /// with `is_runnable_in_scx: true` so the enrichment record
    /// surfaces.
    #[test]
    fn build_runnable_only_task_added_to_global() {
        let mut buf = vec![0u8; 0x1000];
        let head = 0x100usize;
        let n1 = 0x200usize;
        // Empty global list (head loops back to head).
        let global_head_kva = crate::monitor::symbols::START_KERNEL_MAP + 0x500;
        let global_head_pa = 0x500usize;
        buf[global_head_pa..global_head_pa + 8].copy_from_slice(&global_head_kva.to_le_bytes());
        // runnable_list: head -> n1 -> head (one task).
        buf[head..head + 8].copy_from_slice(&(n1 as u64).to_le_bytes());
        buf[n1..n1 + 8].copy_from_slice(&(head as u64).to_le_bytes());
        let mem = unsafe { GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
        let kernel = make_kernel(&mem);

        // Global walk on the empty list returns nothing.
        let global = crate::monitor::scx_walker::walk_scx_tasks_global(
            &kernel,
            global_head_kva,
            0x40,
            0x60,
            0x44,
        );
        assert!(global.is_empty());

        // Runnable_list walk surfaces n1 (task.scx + see.runnable_node = 0
        // here, so task_kva == node_kva).
        let runnable_kvas = walk_runnable_list(
            &mem,
            crate::monitor::reader::WalkContext::default(),
            head as u64,
            head as u64,
            0,
        );
        assert_eq!(runnable_kvas, vec![n1 as u64]);

        // Merge: task on runnable_list but not on global list must
        // surface, with is_runnable_in_scx=true.
        let runnable_per_cpu: Vec<(u64, u64)> =
            runnable_kvas.iter().map(|&k| (k, 0xc0deu64)).collect();
        let entries = merge_global_and_runnable(&global, &runnable_per_cpu);

        assert_eq!(entries.len(), 1, "race-window task must surface");
        assert_eq!(entries[0].task_kva, n1 as u64);
        assert!(entries[0].is_runnable_in_scx);
        assert_eq!(entries[0].running_pc, Some(0xc0de));
    }
}