ktstr 0.4.14

Test harness for Linux process schedulers
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
//! Host-side BPF arena page enumeration.
//!
//! `BPF_MAP_TYPE_ARENA` (kernel uapi value [`BPF_MAP_TYPE_ARENA`]) is
//! a sparse, page-granular memory region shared between BPF programs
//! and userspace. The kernel allocates a 4 GiB-plus-guard
//! (`KERN_VM_SZ`) `vm_struct` and lazily maps order-0 pages into it
//! on demand (see `kernel/bpf/arena.c::arena_alloc_pages` and
//! `arena_vm_fault`); the user-visible window is at
//! `[arena.user_vm_start .. arena.user_vm_end)`, a 32-bit-addressable
//! range whose lower 32 bits the BPF JIT uses as the arena pointer
//! payload. Translation kernel-side is:
//!
//! ```text
//! kern_vm_start = arena->kern_vm->addr + GUARD_SZ/2
//! kaddr         = kern_vm_start + (u32)user_addr
//! page          = vmalloc_to_page(kaddr)   // PTE walk on init_mm
//! ```
//!
//! The host-side walker mirrors this: read the arena's `kern_vm`
//! pointer, dereference to get `vm_struct.addr`, add `GUARD_SZ/2`,
//! then for each pgoff in `0..max_entries` compute `kaddr` and run
//! [`GuestMem::translate_kva`] (the existing PTE walker against
//! `init_mm`'s page table). `max_entries` is the BPF map's declared
//! page capacity from `bpf_map_create()` — it is the source of truth
//! for "how many pages this arena could hold", regardless of whether
//! the scheduler exposes a userspace mmap (some don't, leaving
//! `user_vm_start == user_vm_end == 0`). Pages whose translate fails
//! are simply "not faulted in" — arena maps are sparse by design.
//!
//! The walker does NOT consult `arena->rt` (the range_tree of free
//! pgoffs) — `range_tree` polarity is "set = free" / "clear =
//! allocated", reading it from a frozen snapshot would only tell
//! the host which pages the kernel *intended* to be allocated, not
//! which are actually mapped. The PTE walk is the source of truth.
//!
//! [`BPF_MAP_TYPE_ARENA`]: BPF_MAP_TYPE_ARENA

use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use std::path::Path;

use btf_rs::Btf;

use super::Kva;
use super::bpf_map::{BPF_MAP_TYPE_ARENA, BpfMapInfo};
use super::btf_offsets::{find_struct, load_btf_from_path, member_byte_offset};
use super::guest::GuestKernel;

/// Page size used by the arena walker, derived from the GUEST
/// kernel's MMU configuration.
///
/// `arena_alloc_pages` and `arena_vm_fault` both call
/// `apply_to_page_range` on `PAGE_SIZE`-granular ranges where
/// `PAGE_SIZE` is the GUEST kernel's own MMU page size. The host's
/// page size is irrelevant — ktstr can run a 16 KiB-granule guest
/// on a 4 KiB-granule host (and vice versa), and the arena layout
/// must match the guest's view.
///
/// On x86_64 the guest page granule is fixed at 4 KiB. On aarch64
/// the granule is encoded in `TCR_EL1.TG1` (bits [31:30]):
///   - `0b10` → 4 KiB
///   - `0b01` → 16 KiB
///   - `0b11` → 64 KiB
///
/// Falls back to 4 KiB when the architecture branches reject the
/// register value (e.g. uninitialized `tcr_el1 == 0` on aarch64);
/// the fallback is conservative — at worst the walker overscans a
/// small arena and surfaces extra `pgoff` slots that translate to
/// `None`. A guest with non-4 KiB granule whose `tcr_el1` reads
/// zero would be a freeze-path bug elsewhere (the freeze
/// coordinator polls until `tcr_el1` populates before snapshotting).
fn guest_page_size(tcr_el1: u64) -> u64 {
    #[cfg(target_arch = "x86_64")]
    {
        let _ = tcr_el1;
        4096
    }
    #[cfg(target_arch = "aarch64")]
    {
        match (tcr_el1 >> 30) & 0x3 {
            0b10 => 4096,
            0b01 => 16384,
            0b11 => 65536,
            _ => 4096, // 0b00 reserved; conservative fallback
        }
    }
    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
    {
        let _ = tcr_el1;
        4096
    }
}

/// `GUARD_SZ / 2` from `kernel/bpf/arena.c`.
///
/// Kernel formula:
///   `GUARD_SZ = round_up(1ull << sizeof_field(struct bpf_insn, off) * 8,
///                        PAGE_SIZE << 1)`
/// where `sizeof_field(struct bpf_insn, off) * 8 = 16` so the lower
/// term is `1 << 16 = 65536`. Result depends on the kernel's page
/// granule (`PAGE_SIZE << 1`):
///   - 4 KiB pages: `round_up(65536, 8192)` = 65536, GUARD_HALF = 32768.
///   - 16 KiB pages: `round_up(65536, 32768)` = 65536, GUARD_HALF = 32768.
///   - 64 KiB pages: `round_up(65536, 131072)` = 131072, GUARD_HALF = 65536.
///
/// `bpf_arena_get_kern_vm_start` returns `arena->kern_vm->addr +
/// GUARD_SZ/2`, so the kernel-side accessible region starts
/// `GUARD_HALF` past the raw `vm_struct.addr`. The walker must add
/// this offset when translating user-VA to kern-VA.
fn guard_half(page_size: u64) -> u64 {
    (1u64 << 16).next_multiple_of(page_size << 1) / 2
}

/// Maximum number of pages the walker will translate per arena
/// sequentially.
///
/// `KERN_VM_SZ = SZ_4G + GUARD_SZ` is the kernel's vmalloc reservation
/// (~1M pages) but most arenas use a small fraction. Cap the
/// sequential walk at 4096 pages (16 MiB) to bound report size and
/// freeze-path latency (a full 1M-page walk at ~1 µs per
/// translate_kva would burn ~1 s on the freeze hot path); truncation
/// is surfaced via [`ArenaSnapshot::truncated`] and a sparse stride
/// sweep (see [`MAX_ARENA_STRIDE_PROBES`]) catches mapped pages
/// beyond this cap.
const MAX_ARENA_PAGES: u64 = 4096;

/// Number of evenly-spaced stride probes the walker performs across
/// pgoffs [`MAX_ARENA_PAGES`]..`declared_pages` when `declared_pages`
/// exceeds the sequential cap. Lets the walker surface mapped pages
/// in sparse arenas (e.g. a scheduler that allocated pages near the
/// 4 GiB end of its user_vm window) without paying the full 1M-page
/// translate_kva cost.
///
/// 256 probes × ~1 µs per translate ≈ 0.25 ms — negligible on the
/// freeze hot path. Each hit lands in [`ArenaSnapshot::pages`]
/// alongside the sequential prefix, so the consumer sees both.
const MAX_ARENA_STRIDE_PROBES: u64 = 256;

/// Defensive cap on the arena's address-range span, in bytes.
///
/// The walker computes its span from `info.max_entries * page_size`
/// (the BPF map's declared page capacity, see [`snapshot_arena`]).
/// `bpf_arena_init` allows at most 4 GiB worth of pages by design —
/// the BPF JIT addresses arena pointers via the low 32 bits of the
/// user address, so anything wider than `0x1_0000_0000` cannot be a
/// real arena layout (see `bpf_arena_alloc_pages` in
/// `kernel/bpf/arena.c`). A torn / corrupt `bpf_map.max_entries` or
/// a freeze-time race against `arena_init` could yield a wild value;
/// cap it here so the walker never multiplies a near-`u64::MAX` page
/// count by the page size (overflow) or attempts to walk billions of
/// pgoffs (live-lock on the freeze path).
const MAX_VM_RANGE_BYTES: u64 = 0x1_0000_0000;

/// Byte offsets within `struct bpf_arena` and `struct vm_struct`
/// needed for the host-side arena walker.
///
/// Resolved from BTF at startup so the walker doesn't hardcode kernel
/// layout. Mirrors the [`super::btf_offsets::BpfMapOffsets`] pattern.
#[derive(Debug, Clone)]
pub struct BpfArenaOffsets {
    /// Offset of `kern_vm` (`struct vm_struct *`) within `struct bpf_arena`.
    pub arena_kern_vm: usize,
    /// Offset of `user_vm_start` (u64) within `struct bpf_arena`.
    pub arena_user_vm_start: usize,
    /// Offset of `addr` (`void *`) within `struct vm_struct`.
    pub vm_struct_addr: usize,
}

impl BpfArenaOffsets {
    /// Parse BTF from a vmlinux ELF and resolve arena field offsets.
    ///
    /// Returns Err on kernels whose BTF lacks `bpf_arena` (i.e. arena
    /// support is not built in) — the caller can treat the absent
    /// offsets as a signal to skip arena enumeration.
    ///
    /// Production callers (the freeze coordinator) reach this code
    /// via [`Self::from_btf`] on a pre-parsed `&Btf` to amortize the
    /// ELF parse — `from_vmlinux` stays public as the convenience
    /// entry point for direct-from-vmlinux callers (CLI tools, unit
    /// tests against a vmlinux on disk).
    #[allow(dead_code)]
    pub fn from_vmlinux(path: &Path) -> Result<Self> {
        let btf = load_btf_from_path(path).context("btf: open vmlinux")?;
        Self::from_btf(&btf)
    }

    /// Resolve arena struct offsets from a pre-loaded BTF object.
    pub fn from_btf(btf: &Btf) -> Result<Self> {
        let (bpf_arena, _) = find_struct(btf, "bpf_arena")
            .context("btf: struct bpf_arena not found (arena unsupported on this kernel?)")?;
        let arena_kern_vm = member_byte_offset(btf, &bpf_arena, "kern_vm")?;
        let arena_user_vm_start = member_byte_offset(btf, &bpf_arena, "user_vm_start")?;

        let (vm_struct, _) =
            find_struct(btf, "vm_struct").context("btf: struct vm_struct not found")?;
        let vm_struct_addr = member_byte_offset(btf, &vm_struct, "addr")?;

        Ok(Self {
            arena_kern_vm,
            arena_user_vm_start,
            vm_struct_addr,
        })
    }
}

/// One mapped arena page captured from guest memory.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[non_exhaustive]
pub struct ArenaPage {
    /// User-side virtual address (32-bit window starting at
    /// `arena.user_vm_start`). Operators correlate this with the
    /// pointer values they see in BPF program output.
    pub user_addr: u64,
    /// One arena page's worth of bytes read from the guest. Length
    /// matches the guest kernel's MMU page size: 4 KiB on x86_64
    /// and on aarch64 with `TCR_EL1.TG1=0b10`; 16 KiB on aarch64
    /// 16 KiB-granule kernels (Apple Silicon style); 64 KiB on
    /// aarch64 64 KiB-granule kernels. The resolution lives in
    /// [`guest_page_size`] — the snapshot stamps every captured
    /// page at that size.
    pub bytes: Vec<u8>,
}

/// Snapshot of one arena map's mapped pages.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[non_exhaustive]
pub struct ArenaSnapshot {
    /// Mapped pages, in pgoff order (skipped over unmapped pgoffs).
    /// Sequential prefix (pgoffs `0..MAX_ARENA_PAGES`) followed by any
    /// stride-probe hits in the sparse tail (pgoffs sampled across
    /// `MAX_ARENA_PAGES..declared_pages`).
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub pages: Vec<ArenaPage>,
    /// True when the walker stopped sequential enumeration at
    /// [`MAX_ARENA_PAGES`] before finishing the user_vm window. The
    /// stride sweep that follows samples the tail at coarse intervals,
    /// so a hit reaches `pages` even when this flag is set; pgoffs
    /// between sampled positions are still silently skipped.
    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
    pub truncated: bool,
    /// Total declared page count. Derived from
    /// `max_entries * page_size` (the BPF map's declared page
    /// capacity, with `page_size` resolved from the guest's
    /// TCR_EL1 via [`guest_page_size`]), not the user_vm window.
    /// Reflects any [`MAX_VM_RANGE_BYTES`] cap. Surfaced alongside
    /// `pages.len()` so consumers can see the
    /// allocated-vs-declared ratio.
    pub declared_pages: u64,
    /// True when `max_entries * page_size` exceeded
    /// [`MAX_VM_RANGE_BYTES`] (4 GiB) and the walker capped the span
    /// before computing `declared_pages`. Indicates a torn / corrupt
    /// `bpf_arena` struct or a freeze-time race against initialization;
    /// the rendered pages still come from valid translates, so the
    /// snapshot is usable.
    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
    pub span_capped: bool,
    /// Kernel-side base of the arena's user_vm window:
    /// `bpf_arena.kern_vm->addr + GUARD_HALF`. Surfaces here so
    /// downstream consumers (notably the [`super::sdt_alloc`] tree
    /// walker) can translate `__arena` pointers without re-reading
    /// `struct bpf_arena` themselves. `0` when the snapshot bailed
    /// before computing the value (kern_vm_kva NULL, vm_addr NULL,
    /// or any of the upstream translates failed).
    ///
    /// Always serialized — the zero value carries diagnostic
    /// information ("walker reached this point but couldn't compute
    /// the base"), so suppressing it would mask the failure. Mirrors
    /// the policy used for the sibling `declared_pages` field.
    pub kern_vm_start: u64,
    /// User-side base of the arena window: the value of
    /// `bpf_arena.user_vm_start`, the address space the BPF program
    /// (and any captured `__arena` pointer) sees. `[user_vm_start ..
    /// user_vm_start + 4 GiB)` is the kernel-enforced upper bound
    /// (`bpf_arena_alloc_pages` clamps to `SZ_4G`). Consumers use it
    /// to classify a pointer as "lives in this arena" before chasing
    /// into [`Self::pages`].
    ///
    /// `0` when the snapshot bailed before reading
    /// `arena.user_vm_start` (e.g. `arena_pa` translate failed). On
    /// the syscall backend this comes from `bpf_map.map_extra` which
    /// the kernel pins at create time (`lib/arena_map.h` hardcodes
    /// `1<<44` on x86, `1<<32` on aarch64). On the guest-memory
    /// backend it's read directly from
    /// `bpf_arena.user_vm_start` via the resolved offset.
    ///
    /// Always serialized for the same diagnostic reason as
    /// [`Self::kern_vm_start`].
    pub user_vm_start: u64,
}

/// Walk the arena's mapped pages and return a snapshot.
///
/// Reads `kern_vm` from `struct bpf_arena` at `info.map_kva`,
/// dereferences to `vm_struct.addr`, computes
/// `kern_vm_start = addr + GUARD_HALF`, and for each pgoff in
/// `0..N` translates `kern_vm_start + (u32)user_addr` via
/// [`GuestMem::translate_kva`]. Pages that fail to translate are
/// "not faulted in" and silently skipped.
///
/// The walker is best-effort: any read failure on `bpf_arena` /
/// `vm_struct` itself yields an empty snapshot rather than an error,
/// so a corrupt arena can't break the broader failure dump.
pub fn snapshot_arena(
    kernel: &GuestKernel<'_>,
    info: &BpfMapInfo,
    offsets: &BpfArenaOffsets,
) -> ArenaSnapshot {
    if info.map_type != BPF_MAP_TYPE_ARENA {
        return ArenaSnapshot::default();
    }

    let mem = kernel.mem();
    let walk = kernel.walk_context();
    let page_size = guest_page_size(walk.tcr_el1);
    let guard_half_bytes = guard_half(page_size);

    // bpf_arena embeds bpf_map at offset 0, so map_kva == arena_kva.
    let arena_kva = info.map_kva;
    // Translate the arena struct itself — it may be kmalloc'd
    // (direct map) or vmalloc'd (`bpf_map_area_alloc`).
    let Some(arena_pa) = super::idr::translate_any_kva(
        mem,
        walk.cr3_pa,
        walk.page_offset,
        arena_kva,
        walk.l5,
        walk.tcr_el1,
    ) else {
        return ArenaSnapshot::default();
    };

    let user_vm_start = mem.read_u64(arena_pa, offsets.arena_user_vm_start);
    let kern_vm_kva = mem.read_u64(arena_pa, offsets.arena_kern_vm);
    // Preserve `user_vm_start` even when the kern-side walk fails:
    // the `MemReader::is_arena_addr` consumer needs it to classify
    // an `__arena` pointer as in-window (vs. a kernel kptr) so the
    // Ptr-deref path returns `None` cleanly instead of falling
    // through to the kernel-kptr cpumask probe. Without the anchor,
    // an arena pointer would be misread as a slab address — at best
    // garbage hex, at worst a translate against an unmapped page.
    if kern_vm_kva == 0 {
        return ArenaSnapshot {
            user_vm_start,
            ..ArenaSnapshot::default()
        };
    }

    // vm_struct lives in the kernel's slab/kmalloc area; direct or
    // vmalloc, so use translate_any_kva.
    let Some(vm_struct_pa) = super::idr::translate_any_kva(
        mem,
        walk.cr3_pa,
        walk.page_offset,
        kern_vm_kva,
        walk.l5,
        walk.tcr_el1,
    ) else {
        return ArenaSnapshot {
            user_vm_start,
            ..ArenaSnapshot::default()
        };
    };
    let vm_addr = mem.read_u64(vm_struct_pa, offsets.vm_struct_addr);
    if vm_addr == 0 {
        return ArenaSnapshot {
            user_vm_start,
            ..ArenaSnapshot::default()
        };
    }
    let kern_vm_start = vm_addr.wrapping_add(guard_half_bytes);

    // max_entries is the create-time page capacity; user_vm_end may
    // be 0 for arenas without userspace mmap.
    let plan = ArenaWalkPlan::new((info.max_entries as u64) * page_size, page_size);

    let mut snapshot = ArenaSnapshot {
        pages: Vec::new(),
        truncated: plan.truncated,
        declared_pages: plan.declared_pages,
        span_capped: plan.span_capped,
        kern_vm_start,
        user_vm_start,
    };

    // Reusable scratch buffer for the per-page read. Sized once at
    // `page_size` and reused across every captured page: on success
    // the buffer is moved into `ArenaPage` (one allocation per
    // captured page is unavoidable since each page owns its bytes),
    // then a fresh allocation refills the scratch on the next
    // `resize`. The win is the SKIP path — every translate-failure
    // or short-read pgoff used to allocate-and-discard a page-sized
    // zero-initialised buffer; now those paths reuse the existing
    // scratch capacity. On a sparse arena window (most pgoffs
    // unmapped) this collapses thousands of doomed allocations into
    // one. The hot path (freeze coordinator's dump pipeline) used
    // to dominate freeze-time wallclock on arenas with declared
    // pages > captured pages.
    let mut scratch: Vec<u8> = Vec::with_capacity(page_size as usize);

    // Closure: translate one pgoff to a page-content read; push
    // onto `snapshot.pages` if the translate + read succeed.
    // Captures `mem`, `walk`, `kern_vm_start`, `user_vm_start`,
    // `page_size`, and `scratch` (mutable — drained into the
    // captured page on success).
    let mut try_capture_page = |pgoff: u64, pages: &mut Vec<ArenaPage>| {
        // user_vm_start + pgoff*page_size is a 64-bit value, but the
        // kernel composes the kern-VA from the LOW 32 bits only —
        // `uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE)`
        // in arena_alloc_pages — since the user_vm window is capped
        // at SZ_4G and aligned so the low 32 bits cover the whole
        // span uniquely. Match the same truncation here.
        //
        // pgoff and page_size both originate from BPF map metadata
        // and the guest TCR_EL1; pgoff*page_size in u64 can overflow
        // when a corrupt map advertises a huge declared_pages count.
        // Skip the page on multiplication overflow — wrapping_add on
        // user_vm_start is intentional (matches kernel truncation),
        // but only when the multiplicand was correctly computed.
        let Some(byte_off) = pgoff.checked_mul(page_size) else {
            return;
        };
        let user_addr = user_vm_start.wrapping_add(byte_off);
        let kaddr = kern_vm_start.wrapping_add(user_addr & 0xFFFF_FFFF);
        let Some(pa) = mem.translate_kva(walk.cr3_pa, Kva(kaddr), walk.l5, walk.tcr_el1) else {
            return;
        };
        // Translate guarantees a page-aligned PA; bound-check
        // against guest DRAM size in case a corrupt PTE points
        // past end-of-DRAM.
        if pa + page_size > mem.size() {
            return;
        }
        // Resize the reusable scratch to `page_size` and zero-fill.
        // After a previous capture moved the inner Vec out via
        // `mem::take`, `scratch` is empty with `page_size` capacity;
        // resize allocates exactly the new buffer's bytes, but
        // skipping iterations that hit the early returns above
        // never reach this line so their alloc is avoided entirely.
        scratch.clear();
        scratch.resize(page_size as usize, 0);
        // `GuestMem::read_bytes` returns the actual byte count copied
        // (may be short when the PA crosses end-of-DRAM, even after
        // the bounds check above — DRAM can have non-contiguous
        // regions). Truncate the buffer to that count so consumers
        // never see the zero-init tail of an unwritten range as
        // legitimate page bytes.
        let n = mem.read_bytes(pa, &mut scratch);
        scratch.truncate(n);
        if scratch.is_empty() {
            return;
        }
        // Move the populated buffer into the captured page; the
        // scratch falls back to empty (capacity preserved) for the
        // next iteration.
        pages.push(ArenaPage {
            user_addr,
            bytes: std::mem::take(&mut scratch),
        });
    };

    // Phase 1: sequential walk of the first MAX_ARENA_PAGES (16 MiB
    // window) — covers every scheduler today, where allocations cluster
    // near pgoff 0.
    for pgoff in 0..plan.sequential_to {
        try_capture_page(pgoff, &mut snapshot.pages);
    }

    // Phase 2: stride sweep over the sparse tail. Without this, a
    // scheduler that allocated even one page near the 4 GiB end of
    // its user_vm window would be invisible to the dump despite the
    // truncation flag. Mapped pages discovered here append to
    // `snapshot.pages` after the sequential prefix and are
    // discoverable by `user_addr` (the consumer correlates by user
    // pointer, not pgoff index, so out-of-order pgoffs are fine).
    if let Some(stride) = plan.stride {
        let mut pgoff = plan.sequential_to;
        while pgoff < plan.declared_pages {
            try_capture_page(pgoff, &mut snapshot.pages);
            // Saturate at declared_pages on the last step; without
            // this `pgoff += stride` could skip past the final page
            // when stride > 1.
            pgoff = pgoff.saturating_add(stride);
        }
    }

    snapshot
}

/// Pure computation that decides how many pgoffs the walker must
/// translate (sequential prefix + stride sweep). Extracted so the
/// span-cap, declared-page, and stride-derivation logic is unit-
/// testable without mocking a [`super::guest::GuestKernel`].
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct ArenaWalkPlan {
    /// Page count the snapshot reports as "declared". Reflects any
    /// [`MAX_VM_RANGE_BYTES`] cap.
    declared_pages: u64,
    /// True when [`MAX_VM_RANGE_BYTES`] capped the raw span.
    span_capped: bool,
    /// True when `declared_pages > MAX_ARENA_PAGES` and the walker
    /// will skip pgoffs in the sparse tail.
    truncated: bool,
    /// Sequential-walk endpoint: the walker enumerates
    /// `0..sequential_to` exhaustively.
    sequential_to: u64,
    /// Stride for the post-sequential sweep, or `None` when no tail
    /// remains. `Some(stride)` walks pgoffs
    /// `sequential_to, sequential_to + stride, ...` until
    /// `declared_pages`.
    stride: Option<u64>,
}

impl ArenaWalkPlan {
    fn new(raw_span: u64, page_size: u64) -> Self {
        let span_capped = raw_span > MAX_VM_RANGE_BYTES;
        let span = raw_span.min(MAX_VM_RANGE_BYTES);
        let declared_pages = span / page_size;
        let sequential_to = declared_pages.min(MAX_ARENA_PAGES);
        let truncated = declared_pages > sequential_to;
        let stride = if declared_pages > MAX_ARENA_PAGES {
            let tail_pages = declared_pages - MAX_ARENA_PAGES;
            // div_ceil so stride * MAX_ARENA_STRIDE_PROBES covers
            // the whole tail; .max(1) so a tail smaller than
            // MAX_ARENA_STRIDE_PROBES still walks every remaining
            // page sequentially.
            Some(tail_pages.div_ceil(MAX_ARENA_STRIDE_PROBES).max(1))
        } else {
            None
        };
        Self {
            declared_pages,
            span_capped,
            truncated,
            sequential_to,
            stride,
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parse_arena_offsets_from_vmlinux() {
        let path = match crate::monitor::find_test_vmlinux() {
            Some(p) => p,
            None => return,
        };
        // Skip when find_test_vmlinux returns the raw BTF blob — the
        // vmlinux-ELF parse path inside `from_vmlinux` would fail on
        // it, but `from_btf` works directly. Tests in btf_offsets/tests.rs
        // skip the same way for the same reason.
        if path.starts_with("/sys/") {
            crate::report::test_skip("vmlinux is raw BTF (skipping ELF-only path)");
            return;
        }
        let offsets = match BpfArenaOffsets::from_vmlinux(&path) {
            Ok(o) => o,
            Err(e) => {
                // Older kernels without arena support: BTF lacks
                // `struct bpf_arena`. That's a valid configuration —
                // skip rather than fail.
                crate::report::test_skip(format!("arena BTF missing: {e}"));
                return;
            }
        };
        // bpf_arena starts with `struct bpf_map map`, so user_vm_*
        // come AFTER the embedded bpf_map; both must be at nonzero
        // offsets. kern_vm follows them in the kernel layout.
        assert!(
            offsets.arena_user_vm_start > 0,
            "user_vm_start follows embedded bpf_map"
        );
        assert_ne!(
            offsets.arena_kern_vm, offsets.arena_user_vm_start,
            "kern_vm distinct from user_vm_start"
        );
        // vm_struct.addr lives after the 8-byte union (next/llnode)
        // on 64-bit kernels.
        assert!(
            offsets.vm_struct_addr > 0,
            "vm_struct.addr follows the next/llnode union"
        );
    }

    // ---- ArenaWalkPlan: span cap + stride sweep -------------------
    //
    // The plan is a pure function of the raw user_vm span. Pin its
    // outputs against representative shapes so the snapshot_arena
    // call site stays tight against:
    //   - tiny arena (single page) — no truncation, no stride
    //   - mid arena (just under 16 MiB) — sequential only
    //   - large arena (declared > MAX_ARENA_PAGES) — sequential
    //     prefix + stride sweep
    //   - 4 GiB-cap (raw_span > MAX_VM_RANGE_BYTES) — span_capped flag,
    //     declared_pages clamped to MAX_VM_RANGE_BYTES / page_size
    //   - corrupt span (raw_span = u64::MAX) — capped, flag set,
    //     no overflow

    /// Page size used for ArenaWalkPlan unit tests. Production code
    /// resolves the page size from [`guest_page_size`] (which decodes
    /// the guest's `TCR_EL1.TG1`); the plan tests pin their math
    /// against an explicit 4 KiB so they exercise the same shapes
    /// regardless of the host the test runs on. Granule-specific
    /// shapes have their own dedicated tests
    /// (`arena_walk_plan_16k_granule_*`).
    const TEST_PAGE_SIZE: u64 = 4096;

    #[test]
    fn arena_walk_plan_constants_sane() {
        // The plan-derivation invariants depend on these constants.
        // Pin them so a future tightening surfaces here, not in
        // snapshot_arena's runtime behavior.
        assert_eq!(MAX_VM_RANGE_BYTES, 0x1_0000_0000);
        assert_eq!(MAX_ARENA_PAGES, 4096);
        assert_eq!(MAX_ARENA_STRIDE_PROBES, 256);
    }

    #[test]
    fn arena_walk_plan_single_page() {
        // Smallest non-empty arena: one page. Sequential walk covers
        // it; no stride needed; no truncation.
        let plan = ArenaWalkPlan::new(TEST_PAGE_SIZE, TEST_PAGE_SIZE);
        assert_eq!(plan.declared_pages, 1);
        assert!(!plan.span_capped);
        assert!(!plan.truncated);
        assert_eq!(plan.sequential_to, 1);
        assert_eq!(plan.stride, None);
    }

    #[test]
    fn arena_walk_plan_exactly_max_arena_pages() {
        // declared == MAX_ARENA_PAGES: still no stride, no truncation.
        // Boundary case: MAX_ARENA_PAGES walks sequentially.
        let plan = ArenaWalkPlan::new(MAX_ARENA_PAGES * TEST_PAGE_SIZE, TEST_PAGE_SIZE);
        assert_eq!(plan.declared_pages, MAX_ARENA_PAGES);
        assert!(!plan.truncated);
        assert_eq!(plan.sequential_to, MAX_ARENA_PAGES);
        assert_eq!(plan.stride, None);
    }

    #[test]
    fn arena_walk_plan_one_page_past_max() {
        // declared = MAX_ARENA_PAGES + 1: stride mode kicks in for
        // the single tail page; stride must be 1 (every page).
        let plan = ArenaWalkPlan::new((MAX_ARENA_PAGES + 1) * TEST_PAGE_SIZE, TEST_PAGE_SIZE);
        assert_eq!(plan.declared_pages, MAX_ARENA_PAGES + 1);
        assert!(plan.truncated);
        assert_eq!(plan.sequential_to, MAX_ARENA_PAGES);
        assert_eq!(plan.stride, Some(1));
    }

    #[test]
    fn arena_walk_plan_full_4gib() {
        // Largest legitimate arena: full 4 GiB user_vm window (1M pages).
        // Sequential covers first 16 MiB; stride sweeps the remaining
        // ~1M-4096 pages with 256 probes -> stride = ceil((1M - 4096) / 256).
        let raw = MAX_VM_RANGE_BYTES;
        let plan = ArenaWalkPlan::new(raw, TEST_PAGE_SIZE);
        assert_eq!(plan.declared_pages, raw / TEST_PAGE_SIZE);
        assert!(!plan.span_capped, "exactly 4 GiB is at the cap, not above");
        assert!(plan.truncated);
        assert_eq!(plan.sequential_to, MAX_ARENA_PAGES);
        let stride = plan.stride.expect("stride mode for >MAX_ARENA_PAGES");
        let tail = plan.declared_pages - MAX_ARENA_PAGES;
        // Verify stride covers the tail: stride * MAX_ARENA_STRIDE_PROBES
        // must reach `tail` with at most one slot of overshoot.
        assert!(stride * MAX_ARENA_STRIDE_PROBES >= tail);
        assert!((stride - 1) * MAX_ARENA_STRIDE_PROBES < tail);
    }

    #[test]
    fn arena_walk_plan_caps_at_4gib() {
        // Raw span 8 GiB (corrupt struct): span_capped flag set,
        // declared_pages clamped to MAX_VM_RANGE_BYTES / page_size.
        let plan = ArenaWalkPlan::new(2 * MAX_VM_RANGE_BYTES, TEST_PAGE_SIZE);
        assert!(plan.span_capped);
        assert_eq!(plan.declared_pages, MAX_VM_RANGE_BYTES / TEST_PAGE_SIZE);
        assert!(plan.truncated);
        assert!(plan.stride.is_some());
    }

    #[test]
    fn arena_walk_plan_caps_corrupt_u64_max_span() {
        // Pathological: raw_span = u64::MAX. The cap must apply
        // BEFORE the span-to-pages division; without the cap,
        // u64::MAX / page_size = ~4.5 quadrillion pages and the
        // pgoff loop would live-lock.
        let plan = ArenaWalkPlan::new(u64::MAX, TEST_PAGE_SIZE);
        assert!(plan.span_capped);
        assert_eq!(plan.declared_pages, MAX_VM_RANGE_BYTES / TEST_PAGE_SIZE);
        assert!(plan.truncated);
    }

    #[test]
    fn arena_walk_plan_zero_span() {
        // Edge: zero span. snapshot_arena can reach this with
        // max_entries=0; the plan must handle zero spans without
        // panicking or computing nonsense bounds.
        let plan = ArenaWalkPlan::new(0, TEST_PAGE_SIZE);
        assert_eq!(plan.declared_pages, 0);
        assert!(!plan.span_capped);
        assert!(!plan.truncated);
        assert_eq!(plan.sequential_to, 0);
        assert_eq!(plan.stride, None);
    }

    #[test]
    fn arena_walk_plan_stride_visits_every_pgoff_when_short_tail() {
        // tail < MAX_ARENA_STRIDE_PROBES: stride saturates at 1, so
        // the sweep walks every remaining page. Verify by simulating
        // the walk and counting positions.
        // declared = MAX_ARENA_PAGES + 50 -> tail = 50 -> stride = 1.
        let plan = ArenaWalkPlan::new((MAX_ARENA_PAGES + 50) * TEST_PAGE_SIZE, TEST_PAGE_SIZE);
        assert_eq!(plan.stride, Some(1));
        let mut pgoff = plan.sequential_to;
        let mut visited = 0u64;
        while pgoff < plan.declared_pages {
            visited += 1;
            pgoff = pgoff.saturating_add(plan.stride.unwrap());
        }
        assert_eq!(visited, 50, "every tail page should be visited");
    }

    #[test]
    fn arena_walk_plan_stride_distributes_probes_in_long_tail() {
        // tail >> MAX_ARENA_STRIDE_PROBES: stride > 1, fewer probes
        // than tail pages. Verify the sweep visits exactly
        // approximately MAX_ARENA_STRIDE_PROBES positions.
        let plan = ArenaWalkPlan::new(MAX_VM_RANGE_BYTES, TEST_PAGE_SIZE); // full 4 GiB
        let mut pgoff = plan.sequential_to;
        let mut visited = 0u64;
        while pgoff < plan.declared_pages {
            visited += 1;
            pgoff = pgoff.saturating_add(plan.stride.unwrap());
        }
        // The sweep visits ceil(tail / stride) positions; for the
        // 4 GiB case `stride * MAX_ARENA_STRIDE_PROBES >= tail` so
        // visited <= MAX_ARENA_STRIDE_PROBES, and `>= tail / stride`
        // ensures it's not zero.
        assert!(
            visited <= MAX_ARENA_STRIDE_PROBES + 1,
            "visited {visited}, expected ≤ {} probes",
            MAX_ARENA_STRIDE_PROBES + 1
        );
        assert!(
            visited >= MAX_ARENA_STRIDE_PROBES - 1,
            "visited {visited}, expected ≥ {}-ish probes",
            MAX_ARENA_STRIDE_PROBES - 1
        );
    }

    /// `guard_half` mirrors the kernel's `bpf_arena_get_kern_vm_start`
    /// `GUARD_SZ/2` formula. Pin the three legitimate page granules
    /// (4 KiB, 16 KiB, 64 KiB) against the hand-computed values from
    /// the doc comment so a regression in the
    /// `next_multiple_of(page_size << 1)` math surfaces here.
    #[test]
    fn guard_half_matches_kernel_formula() {
        // 4 KiB granule: round_up(65536, 8192) = 65536, /2 = 32768.
        assert_eq!(guard_half(4096), 32768);
        // 16 KiB granule: round_up(65536, 32768) = 65536, /2 = 32768.
        assert_eq!(guard_half(16384), 32768);
        // 64 KiB granule: round_up(65536, 131072) = 131072, /2 = 65536.
        assert_eq!(guard_half(65536), 65536);
    }

    /// `guest_page_size` decodes `TCR_EL1.TG1` (bits [31:30]) into
    /// the granule size on aarch64; on x86_64 it is fixed at 4 KiB
    /// regardless of the input. Pin the four encodings + the
    /// reserved fallback path so a regression in the bit math
    /// surfaces here.
    #[test]
    fn guest_page_size_decodes_tg1() {
        #[cfg(target_arch = "x86_64")]
        {
            // x86_64: page size is always 4 KiB, regardless of the
            // (ignored) `tcr_el1` argument.
            assert_eq!(guest_page_size(0), 4096);
            assert_eq!(guest_page_size(0b01u64 << 30), 4096);
            assert_eq!(guest_page_size(0b10u64 << 30), 4096);
            assert_eq!(guest_page_size(0b11u64 << 30), 4096);
        }
        #[cfg(target_arch = "aarch64")]
        {
            // TG1=0b10 → 4 KiB
            assert_eq!(guest_page_size(0b10u64 << 30), 4096);
            // TG1=0b01 → 16 KiB (Apple Silicon style)
            assert_eq!(guest_page_size(0b01u64 << 30), 16384);
            // TG1=0b11 → 64 KiB
            assert_eq!(guest_page_size(0b11u64 << 30), 65536);
            // TG1=0b00 (reserved) → conservative 4 KiB fallback
            assert_eq!(guest_page_size(0), 4096);
        }
    }

    /// 16 KiB-granule arena (Apple Silicon kernel build): a single
    /// declared page is 16 KiB. With raw_span = 16384 the plan must
    /// report `declared_pages = 1`, no stride. Pre-fix, `PAGE_SIZE`
    /// was hardcoded to 4096 so 16384 / 4096 = 4 pages — wrong.
    #[test]
    fn arena_walk_plan_16k_granule_single_page() {
        let plan = ArenaWalkPlan::new(16384, 16384);
        assert_eq!(plan.declared_pages, 1);
        assert!(!plan.span_capped);
        assert!(!plan.truncated);
        assert_eq!(plan.sequential_to, 1);
        assert_eq!(plan.stride, None);
    }

    /// 16 KiB-granule arena at the 4 GiB cap: `declared_pages` =
    /// 4 GiB / 16 KiB = 256 K. Pre-fix, the divisor was 4 KiB so
    /// the count would have been 4x too large.
    #[test]
    fn arena_walk_plan_16k_granule_full_cap() {
        let plan = ArenaWalkPlan::new(MAX_VM_RANGE_BYTES, 16384);
        assert_eq!(plan.declared_pages, MAX_VM_RANGE_BYTES / 16384);
        assert!(!plan.span_capped);
        assert!(plan.truncated);
        assert_eq!(plan.sequential_to, MAX_ARENA_PAGES);
    }
}