ktstr 0.17.0

Test harness for Linux process schedulers
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
//! Compute the minimum guest memory required to boot, extract the
//! initramfs, and run the post-boot test workload.
//!
//! Used by the deferred-memory path in [`KtstrVm`](super::KtstrVm) to
//! size guest memory from observed initramfs sizes instead of a static
//! caller estimate.

use anyhow::{Context, Result};
use std::path::Path;

/// The fraction of `totalram_pages` the guest rootfs tmpfs is sized for
/// during initramfs extraction.
///
/// `shmem_default_max_blocks` (`mm/shmem.c`) returns `totalram_pages() /
/// 2`, so the rootfs tmpfs admits at most 50% of RAM by default. The
/// `initramfs_options=size=90%` cmdline token (emitted unconditionally
/// when `init_binary.is_some()`) raises that to 90% — but only on
/// kernels carrying mainline commit 278033a225e1 ("fs: Add
/// 'initramfs_options'"), first tagged v6.18-rc1 and backported to the
/// stable series (see [`Self::for_kernel_version`] for the per-series
/// floors). On kernels without it the token is silently ignored and the
/// tmpfs stays at the 50% default.
///
/// The asymmetry is one-directional: sizing RAM for 50% when the kernel
/// honors 90% wastes a little RAM (the tmpfs is bigger than we sized
/// for — safe); sizing RAM for 90% when the kernel only gives 50% means
/// extraction overruns the tmpfs and the guest panics. So [`Self::Half`]
/// is the conservative default for ANY uncertainty, and
/// [`Self::NinetyPercent`] is selected only when the kernel is
/// POSITIVELY known to honor the token (mainline >= 6.18, or a stable
/// series at or above its backport floor — see
/// [`Self::for_kernel_version`]).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum TmpfsFraction {
    /// 50% of `totalram_pages` — the `shmem_default_max_blocks` default
    /// and the universally-safe floor.
    Half,
    /// 90% of `totalram_pages` — the ceiling `initramfs_options=size=90%`
    /// raises the tmpfs to, on honoring kernels (mainline 6.18+ or a
    /// backported stable series — see [`Self::for_kernel_version`]).
    /// Reclaims RAM proportional to the uncompressed payload — roughly a
    /// third of the boot budget at the instrumented-payload shape (the
    /// `ninety_percent_fraction_sizes_less_ram_than_half` test measures
    /// the 3030 -> 1947 MiB drop).
    NinetyPercent,
}

impl TmpfsFraction {
    /// Numerator/denominator of the fraction, as a fixed-point pair the
    /// budget formula multiplies through with integer saturating
    /// arithmetic (no floats — the floor must round the same on every
    /// host). `Half` -> 1/2, `NinetyPercent` -> 9/10.
    fn ratio(self) -> (u64, u64) {
        match self {
            TmpfsFraction::Half => (1, 2),
            TmpfsFraction::NinetyPercent => (9, 10),
        }
    }

    /// Select the tmpfs fraction for a guest kernel whose
    /// `(major, minor, patch)` version is `version`.
    ///
    /// [`Self::NinetyPercent`] iff the kernel is positively known to
    /// honor `initramfs_options=size=90%` (upstream commit 278033a225e1):
    /// mainline `(major, minor) >= (6, 18)` (first tagged v6.18-rc1,
    /// regardless of patch), or a stable series at or above the patch
    /// level where the backport first shipped.
    ///
    /// The per-series backport floors, verified via `git tag --contains`
    /// of each series' backport commit against the linux-stable trees,
    /// are 5.4.301, 5.10.246, 5.15.195, 6.1.157, 6.6.113, 6.12.54, and
    /// 6.17.4.
    ///
    /// `None` (no version established), a series absent from the table
    /// (EOL / no backport, e.g. 6.7-6.11 or 6.13-6.16 — 6.12 IS in the
    /// table), or a version below its series floor yields the
    /// conservative [`Self::Half`]. The table is a
    /// verified snapshot: a series that gains a backport later still
    /// falls to `Half` until added here — safe (no reclaim), never a
    /// panic.
    pub(crate) fn for_kernel_version(version: Option<(u16, u16, u16)>) -> Self {
        let Some((major, minor, patch)) = version else {
            return TmpfsFraction::Half;
        };
        // Mainline 6.18+ honors the token regardless of patch level.
        if (major, minor) >= (6, 18) {
            return TmpfsFraction::NinetyPercent;
        }
        // Per-stable-series patch floor where the initramfs_options
        // backport first shipped. A series not listed (EOL / no
        // backport) falls through to the safe Half.
        let floor = match (major, minor) {
            (5, 4) => 301,
            (5, 10) => 246,
            (5, 15) => 195,
            (6, 1) => 157,
            (6, 6) => 113,
            (6, 12) => 54,
            (6, 17) => 4,
            _ => return TmpfsFraction::Half,
        };
        if patch >= floor {
            TmpfsFraction::NinetyPercent
        } else {
            TmpfsFraction::Half
        }
    }
}

/// Parameters for computing minimum guest memory.
pub(crate) struct MemoryBudget {
    /// Uncompressed initramfs size (base + suffix cpio) in bytes.
    pub uncompressed_initramfs_bytes: u64,
    /// LZ4-compressed initrd size in bytes. The compressed initrd
    /// is memblock-reserved in guest physical memory from load until
    /// free_initrd_mem() releases it after extraction.
    pub compressed_initrd_bytes: u64,
    /// Kernel `init_size` from bzImage setup_header (offset 0x260).
    /// The kernel's declared contiguous memory requirement during
    /// boot decompression. Includes compressed payload, decompressed
    /// kernel, and decompression workspace. Overestimates resident
    /// kernel (init sections and workspace are freed post-boot),
    /// absorbing percpu and misc boot allocations.
    pub kernel_init_size: u64,
    /// `true` when the `/init` payload (the guest's PID 1) is built
    /// with `-C instrument-coverage`. An instrumented `/init` holds
    /// the LLVM profile-counter sections resident AND, at flush time,
    /// `__llvm_profile_write_buffer` serializes them into a heap
    /// buffer (`crate::test_support::try_flush_profraw`) — neither of
    /// which a non-instrumented payload pays for. When set,
    /// [`initramfs_min_memory_mib`] adds
    /// `instrumented_reserve_bytes` to the workload term so the
    /// instrumented `/init` does not OOM during boot.
    pub init_coverage_instrumented: bool,
    /// Extra resident bytes to reserve when
    /// `init_coverage_instrumented` is set: the summed sizes of the
    /// payload's `__llvm_prf_cnts` + `__llvm_prf_data` sections (the
    /// live coverage-counter and profile-metadata arrays the
    /// instrumented binary keeps resident). `0` when the payload is not
    /// instrumented or the sections are absent.
    ///
    /// This is a STEADY-STATE floor, not the flush-time peak: at flush
    /// `__llvm_profile_write_buffer` allocates a
    /// `__llvm_profile_get_size_for_buffer()` heap buffer (cnts + data +
    /// `__llvm_prf_names`) that briefly coexists with the resident
    /// sections, so the true peak is ~2x(cnts+data)+names. `WORKLOAD_MIB`
    /// slack absorbs the second copy at current binary sizes; a much
    /// larger instrumented `/init` may want a peak-aware (2x) reserve.
    pub instrumented_reserve_bytes: u64,
    /// Fraction of `totalram_pages` the guest rootfs tmpfs is sized for
    /// during initramfs extraction; see [`TmpfsFraction`]. A larger
    /// fraction sizes LESS total RAM for the same payload. Selected via
    /// [`TmpfsFraction::for_kernel_version`].
    pub tmpfs_fraction: TmpfsFraction,
}

/// Read the kernel's declared memory footprint from the image file.
///
/// x86_64 bzImage: reads `init_size` from setup_header at file offset
/// 0x260 (setup_header starts at 0x1F1, `init_size` is at byte 111
/// within it). This is the kernel's declared contiguous memory
/// requirement during boot decompression.
///
/// aarch64 Image: reads `image_size` from the arm64 image header at
/// file offset 16 (after code0 + code1 + text_offset). For gzip-
/// compressed vmlinuz, falls back to file size * 4 as a conservative
/// estimate of the decompressed Image size.
pub(crate) fn read_kernel_init_size(kernel_path: &Path) -> Result<u64> {
    use std::io::{Read, Seek, SeekFrom};
    let mut f = std::fs::File::open(kernel_path)
        .with_context(|| format!("open kernel for init_size: {}", kernel_path.display()))?;

    #[cfg(target_arch = "x86_64")]
    {
        // setup_header starts at 0x1F1, init_size at offset 111.
        f.seek(SeekFrom::Start(0x260))
            .context("seek to init_size in bzImage")?;
        let mut buf = [0u8; 4];
        f.read_exact(&mut buf)
            .context("read init_size from bzImage")?;
        Ok(u32::from_le_bytes(buf) as u64)
    }

    #[cfg(target_arch = "aarch64")]
    {
        // Check for gzip magic (0x1f 0x8b).
        let mut magic = [0u8; 2];
        f.read_exact(&mut magic).context("read kernel magic")?;
        if magic == [0x1f, 0x8b] {
            // Compressed vmlinuz — decompress header to read image_size.
            f.seek(SeekFrom::Start(0))
                .context("seek vmlinuz to start")?;
            let mut decoder = flate2::read::GzDecoder::new(&mut f);
            let mut header = [0u8; 24];
            decoder
                .read_exact(&mut header)
                .context("decompress arm64 vmlinuz header for image_size")?;
            return Ok(u64::from_le_bytes(header[16..24].try_into().unwrap()));
        }
        // Raw PE Image: image_size is a little-endian u64 at offset 16.
        f.seek(SeekFrom::Start(16))
            .context("seek to image_size in arm64 Image")?;
        let mut buf = [0u8; 8];
        f.read_exact(&mut buf)
            .context("read image_size from arm64 Image")?;
        Ok(u64::from_le_bytes(buf))
    }
}

/// Read the guest kernel's `(major, minor, patch)` version from the
/// kernel image, for the `initramfs_options=size=90%` honoring gate (see
/// [`TmpfsFraction::for_kernel_version`]).
///
/// x86_64 bzImage: the setup_header `kernel_version` field (a `u16` at
/// file offset 0x20E) is "a pointer to a NUL-terminated version string,
/// less 0x200" (`Documentation/arch/x86/boot.rst`). When nonzero, the
/// string lives at file offset `0x200 + value` and begins with
/// `UTS_RELEASE` (`arch/x86/boot/version.c`), i.e.
/// `MAJOR.MINOR.PATCH[-extra]` (e.g. `6.18.0-rc1`). This parses the
/// leading `MAJOR.MINOR.PATCH`.
///
/// aarch64 Image: the arm64 image header carries no version string —
/// returns `None` here. The caller `tmpfs_fraction` falls back to the
/// cache `metadata.json` sidecar
/// (`read_kernel_version_from_metadata_sidecar`) to recover the aarch64
/// version, so aarch64 is not unconditionally sized at 50%.
///
/// Returns `None` (-> conservative 50% sizing) on ANY uncertainty:
/// unreadable image, `kernel_version` field zero (pre-2.00 protocol or
/// stripped), the string offset out of range, non-UTF-8, or a malformed
/// leading `MAJOR.MINOR`. NEVER errors — a missing version is a normal,
/// safe outcome, not a boot-blocking failure.
pub(crate) fn read_kernel_version(kernel_path: &Path) -> Option<(u16, u16, u16)> {
    #[cfg(target_arch = "x86_64")]
    {
        use std::io::{Read, Seek, SeekFrom};
        let mut f = std::fs::File::open(kernel_path).ok()?;
        // Validate this is a real bzImage before trusting the version
        // field: the setup_header "HdrS" magic is a 4-byte field at file
        // offset 0x202 (boot protocol >= 2.00; the same check
        // `linux_loader::BzImage::load` makes). Without it, a
        // non-bzImage / pre-2.00 / corrupt image's arbitrary bytes at
        // 0x20E could parse as a honoring version and select the 90%
        // tmpfs fraction on a kernel that only gives 50% — the forbidden
        // panic direction. On any mismatch, return None (=> the safe
        // Half), matching the hostile-input doctrine: 90% is taken only
        // for a positively-confirmed bzImage.
        f.seek(SeekFrom::Start(0x202)).ok()?;
        let mut magic = [0u8; 4];
        f.read_exact(&mut magic).ok()?;
        if &magic != b"HdrS" {
            return None;
        }
        // setup_header `kernel_version` is a u16 at file offset 0x20E.
        f.seek(SeekFrom::Start(0x20E)).ok()?;
        let mut vbuf = [0u8; 2];
        f.read_exact(&mut vbuf).ok()?;
        let ver_ptr = u16::from_le_bytes(vbuf);
        // Zero => no version string (boot protocol < 2.00, or absent).
        if ver_ptr == 0 {
            return None;
        }
        // String is at file offset 0x200 + ver_ptr, NUL-terminated. Read
        // a bounded 256-byte window so a corrupt pointer can't drive an
        // unbounded read.
        f.seek(SeekFrom::Start(0x200u64 + ver_ptr as u64)).ok()?;
        let mut window = [0u8; 256];
        let n = f.read(&mut window).ok()?;
        let bytes = &window[..n];
        // The string is `RELEASE (compile-by@host) ...`; a NUL or space
        // bounds the RELEASE token.
        let end = bytes
            .iter()
            .position(|&b| b == 0 || b == b' ')
            .unwrap_or(bytes.len());
        let release = std::str::from_utf8(&bytes[..end]).ok()?;
        parse_kernel_version(release)
    }
    #[cfg(target_arch = "aarch64")]
    {
        // arm64 Image header carries no version string; the cache
        // metadata.json sidecar is the aarch64 version source (see
        // read_kernel_version_from_metadata_sidecar).
        let _ = kernel_path;
        None
    }
}

/// Parse the leading `MAJOR.MINOR.PATCH` from a kernel release string
/// such as `6.6.113`, `6.18.0-rc1`, or `7.1.0-rc7-gc80ba8d32ec3`.
/// Returns `None` if either of the first two dot-separated components is
/// absent or non-numeric. The patch component is optional — mainline rc
/// tags (`6.18-rc1`) carry none, and mainline >= 6.18 honors regardless
/// of patch; an absent or non-numeric patch is reported as `0` (only the
/// stable-backport floors consult patch, and every stable release is
/// `MAJOR.MINOR.PATCH`). Free fn so the host unit tests pin the parse
/// against real release-string shapes without constructing a bzImage.
fn parse_kernel_version(release: &str) -> Option<(u16, u16, u16)> {
    let mut parts = release.split('.');
    let major: u16 = parts.next()?.parse().ok()?;
    // Minor may carry a trailing `-rcN` only when there is no PATCH
    // component (e.g. `6.18-rc1`); strip any non-digit suffix.
    let minor_raw = parts.next()?;
    let minor_digits: String = minor_raw
        .chars()
        .take_while(|c| c.is_ascii_digit())
        .collect();
    if minor_digits.is_empty() {
        return None;
    }
    let minor: u16 = minor_digits.parse().ok()?;
    // Patch is optional; strip any trailing non-digit suffix (e.g.
    // `0-rc7-g...`). Absent, empty, or non-numeric => 0.
    let patch: u16 = parts
        .next()
        .map(|p| {
            p.chars()
                .take_while(|c| c.is_ascii_digit())
                .collect::<String>()
        })
        .and_then(|d| d.parse().ok())
        .unwrap_or(0);
    Some((major, minor, patch))
}

/// Recover the guest kernel `(major, minor, patch)` version from the
/// cache `metadata.json` sidecar next to the boot image.
///
/// The aarch64 `Image` carries no embedded version string (so
/// [`read_kernel_version`] returns `None` there), but a
/// cached kernel records its version in `metadata.json` alongside the
/// image: `crate::cache::CacheDir` stores the boot image at
/// `<entry>/<image_name>` and its metadata at `<entry>/metadata.json`,
/// so the sidecar is the image's sibling. This recovers the 90% reclaim
/// for cache-resident aarch64 kernels (whose version is recorded into
/// `metadata.json` by the acquisition: the kernel.org tarball download,
/// or — via the source-tree Makefile — a local source tree or git
/// clone).
///
/// Returns `None` — falling to the safe [`TmpfsFraction::Half`] — for
/// every case the version can't be positively established: a non-cache
/// image path (raw `--kernel`, no sibling sidecar), an acquisition that
/// recorded no version (`version` absent/`null` — e.g. a source build
/// whose `Makefile` was unparsable), an unreadable or malformed
/// `metadata.json`,
/// or an unparsable version string. Only `version` is read (a minimal
/// probe struct ignores the rest of the schema), keeping this decoupled
/// from `crate::cache::KernelMetadata`'s other fields. The sidecar is
/// host-authored cache infrastructure, not guest input, so trusting its
/// version for the fraction decision is consistent with the threat model
/// (the guest never writes it).
pub(crate) fn read_kernel_version_from_metadata_sidecar(
    kernel_path: &Path,
) -> Option<(u16, u16, u16)> {
    #[derive(serde::Deserialize)]
    struct VersionProbe {
        version: Option<String>,
    }
    let sidecar = kernel_path.parent()?.join("metadata.json");
    let json = std::fs::read_to_string(sidecar).ok()?;
    let probe: VersionProbe = serde_json::from_str(&json).ok()?;
    parse_kernel_version(&probe.version?)
}

/// Minimum guest memory (in MiB) needed to boot, extract the initramfs,
/// and run the test workload.
///
/// ```text
/// total = computed_boot_requirement + WORKLOAD_MIB + shm
/// ```
///
/// ## Computed boot requirement
///
/// Every term is derived from values known at allocation time. The model
/// follows the kernel's boot memory layout.
///
/// **memblock-reserved regions** (excluded from `totalram_pages`):
///
/// - `kernel_init_size`: bzImage setup_header `init_size` field (offset
///   0x260) — the kernel's declared contiguous memory requirement during
///   boot decompression. Includes compressed payload, decompressed
///   vmlinux, and decompression workspace. Overestimates resident kernel
///   since init sections (`free_initmem`, `init/main.c`) and the
///   decompression workspace are freed post-boot. The slack absorbs
///   percpu allocations (`pcpu_embed_first_chunk` in `mm/percpu.c`
///   reserves `static_size + reserved_size + dyn_size` per CPU via
///   memblock, ~220KB/CPU with ktstr's kconfig which disables LOCKDEP)
///   and misc boot allocations (page tables, slab bootstrap, hash tables).
///
/// - `compressed_initrd`: memblock-reserved by `reserve_initrd_mem()`
///   (`init/initramfs.c:642`: `memblock_reserve(start, size)`) until
///   `free_initrd_mem()` after `unpack_to_rootfs` completes.
///
/// - struct page array: `P / 64` bytes. Each 4KB page requires a
///   `struct page` descriptor. Base size = 56 bytes (flags:8 + 5-word
///   union:40 + _mapcount:4 + _refcount:4), reaching 64 either by
///   `CONFIG_HAVE_ALIGNED_STRUCT_PAGE` 16-byte alignment padding
///   (`CONFIG_MEMCG=n`) or by the extra `memcg_data:8` field
///   (`CONFIG_MEMCG=y`) — same `/64` either way
///   (`include/linux/mm_types.h`). `CONFIG_KMSAN` (off here) would add
///   two pointers (→ 80 bytes); excluded.
///
/// **tmpfs constraint** (the binding limit for initramfs extraction):
///
/// The rootfs tmpfs is mounted by `init_mount_tree()` (`fs/namespace.c`)
/// via `vfs_kern_mount(&rootfs_fs_type, 0, ...)` — flags=0, NOT
/// `SB_KERNMOUNT`. `alloc_super` (`fs/super.c`) sets `s->s_flags = flags`,
/// so `SB_KERNMOUNT` is not set. In `shmem_fill_super` (`mm/shmem.c`),
/// the `!(sb->s_flags & SB_KERNMOUNT)` branch runs, and since no
/// `size=` mount option was parsed (`SHMEM_SEEN_BLOCKS` unset), it
/// falls through to `ctx->blocks = shmem_default_max_blocks()` =
/// `totalram_pages() / 2` (`mm/shmem.c:146`).
///
/// `initramfs_options=size=90%` on the cmdline is consumed by
/// `init_mount_tree()` (`fs/namespace.c`, via `initramfs_options_setup`)
/// when mounting the rootfs tmpfs — but only on kernels carrying mainline
/// commit 278033a225e1 ("fs: Add 'initramfs_options' to set initramfs
/// mount options"), first tagged v6.18-rc1. On kernels without it the
/// parameter is silently ignored ("Unknown kernel command line
/// parameters …, will be passed to user space") and the tmpfs uses its
/// 50% default. (The commit was also backported to the stable series —
/// v5.4.301, v5.10.246, v5.15.195, v6.1.157, v6.6.113, v6.12.54, v6.17.4
/// — which ktstr recognizes via `TmpfsFraction::for_kernel_version`.)
/// Initramfs unpacking then fails
/// with `write error` partway through if the uncompressed payload
/// exceeds the live tmpfs limit, leaving `/init` packed but its
/// dynamic-linker dep missing → `Failed to execute /init (error -2)`
/// → kernel panic.
///
/// The formula below sizes for `budget.tmpfs_fraction`: 90% only when
/// the guest kernel is positively known to honor the hint (mainline
/// 6.18+ or a stable series at or above its backport floor, via
/// `TmpfsFraction::for_kernel_version`), else the 50% default. Sizing
/// for 50% on a kernel that honors 90% is safe — the tmpfs is bigger
/// than we sized for; sizing for 90% on a kernel that only gives 50%
/// panics the guest mid-extraction, so the 90% path is taken ONLY on a
/// confirmed-honoring version (every uncertainty — unknown version, an
/// image lacking both an embedded version and a honoring sidecar, a
/// series absent from the table, or below its series floor — falls to
/// the safe 50%).
///
/// Note: `rootflags=size=90%` would set `root_mount_data` (assigned by
/// `root_data_setup` via `__setup("rootflags=", ...)` in
/// `init/do_mounts.c`), consumed only by `do_mount_root()` via
/// `prepare_namespace()`. With `rdinit=`, `kernel_init_freeable`
/// (`init/main.c`) skips `prepare_namespace()` when `init_eaccess`
/// succeeds, so `rootflags=` is never applied to the rootfs.
///
/// The `SB_KERNMOUNT` (unlimited) tmpfs is the separate `shm_mnt`
/// created by `shmem_init()` via `kern_mount()` — used for anonymous
/// shared memory (`shmem_file_setup`), not the rootfs.
///
/// ```text
/// totalram_pages(P) = (P - init_size - compressed - P/64) / 4096
/// tmpfs_max_pages = totalram_pages / 2
/// constraint: tmpfs_max_pages >= uncompressed / 4096
///
/// Solving for P:
/// (P - init_size - compressed - P/64) / 2 >= uncompressed
/// P * 63/64 >= 2 * uncompressed + init_size + compressed
/// P >= (2 * uncompressed + init_size + compressed) * 64/63
/// ```
///
/// ## Workload budget
///
/// 256 MiB for scheduler execution, test scenarios, and runtime
/// allocations (cgroup memory, BPF maps, process stacks, slab caches).
/// This is a deliberate budget for post-boot workload, not a guess at
/// kernel overhead.
///
/// Workload budget (MiB): scheduler execution, test scenarios, cgroup
/// memory, BPF maps, and runtime allocations.
///
/// ## Coverage-instrumented `/init` reserve
///
/// A `-C instrument-coverage` `/init` payload (the
/// [`MemoryBudget::init_coverage_instrumented`] flag) needs more than
/// the base workload budget: it holds the LLVM profile-counter
/// sections resident, and at flush time
/// `__llvm_profile_write_buffer`
/// (`crate::test_support::try_flush_profraw`) serializes them into a
/// heap buffer whose size `__llvm_profile_get_size_for_buffer`
/// reports. `WORKLOAD_MIB` is sized for a non-instrumented payload,
/// so the instrumented case adds
/// [`MemoryBudget::instrumented_reserve_bytes`] (the payload's
/// `__llvm_prf_cnts` + `__llvm_prf_data` section sizes, MiB-ceil) to
/// the workload term. Without it a ~600 MiB-stripped instrumented
/// `/init` OOMs during boot (the empirically-confirmed
/// `memory_deferred_min(4096)` workaround the reserve replaces with a
/// right-sized figure derived from the actual section sizes).
const WORKLOAD_MIB: u64 = 256;

pub(crate) fn initramfs_min_memory_mib(budget: &MemoryBudget) -> u32 {
    let ceil_mib = |bytes: u64| -> u64 { bytes.saturating_add((1 << 20) - 1) >> 20 };

    let init_size_mib = ceil_mib(budget.kernel_init_size);
    let compressed_mib = ceil_mib(budget.compressed_initrd_bytes);
    let uncompressed_mib = ceil_mib(budget.uncompressed_initramfs_bytes);

    // Boot requirement: the rootfs tmpfs block limit is a fraction F of
    // totalram_pages. F = 50% (`shmem_default_max_blocks`) on every
    // kernel that does not positively honor `initramfs_options=size=90%`;
    // F = 90% only on a positively-honoring kernel (mainline 6.18+ or a
    // stable series at/above its backport floor; see `TmpfsFraction` /
    // `TmpfsFraction::for_kernel_version`). A larger F sizes LESS RAM.
    //
    // Constraint (F = frac_num/frac_den): F * totalram_pages >= uncompressed_pages.
    // totalram_pages = (P - reserved) / PAGE_SIZE.
    // reserved = init_size + compressed + struct_page(P) = init_size + compressed + P/64.
    //
    // Solving:
    //   (frac_num/frac_den) * (P - init_size - compressed - P/64) >= uncompressed
    //   P * 63/64 >= (frac_den/frac_num) * uncompressed + init_size + compressed
    //   P >= ((frac_den/frac_num)*uncompressed + init_size + compressed) * 64/63
    // At F=1/2 this is the prior `2 * uncompressed`; at F=9/10 it scales
    // uncompressed by 10/9. div_ceil never rounds DOWN (rounding down
    // would under-size RAM and risk a mid-boot tmpfs overrun).
    let (frac_num, frac_den) = budget.tmpfs_fraction.ratio();
    let uncompressed_scaled = uncompressed_mib.saturating_mul(frac_den).div_ceil(frac_num);
    let content_mib = uncompressed_scaled
        .saturating_add(init_size_mib)
        .saturating_add(compressed_mib);

    // struct page overhead: P/64 is part of reserved, creating a
    // circular dependency. Solve: P = content * 64/63.
    let boot_mib = content_mib.saturating_mul(64).div_ceil(63);

    // Coverage-instrumented `/init` reserve: add the live profile
    // sections (cnts + data) on top of the workload budget. Non-zero
    // only when the payload is instrumented (see the const-level doc).
    let coverage_reserve_mib = if budget.init_coverage_instrumented {
        ceil_mib(budget.instrumented_reserve_bytes)
    } else {
        0
    };

    // total = computed boot requirement + workload budget + coverage
    // reserve. All arithmetic above is saturating, so a pathological
    // (hostile or buggy) input — a corrupt kernel Image `init_size`, a
    // malformed `/init` ELF section size — saturates toward `u64::MAX`
    // rather than wrapping to a too-small value; the `u32::try_from`
    // below then fails LOUDLY (panic) instead of silently truncating
    // the floor and OOMing the guest mid-boot.
    let total_mib = boot_mib
        .saturating_add(WORKLOAD_MIB)
        .saturating_add(coverage_reserve_mib);
    u32::try_from(total_mib).unwrap_or_else(|_| {
        panic!(
            "initramfs_min_memory_mib: computed floor {total_mib}MiB exceeds u32 \
             (boot={boot_mib}MiB, workload={WORKLOAD_MIB}MiB, \
             coverage_reserve={coverage_reserve_mib}MiB)"
        )
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Pin the workload-budget constant. Bumping the value
    /// (`WORKLOAD_MIB`) changes the floor for every deferred-memory
    /// VM boot; this test fails any change so the bump goes through
    /// review rather than slipping in unnoticed.
    #[test]
    fn workload_mib_is_256() {
        assert_eq!(WORKLOAD_MIB, 256);
    }

    /// All-zero inputs collapse to just the workload budget — no
    /// kernel, no initramfs. Pins the lower bound the deferred-memory
    /// path always allocates.
    #[test]
    fn initramfs_min_memory_mib_zeros_returns_workload_budget() {
        let budget = MemoryBudget {
            uncompressed_initramfs_bytes: 0,
            compressed_initrd_bytes: 0,
            kernel_init_size: 0,
            init_coverage_instrumented: false,
            instrumented_reserve_bytes: 0,
            tmpfs_fraction: TmpfsFraction::Half,
        };
        assert_eq!(initramfs_min_memory_mib(&budget), WORKLOAD_MIB as u32);
    }

    /// `kernel_init_size` and `compressed_initrd_bytes` flow into
    /// `content_mib` additively, then through the `*64/63` struct-page
    /// circular-dependency factor. Verify the math against a
    /// hand-computed reference. Inputs:
    ///   uncompressed=10 MiB, init_size=5 MiB, compressed=2 MiB.
    /// Hand trace per `initramfs_min_memory_mib`:
    ///   uncompressed_scaled = 10 * 2 = 20
    ///   content_mib         = 20 + 5 + 2 = 27
    ///   boot_mib            = ceil(27*64/63) = ceil(27.428) = 28
    ///   total              = 28 + 256 (WORKLOAD_MIB) = 284
    #[test]
    fn initramfs_min_memory_mib_known_input() {
        let budget = MemoryBudget {
            uncompressed_initramfs_bytes: 10 * (1 << 20),
            compressed_initrd_bytes: 2 * (1 << 20),
            kernel_init_size: 5 * (1 << 20),
            init_coverage_instrumented: false,
            instrumented_reserve_bytes: 0,
            tmpfs_fraction: TmpfsFraction::Half,
        };
        assert_eq!(initramfs_min_memory_mib(&budget), 284);
    }

    /// Sub-MiB inputs round up to 1 MiB before participating in the
    /// math. A 1-byte initramfs (degenerate but reachable when test
    /// fixtures construct empty payloads) must not silently round
    /// down to zero and bypass the tmpfs-50% safety factor. With
    /// uncompressed=1 byte, init=0, compressed=0:
    ///   uncompressed_scaled = 1 * 2 = 2
    ///   content_mib         = 2 + 0 + 0 = 2
    ///   boot_mib            = ceil(2*64/63) = ceil(2.031) = 3
    ///   total              = 3 + 256 = 259
    #[test]
    fn initramfs_min_memory_mib_subbyte_uncompressed_rounds_up() {
        let budget = MemoryBudget {
            uncompressed_initramfs_bytes: 1,
            compressed_initrd_bytes: 0,
            kernel_init_size: 0,
            init_coverage_instrumented: false,
            instrumented_reserve_bytes: 0,
            tmpfs_fraction: TmpfsFraction::Half,
        };
        assert_eq!(initramfs_min_memory_mib(&budget), 259);
    }

    /// Larger realistic-shape inputs: uncompressed=200 MiB,
    /// compressed=50 MiB, init_size=30 MiB.
    /// Verifies the math holds at integration-realistic scales (the
    /// production callers in vmm/mod.rs feed values of this order).
    /// Trace:
    ///   uncompressed_scaled = 200 * 2 = 400
    ///   content_mib         = 400 + 30 + 50 = 480
    ///   boot_mib            = ceil(480*64/63) = ceil(487.619) = 488
    ///   total              = 488 + 256 = 744
    #[test]
    fn initramfs_min_memory_mib_larger_input() {
        let budget = MemoryBudget {
            uncompressed_initramfs_bytes: 200 * (1 << 20),
            compressed_initrd_bytes: 50 * (1 << 20),
            kernel_init_size: 30 * (1 << 20),
            init_coverage_instrumented: false,
            instrumented_reserve_bytes: 0,
            tmpfs_fraction: TmpfsFraction::Half,
        };
        assert_eq!(initramfs_min_memory_mib(&budget), 744);
    }

    /// Coverage-instrumented shape: a GiB-scale instrumented `/init`
    /// with a multi-hundred-MiB profile-counter reserve must produce a
    /// floor well above the non-instrumented case AND above the
    /// previously-used `memory_deferred_min(4096)` workaround.
    ///
    /// Inputs model a ~600 MiB-stripped instrumented `/init`:
    ///   uncompressed=1200 MiB (binary + suffix), compressed=300 MiB,
    ///   init_size=30 MiB, reserve=3500 MiB (cnts + data sections of a
    ///   heavily-instrumented binary).
    /// Trace:
    ///   uncompressed_scaled = 1200 * 2 = 2400
    ///   content_mib         = 2400 + 30 + 300 = 2730
    ///   boot_mib            = ceil(2730*64/63) = ceil(2773.33) = 2774
    ///   coverage_reserve    = 3500
    ///   total              = 2774 + 256 + 3500 = 6530
    ///
    /// Two assertions pin the contract: (1) the instrumented floor is
    /// strictly larger than the SAME budget with the flag off (the
    /// reserve is actually added), and (2) the floor clears 4096 so an
    /// instrumented `/init` of this shape boots without the manual
    /// `memory_deferred_min(4096)` override.
    #[test]
    fn initramfs_min_memory_mib_instrumented_reserve_raises_floor() {
        let base = MemoryBudget {
            uncompressed_initramfs_bytes: 1200 * (1 << 20),
            compressed_initrd_bytes: 300 * (1 << 20),
            kernel_init_size: 30 * (1 << 20),
            init_coverage_instrumented: false,
            instrumented_reserve_bytes: 3500 * (1 << 20),
            tmpfs_fraction: TmpfsFraction::Half,
        };
        let instrumented = MemoryBudget {
            init_coverage_instrumented: true,
            ..base
        };

        let base_floor = initramfs_min_memory_mib(&base);
        let instrumented_floor = initramfs_min_memory_mib(&instrumented);

        // Flag off: reserve bytes are ignored entirely.
        assert_eq!(
            base_floor, 3030,
            "non-instrumented floor must NOT include the reserve \
             (2774 boot + 256 workload)"
        );
        // Flag on: reserve is added on top of the workload term.
        assert_eq!(
            instrumented_floor, 6530,
            "instrumented floor = 2774 boot + 256 workload + 3500 reserve"
        );
        assert!(
            instrumented_floor > base_floor,
            "instrumented reserve must raise the floor ({instrumented_floor} \
             vs {base_floor})"
        );
        assert!(
            instrumented_floor > 4096,
            "instrumented floor must clear the old memory_deferred_min(4096) \
             workaround (got {instrumented_floor})"
        );
    }

    /// `TmpfsFraction::for_kernel_version` gates the 90% bump on a
    /// positively-honoring kernel: mainline 6.18+ (regardless of patch)
    /// OR a stable series at/above its verified backport floor. Every
    /// uncertain, absent-series, or below-floor case falls to the safe
    /// 50%.
    #[test]
    fn tmpfs_fraction_gates_on_honoring_versions() {
        use TmpfsFraction::{Half, NinetyPercent};
        let frac = TmpfsFraction::for_kernel_version;

        // Mainline 6.18+ honors regardless of patch.
        assert_eq!(frac(Some((6, 18, 0))), NinetyPercent);
        assert_eq!(frac(Some((6, 19, 0))), NinetyPercent);
        assert_eq!(frac(Some((7, 0, 5))), NinetyPercent);
        // Mainline below 6.18 in a series with no backport at all (6.16 is
        // absent from the floor table) -> Half. (6.17 below-floor is
        // covered in the per-series block below.)
        assert_eq!(frac(Some((6, 16, 0))), Half);

        // Stable-backport series: AT the floor -> NinetyPercent; one
        // BELOW the floor -> Half. Floors verified via git tag --contains.
        assert_eq!(frac(Some((6, 17, 4))), NinetyPercent);
        assert_eq!(frac(Some((6, 17, 3))), Half);
        assert_eq!(frac(Some((6, 12, 54))), NinetyPercent);
        assert_eq!(frac(Some((6, 12, 53))), Half);
        assert_eq!(frac(Some((6, 6, 113))), NinetyPercent);
        assert_eq!(frac(Some((6, 6, 112))), Half);
        assert_eq!(frac(Some((6, 1, 157))), NinetyPercent);
        assert_eq!(frac(Some((6, 1, 156))), Half);
        assert_eq!(frac(Some((5, 15, 195))), NinetyPercent);
        assert_eq!(frac(Some((5, 15, 194))), Half);
        assert_eq!(frac(Some((5, 10, 246))), NinetyPercent);
        assert_eq!(frac(Some((5, 10, 245))), Half);
        assert_eq!(frac(Some((5, 4, 301))), NinetyPercent);
        assert_eq!(frac(Some((5, 4, 300))), Half);

        // A series absent from the table (EOL / no backport) -> Half,
        // even at a high patch level.
        assert_eq!(frac(Some((6, 9, 999))), Half);
        assert_eq!(frac(Some((6, 13, 999))), Half);
        // No version established -> Half.
        assert_eq!(frac(None), Half);
    }

    /// `parse_kernel_version` extracts the leading MAJOR.MINOR.PATCH from
    /// real kernel release-string shapes; the patch is optional (absent
    /// or rc-only => 0); a malformed major/minor yields `None`.
    #[test]
    fn parse_kernel_version_shapes() {
        assert_eq!(parse_kernel_version("6.18.0-rc1"), Some((6, 18, 0)));
        assert_eq!(
            parse_kernel_version("7.1.0-rc7-gc80ba8d32ec3"),
            Some((7, 1, 0))
        );
        assert_eq!(parse_kernel_version("6.12.54"), Some((6, 12, 54)));
        assert_eq!(parse_kernel_version("6.6.113"), Some((6, 6, 113)));
        // Minor-only rc tag: no patch component => patch 0.
        assert_eq!(parse_kernel_version("6.18-rc1"), Some((6, 18, 0)));
        // Non-numeric patch => 0 (major.minor still parse).
        assert_eq!(parse_kernel_version("6.6.x"), Some((6, 6, 0)));
        assert_eq!(parse_kernel_version(""), None);
        assert_eq!(parse_kernel_version("garbage"), None);
        assert_eq!(parse_kernel_version("6"), None);
        assert_eq!(parse_kernel_version("6."), None);
        assert_eq!(parse_kernel_version("x.18"), None);
    }

    /// At the SAME payload, the 90% fraction sizes strictly LESS total
    /// RAM than the 50% fraction — the entire point of the reclaim. A
    /// change inverting the constraint (sizing MORE RAM for 90%) would
    /// silently lose the reclaim AND risk under-sizing.
    #[test]
    fn ninety_percent_fraction_sizes_less_ram_than_half() {
        let make = |frac: TmpfsFraction| MemoryBudget {
            uncompressed_initramfs_bytes: 1200 * (1 << 20),
            compressed_initrd_bytes: 300 * (1 << 20),
            kernel_init_size: 30 * (1 << 20),
            init_coverage_instrumented: false,
            instrumented_reserve_bytes: 0,
            tmpfs_fraction: frac,
        };
        let half = initramfs_min_memory_mib(&make(TmpfsFraction::Half));
        let ninety = initramfs_min_memory_mib(&make(TmpfsFraction::NinetyPercent));
        assert!(
            ninety < half,
            "90% tmpfs fraction must size less RAM than 50% \
             (ninety={ninety}MiB, half={half}MiB)"
        );
        // half: uncompressed_scaled = 1200*2 = 2400; content = 2730;
        //   boot = ceil(2730*64/63) = 2774; total = 2774 + 256 = 3030.
        assert_eq!(half, 3030, "50% floor: 2774 boot + 256 workload");
        // ninety: uncompressed_scaled = ceil(1200*10/9) = 1334;
        //   content = 1664; boot = ceil(1664*64/63) = 1691;
        //   total = 1691 + 256 = 1947.
        assert_eq!(ninety, 1947, "90% floor: 1691 boot + 256 workload");
    }

    /// `read_kernel_init_size` on x86_64 reads 4 little-endian bytes
    /// at file offset 0x260. Construct a tempfile padded to that
    /// offset with a known init_size value and assert the function
    /// returns it as u64. Pins the exact byte-offset and width
    /// against a future drift in the bzImage setup_header layout.
    #[cfg(target_arch = "x86_64")]
    #[test]
    fn read_kernel_init_size_x86_64_reads_offset_0x260() {
        use std::io::Write;

        let mut f = tempfile::NamedTempFile::new().expect("tempfile");
        // Pad up to 0x260 with zeros, then write 4 bytes of init_size.
        let pad = vec![0u8; 0x260];
        f.write_all(&pad).expect("write pad");
        // Distinct value, large enough that wrong-offset reads would
        // yield zero (the surrounding pad).
        let init_size: u32 = 0x1234_5678;
        f.write_all(&init_size.to_le_bytes())
            .expect("write init_size");
        f.flush().expect("flush");

        let got = read_kernel_init_size(f.path()).expect("read init_size");
        assert_eq!(got, init_size as u64);
    }

    /// Reading a file shorter than 0x264 bytes (the high end of the
    /// init_size field on x86_64) must surface an error rather than
    /// silently returning 0. Pin the failure shape so a future
    /// "graceful-fallback" refactor that swallows truncated-bzImage
    /// errors can't slip past review.
    #[cfg(target_arch = "x86_64")]
    #[test]
    fn read_kernel_init_size_x86_64_short_file_errors() {
        use std::io::Write;

        let mut f = tempfile::NamedTempFile::new().expect("tempfile");
        // Only 0x100 bytes — well short of the 0x264 needed.
        let truncated = vec![0u8; 0x100];
        f.write_all(&truncated).expect("write truncated");
        f.flush().expect("flush");

        let result = read_kernel_init_size(f.path());
        assert!(result.is_err(), "truncated file must fail; got: {result:?}",);
    }

    /// `read_kernel_init_size` on aarch64 reads a raw PE Image: 8 bytes
    /// of `image_size` at file offset 16 (after `code0` at 0 and
    /// `text_offset` at 8). Construct a tempfile that does NOT begin
    /// with the gzip magic (0x1f 0x8b) so the function takes the raw
    /// PE Image branch, then assert the function returns the value at
    /// offset 16 as u64 little-endian. Pins the byte-offset and width
    /// against a future drift in the arm64 image header layout
    /// (Documentation/arch/arm64/booting.rst, struct arm64_image_header
    /// in arch/arm64/include/asm/image.h).
    #[cfg(target_arch = "aarch64")]
    #[test]
    fn read_kernel_init_size_aarch64_reads_offset_16() {
        use std::io::Write;

        let mut f = tempfile::NamedTempFile::new().expect("tempfile");
        // First 16 bytes: code0 + text_offset, neither of which is the
        // gzip magic 0x1f 0x8b. Use a recognizable non-gzip prefix so
        // a wrong-branch read (decompressing as gzip) would error
        // immediately.
        let prefix = [0u8; 16];
        f.write_all(&prefix).expect("write prefix");
        // Distinct value, large enough that a wrong-offset read would
        // yield zero (the surrounding zero pad).
        let image_size: u64 = 0x1234_5678_9abc_def0;
        f.write_all(&image_size.to_le_bytes())
            .expect("write image_size");
        f.flush().expect("flush");

        let got = read_kernel_init_size(f.path()).expect("read image_size");
        assert_eq!(got, image_size);
    }

    /// Reading a file shorter than 24 bytes (offset 16 + 8 bytes of
    /// image_size) on aarch64 must surface an error rather than
    /// silently returning 0. Mirror of the x86_64 short-file test:
    /// pin the failure shape so a future "graceful-fallback" refactor
    /// that swallows truncated-Image errors can't slip past review.
    #[cfg(target_arch = "aarch64")]
    #[test]
    fn read_kernel_init_size_aarch64_short_file_errors() {
        use std::io::Write;

        let mut f = tempfile::NamedTempFile::new().expect("tempfile");
        // Only 8 bytes — well short of the 24 needed (offset 16 + 8).
        // Also avoids the gzip magic so the raw-Image branch fires.
        let truncated = vec![0u8; 8];
        f.write_all(&truncated).expect("write truncated");
        f.flush().expect("flush");

        let result = read_kernel_init_size(f.path());
        assert!(result.is_err(), "truncated file must fail; got: {result:?}",);
    }

    /// `read_kernel_version` on x86_64 walks the bzImage
    /// setup_header: the "HdrS" magic at 0x202, the kernel_version u16
    /// at 0x20E, and the version string at 0x200 + ptr. Pins the
    /// offset-chase AND the HdrS-magic gate (a non-bzImage returns None
    /// — the safe-50% direction, the panic-direction guard).
    #[cfg(target_arch = "x86_64")]
    #[test]
    fn read_kernel_version_x86_64_offset_chase_and_magic_gate() {
        use std::io::{Seek, SeekFrom, Write};

        let ver_ptr: u16 = 0x0100; // string at file offset 0x200 + 0x100 = 0x300
        let string_off = 0x200u64 + ver_ptr as u64;

        let write_image = |magic: &[u8; 4]| {
            let mut f = tempfile::NamedTempFile::new().expect("tempfile");
            // Pad past the version-string region.
            f.write_all(&vec![0u8; (string_off as usize) + 64])
                .expect("pad");
            // setup_header "HdrS" magic at 0x202.
            f.seek(SeekFrom::Start(0x202)).expect("seek magic");
            f.write_all(magic).expect("write magic");
            // kernel_version pointer at 0x20E.
            f.seek(SeekFrom::Start(0x20E)).expect("seek ver_ptr");
            f.write_all(&ver_ptr.to_le_bytes()).expect("write ver_ptr");
            // Version string at 0x200 + ptr: "RELEASE (builder@host)" + NUL,
            // as the kernel writes it (a space bounds the RELEASE token).
            f.seek(SeekFrom::Start(string_off)).expect("seek string");
            f.write_all(b"6.18.0-rc1 (builder@host)\0")
                .expect("write string");
            f.flush().expect("flush");
            f
        };

        // Valid bzImage: HdrS magic present -> the version-chase parses
        // (6, 18, 0) from the "6.18.0-rc1" release string.
        let good = write_image(b"HdrS");
        assert_eq!(
            read_kernel_version(good.path()),
            Some((6, 18, 0)),
            "valid bzImage offset-chase must parse (6, 18, 0)",
        );

        // Wrong HdrS magic (not a bzImage / corrupt) -> None, even though
        // the 0x20E/0x300 bytes would parse as 6.18. The panic-direction
        // guard: an unvalidated image must NOT select the 90% fraction.
        let bad = write_image(b"XXXX");
        assert_eq!(
            read_kernel_version(bad.path()),
            None,
            "wrong HdrS magic must return None (the safe-50% direction)",
        );
    }

    /// `read_kernel_version_from_metadata_sidecar` reads `version` from a
    /// `metadata.json` sibling of the image path and parses its
    /// MAJOR.MINOR.PATCH — the aarch64 reclaim source. Pins: a 6.18.2
    /// version parses (and unrelated schema keys are ignored); an absent
    /// `version` key, an explicit `null`, an unparsable string, malformed
    /// JSON, and a missing sidecar all return None — the safe-50%
    /// direction.
    #[test]
    fn read_kernel_version_from_metadata_sidecar_parses_and_guards() {
        use std::io::Write;

        let dir = tempfile::tempdir().expect("tempdir");
        // The reader only needs the image path's parent; the image file
        // itself is never read, so it need not exist.
        let image = dir.path().join("Image");
        let sidecar = dir.path().join("metadata.json");
        let write_sidecar = |json: &str| {
            let mut f = std::fs::File::create(&sidecar).expect("create sidecar");
            f.write_all(json.as_bytes()).expect("write sidecar");
            f.flush().expect("flush");
        };

        // Honoring version present, with an unrelated key -> parses
        // (6, 18, 2) (extra schema fields ignored by the minimal probe).
        write_sidecar(r#"{"version":"6.18.2","arch":"aarch64"}"#);
        assert_eq!(
            read_kernel_version_from_metadata_sidecar(&image),
            Some((6, 18, 2)),
            "sidecar version must parse to (6, 18, 2)",
        );

        // version key absent -> None (an acquisition that recorded no
        // version, e.g. a source build whose Makefile was unparsable).
        write_sidecar(r#"{"arch":"aarch64"}"#);
        assert_eq!(
            read_kernel_version_from_metadata_sidecar(&image),
            None,
            "absent version key must return None",
        );

        // Explicit null version -> None.
        write_sidecar(r#"{"version":null}"#);
        assert_eq!(read_kernel_version_from_metadata_sidecar(&image), None);

        // Unparsable version string -> None.
        write_sidecar(r#"{"version":"not-a-version"}"#);
        assert_eq!(read_kernel_version_from_metadata_sidecar(&image), None);

        // Malformed JSON -> None.
        write_sidecar("{not json");
        assert_eq!(read_kernel_version_from_metadata_sidecar(&image), None);

        // No sidecar at all (raw --kernel path) -> None.
        std::fs::remove_file(&sidecar).expect("remove sidecar");
        assert_eq!(
            read_kernel_version_from_metadata_sidecar(&image),
            None,
            "missing sidecar must return None (raw --kernel path)",
        );

        // A path with no parent (root "/") exercises the
        // `kernel_path.parent()?` guard: it early-returns None before any
        // file read, deterministically (no cwd dependency).
        assert_eq!(
            read_kernel_version_from_metadata_sidecar(std::path::Path::new("/")),
            None,
            "root path (no parent) must return None",
        );
    }
}