supermachine 0.7.70

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
//! 64-bit Linux boot protocol (the `arch/x86_64` deliverable).
//!
//! Builds everything a freshly-created vCPU needs to enter a bzImage's 64-bit
//! entry point directly — no real-mode/protected-mode trampoline, no firmware:
//!
//!   * load the protected-mode kernel image at 1 MiB,
//!   * build `struct boot_params` (the "zero page"): copy the bzImage setup
//!     header, set the loader id / cmdline pointer / ramdisk / e820 map,
//!   * identity-map the low 1 GiB (PML4 → PDPT → PD, 2 MiB pages),
//!   * lay down a GDT with flat 64-bit `__BOOT_CS`(0x10) / `__BOOT_DS`(0x18),
//!   * return the register state (CR0/CR3/CR4/EFER, segments, RIP/RSI) that
//!     puts the vCPU in long mode at `kernel_load + 0x200` with RSI pointing at
//!     `boot_params`.
//!
//! This module is pure logic over a guest-RAM byte slice (gpa 0 == `mem[0]`):
//! it has no KVM/hypervisor dependency, so it is unit-testable on its own. The
//! KVM backend calls [`setup_boot`], copies the returned [`BootRegs`] onto the
//! vCPU's SREGS/REGS, and runs. See the validated end-to-end path in
//! `spikes/kvm-boot` (this is that boot setup, productionized).
//!
//! Reference: `Documentation/arch/x86/boot.rst` (the Linux boot protocol).

/// Guest-physical memory layout. Everything except the kernel and initrd lives
/// below 1 MiB, in the conventional low-memory scratch area.
///
/// The initramfs is NOT placed at a fixed address. The kernel decompresses and
/// runs in `[KERNEL_LOAD, KERNEL_LOAD + init_size)`; a fixed low address (e.g.
/// 64 MiB) silently lands inside that window and the kernel clobbers the ramdisk
/// during decompression — confirmed guest-RAM corruption. It is placed
/// top-down, clear of the kernel footprint — see [`initrd_load_addr`].
pub mod layout {
    /// Page-map level 4 (top of the 4-level page-table walk).
    pub const PML4: u64 = 0x1000;
    /// Page-directory pointer table.
    pub const PDPT: u64 = 0x2000;
    /// Page directory (512 × 2 MiB entries = identity-maps the low 1 GiB).
    pub const PD: u64 = 0x3000;
    /// Global descriptor table (null, null, `__BOOT_CS`, `__BOOT_DS`).
    pub const GDT: u64 = 0x4000;
    /// `struct boot_params` (the "zero page") the kernel reads via RSI.
    pub const BOOT_PARAMS: u64 = 0x1_0000;
    /// Kernel command line (NUL-terminated ASCII).
    pub const CMDLINE: u64 = 0x2_0000;
    /// Protected-mode kernel load address (the bzImage 64-bit entry is here +0x200).
    pub const KERNEL_LOAD: u64 = 0x10_0000; // 1 MiB
}

// `boot_params` field offsets (within the zero page).
const BP_E820_ENTRIES: u64 = 0x1e8;
const BP_SETUP_HEADER: u64 = 0x1f1; // start of the embedded setup_header
const BP_E820_TABLE: u64 = 0x2d0;

// setup_header field offsets (these are absolute zero-page offsets; the header
// is copied verbatim from the bzImage starting at BP_SETUP_HEADER).
const HDR_SETUP_SECTS: usize = 0x1f1;
const HDR_TYPE_OF_LOADER: u64 = 0x210;
const HDR_LOADFLAGS: u64 = 0x211;
const HDR_RAMDISK_IMAGE: u64 = 0x218;
const HDR_RAMDISK_SIZE: u64 = 0x21c;
const HDR_CMD_LINE_PTR: u64 = 0x228;
/// `init_size`: the total memory (from the load base) the kernel needs to
/// decompress and run. The kernel owns [KERNEL_LOAD, KERNEL_LOAD + init_size).
const HDR_INIT_SIZE: u64 = 0x260;

/// The setup_header struct spans [0x1f1, 0x268) in the bzImage / zero page.
const SETUP_HEADER_END: u64 = 0x268;

// e820 memory-map entry types.
const E820_RAM: u32 = 1;

// Long-mode control-register / EFER bits applied on entry.
const CR0_PE_PG: u64 = 0x8000_0001; // PE (protected) | PG (paging)
const CR4_PAE: u64 = 0x0000_0020; // PAE (required for long mode)
const EFER_LME_LMA: u64 = 0x0000_0500; // LME (long-mode enable) | LMA (active)

// Flat 64-bit GDT descriptors.
const GDT_CODE64: u64 = 0x00af_9a00_0000_ffff; // G,L, present, exec/read
const GDT_DATA: u64 = 0x00cf_9200_0000_ffff; // G,DB, present, read/write

/// `__BOOT_CS` — selector of GDT entry 2 (index 2 << 3).
pub const BOOT_CS: u16 = 0x10;
/// `__BOOT_DS` — selector of GDT entry 3 (index 3 << 3).
pub const BOOT_DS: u16 = 0x18;

/// Inputs to [`setup_boot`].
pub struct BootConfig<'a> {
    /// Total guest RAM size in bytes (also the length of the `mem` slice).
    pub mem_size: usize,
    /// Kernel command line (without the trailing NUL — added here).
    pub cmdline: &'a str,
    /// The bzImage bytes (real-mode setup sectors + protected-mode kernel).
    pub bzimage: &'a [u8],
    /// Optional initramfs (cpio) the kernel unpacks as its initial rootfs.
    pub initrd: Option<&'a [u8]>,
}

/// A segment register's worth of state — a portable mirror of the fields a
/// hypervisor needs to load (KVM's `kvm_segment`, HVF's segment MSRs). The
/// backend copies these onto its own segment-descriptor type.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Segment {
    pub selector: u16,
    pub base: u64,
    pub limit: u32,
    /// Segment type nibble (e.g. `0b1011` = code, exec/read/accessed).
    pub type_: u8,
    /// Descriptor-privilege level.
    pub dpl: u8,
    /// 1 = code/data segment (vs system).
    pub s: u8,
    pub present: u8,
    /// 1 = 64-bit code segment (code only).
    pub l: u8,
    /// Default operation size (1 = 32-bit; ignored when `l` is set for code).
    pub db: u8,
    /// Granularity (1 = limit in 4 KiB pages).
    pub g: u8,
}

/// The vCPU register state that puts the core in long mode at the kernel entry.
/// Returned by [`setup_boot`]; the hypervisor backend applies it to SREGS/REGS.
#[derive(Clone, Copy, Debug)]
pub struct BootRegs {
    pub cr0: u64,
    pub cr3: u64,
    pub cr4: u64,
    pub efer: u64,
    pub gdt_base: u64,
    pub gdt_limit: u16,
    pub cs: Segment,
    /// All data segments (DS/ES/SS/FS/GS) share this descriptor.
    pub ds: Segment,
    pub rip: u64,
    pub rsi: u64,
    pub rflags: u64,
}

/// Why boot setup failed — all are caller-supplied-input problems (a truncated
/// bzImage, RAM too small for the kernel/initrd, an over-long cmdline), never
/// internal invariants. The boot builder never panics on bad input.
#[derive(Debug, PartialEq, Eq)]
pub enum BootError {
    /// The bzImage is too short to contain a full setup header.
    BzImageTooSmall { len: usize, need: usize },
    /// The setup-sector count points past the end of the bzImage.
    SetupSectorsOutOfRange { pm_offset: usize, len: usize },
    /// Guest RAM is too small to hold the kernel/initrd/boot structures.
    MemTooSmall { need: u64, have: usize },
    /// A region (kernel, initrd, cmdline) would overlap another.
    RegionOverlap(&'static str),
    /// The command line (plus NUL) does not fit before the kernel load address.
    CmdlineTooLong { len: usize, max: u64 },
}

impl std::fmt::Display for BootError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            BootError::BzImageTooSmall { len, need } => {
                write!(f, "bzImage too small: {len} bytes, need >= {need}")
            }
            BootError::SetupSectorsOutOfRange { pm_offset, len } => {
                write!(
                    f,
                    "setup sectors out of range: pm-kernel at +0x{pm_offset:x} past end ({len})"
                )
            }
            BootError::MemTooSmall { need, have } => {
                write!(
                    f,
                    "guest RAM too small: need 0x{need:x} bytes, have 0x{have:x}"
                )
            }
            BootError::RegionOverlap(what) => write!(f, "boot region overlap: {what}"),
            BootError::CmdlineTooLong { len, max } => {
                write!(f, "cmdline too long: {len} bytes, max {max}")
            }
        }
    }
}

impl std::error::Error for BootError {}

/// Bounds-checked little-endian writer over the guest-RAM slice (gpa 0 ==
/// `mem[0]`). Every write validates against the slice length, so a bad gpa
/// returns `MemTooSmall` instead of panicking.
struct GuestWriter<'a> {
    mem: &'a mut [u8],
}

impl GuestWriter<'_> {
    fn write(&mut self, gpa: u64, bytes: &[u8]) -> Result<(), BootError> {
        let start = gpa as usize;
        let end = start
            .checked_add(bytes.len())
            .filter(|&e| e <= self.mem.len())
            .ok_or(BootError::MemTooSmall {
                need: gpa + bytes.len() as u64,
                have: self.mem.len(),
            })?;
        self.mem[start..end].copy_from_slice(bytes);
        Ok(())
    }
    fn write_u8(&mut self, gpa: u64, v: u8) -> Result<(), BootError> {
        self.write(gpa, &[v])
    }
    fn write_u32(&mut self, gpa: u64, v: u32) -> Result<(), BootError> {
        self.write(gpa, &v.to_le_bytes())
    }
    fn write_u64(&mut self, gpa: u64, v: u64) -> Result<(), BootError> {
        self.write(gpa, &v.to_le_bytes())
    }
}

/// Choose the initramfs load address: the highest page-aligned base that fits
/// the ramdisk in RAM while staying clear of the kernel's footprint.
///
/// The kernel decompresses and runs in `[KERNEL_LOAD, KERNEL_LOAD +
/// kernel_footprint)` (where `kernel_footprint` is the larger of the loaded
/// image and the setup header's `init_size`). Any ramdisk placed inside that
/// window is silently clobbered during decompression — a fixed 64 MiB address
/// corrupted any initramfs that reached ~64 MiB. Placing it top-down guarantees
/// it never collides regardless of kernel or ramdisk size.
fn initrd_load_addr(
    mem_size: u64,
    kernel_footprint: u64,
    initrd_len: u64,
) -> Result<u64, BootError> {
    const PAGE: u64 = 0x1000;
    let floor = (layout::KERNEL_LOAD + kernel_footprint).next_multiple_of(PAGE);
    match mem_size.checked_sub(initrd_len).map(|a| a & !(PAGE - 1)) {
        Some(addr) if addr >= floor => Ok(addr),
        // Need at least the kernel footprint plus the ramdisk to coexist.
        _ => Err(BootError::MemTooSmall {
            need: floor + initrd_len,
            have: mem_size as usize,
        }),
    }
}

/// Build the boot environment in `mem` (the guest RAM, `mem.len() ==
/// cfg.mem_size`) and return the register state to enter the kernel.
///
/// On success, `mem` contains the page tables, GDT, `boot_params`, command
/// line, kernel image, and (if any) initramfs; applying the returned
/// [`BootRegs`] to a vCPU and running it boots Linux.
pub fn setup_boot(mem: &mut [u8], cfg: &BootConfig) -> Result<BootRegs, BootError> {
    use layout::*;

    debug_assert_eq!(
        mem.len(),
        cfg.mem_size,
        "mem slice must be exactly mem_size"
    );

    let bz = cfg.bzimage;
    // The setup header must be fully present to copy it into the zero page.
    if (bz.len() as u64) < SETUP_HEADER_END {
        return Err(BootError::BzImageTooSmall {
            len: bz.len(),
            need: SETUP_HEADER_END as usize,
        });
    }

    // Protected-mode kernel begins after the setup sectors: a setup_sects of 0
    // is historically interpreted as 4.
    let setup_sects = if bz[HDR_SETUP_SECTS] == 0 {
        4u8
    } else {
        bz[HDR_SETUP_SECTS]
    };
    let pm_offset = (setup_sects as usize + 1) * 512;
    if pm_offset >= bz.len() {
        return Err(BootError::SetupSectorsOutOfRange {
            pm_offset,
            len: bz.len(),
        });
    }
    let pm_kernel = &bz[pm_offset..];

    // Validate the guest-RAM footprint up front so every later write is in
    // bounds. The largest required address is the kernel image end or, if an
    // initramfs is present, the initramfs end.
    let kernel_end = KERNEL_LOAD + pm_kernel.len() as u64;
    // The kernel decompresses/runs in [KERNEL_LOAD, KERNEL_LOAD + footprint);
    // `init_size` is its full runtime footprint (>= the loaded image).
    let init_size = u32::from_le_bytes(
        bz[HDR_INIT_SIZE as usize..HDR_INIT_SIZE as usize + 4]
            .try_into()
            .unwrap(),
    ) as u64;
    let kernel_footprint = (pm_kernel.len() as u64).max(init_size);
    let mut high_water = kernel_end;
    // Place the initramfs top-down, clear of the kernel footprint, so the
    // kernel's decompression can never clobber it (see `initrd_load_addr`).
    let initrd_addr = match cfg.initrd {
        Some(rd) => {
            let addr = initrd_load_addr(cfg.mem_size as u64, kernel_footprint, rd.len() as u64)?;
            high_water = high_water.max(addr + rd.len() as u64);
            Some(addr)
        }
        None => None,
    };
    if high_water > cfg.mem_size as u64 {
        return Err(BootError::MemTooSmall {
            need: high_water,
            have: cfg.mem_size,
        });
    }

    // The cmdline (plus its NUL) must fit between CMDLINE and the kernel load.
    let cmdline_max = KERNEL_LOAD - CMDLINE;
    if cfg.cmdline.len() as u64 + 1 > cmdline_max {
        return Err(BootError::CmdlineTooLong {
            len: cfg.cmdline.len(),
            max: cmdline_max,
        });
    }

    let mut w = GuestWriter { mem };

    // 1. Identity-map the low 1 GiB (PML4 -> PDPT -> PD, 2 MiB pages).
    w.write_u64(PML4, PDPT | 0x3)?; // present | rw
    w.write_u64(PDPT, PD | 0x3)?;
    for i in 0..512u64 {
        w.write_u64(PD + i * 8, (i * 0x20_0000) | 0x83)?; // present | rw | ps(2MiB)
    }

    // 2. GDT: [0]=null, [1]=null, [2]=__BOOT_CS, [3]=__BOOT_DS.
    w.write_u64(GDT, 0)?;
    w.write_u64(GDT + 8, 0)?;
    w.write_u64(GDT + 16, GDT_CODE64)?;
    w.write_u64(GDT + 24, GDT_DATA)?;

    // 3. boot_params (zero page): copy the setup header verbatim, then patch
    //    the loader/cmdline/ramdisk fields.
    w.write(
        BOOT_PARAMS + BP_SETUP_HEADER,
        &bz[BP_SETUP_HEADER as usize..SETUP_HEADER_END as usize],
    )?;
    w.write_u8(BOOT_PARAMS + HDR_TYPE_OF_LOADER, 0xff)?; // "undefined" bootloader
                                                         // Preserve LOADED_HIGH (bit 0); the rest of loadflags come from the image.
    let loadflags = bz[HDR_LOADFLAGS as usize] | 0x01;
    w.write_u8(BOOT_PARAMS + HDR_LOADFLAGS, loadflags)?;
    match (cfg.initrd, initrd_addr) {
        (Some(rd), Some(addr)) => {
            w.write_u32(BOOT_PARAMS + HDR_RAMDISK_IMAGE, addr as u32)?;
            w.write_u32(BOOT_PARAMS + HDR_RAMDISK_SIZE, rd.len() as u32)?;
        }
        _ => {
            w.write_u32(BOOT_PARAMS + HDR_RAMDISK_IMAGE, 0)?;
            w.write_u32(BOOT_PARAMS + HDR_RAMDISK_SIZE, 0)?;
        }
    }
    w.write_u32(BOOT_PARAMS + HDR_CMD_LINE_PTR, CMDLINE as u32)?;

    // 4. Command line (NUL-terminated).
    w.write(CMDLINE, cfg.cmdline.as_bytes())?;
    w.write_u8(CMDLINE + cfg.cmdline.len() as u64, 0)?;

    // 5. e820 map: usable low memory (< 640 KiB), then 1 MiB..end of RAM.
    let mut e820 = |slot: u64, addr: u64, size: u64, typ: u32| -> Result<(), BootError> {
        let base = BOOT_PARAMS + BP_E820_TABLE + slot * 20;
        w.write_u64(base, addr)?;
        w.write_u64(base + 8, size)?;
        w.write_u32(base + 16, typ)?;
        Ok(())
    };
    e820(0, 0x0, 0x9_fc00, E820_RAM)?;
    e820(1, KERNEL_LOAD, cfg.mem_size as u64 - KERNEL_LOAD, E820_RAM)?;
    w.write_u8(BOOT_PARAMS + BP_E820_ENTRIES, 2)?;

    // 6. Load the protected-mode kernel and the initramfs.
    w.write(KERNEL_LOAD, pm_kernel)?;
    if let (Some(rd), Some(addr)) = (cfg.initrd, initrd_addr) {
        w.write(addr, rd)?;
    }

    // 7. Register state: long mode, flat segments, entry at load+0x200.
    let cs = Segment {
        selector: BOOT_CS,
        base: 0,
        limit: 0xffff_ffff,
        type_: 0b1011, // code: exec/read/accessed
        dpl: 0,
        s: 1,
        present: 1,
        l: 1,
        db: 0,
        g: 1,
    };
    let ds = Segment {
        selector: BOOT_DS,
        base: 0,
        limit: 0xffff_ffff,
        type_: 0b0011, // data: read/write/accessed
        dpl: 0,
        s: 1,
        present: 1,
        l: 0,
        db: 1,
        g: 1,
    };
    Ok(BootRegs {
        cr0: CR0_PE_PG,
        cr3: PML4,
        cr4: CR4_PAE,
        efer: EFER_LME_LMA,
        gdt_base: GDT,
        gdt_limit: 4 * 8 - 1,
        cs,
        ds,
        rip: KERNEL_LOAD + 0x200,
        rsi: BOOT_PARAMS,
        rflags: 0x2, // reserved bit 1 always set
    })
}

#[cfg(test)]
mod tests {
    use super::layout::*;
    use super::*;

    /// A synthetic bzImage just real enough for the boot builder: a valid
    /// setup-header region with `setup_sects`, then `pm_len` bytes of "kernel".
    fn fake_bzimage(setup_sects: u8, pm_len: usize) -> Vec<u8> {
        let pm_offset = (setup_sects as usize + 1) * 512;
        let mut bz = vec![0u8; pm_offset + pm_len];
        bz[HDR_SETUP_SECTS] = setup_sects;
        // loadflags: pretend the image already wants LOADED_HIGH + some other bit
        // so we can prove we OR in 0x01 and preserve the rest.
        bz[HDR_LOADFLAGS as usize] = 0x80;
        // Mark the pm-kernel region so we can assert it was copied to 1 MiB.
        for (i, b) in bz[pm_offset..].iter_mut().enumerate() {
            *b = (i as u8) ^ 0xa5;
        }
        bz
    }

    fn rd_u32(mem: &[u8], gpa: u64) -> u32 {
        let s = gpa as usize;
        u32::from_le_bytes(mem[s..s + 4].try_into().unwrap())
    }
    fn rd_u64(mem: &[u8], gpa: u64) -> u64 {
        let s = gpa as usize;
        u64::from_le_bytes(mem[s..s + 8].try_into().unwrap())
    }

    #[test]
    fn builds_long_mode_entry_state() {
        let bz = fake_bzimage(4, 4096);
        let mem_size = 512 * 1024 * 1024;
        let mut mem = vec![0u8; mem_size];
        let cfg = BootConfig {
            mem_size,
            cmdline: "console=ttyS0 root=/dev/vda",
            bzimage: &bz,
            initrd: None,
        };
        let regs = setup_boot(&mut mem, &cfg).expect("setup_boot");

        // Long-mode control regs + entry point.
        assert_eq!(regs.cr0, 0x8000_0001);
        assert_eq!(regs.cr4, 0x20);
        assert_eq!(regs.efer, 0x500);
        assert_eq!(regs.cr3, PML4);
        assert_eq!(regs.rip, KERNEL_LOAD + 0x200);
        assert_eq!(regs.rsi, BOOT_PARAMS);
        assert_eq!(regs.gdt_base, GDT);
        assert_eq!(regs.gdt_limit, 31);
        assert_eq!(regs.cs.selector, BOOT_CS);
        assert_eq!(regs.cs.l, 1, "CS must be a 64-bit code segment");
        assert_eq!(regs.ds.selector, BOOT_DS);
        assert_eq!(regs.ds.db, 1);
    }

    #[test]
    fn writes_page_tables_gdt_and_zero_page() {
        let bz = fake_bzimage(4, 8192);
        let mem_size = 256 * 1024 * 1024;
        let mut mem = vec![0u8; mem_size];
        let cfg = BootConfig {
            mem_size,
            cmdline: "console=ttyS0",
            bzimage: &bz,
            initrd: None,
        };
        setup_boot(&mut mem, &cfg).unwrap();

        // Page-table roots.
        assert_eq!(rd_u64(&mem, PML4), PDPT | 0x3);
        assert_eq!(rd_u64(&mem, PDPT), PD | 0x3);
        // First and last PD entries (identity 2 MiB pages, present|rw|ps).
        assert_eq!(rd_u64(&mem, PD), 0x83);
        assert_eq!(rd_u64(&mem, PD + 511 * 8), (511 * 0x20_0000) | 0x83);

        // GDT flat code/data descriptors.
        assert_eq!(rd_u64(&mem, GDT + 16), 0x00af_9a00_0000_ffff);
        assert_eq!(rd_u64(&mem, GDT + 24), 0x00cf_9200_0000_ffff);

        // Zero-page loader fields.
        assert_eq!(mem[(BOOT_PARAMS + 0x210) as usize], 0xff); // type_of_loader
        assert_eq!(mem[(BOOT_PARAMS + 0x211) as usize], 0x81); // loadflags |= 0x01
        assert_eq!(rd_u32(&mem, BOOT_PARAMS + 0x228), CMDLINE as u32); // cmd_line_ptr
        assert_eq!(mem[(BOOT_PARAMS + 0x1e8) as usize], 2); // e820 entries

        // e820[0] = usable low, e820[1] = usable from 1 MiB.
        assert_eq!(rd_u64(&mem, BOOT_PARAMS + 0x2d0), 0);
        assert_eq!(rd_u64(&mem, BOOT_PARAMS + 0x2d0 + 8), 0x9_fc00);
        assert_eq!(rd_u32(&mem, BOOT_PARAMS + 0x2d0 + 16), 1);
        assert_eq!(rd_u64(&mem, BOOT_PARAMS + 0x2d0 + 20), KERNEL_LOAD);
        assert_eq!(
            rd_u64(&mem, BOOT_PARAMS + 0x2d0 + 28),
            mem_size as u64 - KERNEL_LOAD
        );

        // No initramfs → ramdisk fields zeroed.
        assert_eq!(rd_u32(&mem, BOOT_PARAMS + 0x218), 0);
        assert_eq!(rd_u32(&mem, BOOT_PARAMS + 0x21c), 0);

        // Command line copied + NUL-terminated.
        let c = b"console=ttyS0";
        assert_eq!(&mem[CMDLINE as usize..CMDLINE as usize + c.len()], c);
        assert_eq!(mem[CMDLINE as usize + c.len()], 0);
    }

    #[test]
    fn copies_pm_kernel_to_one_mib() {
        let bz = fake_bzimage(2, 1024);
        let pm_offset = (2 + 1) * 512;
        let mem_size = 64 * 1024 * 1024;
        let mut mem = vec![0u8; mem_size];
        let cfg = BootConfig {
            mem_size,
            cmdline: "x",
            bzimage: &bz,
            initrd: None,
        };
        setup_boot(&mut mem, &cfg).unwrap();
        // The pm-kernel bytes (marked i^0xa5) land at KERNEL_LOAD.
        assert_eq!(
            &mem[KERNEL_LOAD as usize..KERNEL_LOAD as usize + 1024],
            &bz[pm_offset..pm_offset + 1024]
        );
    }

    #[test]
    fn places_initramfs_and_sets_ramdisk_fields() {
        let bz = fake_bzimage(4, 4096);
        let initrd = vec![0x5au8; 4096];
        let mem_size = 128 * 1024 * 1024;
        let mut mem = vec![0u8; mem_size];
        let cfg = BootConfig {
            mem_size,
            cmdline: "console=ttyS0",
            bzimage: &bz,
            initrd: Some(&initrd),
        };
        setup_boot(&mut mem, &cfg).unwrap();
        // The ramdisk address is computed (top-down), not fixed. Read it back
        // from boot_params and verify the bytes landed exactly there.
        let addr = rd_u32(&mem, BOOT_PARAMS + 0x218) as usize;
        assert_eq!(rd_u32(&mem, BOOT_PARAMS + 0x21c), initrd.len() as u32);
        assert_eq!(&mem[addr..addr + initrd.len()], &initrd[..]);
        // It must sit above the kernel image and be page-aligned — clear of the
        // decompression zone that previously corrupted it.
        assert!(addr as u64 > KERNEL_LOAD);
        assert_eq!(addr & 0xfff, 0);
        // And it must fit inside RAM.
        assert!(addr + initrd.len() <= mem_size);
    }

    #[test]
    fn initrd_placed_clear_of_kernel_footprint() {
        // A kernel whose decompression footprint (init_size) is much larger
        // than the loaded image must still not collide with the ramdisk: the
        // initramfs goes ABOVE init_size, not just above the loaded bytes.
        let mut bz = fake_bzimage(4, 4096);
        let init_size: u32 = 200 * 1024 * 1024; // 200 MiB footprint
        bz[HDR_INIT_SIZE as usize..HDR_INIT_SIZE as usize + 4]
            .copy_from_slice(&init_size.to_le_bytes());
        let initrd = vec![0x5au8; 4096];
        let mem_size = 512 * 1024 * 1024;
        let mut mem = vec![0u8; mem_size];
        let cfg = BootConfig {
            mem_size,
            cmdline: "console=ttyS0",
            bzimage: &bz,
            initrd: Some(&initrd),
        };
        setup_boot(&mut mem, &cfg).unwrap();
        let addr = rd_u32(&mem, BOOT_PARAMS + 0x218) as u64;
        assert!(
            addr >= KERNEL_LOAD + init_size as u64,
            "initrd at {addr:#x} must clear the {init_size:#x}-byte kernel footprint"
        );
    }

    #[test]
    fn rejects_truncated_bzimage() {
        let bz = vec![0u8; 0x100]; // shorter than the setup header
        let mut mem = vec![0u8; 64 * 1024 * 1024];
        let cfg = BootConfig {
            mem_size: mem.len(),
            cmdline: "x",
            bzimage: &bz,
            initrd: None,
        };
        assert!(matches!(
            setup_boot(&mut mem, &cfg),
            Err(BootError::BzImageTooSmall { .. })
        ));
    }

    #[test]
    fn rejects_setup_sects_past_end() {
        // Header present, but setup_sects claims more sectors than the file has.
        let mut bz = vec![0u8; 0x268];
        bz[HDR_SETUP_SECTS] = 200; // (200+1)*512 = 102912 >> file len
        let mut mem = vec![0u8; 64 * 1024 * 1024];
        let cfg = BootConfig {
            mem_size: mem.len(),
            cmdline: "x",
            bzimage: &bz,
            initrd: None,
        };
        assert!(matches!(
            setup_boot(&mut mem, &cfg),
            Err(BootError::SetupSectorsOutOfRange { .. })
        ));
    }

    #[test]
    fn rejects_ram_too_small_for_kernel() {
        // pm-kernel larger than the tiny guest RAM.
        let bz = fake_bzimage(4, 8 * 1024 * 1024);
        let mem_size = 4 * 1024 * 1024; // smaller than KERNEL_LOAD + pm
        let mut mem = vec![0u8; mem_size];
        let cfg = BootConfig {
            mem_size,
            cmdline: "x",
            bzimage: &bz,
            initrd: None,
        };
        assert!(matches!(
            setup_boot(&mut mem, &cfg),
            Err(BootError::MemTooSmall { .. })
        ));
    }

    #[test]
    fn rejects_initramfs_past_end_of_ram() {
        let bz = fake_bzimage(4, 4096);
        // An initramfs as large as RAM itself can't be placed above the kernel.
        let mem_size = 8 * 1024 * 1024;
        let initrd = vec![0u8; mem_size];
        let mut mem = vec![0u8; mem_size];
        let cfg = BootConfig {
            mem_size,
            cmdline: "x",
            bzimage: &bz,
            initrd: Some(&initrd),
        };
        assert!(matches!(
            setup_boot(&mut mem, &cfg),
            Err(BootError::MemTooSmall { .. })
        ));
    }

    #[test]
    fn rejects_overlong_cmdline() {
        let bz = fake_bzimage(4, 4096);
        let mem_size = 64 * 1024 * 1024;
        let mut mem = vec![0u8; mem_size];
        let huge = "a".repeat((KERNEL_LOAD - CMDLINE) as usize);
        let cfg = BootConfig {
            mem_size,
            cmdline: &huge,
            bzimage: &bz,
            initrd: None,
        };
        assert!(matches!(
            setup_boot(&mut mem, &cfg),
            Err(BootError::CmdlineTooLong { .. })
        ));
    }

    use proptest::prelude::*;

    proptest! {
        #![proptest_config(ProptestConfig::with_cases(4096))]

        /// Exhaustively guard the invariant whose violation silently corrupted
        /// guest RAM: a placed initramfs must NEVER land inside the kernel's
        /// decompression footprint `[KERNEL_LOAD, KERNEL_LOAD + footprint)`.
        /// Whenever placement succeeds it must clear that window, be
        /// page-aligned, and fit in RAM; it may only be rejected when the
        /// ramdisk genuinely cannot fit above the footprint.
        #[test]
        fn initrd_never_overlaps_kernel_footprint(
            mem_size in 0u64..=64 * 1024 * 1024 * 1024,
            footprint in 0u64..=8 * 1024 * 1024 * 1024,
            initrd_len in 0u64..=8 * 1024 * 1024 * 1024,
        ) {
            let floor = (KERNEL_LOAD + footprint).next_multiple_of(0x1000);
            match initrd_load_addr(mem_size, footprint, initrd_len) {
                Ok(addr) => {
                    prop_assert_eq!(addr & 0xfff, 0, "page-aligned");
                    prop_assert!(addr >= floor, "clears the kernel footprint");
                    prop_assert!(addr + initrd_len <= mem_size, "fits in RAM");
                }
                Err(_) => {
                    // Rejection is allowed only when there is genuinely no room
                    // for the ramdisk above the kernel footprint.
                    prop_assert!(floor + initrd_len > mem_size);
                }
            }
        }
    }
}