supermachine 0.7.70

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
//! Portable snapshot-container substrate shared by both backends' snapshot
//! pipelines (`vmm::snapshot` on HVF, `kvm::run` on KVM).
//!
//! The two pipelines emit different *container* layouts — their device sections
//! mirror two different device models (HVF: GICv3 blob + virtio-mmio list; KVM:
//! Com1 + PIT + irqchips + clock + disk/volume/vsock/virtiofs) — but they share
//! the security-critical framing primitives:
//!
//!   * little-endian field readers that fail (not panic) on a short slice,
//!   * the `Vec::with_capacity` count cap that stops a hostile header from
//!     OOM-aborting the host before the read loop's bounds checks run,
//!   * the "RAM region lies within the file" check that prevents a corrupt
//!     `ram_offset`/`mem_size` from `mmap`-ing past EOF (→ guest SIGBUS on first
//!     touch = host crash),
//!   * the page-aligned `MAP_PRIVATE` copy-on-write RAM restore.
//!
//! A bug in any of these is a host SIGBUS/OOB, so they live in ONE audited place
//! rather than being re-implemented per backend. This module is pure framing: it
//! names no backend type and compiles on every target.

/// Read a little-endian `u16` from the first 2 bytes of `bytes`. `None` if the
/// slice is shorter than 2 bytes (the caller maps that to its truncated-file
/// error). Never panics on a short slice.
#[inline]
pub fn le_u16(bytes: &[u8]) -> Option<u16> {
    Some(u16::from_le_bytes(bytes.get(..2)?.try_into().ok()?))
}

/// Read a little-endian `u32` from the first 4 bytes of `bytes`. `None` if short.
#[inline]
pub fn le_u32(bytes: &[u8]) -> Option<u32> {
    Some(u32::from_le_bytes(bytes.get(..4)?.try_into().ok()?))
}

/// Read a little-endian `u64` from the first 8 bytes of `bytes`. `None` if short.
#[inline]
pub fn le_u64(bytes: &[u8]) -> Option<u64> {
    Some(u64::from_le_bytes(bytes.get(..8)?.try_into().ok()?))
}

/// Read a little-endian `u128` from the first 16 bytes of `bytes`. `None` if short.
#[inline]
pub fn le_u128(bytes: &[u8]) -> Option<u128> {
    Some(u128::from_le_bytes(bytes.get(..16)?.try_into().ok()?))
}

/// Cap a header-supplied element count before using it as a `Vec::with_capacity`
/// hint. A section of `count` records, each at least `min_elem` bytes on disk,
/// cannot occupy more than `region` bytes — so a count larger than
/// `region / min_elem` is a corrupt or hostile header and must not be trusted
/// for a pre-allocation. Without this, a snapshot whose header claims a count of
/// `0xFFFF_FFFF` drives a multi-GiB `with_capacity` and OOM-aborts the host
/// *before* the read loop's per-record bounds checks ever run. The loop itself
/// still fails cleanly via `read_exact`; this only right-sizes the up-front
/// allocation so the work the file can justify is the most we ever do.
#[inline]
pub fn cap_count(count: usize, min_elem: usize, region: usize) -> usize {
    count.min(region / min_elem.max(1))
}

/// Whether the RAM section `[ram_offset, ram_offset + memory_bytes)` lies wholly
/// within a file of `file_len` bytes. Computed without overflow (the subtraction
/// is guarded by the first comparison). A `false` here means the snapshot's
/// `ram_offset`/`mem_size` are corrupt or hostile and the region must NOT be
/// `mmap`'d — doing so SIGBUSes the host on first touch of an unbacked page.
#[inline]
pub fn ram_region_within(file_len: u64, ram_offset: u64, memory_bytes: u64) -> bool {
    ram_offset <= file_len && memory_bytes <= file_len - ram_offset
}

/// `mmap` a snapshot file's RAM section copy-on-write: `MAP_PRIVATE` so guest
/// writes go to private anon pages and the snapshot file on disk is never
/// written, and restore is O(pages-touched) rather than a full copy (pages fault
/// in lazily from the page cache). `extra_flags` is OR'd into the mmap flags
/// (callers pass e.g. `libc::MAP_NORESERVE`); pass `0` for none. The RAM region
/// is validated against the file length first ([`ram_region_within`]) so a
/// corrupt offset/size returns `InvalidData` instead of mapping past EOF.
///
/// `ram_offset` must be page-aligned (snapshot writers pad to a page boundary
/// before the RAM section) — `mmap` requires the file offset to be a multiple of
/// the page size. Madvise hints (hugepage / mergeable / sequential) are the
/// caller's concern, layered on the returned pointer.
///
/// Returns the mapping base; the caller owns it and must `munmap(ptr,
/// memory_bytes)` when done.
pub fn cow_map_ram(
    file: &std::fs::File,
    ram_offset: u64,
    memory_bytes: usize,
    extra_flags: libc::c_int,
) -> std::io::Result<*mut u8> {
    use std::os::fd::AsRawFd;

    let file_len = file.metadata()?.len();
    if !ram_region_within(file_len, ram_offset, memory_bytes as u64) {
        return Err(std::io::Error::new(
            std::io::ErrorKind::InvalidData,
            format!(
                "snapshot RAM region [{ram_offset}, {ram_offset}+{memory_bytes}) \
                 exceeds file length {file_len}"
            ),
        ));
    }
    // SAFETY: the kernel allocates the pages; the region is proven in-bounds
    // above and MAP_PRIVATE means guest writes never reach the file.
    let ptr = unsafe {
        libc::mmap(
            std::ptr::null_mut(),
            memory_bytes,
            libc::PROT_READ | libc::PROT_WRITE,
            libc::MAP_PRIVATE | extra_flags,
            file.as_raw_fd(),
            ram_offset as libc::off_t,
        )
    };
    if ptr == libc::MAP_FAILED {
        return Err(std::io::Error::last_os_error());
    }
    Ok(ptr as *mut u8)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn le_readers_reject_short_slices() {
        assert_eq!(le_u16(&[1, 2]), Some(0x0201));
        assert_eq!(le_u16(&[1]), None);
        assert_eq!(le_u32(&[1, 0, 0, 0]), Some(1));
        assert_eq!(le_u32(&[1, 0, 0]), None);
        assert_eq!(le_u64(&[0xff, 0, 0, 0, 0, 0, 0, 0]), Some(0xff));
        assert_eq!(le_u64(&[0; 7]), None);
        assert!(le_u128(&[7u8; 16]).is_some());
        assert_eq!(le_u128(&[7u8; 15]), None);
    }

    #[test]
    fn le_readers_read_only_the_leading_bytes() {
        // Trailing bytes are ignored — the reader takes exactly its width.
        assert_eq!(le_u32(&[1, 0, 0, 0, 0xaa, 0xbb]), Some(1));
    }

    #[test]
    fn cap_count_caps_to_what_the_region_can_hold() {
        // A hostile huge count is clamped to region/min_elem.
        assert_eq!(cap_count(0xFFFF_FFFF, 12, 1200), 100);
        // An honest small count passes through.
        assert_eq!(cap_count(5, 12, 1200), 5);
        // min_elem of 0 must not divide-by-zero (treated as 1).
        assert_eq!(cap_count(7, 0, 1200), 7);
        // Zero region caps everything to zero.
        assert_eq!(cap_count(7, 12, 0), 0);
    }

    #[test]
    fn ram_region_bounds_are_overflow_safe() {
        assert!(ram_region_within(100, 10, 90));
        assert!(ram_region_within(100, 100, 0)); // empty region at EOF
        assert!(!ram_region_within(100, 10, 91)); // 1 byte past EOF
        assert!(!ram_region_within(100, 101, 0)); // offset past EOF
                                                  // No overflow even with extreme values.
        assert!(!ram_region_within(100, u64::MAX, 1));
        assert!(!ram_region_within(100, 0, u64::MAX));
    }

    #[test]
    fn cow_map_ram_rejects_region_past_eof() {
        use std::io::Write;
        let mut tmp = std::env::temp_dir();
        tmp.push(format!("smframe_cow_{}.bin", std::process::id()));
        {
            let mut f = std::fs::File::create(&tmp).unwrap();
            f.write_all(&[0u8; 4096]).unwrap();
        }
        let f = std::fs::File::open(&tmp).unwrap();
        // Claim a RAM region one page past the 4096-byte file.
        let err = cow_map_ram(&f, 4096, 4096, 0).unwrap_err();
        assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
        // A valid in-bounds region maps.
        let ptr = cow_map_ram(&f, 0, 4096, 0).unwrap();
        assert!(!ptr.is_null());
        // SAFETY: we just mapped exactly 4096 bytes at ptr.
        unsafe {
            assert_eq!(*ptr, 0);
            libc::munmap(ptr as *mut libc::c_void, 4096);
        }
        let _ = std::fs::remove_file(&tmp);
    }
}

// ── Unified device record (7c holistic snapshot container) ──────────────────

use crate::devices::virtio::mmio::MmioSnapshot;

/// The kind of a virtio device in the unified snapshot device section
/// (`SMSNAP\x0a`, 7c holistic container). Ordering is significant on restore —
/// it implies each device's MMIO base / IRQ (see `DeviceAttachPlan`).
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[repr(u8)]
pub enum DeviceKind {
    /// Root virtio-blk (the rootfs disk).
    Blk = 0,
    /// virtio-vsock (host↔guest sockets); rebuilt fresh on restore.
    Vsock = 1,
    /// A data volume (vdb, vdc, …).
    Volume = 2,
    /// A virtio-fs mount.
    VirtioFs = 3,
}

impl DeviceKind {
    /// Map a virtio device-type id (see `devices::virtio::VIRTIO_ID_*`) to a
    /// snapshot device kind. Used by the HVF capture path, whose restore is
    /// positional (it does not consult the kind) — so data volumes (also
    /// virtio-blk) labelling as `Blk` is harmless; KVM tags `Volume` explicitly.
    pub fn from_virtio_id(id: u32) -> Self {
        match id {
            19 => Self::Vsock,
            26 => Self::VirtioFs,
            _ => Self::Blk, // VIRTIO_ID_BLOCK (and any future block-like device)
        }
    }

    fn from_u8(v: u8) -> Option<Self> {
        match v {
            0 => Some(Self::Blk),
            1 => Some(Self::Vsock),
            2 => Some(Self::Volume),
            3 => Some(Self::VirtioFs),
            _ => None,
        }
    }
}

/// A device's host-side backing, carried INLINE so a snapshot is self-describing
/// (the unified philosophy — no external metadata.json/sidecar chase on restore).
/// `None` = a device with no host backing to re-open (e.g. vsock, or an HVF
/// device whose backing is implied by the Image). The string fields are host
/// paths / guest mount points / virtio-fs tags.
#[derive(Clone, Debug, PartialEq, Eq, Default)]
pub enum DeviceBacking {
    #[default]
    None,
    Disk {
        path: String,
        size: u64,
    },
    Volume {
        path: String,
        size: u64,
        mount: String,
    },
    VirtioFs {
        tag: String,
        mount: String,
        host_path: String,
        /// DAX window: guest-physical base + length the host file ranges are
        /// mmapped into. Re-bound eagerly on restore (content stays in the host
        /// dir at `host_path` — never copied into the snapshot).
        dax_gpa: u64,
        dax_window_len: u64,
        /// The FUSE backend's snapshot blob (open-fd table etc. → lazy reopen)
        /// and the DAX slot-table blob. O(table-size) metadata, NOT content.
        backend_state: Vec<u8>,
        dax_state: Vec<u8>,
    },
}

/// One device in the unified snapshot: its kind, MMIO/queue transport state
/// (the brick-1 canonical [`MmioSnapshot`] codec), and inline host backing.
#[derive(Clone, Debug)]
pub struct DeviceRecord {
    pub kind: DeviceKind,
    pub mmio: MmioSnapshot,
    pub backing: DeviceBacking,
}

impl DeviceRecord {
    /// Canonical codec for one device record. Layout (little-endian):
    /// `kind u8 · MmioSnapshot (brick-1 codec) · backing`, where backing is a
    /// tag byte (0=None,1=Disk,2=Volume,3=VirtioFs) followed by its
    /// length-prefixed string + u64 fields.
    pub fn write_to<W: std::io::Write>(&self, w: &mut W) -> std::io::Result<()> {
        w.write_all(&[self.kind as u8])?;
        self.mmio.write_to(w)?;
        fn s<W: std::io::Write>(w: &mut W, s: &str) -> std::io::Result<()> {
            w.write_all(&(s.len() as u32).to_le_bytes())?;
            w.write_all(s.as_bytes())
        }
        fn blob<W: std::io::Write>(w: &mut W, b: &[u8]) -> std::io::Result<()> {
            w.write_all(&(b.len() as u32).to_le_bytes())?;
            w.write_all(b)
        }
        match &self.backing {
            DeviceBacking::None => w.write_all(&[0u8])?,
            DeviceBacking::Disk { path, size } => {
                w.write_all(&[1u8])?;
                s(w, path)?;
                w.write_all(&size.to_le_bytes())?;
            }
            DeviceBacking::Volume { path, size, mount } => {
                w.write_all(&[2u8])?;
                s(w, path)?;
                w.write_all(&size.to_le_bytes())?;
                s(w, mount)?;
            }
            DeviceBacking::VirtioFs {
                tag,
                mount,
                host_path,
                dax_gpa,
                dax_window_len,
                backend_state,
                dax_state,
            } => {
                w.write_all(&[3u8])?;
                s(w, tag)?;
                s(w, mount)?;
                s(w, host_path)?;
                w.write_all(&dax_gpa.to_le_bytes())?;
                w.write_all(&dax_window_len.to_le_bytes())?;
                blob(w, backend_state)?;
                blob(w, dax_state)?;
            }
        }
        Ok(())
    }

    /// Inverse of [`write_to`](Self::write_to).
    pub fn read_from<R: std::io::Read>(r: &mut R) -> std::io::Result<DeviceRecord> {
        fn b<const N: usize, R: std::io::Read>(r: &mut R) -> std::io::Result<[u8; N]> {
            let mut x = [0u8; N];
            r.read_exact(&mut x)?;
            Ok(x)
        }
        fn err(m: &str) -> std::io::Error {
            std::io::Error::new(std::io::ErrorKind::InvalidData, m.to_string())
        }
        fn s<R: std::io::Read>(r: &mut R) -> std::io::Result<String> {
            let n = u32::from_le_bytes(b::<4, _>(r)?) as usize;
            if n > 1 << 16 {
                return Err(err("device backing string too long"));
            }
            let mut buf = vec![0u8; n];
            r.read_exact(&mut buf)?;
            String::from_utf8(buf).map_err(|_| err("device backing string not utf8"))
        }
        // Length-prefixed byte blob (DAX/backend state). Capped to guard against
        // a corrupt/hostile length triggering a huge allocation; these blobs are
        // O(table-size) metadata (KiB), so 16 MiB is a generous ceiling.
        fn blob<R: std::io::Read>(r: &mut R) -> std::io::Result<Vec<u8>> {
            let n = u32::from_le_bytes(b::<4, _>(r)?) as usize;
            if n > 16 << 20 {
                return Err(err("device backing blob too large"));
            }
            let mut buf = vec![0u8; n];
            r.read_exact(&mut buf)?;
            Ok(buf)
        }
        let kind = DeviceKind::from_u8(b::<1, _>(r)?[0]).ok_or_else(|| err("bad device kind"))?;
        let mmio = MmioSnapshot::read_from(r)?;
        let backing = match b::<1, _>(r)?[0] {
            0 => DeviceBacking::None,
            1 => DeviceBacking::Disk {
                path: s(r)?,
                size: u64::from_le_bytes(b::<8, _>(r)?),
            },
            2 => DeviceBacking::Volume {
                path: s(r)?,
                size: u64::from_le_bytes(b::<8, _>(r)?),
                mount: s(r)?,
            },
            3 => DeviceBacking::VirtioFs {
                tag: s(r)?,
                mount: s(r)?,
                host_path: s(r)?,
                dax_gpa: u64::from_le_bytes(b::<8, _>(r)?),
                dax_window_len: u64::from_le_bytes(b::<8, _>(r)?),
                backend_state: blob(r)?,
                dax_state: blob(r)?,
            },
            _ => return Err(err("bad device backing tag")),
        };
        Ok(DeviceRecord {
            kind,
            mmio,
            backing,
        })
    }
}

// ── Portable snapshot container body (7c step 4: the one assembler) ──────────

use crate::devices::virtio::vsock::muxer::TsiListenerSnapshot;

/// The backend-neutral body of a snapshot container — everything in the file
/// AFTER the magic and the `ram_offset` header and BEFORE the RAM section. Both
/// pipelines (`kvm::run` on KVM, `vmm::snapshot` on HVF) assemble and parse the
/// container through [`write_container`](Self::write_container) /
/// [`read_container`](Self::read_container) so the framing exists in ONE audited
/// place — only the *contents* of the opaque backend blobs differ:
///
///   * [`intc_blob`](Self::intc_blob) — the VM-global interrupt-controller +
///     timer state from [`HypervisorVm::capture_intc`]. KVM: PIT + PIC/IOAPIC +
///     kvmclock; HVF: GICv3 distributor. The assembler never interprets it.
///   * [`vcpu_blobs`](Self::vcpu_blobs) — one opaque per-vCPU register-file blob
///     from [`HypervisorVcpu::write_snapshot_state`], length-prefixed here so the
///     assembler frames each without relying on the backend reader consuming an
///     exact count (a self-delimiting reader bug stays contained to one vCPU's
///     bytes instead of desyncing the whole stream).
///   * [`clock_host_ticks`](Self::clock_host_ticks) /
///     [`clock_ref`](Self::clock_ref) — the host-monotonic reading at capture
///     and the backend clock reference ([`HypervisorVcpu::capture_clock_ref`]),
///     so the guest timebase re-anchors on restore instead of freezing.
///   * [`com1`](Self::com1) — 16550 UART latch/control regs (KVM serial). HVF
///     has no 8250 COM1, so it writes zeros.
///
/// `ram_offset` is NOT here: it is a file-layout concern (it depends on padding
/// the body to a page boundary) owned by each backend's save wrapper, which
/// writes `magic · ram_offset · <this body> · pad · RAM`.
///
/// [`HypervisorVm`]: crate::hypervisor::HypervisorVm
/// [`HypervisorVm::capture_intc`]: crate::hypervisor::HypervisorVm::capture_intc
/// [`HypervisorVcpu::write_snapshot_state`]: crate::hypervisor::HypervisorVcpu::write_snapshot_state
/// [`HypervisorVcpu::capture_clock_ref`]: crate::hypervisor::HypervisorVcpu::capture_clock_ref
#[derive(Clone, Debug)]
pub struct ContainerMeta {
    pub num_cpus: u8,
    pub mem_size: u64,
    /// 16550 UART regs `[ier, lcr, mcr, scr, dll, dlm]`. HVF writes zeros.
    pub com1: [u8; 6],
    /// Host monotonic timebase reading at capture (backend's native unit).
    pub clock_host_ticks: u64,
    /// Backend clock reference at capture (for restore re-anchoring).
    pub clock_ref: u64,
    /// Opaque VM-global interrupt-controller + timer blob (`capture_intc`).
    pub intc_blob: Vec<u8>,
    /// One opaque register-file blob per vCPU (`write_snapshot_state`).
    pub vcpu_blobs: Vec<Vec<u8>>,
    /// Every virtio device, kind-tagged, with inline host backing.
    pub devices: Vec<DeviceRecord>,
    /// vsock TSI control-channel auth token the guest was booted with.
    pub tsi_token: Option<[u8; 32]>,
    /// Guest TSI listener routes (host port-forwards) live at capture.
    pub vsock_listeners: Vec<TsiListenerSnapshot>,
}

/// Per-blob length ceiling (bytes). The interrupt-controller blob and each
/// vCPU register file are KiB-scale; 16 MiB is a generous, allocation-bounding
/// cap so a corrupt/hostile length can't drive a multi-GiB `vec![0; n]` before
/// `read_exact` would fail anyway.
const CONTAINER_BLOB_MAX: usize = 16 << 20;
/// Pre-allocation cap for the vCPU count (the loop still fails cleanly via
/// `read_exact`; this only right-sizes the up-front `with_capacity`).
const MAX_VCPUS_HINT: usize = 1024;
/// Pre-allocation cap for the device / listener counts.
const MAX_DEVS_HINT: usize = 4096;
const MAX_LISTENERS_HINT: usize = 1 << 16;

impl ContainerMeta {
    /// Serialize the container body. Layout (little-endian):
    ///
    /// ```text
    ///   num_cpus u8 · mem_size u64 · com1[6]
    ///   clock_host_ticks u64 · clock_ref u64
    ///   intc_blob:   len u32 · bytes
    ///   n_vcpus u32 · per vcpu: len u32 · bytes
    ///   n_devices u32 · per device: DeviceRecord codec
    ///   tsi_token: present u8 · (32 bytes if present)
    ///   n_listeners u32 · per listener: TsiListenerSnapshot codec (WIRE_LEN)
    /// ```
    pub fn write_container<W: std::io::Write>(&self, w: &mut W) -> std::io::Result<()> {
        fn blob<W: std::io::Write>(w: &mut W, b: &[u8]) -> std::io::Result<()> {
            w.write_all(&(b.len() as u32).to_le_bytes())?;
            w.write_all(b)
        }
        w.write_all(&[self.num_cpus])?;
        w.write_all(&self.mem_size.to_le_bytes())?;
        w.write_all(&self.com1)?;
        w.write_all(&self.clock_host_ticks.to_le_bytes())?;
        w.write_all(&self.clock_ref.to_le_bytes())?;

        blob(w, &self.intc_blob)?;

        w.write_all(&(self.vcpu_blobs.len() as u32).to_le_bytes())?;
        for v in &self.vcpu_blobs {
            blob(w, v)?;
        }

        w.write_all(&(self.devices.len() as u32).to_le_bytes())?;
        for d in &self.devices {
            d.write_to(w)?;
        }

        match &self.tsi_token {
            Some(t) => {
                w.write_all(&[1u8])?;
                w.write_all(t)?;
            }
            None => w.write_all(&[0u8])?,
        }

        w.write_all(&(self.vsock_listeners.len() as u32).to_le_bytes())?;
        for l in &self.vsock_listeners {
            l.write_to(w)?;
        }
        Ok(())
    }

    /// Inverse of [`write_container`](Self::write_container). Every count is
    /// bounded before pre-allocation and every blob length is capped, so a
    /// corrupt header can't OOM-abort the host before the read loops' per-record
    /// bounds checks run; the loops themselves fail cleanly via `read_exact`.
    pub fn read_container<R: std::io::Read>(r: &mut R) -> std::io::Result<ContainerMeta> {
        fn b<const N: usize, R: std::io::Read>(r: &mut R) -> std::io::Result<[u8; N]> {
            let mut x = [0u8; N];
            r.read_exact(&mut x)?;
            Ok(x)
        }
        fn err(m: &str) -> std::io::Error {
            std::io::Error::new(std::io::ErrorKind::InvalidData, m.to_string())
        }
        fn blob<R: std::io::Read>(r: &mut R) -> std::io::Result<Vec<u8>> {
            let n = u32::from_le_bytes(b::<4, _>(r)?) as usize;
            if n > CONTAINER_BLOB_MAX {
                return Err(err("container blob too large"));
            }
            let mut buf = vec![0u8; n];
            r.read_exact(&mut buf)?;
            Ok(buf)
        }
        let num_cpus = b::<1, _>(r)?[0];
        let mem_size = u64::from_le_bytes(b::<8, _>(r)?);
        let com1 = b::<6, _>(r)?;
        let clock_host_ticks = u64::from_le_bytes(b::<8, _>(r)?);
        let clock_ref = u64::from_le_bytes(b::<8, _>(r)?);

        let intc_blob = blob(r)?;

        let n_vcpus = u32::from_le_bytes(b::<4, _>(r)?) as usize;
        let mut vcpu_blobs = Vec::with_capacity(n_vcpus.min(MAX_VCPUS_HINT));
        for _ in 0..n_vcpus {
            vcpu_blobs.push(blob(r)?);
        }

        let n_devices = u32::from_le_bytes(b::<4, _>(r)?) as usize;
        let mut devices = Vec::with_capacity(n_devices.min(MAX_DEVS_HINT));
        for _ in 0..n_devices {
            devices.push(DeviceRecord::read_from(r)?);
        }

        let tsi_token = match b::<1, _>(r)?[0] {
            0 => None,
            1 => Some(b::<32, _>(r)?),
            _ => return Err(err("bad tsi_token presence byte")),
        };

        let n_listeners = u32::from_le_bytes(b::<4, _>(r)?) as usize;
        let mut vsock_listeners = Vec::with_capacity(n_listeners.min(MAX_LISTENERS_HINT));
        for _ in 0..n_listeners {
            vsock_listeners.push(TsiListenerSnapshot::read_from(r)?);
        }

        Ok(ContainerMeta {
            num_cpus,
            mem_size,
            com1,
            clock_host_ticks,
            clock_ref,
            intc_blob,
            vcpu_blobs,
            devices,
            tsi_token,
            vsock_listeners,
        })
    }
}

#[cfg(test)]
mod container_tests {
    use super::*;
    use crate::devices::virtio::mmio::{MmioSnapshot, QueueSnapshot};

    fn mmio(features: u32, nq: usize) -> MmioSnapshot {
        MmioSnapshot {
            driver_features: [features, features ^ 0xdead_beef],
            status: 0xf,
            interrupt_status: 1,
            queues: (0..nq)
                .map(|i| QueueSnapshot {
                    size: 256,
                    ready: i % 2 == 0,
                    desc_table: 0x1000 * (i as u64 + 1),
                    avail_ring: 0x2000 * (i as u64 + 1),
                    used_ring: 0x3000 * (i as u64 + 1),
                    last_avail_idx: i as u16,
                    next_used_idx: (i as u16) + 7,
                })
                .collect(),
        }
    }

    fn sample(
        devices: Vec<DeviceRecord>,
        listeners: usize,
        token: bool,
        vcpus: usize,
    ) -> ContainerMeta {
        ContainerMeta {
            num_cpus: vcpus as u8,
            mem_size: 512 << 20,
            com1: [1, 2, 3, 4, 5, 6],
            clock_host_ticks: 0x1122_3344_5566_7788,
            clock_ref: 0x99aa_bbcc_ddee_ff00,
            intc_blob: (0..333u32).map(|i| i as u8).collect(),
            vcpu_blobs: (0..vcpus).map(|i| vec![i as u8; 64 + i]).collect(),
            devices,
            tsi_token: token.then_some([0x5a; 32]),
            vsock_listeners: (0..listeners)
                .map(|i| TsiListenerSnapshot {
                    cid: 3,
                    peer_port: 1000 + i as u32,
                    vm_port: 80,
                    family: 2,
                    socktype: 1,
                    inet_port: if i % 2 == 0 {
                        Some(8080 + i as u16)
                    } else {
                        None
                    },
                })
                .collect(),
        }
    }

    fn assert_eq_meta(a: &ContainerMeta, b: &ContainerMeta) {
        assert_eq!(a.num_cpus, b.num_cpus);
        assert_eq!(a.mem_size, b.mem_size);
        assert_eq!(a.com1, b.com1);
        assert_eq!(a.clock_host_ticks, b.clock_host_ticks);
        assert_eq!(a.clock_ref, b.clock_ref);
        assert_eq!(a.intc_blob, b.intc_blob);
        assert_eq!(a.vcpu_blobs, b.vcpu_blobs);
        assert_eq!(a.tsi_token, b.tsi_token);
        assert_eq!(a.devices.len(), b.devices.len());
        for (x, y) in a.devices.iter().zip(&b.devices) {
            assert_eq!(x.kind, y.kind);
            assert_eq!(x.backing, y.backing);
            // MmioSnapshot has no PartialEq; compare via its own codec bytes.
            let (mut xb, mut yb) = (Vec::new(), Vec::new());
            x.mmio.write_to(&mut xb).unwrap();
            y.mmio.write_to(&mut yb).unwrap();
            assert_eq!(xb, yb);
        }
        assert_eq!(a.vsock_listeners.len(), b.vsock_listeners.len());
        for (x, y) in a.vsock_listeners.iter().zip(&b.vsock_listeners) {
            assert_eq!(x.cid, y.cid);
            assert_eq!(x.peer_port, y.peer_port);
            assert_eq!(x.vm_port, y.vm_port);
            assert_eq!(x.family, y.family);
            assert_eq!(x.socktype, y.socktype);
            assert_eq!(x.inet_port, y.inet_port);
        }
    }

    fn round_trip(m: &ContainerMeta) {
        let mut buf = Vec::new();
        m.write_container(&mut buf).unwrap();
        let mut cur = std::io::Cursor::new(&buf);
        let back = ContainerMeta::read_container(&mut cur).unwrap();
        // Reader consumed exactly the bytes the writer produced — no trailing
        // slack (would desync a container with a RAM section right after).
        assert_eq!(cur.position() as usize, buf.len());
        assert_eq_meta(m, &back);
    }

    #[test]
    fn empty_container_round_trips() {
        round_trip(&sample(vec![], 0, false, 1));
    }

    #[test]
    fn full_container_round_trips() {
        let devices = vec![
            DeviceRecord {
                kind: DeviceKind::Blk,
                mmio: mmio(0x1, 2),
                backing: DeviceBacking::Disk {
                    path: "/var/lib/sm/root.img".into(),
                    size: 1 << 30,
                },
            },
            DeviceRecord {
                kind: DeviceKind::Vsock,
                mmio: mmio(0x2, 3),
                backing: DeviceBacking::None,
            },
            DeviceRecord {
                kind: DeviceKind::Volume,
                mmio: mmio(0x3, 1),
                backing: DeviceBacking::Volume {
                    path: "/var/lib/sm/vol-b.img".into(),
                    size: 4 << 20,
                    mount: "/data".into(),
                },
            },
            DeviceRecord {
                kind: DeviceKind::VirtioFs,
                mmio: mmio(0x4, 2),
                backing: DeviceBacking::VirtioFs {
                    tag: "shared".into(),
                    mount: "/mnt/shared".into(),
                    host_path: "/srv/share".into(),
                    dax_gpa: 0x4000_0000,
                    dax_window_len: 0x800_0000,
                    backend_state: vec![0xab; 200],
                    dax_state: vec![0xcd; 80],
                },
            },
        ];
        round_trip(&sample(devices, 3, true, 4));
    }

    #[test]
    fn truncated_stream_errors_not_panics() {
        let mut buf = Vec::new();
        sample(
            vec![DeviceRecord {
                kind: DeviceKind::Blk,
                mmio: mmio(1, 1),
                backing: DeviceBacking::None,
            }],
            2,
            true,
            2,
        )
        .write_container(&mut buf)
        .unwrap();
        // Every prefix shorter than the whole must error (never panic).
        for cut in 0..buf.len() {
            let mut cur = std::io::Cursor::new(&buf[..cut]);
            assert!(
                ContainerMeta::read_container(&mut cur).is_err(),
                "prefix {cut}"
            );
        }
    }

    #[test]
    fn hostile_blob_length_is_rejected() {
        // A container whose intc_blob length claims > CONTAINER_BLOB_MAX must be
        // rejected without attempting the allocation.
        let mut buf = Vec::new();
        buf.push(1u8); // num_cpus
        buf.extend_from_slice(&(0u64).to_le_bytes()); // mem_size
        buf.extend_from_slice(&[0u8; 6]); // com1
        buf.extend_from_slice(&(0u64).to_le_bytes()); // clock_host_ticks
        buf.extend_from_slice(&(0u64).to_le_bytes()); // clock_ref
        buf.extend_from_slice(&u32::MAX.to_le_bytes()); // intc_blob len = 4 GiB
        let mut cur = std::io::Cursor::new(&buf);
        let err = ContainerMeta::read_container(&mut cur).unwrap_err();
        assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
    }
}