supermachine 0.7.70

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
// Status: compact port. Descriptor walking against raw guest mem
// pointer. No vm-memory dep — we treat guest RAM as a flat
// (host_ram_base, ram_gpa, ram_size) tuple and translate addresses.

#![allow(dead_code)]

use std::sync::atomic::Ordering;
use std::sync::Arc;

/// virtq descriptor flags.
pub const VRING_DESC_F_NEXT: u16 = 1;
pub const VRING_DESC_F_WRITE: u16 = 2;
pub const VRING_DESC_F_INDIRECT: u16 = 4;

/// Cached `$SUPERMACHINE_VQ_TRACE` flag. Same rationale as
/// `vsock_trace_enabled` in muxer.rs — we don't want a libc
/// `getenv` global-locked lookup on every descriptor pop / push.
#[inline]
fn vq_trace_enabled() -> bool {
    static CACHED: std::sync::atomic::AtomicU8 = std::sync::atomic::AtomicU8::new(0);
    let v = CACHED.load(std::sync::atomic::Ordering::Relaxed);
    if v != 0 {
        return v == 2;
    }
    let on = crate::trace::enabled("vq");
    CACHED.store(if on { 2 } else { 1 }, std::sync::atomic::Ordering::Relaxed);
    on
}

/// Single virtq descriptor as seen in guest memory.
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct Desc {
    pub addr: u64,
    pub len: u32,
    pub flags: u16,
    pub next: u16,
}

/// Per-queue state: descriptor table / avail ring / used ring
/// addresses, queue size, indices.
#[derive(Clone)]
pub struct Queue {
    pub size: u16,
    pub ready: bool,
    pub desc_table: u64,
    pub avail_ring: u64,
    pub used_ring: u64,
    pub last_avail_idx: u16,
    pub next_used_idx: u16,
    pub mem: GuestMem,
}

impl Queue {
    pub fn new(mem: GuestMem) -> Self {
        Self {
            size: 256,
            ready: false,
            desc_table: 0,
            avail_ring: 0,
            used_ring: 0,
            last_avail_idx: 0,
            next_used_idx: 0,
            mem,
        }
    }

    /// Read the avail-ring `idx` field (= total descriptors the
    /// driver has made available). LE u16 at offset 2 of avail ring.
    pub fn avail_idx(&self) -> u16 {
        self.mem.read_u16(self.avail_ring + 2)
    }

    /// Pop the next available descriptor chain head, if any.
    /// Returns (head_index, descriptor_chain_vec).
    pub fn pop_chain(&mut self) -> Option<(u16, Vec<Desc>)> {
        // A size-0 queue is unconfigured — or a hostile guest wrote
        // QueueNum=0 (the MMIO shell stores the size verbatim) before
        // flipping QueueReady/DRIVER_OK. Treat it as empty: the
        // `% self.size` below would otherwise divide by zero and panic
        // the worker (a guest-triggerable host DoS via a crafted MMIO
        // register sequence). virtio queue sizes are always a power of
        // two ≥ 1 for a legitimately-configured queue.
        if self.size == 0 {
            return None;
        }
        let avail = self.avail_idx();
        if avail == self.last_avail_idx {
            return None;
        }
        // ACQUIRE barrier — the missing half of the virtio memory
        // cadence. The guest driver writes the descriptor table entry
        // (addr/len/flags/next) and the avail-ring head index, then
        // does a release-store to `avail.idx` (Linux: `virtio_wmb()`
        // followed by the idx write). We've just observed a bumped
        // `avail.idx` above via a plain (relaxed) load; without an
        // acquire fence here, on a weakly-ordered CPU (aarch64 / Apple
        // Silicon) the descriptor-table and avail-ring reads below may
        // be reordered ahead of — or simply not yet observe — the
        // guest's pre-idx-bump writes. The result is a STALE descriptor
        // read (the slot's previous addr/len), so we DMA the packet
        // payload into the wrong / already-recycled guest buffer:
        // silent, size-preserving data corruption that only manifests
        // under concurrency (the guest reposting RX buffers while we
        // drain) and only on weak-ordered hardware (x86-TSO hides it).
        // `add_used` already issues the matching release fence for the
        // used ring; this is the acquire counterpart for the avail ring.
        std::sync::atomic::fence(Ordering::Acquire);
        // Load the head index from avail.ring[last_avail_idx % size]
        let off = self.avail_ring + 4 + ((self.last_avail_idx % self.size) as u64) * 2;
        let head = self.mem.read_u16(off);
        if vq_trace_enabled() {
            eprintln!("[vq desc=0x{:x}] pop_chain: avail.idx={avail} last_avail_idx={} slot={} head={head}",
                self.desc_table, self.last_avail_idx, self.last_avail_idx % self.size);
        }
        self.last_avail_idx = self.last_avail_idx.wrapping_add(1);

        let mut chain = Vec::new();
        let mut idx = head;
        // Hard cap on chain length. `for _ in 0..self.size` already
        // prevents infinite loops (queue size is 256 by default; the
        // chain can't exceed that without re-visiting descriptors),
        // but a guest can craft a chain that visits self.size
        // descriptors with `next` indices > self.size. The
        // descriptor table is `self.size` entries, so an out-of-range
        // `next` value is undefined behavior in the virtio spec —
        // treat it as end-of-chain.
        for _ in 0..self.size {
            if idx >= self.size {
                // OOB next index — terminate the chain. The
                // descriptor we've already collected is consumed; the
                // guest sees the partial chain ack'd.
                break;
            }
            let d_addr = self.desc_table + (idx as u64) * 16;
            let desc = Desc {
                addr: self.mem.read_u64(d_addr),
                len: self.mem.read_u32(d_addr + 8),
                flags: self.mem.read_u16(d_addr + 12),
                next: self.mem.read_u16(d_addr + 14),
            };
            chain.push(desc);
            if desc.flags & VRING_DESC_F_NEXT == 0 {
                break;
            }
            idx = desc.next;
        }
        Some((head, chain))
    }

    /// Push (id, used_len) onto the used ring + bump used.idx.
    pub fn add_used(&mut self, head: u16, used_len: u32) {
        // Same size-0 divide-by-zero guard as `pop_chain` (defense in
        // depth — add_used is only reached via a Some() from pop_chain,
        // which already bails on size 0, but a 0 divisor must never reach
        // the `% self.size` below regardless of caller).
        if self.size == 0 {
            return;
        }
        // used.ring[next_used_idx % size] = { id: u32, len: u32 }
        let entry_off = self.used_ring + 4 + ((self.next_used_idx % self.size) as u64) * 8;
        self.mem.write_u32(entry_off, head as u32);
        self.mem.write_u32(entry_off + 4, used_len);
        if vq_trace_enabled() {
            eprintln!("[vq desc=0x{:x}] add_used: slot={} head={head} len={used_len} next_used_idx_after={}",
                self.desc_table, self.next_used_idx % self.size, self.next_used_idx + 1);
        }
        self.next_used_idx = self.next_used_idx.wrapping_add(1);
        // Release fence then publish the new idx.
        std::sync::atomic::fence(Ordering::Release);
        self.mem.write_u16(self.used_ring + 2, self.next_used_idx);
    }
}

// ── Validated descriptor-chain access ──────────────────────────────────
//
// THE single chokepoint for moving bytes between a popped descriptor
// chain and host memory. Every device that consumes a guest-provided
// chain should route through these instead of hand-rolling the
// `chain[0]` / `desc.len` / cap / write-flag logic — that ad-hoc
// per-device handling is exactly where the guest→host bug class lived
// (unbounded allocation from a claimed length, an empty-chain index
// panic, a write past a short descriptor). Both helpers are
// empty-chain-safe and bounds-safe by construction.

/// Concatenate the guest's READABLE descriptors (those WITHOUT
/// `VRING_DESC_F_WRITE`) into one buffer, refusing to exceed `max_bytes`.
///
/// The cap is the defense: a hostile guest cannot force an unbounded host
/// allocation by advertising descriptor lengths that sum to gigabytes —
/// on over-cap this returns `None` and the caller rejects the request.
/// An empty chain (or one with no readable descriptors) yields
/// `Some(vec![])`, never an index panic. Reads are bounds-checked inside
/// `GuestMem` (OOB → zero-fill), so a bogus `addr` can't read host memory.
pub fn read_readable_capped(chain: &[Desc], mem: &GuestMem, max_bytes: usize) -> Option<Vec<u8>> {
    let mut out: Vec<u8> = Vec::new();
    for d in chain.iter().filter(|d| d.flags & VRING_DESC_F_WRITE == 0) {
        let want = d.len as usize;
        if out.len().saturating_add(want) > max_bytes {
            return None;
        }
        let off = out.len();
        out.resize(off + want, 0);
        mem.read_slice(d.addr, &mut out[off..]);
    }
    Some(out)
}

/// Write `bytes` across the guest's WRITABLE descriptors (those WITH
/// `VRING_DESC_F_WRITE`), in chain order, stopping when the payload is
/// exhausted OR the descriptors run out. Returns the number of bytes
/// actually placed — NEVER more than the descriptors can hold, so a
/// short or empty chain can't cause an out-of-bounds write or a panic.
/// Use the return value as the used-ring length.
pub fn write_writable(chain: &[Desc], mem: &GuestMem, bytes: &[u8]) -> usize {
    let mut written = 0usize;
    for d in chain.iter().filter(|d| d.flags & VRING_DESC_F_WRITE != 0) {
        if written >= bytes.len() {
            break;
        }
        let take = (d.len as usize).min(bytes.len() - written);
        mem.write_slice(d.addr, &bytes[written..written + take]);
        written += take;
    }
    written
}

/// Cheaply-cloneable handle to guest physical memory. Backed by
/// the mmap region the VMM created. Reads/writes are unsynchronized
/// raw pointer accesses — the guest uses the same memory with a
/// virtio rmb/wmb cadence that we pair with explicit fences at the
/// ring boundaries: a release fence before publishing `used.idx`
/// (`add_used`) and an acquire fence after observing `avail.idx`
/// (`pop_chain`). Those two fences are what make the otherwise-raw
/// pointer accesses safe against the guest on weakly-ordered CPUs.
#[derive(Clone)]
pub struct GuestMem {
    inner: Arc<GuestMemInner>,
}

struct GuestMemInner {
    host: *mut u8,
    base_gpa: u64,
    len: usize,
}

// SAFETY: mmap pages are stable for the VM lifetime; raw pointer
// access is intentional (the device side races the guest via virtio
// memory ordering, not Rust ownership).
unsafe impl Send for GuestMemInner {}
unsafe impl Sync for GuestMemInner {}

impl GuestMem {
    pub fn new(host: *mut u8, base_gpa: u64, len: usize) -> Self {
        Self {
            inner: Arc::new(GuestMemInner {
                host,
                base_gpa,
                len,
            }),
        }
    }

    /// Resolve `(gpa, n)` to a host pointer with **release-mode**
    /// bounds checking. Returns `None` when the range falls outside
    /// the VM's RAM window — typically because a hostile or buggy
    /// guest put an OOB address into a virtqueue descriptor.
    ///
    /// SECURITY: this MUST stay enabled in release. Replacing it with
    /// `debug_assert!` (as it was prior to 0.5.2) lets a guest read
    /// or write arbitrary host memory by crafting descriptors with
    /// `addr = base_gpa - K`, since `(gpa - base_gpa) as usize`
    /// underflows to a huge offset and `host.add(off)` produces a
    /// pointer well outside our mmap. The bounds check is single-
    /// digit nanoseconds (one compare + branch) on the hot virtio
    /// path; the perf cost is unmeasurable.
    fn translate(&self, gpa: u64, n: usize) -> Option<*mut u8> {
        let off = gpa.checked_sub(self.inner.base_gpa)? as usize;
        // off + n <= len, computed without overflow.
        if off > self.inner.len {
            return None;
        }
        if n > self.inner.len - off {
            return None;
        }
        // SAFETY: off+n ≤ len bounds-checked above.
        Some(unsafe { self.inner.host.add(off) })
    }

    fn translate_or_zero(&self, gpa: u64, n: usize) -> *mut u8 {
        match self.translate(gpa, n) {
            Some(p) => p,
            None => {
                eprintln!(
                    "[guest-mem] OOB access: gpa={gpa:#x} len={n} (base={:#x} len={:#x}); zero-filling",
                    self.inner.base_gpa, self.inner.len
                );
                std::ptr::null_mut()
            }
        }
    }

    pub fn read_u16(&self, gpa: u64) -> u16 {
        let p = self.translate_or_zero(gpa, 2);
        if p.is_null() {
            return 0;
        }
        unsafe { std::ptr::read_unaligned(p as *const u16) }
    }
    pub fn read_u32(&self, gpa: u64) -> u32 {
        let p = self.translate_or_zero(gpa, 4);
        if p.is_null() {
            return 0;
        }
        unsafe { std::ptr::read_unaligned(p as *const u32) }
    }
    pub fn read_u64(&self, gpa: u64) -> u64 {
        let p = self.translate_or_zero(gpa, 8);
        if p.is_null() {
            return 0;
        }
        unsafe { std::ptr::read_unaligned(p as *const u64) }
    }
    pub fn write_u16(&self, gpa: u64, v: u16) {
        if let Some(p) = self.translate(gpa, 2) {
            unsafe { std::ptr::write_unaligned(p as *mut u16, v) }
        }
    }
    pub fn write_u32(&self, gpa: u64, v: u32) {
        if let Some(p) = self.translate(gpa, 4) {
            unsafe { std::ptr::write_unaligned(p as *mut u32, v) }
        }
    }
    pub fn write_u64(&self, gpa: u64, v: u64) {
        if let Some(p) = self.translate(gpa, 8) {
            unsafe { std::ptr::write_unaligned(p as *mut u64, v) }
        }
    }

    /// Read a byte slice from guest memory. On OOB the destination is
    /// filled with zeros and a log line emitted; the worker keeps
    /// running so the guest sees deterministic (if useless) data
    /// rather than crashing the whole worker for one bad descriptor.
    pub fn read_slice(&self, gpa: u64, dst: &mut [u8]) {
        match self.translate(gpa, dst.len()) {
            Some(p) => unsafe {
                std::ptr::copy_nonoverlapping(p as *const u8, dst.as_mut_ptr(), dst.len())
            },
            None => {
                eprintln!(
                    "[guest-mem] OOB read_slice: gpa={gpa:#x} len={} (base={:#x} len={:#x}); zero-filling",
                    dst.len(), self.inner.base_gpa, self.inner.len
                );
                dst.fill(0);
            }
        }
    }
    /// Write a byte slice into guest memory. OOB writes are dropped
    /// with a log line.
    pub fn write_slice(&self, gpa: u64, src: &[u8]) {
        match self.translate(gpa, src.len()) {
            Some(p) => unsafe { std::ptr::copy_nonoverlapping(src.as_ptr(), p, src.len()) },
            None => {
                eprintln!(
                    "[guest-mem] OOB write_slice: gpa={gpa:#x} len={} (base={:#x} len={:#x}); dropping",
                    src.len(), self.inner.base_gpa, self.inner.len
                );
            }
        }
    }

    /// Direct host pointer for a `(gpa, len)` range. Use when a
    /// device wants to recv/read straight into guest memory.
    /// Returns null on OOB — callers MUST check.
    pub fn host_ptr(&self, gpa: u64, len: usize) -> *mut u8 {
        self.translate(gpa, len).unwrap_or(std::ptr::null_mut())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Allocate a known-good 4 KiB host buffer + wrap it. Tests can
    /// then issue GPA reads/writes inside [0x10_0000, 0x10_1000).
    fn make_mem() -> (GuestMem, Vec<u8>) {
        let mut buf = vec![0u8; 4096];
        // Pre-fill with a sentinel so reads can tell zero-on-OOB
        // (the security path) from zero-because-empty (data).
        for (i, b) in buf.iter_mut().enumerate() {
            *b = (i & 0xff) as u8;
        }
        let mem = GuestMem::new(buf.as_mut_ptr(), 0x10_0000, buf.len());
        (mem, buf)
    }

    fn desc(addr: u64, len: u32, writable: bool) -> Desc {
        Desc {
            addr,
            len,
            flags: if writable { VRING_DESC_F_WRITE } else { 0 },
            next: 0,
        }
    }

    #[test]
    fn read_readable_capped_concatenates_only_readable() {
        let (mem, _buf) = make_mem();
        let base = 0x10_0000;
        mem.write_slice(base + 0x100, b"AAAA");
        mem.write_slice(base + 0x200, b"BBBB");
        // [readable 4][writable 4 — skipped][readable 4]
        let chain = [
            desc(base + 0x100, 4, false),
            desc(base + 0x300, 4, true),
            desc(base + 0x200, 4, false),
        ];
        let got = read_readable_capped(&chain, &mem, 1024).unwrap();
        assert_eq!(got, b"AAAABBBB", "writable descriptor excluded");
    }

    #[test]
    fn read_readable_capped_empty_and_overcap() {
        let (mem, _buf) = make_mem();
        // Empty chain → empty buffer, never a panic.
        assert_eq!(read_readable_capped(&[], &mem, 1024), Some(Vec::new()));
        // A readable descriptor over the cap → None (no unbounded alloc).
        let chain = [desc(0x10_0000, 4096, false)];
        assert_eq!(read_readable_capped(&chain, &mem, 64), None);
        // Exactly at the cap is accepted.
        assert!(read_readable_capped(&chain, &mem, 4096).is_some());
    }

    #[test]
    fn write_writable_bounded_and_skips_readable() {
        let (mem, _buf) = make_mem();
        let base = 0x10_0000;
        // [writable 4][readable 4 — skipped][writable 4]; payload 8 bytes
        // must land in the two writable descriptors only.
        let chain = [
            desc(base + 0x100, 4, true),
            desc(base + 0x200, 4, false),
            desc(base + 0x300, 4, true),
        ];
        let n = write_writable(&chain, &mem, b"12345678");
        assert_eq!(n, 8);
        let mut a = [0u8; 4];
        let mut b = [0u8; 4];
        mem.read_slice(base + 0x100, &mut a);
        mem.read_slice(base + 0x300, &mut b);
        assert_eq!(&a, b"1234");
        assert_eq!(&b, b"5678");
        // The readable descriptor's region was NOT written.
        let mut skipped = [0xffu8; 4];
        mem.read_slice(base + 0x200, &mut skipped);
        assert_ne!(&skipped, b"5678");
    }

    #[test]
    fn write_writable_truncates_to_descriptor_capacity() {
        let (mem, _buf) = make_mem();
        let base = 0x10_0000;
        // Only 4 writable bytes available; an 8-byte payload writes 4.
        let chain = [desc(base + 0x100, 4, true)];
        assert_eq!(write_writable(&chain, &mem, b"12345678"), 4);
        // Empty chain → 0, no panic.
        assert_eq!(write_writable(&[], &mem, b"x"), 0);
        // Payload shorter than the descriptor → only payload bytes.
        assert_eq!(write_writable(&chain, &mem, b"hi"), 2);
    }

    #[test]
    fn size_zero_queue_does_not_divide_by_zero() {
        // A hostile guest can write QueueNum=0 via MMIO (the shell stores
        // the size verbatim). pop_chain / add_used do `idx % size`; size 0
        // must be treated as an empty/unconfigured queue, NOT panic the
        // worker. Pre-fix this divided by zero in `% self.size`.
        let (mem, _buf) = make_mem();
        // avail.idx != last_avail_idx so a pre-fix pop_chain would reach
        // the `% size` site rather than the early `avail == last` return.
        let avail_ring = 0x10_0000 + 0x100;
        mem.write_u16(avail_ring + 2, 1);
        let mut q = Queue::new(mem);
        q.size = 0;
        q.ready = true;
        q.avail_ring = avail_ring;
        q.used_ring = 0x10_0000 + 0x200;
        assert!(q.pop_chain().is_none(), "size-0 queue yields no chains");
        q.add_used(0, 0); // must not panic either
    }

    #[test]
    fn in_bounds_read_returns_real_data() {
        let (mem, _buf) = make_mem();
        // offset 0 in our mmap → sentinel byte 0
        assert_eq!(mem.read_u32(0x10_0000), 0x03020100);
        // last 4 bytes
        assert_eq!(mem.read_u32(0x10_0000 + 4096 - 4), {
            // bytes 4092..4096 = 0xfc, 0xfd, 0xfe, 0xff
            u32::from_le_bytes([0xfc, 0xfd, 0xfe, 0xff])
        });
    }

    #[test]
    fn underflow_is_caught_not_uaf() {
        // gpa < base → underflow. Pre-0.5.2 this would compute a
        // huge usize offset and host.add(off) would produce an
        // arbitrary pointer, then read_unaligned would dereference.
        // Now translate returns None and we return 0.
        let (mem, _buf) = make_mem();
        assert_eq!(mem.read_u32(0x10_0000 - 1), 0);
        assert_eq!(mem.read_u32(0x0), 0);
        assert_eq!(mem.read_u64(u64::MAX - 100), 0);
    }

    #[test]
    fn overflow_past_end_is_caught() {
        let (mem, _buf) = make_mem();
        // Last legitimate u32 read.
        let _ = mem.read_u32(0x10_0000 + 4096 - 4);
        // One byte past end.
        assert_eq!(mem.read_u32(0x10_0000 + 4096 - 3), 0);
        // Huge len that wraps.
        let mut dst = [0u8; 8];
        mem.read_slice(0x10_0000 + 4096 - 4, &mut dst[..]);
        // First 4 bytes filled with real data, next 4 with zeros
        // because the entire request was OOB? Actually the entire
        // request is OOB (start + len > buf size), so it zero-fills
        // the whole dst. That's the defensive default.
        assert!(dst.iter().all(|&b| b == 0) || dst[..4] == [0xfc, 0xfd, 0xfe, 0xff]);
    }

    #[test]
    fn oob_write_is_silently_dropped() {
        // Hostile guest tells us to write to an address before our
        // mmap. Must NOT corrupt anything; just log + drop.
        let (mem, buf) = make_mem();
        let _ = buf;
        // Capture mem before the write
        mem.write_u32(0x0, 0xdeadbeef);
        mem.write_slice(u64::MAX - 100, &[0xff; 100]);
        // No assertion — the test passes if we don't segfault.
    }

    #[test]
    fn descriptor_with_oob_next_terminates_chain() {
        // Pre-0.5.2, a descriptor.next > queue.size produced an
        // arbitrary read (off the desc table). Now we treat OOB
        // next as end-of-chain.
        //
        // This is more of a smoke check; we don't spin a full
        // queue here, just confirm the bounds-check function
        // is plumbed correctly via translate.
        let (mem, _buf) = make_mem();
        // Read at exactly the boundary — last legal 8 bytes.
        let _ = mem.read_u64(0x10_0000 + 4096 - 8);
        // Read past the boundary by 1 byte.
        assert_eq!(mem.read_u64(0x10_0000 + 4096 - 7), 0);
    }

    // ── Property-based hardening of the descriptor-chain walk ──────
    // `pop_chain` walks a guest-controlled descriptor table following
    // `next` links. A hostile or buggy guest can put ANY bytes there.
    // The invariants that MUST hold for every possible guest memory:
    //   * it never panics (no OOB deref, no integer-overflow panic),
    //   * it always terminates (no infinite `next`-cycle),
    //   * the returned chain is bounded by the queue size (a longer
    //     chain would mean a revisited / runaway descriptor).
    // These are exactly the guarantees the `0..self.size` cap and the
    // `idx >= self.size` end-of-chain break (and translate()'s bounds
    // check) provide; this test locks them against regression across
    // arbitrary inputs, not just the hand-picked cases above.
    use proptest::prelude::*;

    const PT_BASE: u64 = 0x10_0000;
    const PT_LEN: usize = 8192;

    proptest! {
        #![proptest_config(ProptestConfig::with_cases(512))]

        #[test]
        fn pop_chain_is_panic_free_terminates_and_bounded(
            seed in any::<u64>(),
            size_log in 0u32..=8,        // queue size 1..=256 (power of two)
            last_avail in any::<u16>(),
            used_head in any::<u16>(),
            used_len in any::<u32>(),
        ) {
            // Arbitrary guest memory: fill the whole window from a seed.
            let mut buf = vec![0u8; PT_LEN];
            let mut x = seed | 1;
            for b in buf.iter_mut() {
                x ^= x << 13; x ^= x >> 7; x ^= x << 17;
                *b = (x & 0xff) as u8;
            }
            let mem = GuestMem::new(buf.as_mut_ptr(), PT_BASE, PT_LEN);

            let size: u16 = 1 << size_log;
            // Lay the three rings out non-overlapping inside the window.
            // Worst case (size=256): desc 4096B, avail 516B, used 2052B —
            // all fit in 8 KiB.
            let desc_table = PT_BASE;             // [0, size*16)
            let avail_ring = PT_BASE + 4096;      // [4096, +4+size*2)
            let used_ring  = PT_BASE + 6000;      // [6000, +4+size*8)

            // Force entry into the walk: make avail.idx differ from
            // last_avail_idx so pop_chain actually exercises the chain
            // logic (not the early `avail == last_avail_idx` return).
            mem.write_u16(avail_ring + 2, last_avail.wrapping_add(1));

            let mut q = Queue::new(mem);
            q.size = size;
            q.ready = true;
            q.desc_table = desc_table;
            q.avail_ring = avail_ring;
            q.used_ring = used_ring;
            q.last_avail_idx = last_avail;

            // Must not panic; must terminate; chain length bounded by size.
            if let Some((_head, chain)) = q.pop_chain() {
                prop_assert!(
                    chain.len() <= size as usize,
                    "chain len {} exceeded queue size {}",
                    chain.len(), size,
                );
            }

            // add_used over arbitrary head/len must also stay panic-free
            // and in-bounds (writes are translate()-checked).
            q.add_used(used_head, used_len);
        }
    }
}