supermachine 0.7.72

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
//! HVF backend: per-vCPU register-file snapshot (capture/restore).
//!
//! This is the HVF implementation of the seam's
//! [`HypervisorVcpu::capture_snapshot`](crate::hypervisor::HypervisorVcpu::capture_snapshot)
//! / `restore_snapshot` — the aarch64 GP/SIMD/sysreg + GICv3 ICC/ICH/redistributor
//! register file. It lives in the backend module (not in the cross-backend
//! `vmm::snapshot` pipeline) because it is intrinsically HVF + aarch64 (it pokes
//! `applevisor_sys` register enums and `hv_vcpu_*` accessors). `vmm::snapshot`
//! consumes [`PerVcpuState`] as the active backend's opaque snapshot state and
//! drives capture/restore through the seam.

use applevisor_sys as av;

use crate::hvf::Vcpu;

/// One vCPU's restorable register state for a snapshot. Field encodings note the
/// `applevisor_sys` register-enum the `u32` ids come from.
#[derive(Default, Clone)]
pub struct PerVcpuState {
    pub gp_regs: Vec<(u32, u64)>,     // hv_reg_t as u32
    pub simd_regs: Vec<(u32, u128)>,  // hv_simd_fp_reg_t as u32
    pub sys_regs: Vec<(u32, u64)>,    // hv_sys_reg_t as u32
    pub icc_regs: Vec<(u32, u64)>,    // hv_gic_icc_reg_t as u32
    pub redist_regs: Vec<(u32, u64)>, // GICR offset
    pub vtimer_offset: u64,
    /// Per-PE GIC EL2 ICH (interrupt-controller hypervisor)
    /// registers: List Registers (LR0..LR15_EL2), HCR_EL2, VMCR_EL2,
    /// AP0R0_EL2, AP1R0_EL2. Apple's `hv_gic_state_*` blob does NOT
    /// cover these — they live per-vCPU and are accessed via
    /// `hv_gic_get_ich_reg` / `hv_gic_set_ich_reg`. Stored as
    /// `(hv_gic_ich_reg_t as u32, value)`. Empty on snapshots
    /// loaded from v8 or earlier files; capture path always emits.
    pub ich_regs: Vec<(u32, u64)>,
}

impl PerVcpuState {
    /// The captured program counter, if present. A convenience for
    /// callers (e.g. the coordinator's trace logging) that want PC
    /// without reaching into the backend's register-id encoding.
    pub fn pc(&self) -> Option<u64> {
        let pc_id = av::hv_reg_t::PC as u32;
        self.gp_regs
            .iter()
            .find(|(id, _)| *id == pc_id)
            .map(|(_, v)| *v)
    }
}

// ----- Register enumerations -----

fn gp_reg_enum() -> Vec<av::hv_reg_t> {
    let mut out = Vec::with_capacity(37);
    let x0 = av::hv_reg_t::X0 as u32;
    for i in 0..=30u32 {
        // SAFETY: X0..X30 are 31 contiguous variants starting at X0.
        out.push(unsafe { std::mem::transmute::<u32, av::hv_reg_t>(x0 + i) });
    }
    out.push(av::hv_reg_t::FP);
    out.push(av::hv_reg_t::LR);
    out.push(av::hv_reg_t::PC);
    out.push(av::hv_reg_t::CPSR);
    out.push(av::hv_reg_t::FPCR);
    out.push(av::hv_reg_t::FPSR);
    out
}

fn simd_reg_enum() -> Vec<av::hv_simd_fp_reg_t> {
    let q0 = av::hv_simd_fp_reg_t::Q0 as u32;
    (0..32u32)
        // SAFETY: Q0..Q31 are 32 contiguous variants starting at Q0.
        .map(|i| unsafe { std::mem::transmute::<u32, av::hv_simd_fp_reg_t>(q0 + i) })
        .collect()
}

fn sys_reg_enum() -> Vec<av::hv_sys_reg_t> {
    use av::hv_sys_reg_t::*;
    vec![
        MPIDR_EL1,
        SCTLR_EL1,
        CPACR_EL1,
        TCR_EL1,
        TTBR0_EL1,
        TTBR1_EL1,
        MAIR_EL1,
        AMAIR_EL1,
        VBAR_EL1,
        CONTEXTIDR_EL1,
        TPIDR_EL1,
        SPSR_EL1,
        ELR_EL1,
        SP_EL0,
        SP_EL1,
        ESR_EL1,
        FAR_EL1,
        PAR_EL1,
        TPIDR_EL0,
        TPIDRRO_EL0,
        CNTKCTL_EL1,
        CSSELR_EL1,
        MDSCR_EL1,
        // Pointer-auth keys (CONFIG_ARM64_PTR_AUTH guests OOPS without these).
        APIAKEYLO_EL1,
        APIAKEYHI_EL1,
        APIBKEYLO_EL1,
        APIBKEYHI_EL1,
        APDAKEYLO_EL1,
        APDAKEYHI_EL1,
        APDBKEYLO_EL1,
        APDBKEYHI_EL1,
        APGAKEYLO_EL1,
        APGAKEYHI_EL1,
        // Vtimer CTL + deadline (HVF DOES accept these despite earlier rumours).
        CNTV_CTL_EL0,
        CNTV_CVAL_EL0,
        CNTP_CTL_EL0,
        CNTP_CVAL_EL0,
        // CNTVOFF_EL2 is captured via hv_vcpu_get_vtimer_offset, not as a sysreg.
    ]
}

fn icc_reg_enum() -> Vec<av::hv_gic_icc_reg_t> {
    use av::hv_gic_icc_reg_t::*;
    vec![
        PMR_EL1,
        BPR0_EL1,
        BPR1_EL1,
        AP0R0_EL1,
        AP1R0_EL1,
        RPR_EL1,
        CTLR_EL1,
        SRE_EL1,
        IGRPEN0_EL1,
        IGRPEN1_EL1,
    ]
}

/// Per-PE GIC EL2 ICH registers — the LRs (16 of them) plus the
/// vCPU's hypervisor-side IRQ state (HCR_EL2, VMCR_EL2, AP*0R0_EL2).
/// Apple's `hv_gic_state_get_data()` opaque blob explicitly excludes
/// these per its header doc; without capturing them, a secondary
/// vCPU snapshotted with pending virtual IRQs in its LRs comes back
/// from restore with empty LRs and either drops the IRQs (kernel
/// hangs waiting for completion) or — worse — hits stale state in
/// VMCR/HCR that doesn't match the rest of GIC, panicking on the
/// first IRQ delivery after resume.
///
/// VTR_EL2 / MISR_EL2 / EISR_EL2 / ELRSR_EL2 are read-only status
/// registers (Architecturally derived from LR contents); we don't
/// capture them — restore would fail on write anyway.
fn ich_reg_enum() -> Vec<av::hv_gic_ich_reg_t> {
    use av::hv_gic_ich_reg_t::*;
    vec![
        AP0R0_EL2, AP1R0_EL2, HCR_EL2, VMCR_EL2, LR0_EL2, LR1_EL2, LR2_EL2, LR3_EL2, LR4_EL2,
        LR5_EL2, LR6_EL2, LR7_EL2, LR8_EL2, LR9_EL2, LR10_EL2, LR11_EL2, LR12_EL2, LR13_EL2,
        LR14_EL2, LR15_EL2,
    ]
}

/// Per-vCPU redistributor offsets QEMU v11 captures explicitly because
/// the opaque blob doesn't cover them. From hw/intc/arm_gicv3_hvf.c.
fn redist_reg_offsets() -> Vec<u32> {
    let mut v = Vec::with_capacity(11);
    v.push(0x10080); // GICR_IGROUPR0
    v.push(0x10100); // GICR_ISENABLER0
    v.push(0x10C04); // GICR_ICFGR1
    v.push(0x10200); // GICR_ISPENDR0  (vtimer PPI 27 lives here)
    v.push(0x10300); // GICR_ISACTIVER0
    for n in 0..8u32 {
        v.push(0x10400 + 4 * n); // GICR_IPRIORITYR0..7
    }
    v
}

// ----- Capture -----

pub fn capture_vcpu_state(vcpu: &Vcpu) -> crate::hvf::Result<PerVcpuState> {
    let mut gp_regs = Vec::with_capacity(37);
    for r in gp_reg_enum() {
        gp_regs.push((r as u32, vcpu.get_reg(r)?));
    }
    let mut simd_regs = Vec::with_capacity(32);
    for r in simd_reg_enum() {
        simd_regs.push((r as u32, vcpu.get_simd_fp_reg(r)?));
    }
    let mut sys_regs = Vec::new();
    for r in sys_reg_enum() {
        // Best-effort: some sysregs may be unreadable in certain
        // states; skip rather than fail the whole capture.
        if let Ok(v) = vcpu.get_sys_reg(r) {
            sys_regs.push((r as u32, v));
        }
    }
    let mut icc_regs = Vec::new();
    for r in icc_reg_enum() {
        if let Ok(v) = vcpu.get_icc_reg(r) {
            icc_regs.push((r as u32, v));
        }
    }
    let mut redist_regs = Vec::new();
    for off in redist_reg_offsets() {
        // SAFETY: offsets are valid GICR register enum variants.
        let reg: av::hv_gic_redistributor_reg_t = unsafe { std::mem::transmute(off) };
        if let Ok(v) = vcpu.get_redist_reg(reg) {
            redist_regs.push((off, v));
        }
    }
    // ICH (EL2) registers — the per-PE GIC virtual-interrupt state
    // that's NOT covered by `hv_gic_state_get_data`'s opaque blob.
    // This is the actual root-cause fix for multi-vCPU snapshot
    // intermittency: secondaries captured mid-flight have pending
    // interrupts in their LRs and active-priority bits in AP*R0; if
    // we don't capture+restore them, the kernel sees a phantom IRQ
    // state at resume and panics or hangs in interrupt context.
    let mut ich_regs = Vec::new();
    for r in ich_reg_enum() {
        if let Ok(v) = vcpu.get_ich_reg(r) {
            ich_regs.push((r as u32, v));
        }
    }
    let vtimer_offset = vcpu.get_vtimer_offset()?;
    Ok(PerVcpuState {
        gp_regs,
        simd_regs,
        sys_regs,
        icc_regs,
        redist_regs,
        vtimer_offset,
        ich_regs,
    })
}

// ----- Restore -----

/// Restore per-vCPU state. Vtimer offset is the CALLER's
/// responsibility (one coherent value across all vCPUs).
pub fn restore_vcpu_state(vcpu: &Vcpu, st: &PerVcpuState) -> crate::hvf::Result<()> {
    // 1. sysregs first (MMU, exception state, pointer-auth, vtimer).
    //    Some may be RO; only fail loudly for MMU-critical writes.
    use av::hv_sys_reg_t as S;
    let critical = |id: u32| {
        let r: S = unsafe { std::mem::transmute(id) };
        matches!(
            r,
            S::SCTLR_EL1 | S::TCR_EL1 | S::TTBR0_EL1 | S::TTBR1_EL1 | S::MAIR_EL1 | S::VBAR_EL1
        )
    };
    for (id, v) in &st.sys_regs {
        // SAFETY: id originated from sys_reg_enum() variants.
        let r: S = unsafe { std::mem::transmute(*id) };
        if let Err(e) = vcpu.set_sys_reg(r, *v) {
            if critical(*id) {
                return Err(e);
            }
        }
    }

    // 2. ICC: SRE_EL1 first (gates subsequent ICC writes), then
    //    everything except IGRPEN0/1, then IGRPEN0/1 last (unmask
    //    delivery only after PMR/BPR/AP state is in place).
    use av::hv_gic_icc_reg_t as I;
    let icc_find = |want: I| -> Option<u64> {
        st.icc_regs.iter().find_map(|(id, v)| {
            // SAFETY: id from icc_reg_enum.
            let r: I = unsafe { std::mem::transmute(*id) };
            (r == want).then_some(*v)
        })
    };
    if let Some(v) = icc_find(I::SRE_EL1) {
        let _ = vcpu.set_icc_reg(I::SRE_EL1, v);
    }
    for (id, v) in &st.icc_regs {
        // SAFETY: id from icc_reg_enum.
        let r: I = unsafe { std::mem::transmute(*id) };
        match r {
            I::SRE_EL1 | I::IGRPEN0_EL1 | I::IGRPEN1_EL1 => continue,
            _ => {
                let _ = vcpu.set_icc_reg(r, *v);
            }
        }
    }
    if let Some(v) = icc_find(I::IGRPEN0_EL1) {
        let _ = vcpu.set_icc_reg(I::IGRPEN0_EL1, v);
    }
    if let Some(v) = icc_find(I::IGRPEN1_EL1) {
        let _ = vcpu.set_icc_reg(I::IGRPEN1_EL1, v);
    }

    // 3. SIMD/FP regs.
    use av::hv_simd_fp_reg_t as Q;
    for (id, v) in &st.simd_regs {
        // SAFETY: id from simd_reg_enum.
        let r: Q = unsafe { std::mem::transmute(*id) };
        vcpu.set_simd_fp_reg(r, *v)?;
    }

    // 4. Per-vCPU redistributor regs. Order from QEMU v11
    //    arm_gicv3_hvf.c: group/config/priority first, then CLEAR each
    //    enable/pending/active mask before SET (otherwise restore is
    //    OR of default + captured bits).
    let find_off = |off: u32| -> u64 {
        st.redist_regs
            .iter()
            .find_map(|(o, v)| (*o == off).then_some(*v))
            .unwrap_or(0)
    };
    let write_off = |off: u32, val: u64| -> crate::hvf::Result<()> {
        // SAFETY: off comes from our own enumeration; transmute matches repr.
        let r: av::hv_gic_redistributor_reg_t = unsafe { std::mem::transmute(off) };
        vcpu.set_redist_reg(r, val)
    };
    write_off(0x10080, find_off(0x10080))?; // IGROUPR0
    write_off(0x10C04, find_off(0x10C04))?; // ICFGR1
    for n in 0..8u32 {
        write_off(0x10400 + 4 * n, find_off(0x10400 + 4 * n))?;
    }
    write_off(0x10180, 0xFFFF_FFFF)?; // ICENABLER0 clear
    write_off(0x10100, find_off(0x10100))?; // ISENABLER0 set
    write_off(0x10280, 0xFFFF_FFFF)?; // ICPENDR0 clear
    write_off(0x10200, find_off(0x10200))?; // ISPENDR0 set
    write_off(0x10380, 0xFFFF_FFFF)?; // ICACTIVER0 clear
    write_off(0x10300, find_off(0x10300))?; // ISACTIVER0 set

    // 4b. ICH (EL2) registers — write LRs + HCR + VMCR + AP*R0 last
    // among GIC state, so they sit on top of a fully-restored
    // redistributor. AP0R0_EL2 / AP1R0_EL2 carry the active-priority
    // bitmap that has to match the activeR redist bits we just set;
    // restoring them dovetails. Writing LRs LAST means the next
    // hv_vcpu_run sees a vCPU with whatever pending virtual IRQs
    // were in flight at capture time, so the guest's interrupt-
    // handler state machine resumes mid-flight without dropping or
    // duplicating IRQs. Best-effort per register: some ICH writes
    // can fail on HVF if the value is reserved-bits-violated; we
    // skip rather than abort the whole restore.
    use av::hv_gic_ich_reg_t as H;
    for (id, v) in &st.ich_regs {
        // SAFETY: id from ich_reg_enum.
        let r: H = unsafe { std::mem::transmute(*id) };
        let _ = vcpu.set_ich_reg(r, *v);
    }

    // 5. Force vtimer mask off so HVF re-evaluates on next run.
    let _ = vcpu.set_vtimer_mask(false);

    // 6. Vtimer force-fire: if the captured vtimer was enabled and
    //    unmasked, set CVAL=0 and force-pend the PPI bit so the guest
    //    wakes immediately rather than waiting on a stale deadline.
    let cntv_ctl = st
        .sys_regs
        .iter()
        .find_map(|(id, v)| {
            let r: S = unsafe { std::mem::transmute(*id) };
            (r == S::CNTV_CTL_EL0).then_some(*v)
        })
        .unwrap_or(0);
    let enable = cntv_ctl & 1 != 0;
    let imask = cntv_ctl & 2 != 0;
    if enable && !imask {
        vcpu.set_sys_reg(S::CNTV_CVAL_EL0, 0)?;
        // GICR_ISPENDR0 bit 27 = vtimer PPI.
        write_off(0x10200, 1u64 << 27)?;
    }

    // 7. GP regs LAST (PC/CPSR finalize the vCPU).
    use av::hv_reg_t as R;
    for (id, v) in &st.gp_regs {
        // SAFETY: id from gp_reg_enum.
        let r: R = unsafe { std::mem::transmute(*id) };
        vcpu.set_reg(r, *v)?;
    }
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::PerVcpuState;
    use crate::hypervisor::HypervisorVcpu;

    /// The seam's `write_snapshot_state` / `read_snapshot_state` (Phase 3
    /// snapshot-pipeline groundwork) must round-trip the aarch64 register file
    /// and consume exactly the bytes written. Pure logic — no VM needed.
    #[test]
    fn snapshot_state_seam_round_trips() {
        let st = PerVcpuState {
            gp_regs: vec![(0, 0x42), (31, 0xdead_beef)],
            simd_regs: vec![(0, 0x1122_3344_5566_7788_99aa_bbcc_ddee_ff00u128)],
            sys_regs: vec![(7, 0x1234), (9, 0)],
            icc_regs: vec![(1, 0x5)],
            redist_regs: vec![(0x10080, 0xff)],
            vtimer_offset: 0xcafe_f00d,
            ich_regs: vec![(2, 0x9)],
        };
        let mut buf = Vec::new();
        <crate::hvf::Vcpu as HypervisorVcpu>::write_snapshot_state(&st, &mut buf).unwrap();
        let mut cur = std::io::Cursor::new(&buf);
        let back = <crate::hvf::Vcpu as HypervisorVcpu>::read_snapshot_state(&mut cur).unwrap();
        assert_eq!(back.gp_regs, st.gp_regs);
        assert_eq!(back.simd_regs, st.simd_regs);
        assert_eq!(back.sys_regs, st.sys_regs);
        assert_eq!(back.icc_regs, st.icc_regs);
        assert_eq!(back.redist_regs, st.redist_regs);
        assert_eq!(back.vtimer_offset, st.vtimer_offset);
        assert_eq!(back.ich_regs, st.ich_regs);
        assert_eq!(cur.position() as usize, buf.len(), "consumed exactly");
    }

    /// A corrupt/huge leading count must NOT drive a multi-GB `Vec::with_capacity`
    /// (OOM); it should fail fast on the truncated read. Without the pre-alloc cap
    /// this test would abort the process instead of returning `Err`.
    #[test]
    fn read_snapshot_state_rejects_huge_count_without_oom() {
        // v9 layout: 8-byte vtimer offset, then gp_regs count. A 4-billion
        // gp count followed by EOF must fail fast on the truncated entry
        // read, never drive a multi-GB `Vec::with_capacity`.
        let mut bytes = Vec::new();
        bytes.extend_from_slice(&0u64.to_le_bytes()); // vtimer_offset
        bytes.extend_from_slice(&u32::MAX.to_le_bytes()); // gp_regs count = 4 billion
        let mut cur = std::io::Cursor::new(&bytes[..]);
        let r = <crate::hvf::Vcpu as HypervisorVcpu>::read_snapshot_state(&mut cur);
        assert!(r.is_err(), "truncated huge-count input must error, not OOM");
    }
}