supermachine 0.7.69

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
//! KVM (Linux/x86_64) implementation of the portable [`crate::hypervisor`]
//! backend contract — the sibling of `crate::hvf`. Increment 1: VM + in-kernel
//! irqchip lifecycle, guest-memory mapping, vCPU creation + CPUID, and the
//! run/exit primitive (`step()` → [`VcpuExit`]).
//!
//! The seam's register model is aarch64-architectural; on x86 those methods
//! (`CoreReg`/`SysReg`) are vestigial — the x86 orchestration sets RIP/CR*/EFER
//! on the concrete [`KvmVcpu`] via [`KvmVcpu::enter_long_mode`] and friends, and
//! drives the loop through `step()`. Snapshot + cross-thread force-exit are
//! later increments (6 and 5).

use std::cell::RefCell;
use std::collections::HashMap;
use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, AtomicUsize, Ordering};
use std::sync::{Arc, Mutex, Once, OnceLock};

use kvm_bindings::{
    kvm_clock_data, kvm_debugregs, kvm_fpu, kvm_irqchip, kvm_lapic_state, kvm_mp_state,
    kvm_msr_entry, kvm_pit_state2, kvm_regs, kvm_segment, kvm_sregs, kvm_userspace_memory_region,
    kvm_vcpu_events, kvm_xcrs, CpuId, Msrs, KVM_MAX_CPUID_ENTRIES, KVM_MEM_READONLY,
    KVM_MP_STATE_UNINITIALIZED,
};
use kvm_ioctls::{IoEventAddress, Kvm, NoDatamatch, VcpuExit as KvmExit, VcpuFd, VmFd};
use vmm_sys_util::eventfd::EventFd;

use crate::hypervisor::{CoreReg, HypervisorVcpu, HypervisorVm, SysReg, VcpuExit, VcpuHandle};

pub mod run;

/// Backend error: a flat message wrapping the underlying KVM errno (the
/// orchestration only ever surfaces these as strings).
#[derive(Debug)]
pub struct KvmError(String);

impl KvmError {
    fn unsupported(what: &str) -> Self {
        KvmError(format!("kvm: unsupported ({what})"))
    }
}

impl From<kvm_ioctls::Error> for KvmError {
    fn from(e: kvm_ioctls::Error) -> Self {
        KvmError(format!("kvm: {e}"))
    }
}

impl From<std::io::Error> for KvmError {
    fn from(e: std::io::Error) -> Self {
        KvmError(format!("kvm io: {e}"))
    }
}

impl From<crate::arch::x86_64::boot::BootError> for KvmError {
    fn from(e: crate::arch::x86_64::boot::BootError) -> Self {
        KvmError(format!("kvm boot: {e}"))
    }
}

impl std::fmt::Display for KvmError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(&self.0)
    }
}

impl std::error::Error for KvmError {}

impl crate::hypervisor::BackendError for KvmError {
    fn other(msg: &str) -> Self {
        KvmError(msg.to_string())
    }
}

/// A VM + its in-kernel interrupt controller. Owns the `/dev/kvm` handle and
/// the VM fd; tracks memory slots so a region can be unmapped by gpa.
pub struct KvmVm {
    _kvm: Kvm,
    // `Arc` so the seam's `irq_line()` can hand a cloneable, 'static IRQ-raise
    // closure to device threads (they call `set_irq_line` on this fd). All
    // `self.vm.*` calls deref through the Arc unchanged.
    vm: Arc<VmFd>,
    next_slot: AtomicU32,
    next_vcpu: AtomicU64,
    supported_cpuid: CpuId,
    /// gpa → slot, so `unmap_ram` can find the slot to zero.
    slots: Mutex<HashMap<u64, u32>>,
}

impl HypervisorVm for KvmVm {
    type Error = KvmError;
    type Vcpu = KvmVcpu;

    fn create() -> Result<Self, KvmError> {
        let kvm = Kvm::new()?;
        let vm = kvm.create_vm()?;
        // x86 prerequisites for the in-kernel irqchip: a TSS region and the
        // identity-map page must be set before KVM_CREATE_IRQCHIP.
        vm.set_tss_address(0xfffb_d000)?;
        vm.create_irq_chip()?;
        let mut supported_cpuid = kvm.get_supported_cpuid(KVM_MAX_CPUID_ENTRIES)?;
        // Mask CET in leaf 7/0: ECX[7]=SHSTK (user shadow stack), EDX[20]=IBT
        // (indirect-branch tracking). We don't set up CET state, so a kernel
        // that enables it would fault all userspace immediately (no-endbr64
        // entry → #CP, RET → shadow-stack mismatch). Proven necessary by the
        // kvm-boot spike.
        for e in supported_cpuid.as_mut_slice() {
            if e.function == 7 && e.index == 0 {
                e.ecx &= !(1 << 7);
                e.edx &= !(1 << 20);
            }
        }
        Ok(KvmVm {
            _kvm: kvm,
            vm: Arc::new(vm),
            next_slot: AtomicU32::new(0),
            next_vcpu: AtomicU64::new(0),
            supported_cpuid,
            slots: Mutex::new(HashMap::new()),
        })
    }

    unsafe fn map_ram(
        &self,
        host_ptr: *mut u8,
        gpa: u64,
        len: usize,
        prot: u64,
    ) -> Result<(), KvmError> {
        let slot = self.next_slot.fetch_add(1, Ordering::Relaxed);
        let mut flags = 0u32;
        if prot & crate::hypervisor::prot::WRITE == 0 {
            flags |= KVM_MEM_READONLY;
        }
        let region = kvm_userspace_memory_region {
            slot,
            guest_phys_addr: gpa,
            memory_size: len as u64,
            userspace_addr: host_ptr as u64,
            flags,
        };
        // SAFETY: caller guarantees host_ptr stays valid for the VM's lifetime.
        unsafe { self.vm.set_user_memory_region(region)? };
        self.slots.lock().unwrap().insert(gpa, slot);
        Ok(())
    }

    unsafe fn unmap_ram(&self, gpa: u64, _len: usize) -> Result<(), KvmError> {
        if let Some(slot) = self.slots.lock().unwrap().remove(&gpa) {
            // memory_size = 0 deletes the slot.
            let region = kvm_userspace_memory_region {
                slot,
                guest_phys_addr: gpa,
                memory_size: 0,
                userspace_addr: 0,
                flags: 0,
            };
            // SAFETY: caller guarantees no vCPU is accessing this range.
            unsafe { self.vm.set_user_memory_region(region)? };
        }
        Ok(())
    }

    fn create_vcpu(&self) -> Result<KvmVcpu, KvmError> {
        // Per-VM KVM vCPU index (the APIC id) — must be 0..N within this VM.
        let kvm_index = self.next_vcpu.fetch_add(1, Ordering::Relaxed);
        let mut vcpu = self.vm.create_vcpu(kvm_index)?;
        vcpu.set_cpuid2(&self.supported_cpuid)?;
        // Register the cross-thread control block under a process-global id +
        // ensure the SIGUSR1 handler is installed before this vCPU can run.
        install_force_exit_signal();
        let id = NEXT_REG_ID.fetch_add(1, Ordering::Relaxed);
        // Publish the immediate_exit flag pointer NOW (main thread, before the
        // run thread is spawned). The kvm_run page is an mmap on the vCPU fd
        // with a process-stable address, so a pointer captured here is valid for
        // the lifetime of the fd regardless of which thread later runs it. This
        // closes the start()/teardown race: a force_exit issued before the run
        // thread binds itself (tid==0) still gates guest entry via this flag.
        let immediate_exit_ptr = std::ptr::addr_of_mut!(vcpu.get_kvm_run().immediate_exit) as usize;
        let reg = Arc::new(VcpuReg {
            tid: AtomicU64::new(0),
            exit: AtomicBool::new(false),
            immediate_exit_ptr: AtomicUsize::new(immediate_exit_ptr),
        });
        registry().lock().unwrap().insert(id, reg.clone());
        Ok(KvmVcpu {
            vcpu: RefCell::new(vcpu),
            id,
            reg,
            bound: AtomicBool::new(false),
        })
    }

    fn set_irq(&self, intid: u32, level: bool) -> Result<(), KvmError> {
        // The in-kernel irqchip (IOAPIC/PIC) is per-VM on KVM; drive the GSI line.
        self.vm.set_irq_line(intid, level)?;
        Ok(())
    }

    fn irq_line(&self) -> Arc<dyn Fn(u32, bool) + Send + Sync> {
        // Clone the shared VM-fd handle into the closure so device threads can
        // drive the per-VM in-kernel irqchip after the borrow ends.
        let vm = Arc::clone(&self.vm);
        Arc::new(move |intid, level| {
            let _ = vm.set_irq_line(intid, level);
        })
    }

    fn capture_intc(&self) -> Result<Vec<u8>, KvmError> {
        // Serialize PIT + 3 irqchips (PIC master/slave + IOAPIC) + kvmclock as
        // length-prefixed POD blobs. (The full snapshot path in `kvm::run` has
        // its own inline serialization for SMSNAP05; this standalone blob is the
        // seam form the backend-agnostic snapshot pipeline consumes.)
        let s = self.capture_devices()?;
        let mut out = Vec::new();
        push_pod(&mut out, &s.pit);
        for chip in &s.irqchips {
            push_pod(&mut out, chip);
        }
        push_pod(&mut out, &s.clock);
        Ok(out)
    }

    fn restore_intc(&self, blob: &[u8]) -> Result<(), KvmError> {
        let mut p = 0usize;
        let pit = read_pod(blob, &mut p)?;
        let irqchips = [
            read_pod(blob, &mut p)?,
            read_pod(blob, &mut p)?,
            read_pod(blob, &mut p)?,
        ];
        let clock = read_pod(blob, &mut p)?;
        self.restore_devices(&KvmDeviceState {
            pit,
            irqchips,
            clock,
        })
    }

    fn dax_mapper(self: &Arc<Self>) -> Arc<dyn crate::fuse::HvfMapper> {
        // The KVM mapper retains a shared VM handle for memslot map/unmap.
        crate::kvm::run::kvm_dax_mapper(Arc::clone(self))
    }

    fn host_monotonic_ticks() -> u64 {
        // CLOCK_MONOTONIC in nanoseconds. Currently informational on KVM: the
        // guest clock travels in the VM-global device blob (kvm_clock_data via
        // capture_intc/restore_intc) and KVM re-anchors the TSC on KVM_SET_CLOCK,
        // so capture_clock_ref / restore_clock ignore this value. Kept correct
        // for when vmm::snapshot drives both backends through one container.
        let mut ts = libc::timespec {
            tv_sec: 0,
            tv_nsec: 0,
        };
        // SAFETY: writes into a local timespec; CLOCK_MONOTONIC is always present.
        unsafe { libc::clock_gettime(libc::CLOCK_MONOTONIC, &mut ts) };
        (ts.tv_sec as u64)
            .wrapping_mul(1_000_000_000)
            .wrapping_add(ts.tv_nsec as u64)
    }

    fn boot_linux(
        &self,
        vcpu: &KvmVcpu,
        mem: &mut [u8],
        cfg: &crate::hypervisor::LinuxBootConfig,
    ) -> Result<(), KvmError> {
        // x86: write kernel + initrd + boot_params + GDT into guest RAM and
        // compute the long-mode entry registers, then apply them to the BSP. The
        // FDT field is unused (x86 has no device tree; the cmdline carries the
        // virtio-mmio device list). SMP (MP table) is the caller's concern.
        use crate::arch::x86_64::boot::{self, BootConfig};
        let bcfg = BootConfig {
            mem_size: cfg.ram_size,
            cmdline: cfg.cmdline,
            bzimage: cfg.kernel,
            initrd: cfg.initrd,
        };
        let regs = boot::setup_boot(mem, &bcfg)?;
        vcpu.apply_boot_regs(&regs)
    }
}

/// Append a length-prefixed POD value to `out` (little-endian u32 length, then
/// the value's raw bytes). Used by [`KvmVm::capture_intc`].
fn push_pod<T>(out: &mut Vec<u8>, v: &T) {
    // SAFETY: kvm-bindings device structs are `repr(C)` POD; reading them as
    // bytes is sound and copies no padding-sensitive invariants.
    let bytes =
        unsafe { std::slice::from_raw_parts(v as *const T as *const u8, std::mem::size_of::<T>()) };
    out.extend_from_slice(&(bytes.len() as u32).to_le_bytes());
    out.extend_from_slice(bytes);
}

/// Read a length-prefixed POD value written by [`push_pod`], validating the
/// stored length matches `size_of::<T>()` (rejects a truncated / mismatched blob).
fn read_pod<T: Copy>(b: &[u8], p: &mut usize) -> Result<T, KvmError> {
    if *p + 4 > b.len() {
        return Err(KvmError("intc blob truncated (length prefix)".to_string()));
    }
    let len = u32::from_le_bytes([b[*p], b[*p + 1], b[*p + 2], b[*p + 3]]) as usize;
    *p += 4;
    if len != std::mem::size_of::<T>() || *p + len > b.len() {
        return Err(KvmError("intc blob field size mismatch".to_string()));
    }
    let mut v = std::mem::MaybeUninit::<T>::uninit();
    // SAFETY: len == size_of::<T>() (checked) and the source range is in bounds;
    // T is POD (`Copy`), so an arbitrary byte pattern is a valid value.
    unsafe {
        std::ptr::copy_nonoverlapping(b[*p..].as_ptr(), v.as_mut_ptr() as *mut u8, len);
    }
    *p += len;
    Ok(unsafe { v.assume_init() })
}

/// Stream form of [`push_pod`]: length-prefixed POD value to a `Write`. Used by
/// `HypervisorVcpu::write_snapshot_state` (byte-identical to the SMSNAP per-vCPU
/// blob encoding).
fn write_pod_to<T>(w: &mut dyn std::io::Write, v: &T) -> std::io::Result<()> {
    // SAFETY: kvm-bindings POD struct; reading its bytes is sound.
    let bytes =
        unsafe { std::slice::from_raw_parts(v as *const T as *const u8, std::mem::size_of::<T>()) };
    w.write_all(&(bytes.len() as u32).to_le_bytes())?;
    w.write_all(bytes)
}

/// Stream form of [`read_pod`]: read a length-prefixed POD value from a `Read`,
/// validating the stored length matches `size_of::<T>()`.
fn read_pod_from<T: Copy>(r: &mut dyn std::io::Read) -> std::io::Result<T> {
    let mut lb = [0u8; 4];
    r.read_exact(&mut lb)?;
    let len = u32::from_le_bytes(lb) as usize;
    if len != std::mem::size_of::<T>() {
        return Err(std::io::Error::new(
            std::io::ErrorKind::InvalidData,
            "snapshot-state field size mismatch",
        ));
    }
    let mut v = std::mem::MaybeUninit::<T>::uninit();
    // SAFETY: len == size_of::<T>() (checked); T is POD (`Copy`), so any byte
    // pattern is a valid value.
    let buf = unsafe { std::slice::from_raw_parts_mut(v.as_mut_ptr() as *mut u8, len) };
    r.read_exact(buf)?;
    Ok(unsafe { v.assume_init() })
}

impl KvmVm {
    /// In-kernel PIT (8254) — gives the kernel IRQ0 timekeeping so it gets past
    /// timer calibration. Call once after `create()` for the boot path. (Kept
    /// separate from `create` because the seam's `HypervisorVm::create` is the
    /// portable contract; the PIT is an x86-boot specific.)
    pub fn create_pit(&self) -> Result<(), KvmError> {
        self.vm
            .create_pit2(kvm_bindings::kvm_pit_config::default())?;
        Ok(())
    }

    /// Drive a legacy interrupt line on the in-kernel irqchip. The run loop
    /// mirrors a device's `irq_line()` level here after each access to deliver
    /// serial RX/THRE (IRQ4) and virtio used-buffer (IRQ5) interrupts.
    pub fn set_irq_line(&self, irq: u32, level: bool) -> Result<(), KvmError> {
        self.vm.set_irq_line(irq, level)?;
        Ok(())
    }

    /// Register an MMIO doorbell (ioeventfd): a guest write to `addr` signals
    /// `fd` *in the kernel* instead of exiting to userspace. A device thread
    /// waits on `fd` and services the queue — removing the per-notification vCPU
    /// round-trip (the dominant virtio cost). `NoDatamatch` triggers on any
    /// write to the address.
    pub fn register_mmio_ioevent(&self, fd: &EventFd, addr: u64) -> Result<(), KvmError> {
        self.vm
            .register_ioevent(fd, &IoEventAddress::Mmio(addr), NoDatamatch)?;
        Ok(())
    }

    /// Register an irqfd: writing `fd` injects `gsi` directly via the in-kernel
    /// irqchip, with no `set_irq_line` ioctl. The device thread uses this to
    /// raise the virtio used-buffer interrupt off the vCPU path.
    pub fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> Result<(), KvmError> {
        self.vm.register_irqfd(fd, gsi)?;
        Ok(())
    }

    /// Capture the in-kernel device state for a snapshot: the PIT (8254), the
    /// interrupt controllers (PIC master/slave + IOAPIC), and the KVM clock.
    /// The per-vCPU LAPIC is captured with each vCPU (see [`KvmSnapshotState`]).
    pub fn capture_devices(&self) -> Result<KvmDeviceState, KvmError> {
        let pit = self.vm.get_pit2()?;
        let mut irqchips = [
            kvm_irqchip::default(),
            kvm_irqchip::default(),
            kvm_irqchip::default(),
        ];
        for (i, chip) in irqchips.iter_mut().enumerate() {
            chip.chip_id = i as u32; // 0=PIC master, 1=PIC slave, 2=IOAPIC
            self.vm.get_irqchip(chip)?;
        }
        let clock = self.vm.get_clock()?;
        Ok(KvmDeviceState {
            pit,
            irqchips,
            clock,
        })
    }

    /// Restore the in-kernel device state captured by [`capture_devices`].
    pub fn restore_devices(&self, s: &KvmDeviceState) -> Result<(), KvmError> {
        self.vm.set_pit2(&s.pit)?;
        for chip in &s.irqchips {
            self.vm.set_irqchip(chip)?;
        }
        // Reset the clock flags so KVM takes our value as the new base rather
        // than expecting realtime/host-TSC-stable semantics across the restore.
        let mut clock = s.clock;
        clock.flags = 0;
        self.vm.set_clock(&clock)?;
        Ok(())
    }
}

/// In-kernel device state for a VM snapshot (PIT + interrupt controllers +
/// clock). Captured/restored via [`KvmVm::capture_devices`]/[`restore_devices`].
/// `Clone` so the in-place reset path can cache the snapshot's intc/timer
/// baseline and re-apply it each reset.
#[derive(Clone)]
pub struct KvmDeviceState {
    pit: kvm_pit_state2,
    irqchips: [kvm_irqchip; 3],
    clock: kvm_clock_data,
}

/// Per-vCPU cross-thread control block, shared between the vCPU's own thread
/// (which runs it) and any thread holding a [`KvmVcpuHandle`] for it. Lives in
/// the global [`registry`] keyed by vCPU id.
struct VcpuReg {
    /// The OS thread currently running this vCPU (`pthread_t`, 0 = not yet
    /// bound). Stored by the vCPU thread on its first `step()`.
    tid: AtomicU64,
    /// Set by `force_exit`; observed by the run loop via [`KvmVcpu::should_exit`]
    /// so the stop intent is never lost even if the SIGUSR1 lands in the window
    /// between two `KVM_RUN`s (a spinning guest is broken out by the signal;
    /// this flag covers the rest).
    exit: AtomicBool,
    /// Address of this vCPU's `kvm_run.immediate_exit` byte (0 = not yet
    /// published). Published at vCPU *creation* (on the main thread, before the
    /// run thread is spawned) — NOT in `bind_thread` — so `force_exit` can gate
    /// guest re-entry even in the window before the run thread has bound itself
    /// (the `tid == 0` window). Writing 1 here makes the vCPU's *next* `KVM_RUN`
    /// return `EINTR` at guest entry without executing an instruction; combined
    /// with the SIGUSR1 (which breaks a vCPU already blocked *inside* KVM_RUN),
    /// a single `force_exit` is race-free in every thread state — no re-kicking.
    immediate_exit_ptr: AtomicUsize,
}

/// Globally-unique vCPU identity for the registry/handle. Distinct from the
/// per-VM KVM vCPU index (the APIC id, 0..N within one VM) — that index is NOT
/// unique across VMs, so it can't key a process-global map (two VMs would both
/// claim id 0 and force-exit each other's thread).
static NEXT_REG_ID: AtomicU64 = AtomicU64::new(0);

/// Global vCPU id → control block. The handle is `Copy` (just an id), so the
/// shared state it needs (thread + exit flag) lives here, not in the handle.
fn registry() -> &'static Mutex<HashMap<u64, Arc<VcpuReg>>> {
    static R: OnceLock<Mutex<HashMap<u64, Arc<VcpuReg>>>> = OnceLock::new();
    R.get_or_init(|| Mutex::new(HashMap::new()))
}

/// No-op SIGUSR1 handler. Its only purpose is to exist (so the default
/// terminate action doesn't fire) without `SA_RESTART`, so a SIGUSR1 delivered
/// during `KVM_RUN` makes the ioctl return `EINTR` (→ `VcpuExit::Intr`).
extern "C" fn sigusr1_noop(_sig: libc::c_int) {}

/// Install the SIGUSR1 handler once per process.
fn install_force_exit_signal() {
    static ONCE: Once = Once::new();
    ONCE.call_once(|| unsafe {
        let mut sa: libc::sigaction = std::mem::zeroed();
        sa.sa_sigaction = sigusr1_noop as usize;
        libc::sigemptyset(&mut sa.sa_mask);
        sa.sa_flags = 0; // deliberately NOT SA_RESTART → KVM_RUN returns EINTR
        libc::sigaction(libc::SIGUSR1, &sa, std::ptr::null_mut());
    });
}

/// Cross-thread force-exit token: kicks a vCPU out of `KVM_RUN` from another
/// thread. The signal (SIGUSR1, no `SA_RESTART`) makes a *blocked* `KVM_RUN`
/// return `EINTR`; the registry's `exit` flag carries the intent so it survives
/// the gap between two runs. Foundational for quiesce/snapshot, the multi-vCPU
/// coordinator, and an interruptible run loop.
#[derive(Clone, Copy)]
pub struct KvmVcpuHandle {
    vcpu_id: u64,
}

impl VcpuHandle for KvmVcpuHandle {
    fn force_exit(handles: &[Self]) {
        install_force_exit_signal();
        let reg = registry().lock().unwrap();
        for h in handles {
            if let Some(e) = reg.get(&h.vcpu_id) {
                e.exit.store(true, Ordering::SeqCst);
                // Gate guest re-entry FIRST (before the signal): the vCPU's next
                // KVM_RUN returns EINTR at entry without running an instruction.
                // This covers the not-yet-bound (tid==0) and between-runs states.
                // SAFETY: `immediate_exit_ptr` was published at vCPU creation and
                // points into the still-mapped kvm_run page for this fd's life.
                let imm = e.immediate_exit_ptr.load(Ordering::SeqCst);
                if imm != 0 {
                    unsafe { std::ptr::write_volatile(imm as *mut u8, 1u8) };
                }
                let tid = e.tid.load(Ordering::SeqCst);
                // Break a vCPU already blocked INSIDE KVM_RUN (idle HLT with the
                // in-kernel irqchip): immediate_exit isn't re-checked while
                // blocked, so the signal is what wakes it. Harmless if it's not
                // in KVM_RUN — SIGUSR1 is thread-blocked there and stays pending.
                if tid != 0 {
                    // SAFETY: signalling a live thread with an installed
                    // handler; a stale tid would target a since-exited thread,
                    // but a vCPU's registry entry is removed on its Drop before
                    // the thread can be reused, so tid refers to this vCPU.
                    unsafe {
                        libc::pthread_kill(tid as libc::pthread_t, libc::SIGUSR1);
                    }
                }
            }
        }
    }
}

/// The MSRs to snapshot — the resume-critical ones that are NOT already
/// covered by REGS/SREGS (EFER, APIC_BASE, FS/GS_BASE live in sregs). These
/// are the syscall/sysenter fast-path bases + TSC; all get/set cleanly (unlike
/// the full KVM_GET_MSR_INDEX_LIST, which contains entries that error on set).
const SNAPSHOT_MSRS: &[u32] = &[
    0x0000_0010, // IA32_TSC
    0x0000_0174, // IA32_SYSENTER_CS
    0x0000_0175, // IA32_SYSENTER_ESP
    0x0000_0176, // IA32_SYSENTER_EIP
    0xc000_0081, // STAR
    0xc000_0082, // LSTAR
    0xc000_0083, // CSTAR
    0xc000_0084, // SYSCALL_MASK (SFMASK)
    0xc000_0102, // KERNEL_GS_BASE
    0xc000_0103, // TSC_AUX
];

/// Captured vCPU state for snapshot/restore. The full per-vCPU CPU state KVM
/// exposes: GP regs, system regs, FP/SSE (fxsave via FPU — kvm-ioctls 0.23 has
/// no SET_XSAVE so AVX state isn't restored), extended control regs, the local
/// APIC, pending events, debug regs, run state, and the resume-critical MSRs.
/// (Guest RAM + device state are captured separately by the run layer.)
///
/// `Clone` so the in-place reset path can hand each vCPU thread an owned copy of
/// its snapshot-baseline state to re-apply on reset (POD register blobs + a
/// `Msrs` FAM wrapper, all cloneable).
#[derive(Clone)]
pub struct KvmSnapshotState {
    regs: kvm_regs,
    sregs: kvm_sregs,
    fpu: kvm_fpu,
    xcrs: kvm_xcrs,
    events: kvm_vcpu_events,
    mp_state: kvm_mp_state,
    debug_regs: kvm_debugregs,
    lapic: kvm_lapic_state,
    msrs: Msrs,
}

/// A single vCPU. `RefCell` because `VcpuFd::run` needs `&mut` (it returns an
/// exit borrowing the shared `kvm_run` page) while the seam's `step` is `&self`;
/// vCPUs are thread-bound, so this is never contended.
pub struct KvmVcpu {
    vcpu: RefCell<VcpuFd>,
    id: u64,
    reg: Arc<VcpuReg>,
    /// Set once `bind_thread` has run (idempotent guard).
    bound: AtomicBool,
}

impl Drop for KvmVcpu {
    fn drop(&mut self) {
        // Remove the registry entry so a recycled tid can't be force-exited as
        // if it were still this vCPU.
        registry().lock().unwrap().remove(&self.id);
    }
}

impl HypervisorVcpu for KvmVcpu {
    type Error = KvmError;
    type Handle = KvmVcpuHandle;
    type SnapshotState = KvmSnapshotState;

    fn exit_token(&self) -> KvmVcpuHandle {
        KvmVcpuHandle { vcpu_id: self.id }
    }

    fn capture_snapshot(&self) -> Result<KvmSnapshotState, KvmError> {
        self.capture_snapshot_locked(&self.vcpu.borrow())
    }

    fn restore_snapshot(&self, s: &KvmSnapshotState) -> Result<(), KvmError> {
        self.restore_snapshot_locked(&self.vcpu.borrow(), s)
    }

    fn capture_clock_ref(_state: &Self::SnapshotState, _host_now: u64) -> u64 {
        // On KVM the guest clock travels in the VM-global device blob
        // (kvm_clock_data, captured via capture_intc), and KVM re-anchors the
        // guest TSC on KVM_SET_CLOCK at restore — so there is no per-vCPU clock
        // reference to persist. Sentinel 0; restore_clock is a matching no-op.
        0
    }

    fn restore_clock(&self, _captured_ref: u64, _host_now: u64) -> Result<u64, KvmError> {
        // No-op on KVM: the guest clock is re-anchored by restoring the device
        // blob (restore_intc -> KVM_SET_CLOCK), not per vCPU. See capture_clock_ref.
        Ok(0)
    }

    fn write_snapshot_state(
        s: &KvmSnapshotState,
        w: &mut dyn std::io::Write,
    ) -> std::io::Result<()> {
        // Byte layout matches the inline per-vCPU encoding the SMSNAP container
        // already uses (length-prefixed POD blobs in field order, then the MSR
        // index/data pairs), so routing the snapshot pipeline through this seam
        // method leaves the on-disk format unchanged.
        write_pod_to(w, &s.regs)?;
        write_pod_to(w, &s.sregs)?;
        write_pod_to(w, &s.fpu)?;
        write_pod_to(w, &s.xcrs)?;
        write_pod_to(w, &s.events)?;
        write_pod_to(w, &s.mp_state)?;
        write_pod_to(w, &s.debug_regs)?;
        write_pod_to(w, &s.lapic)?;
        let entries = s.msrs.as_slice();
        w.write_all(&(entries.len() as u32).to_le_bytes())?;
        for e in entries {
            w.write_all(&e.index.to_le_bytes())?;
            w.write_all(&e.data.to_le_bytes())?;
        }
        Ok(())
    }

    fn read_snapshot_state(r: &mut dyn std::io::Read) -> std::io::Result<KvmSnapshotState> {
        let regs = read_pod_from(r)?;
        let sregs = read_pod_from(r)?;
        let fpu = read_pod_from(r)?;
        let xcrs = read_pod_from(r)?;
        let events = read_pod_from(r)?;
        let mp_state = read_pod_from(r)?;
        let debug_regs = read_pod_from(r)?;
        let lapic = read_pod_from(r)?;
        let mut lb = [0u8; 4];
        r.read_exact(&mut lb)?;
        let nmsr = u32::from_le_bytes(lb);
        // Cap the pre-allocation: a corrupt/huge count must not OOM here. A bogus
        // count then fails fast on the first truncated `read_exact` below.
        let mut entries = Vec::with_capacity((nmsr as usize).min(4096));
        for _ in 0..nmsr {
            let mut ib = [0u8; 4];
            r.read_exact(&mut ib)?;
            let mut db = [0u8; 8];
            r.read_exact(&mut db)?;
            entries.push(kvm_msr_entry {
                index: u32::from_le_bytes(ib),
                data: u64::from_le_bytes(db),
                ..Default::default()
            });
        }
        let msrs = Msrs::from_entries(&entries).map_err(|e| {
            std::io::Error::new(std::io::ErrorKind::InvalidData, format!("msrs: {e:?}"))
        })?;
        Ok(KvmSnapshotState {
            regs,
            sregs,
            fpu,
            xcrs,
            events,
            mp_state,
            debug_regs,
            lapic,
            msrs,
        })
    }

    // aarch64 register model — not used by x86 orchestration (it uses the
    // inherent x86 register methods below).
    fn get_core(&self, _reg: CoreReg) -> Result<u64, KvmError> {
        Err(KvmError::unsupported("aarch64 CoreReg on x86"))
    }
    fn set_core(&self, _reg: CoreReg, _value: u64) -> Result<(), KvmError> {
        Err(KvmError::unsupported("aarch64 CoreReg on x86"))
    }
    fn get_sys(&self, _reg: SysReg) -> Result<u64, KvmError> {
        Err(KvmError::unsupported("aarch64 SysReg on x86"))
    }
    fn set_sys(&self, _reg: SysReg, _value: u64) -> Result<(), KvmError> {
        Err(KvmError::unsupported("aarch64 SysReg on x86"))
    }

    fn step(&self) -> Result<VcpuExit, KvmError> {
        // Bind this vCPU to the running OS thread (idempotent) so a handle can
        // force-exit it race-free. Must run on the thread that will call run().
        self.bind_thread()?;
        let mut vcpu = self.vcpu.borrow_mut();
        // A force-exit SIGUSR1 makes KVM_RUN fail with EINTR (it does not set a
        // successful KVM_EXIT_INTR). Surface both as Canceled.
        let exit = match vcpu.run() {
            Ok(e) => e,
            Err(e) if e.errno() == libc::EINTR => return Ok(VcpuExit::Canceled),
            Err(e) => return Err(e.into()),
        };
        Ok(match exit {
            KvmExit::IoOut(port, data) => VcpuExit::Io {
                port,
                write: true,
                size: data.len() as u8,
                data: pack_u32(data),
            },
            KvmExit::IoIn(port, data) => VcpuExit::Io {
                port,
                write: false,
                size: data.len() as u8,
                data: 0,
            },
            KvmExit::MmioWrite(addr, data) => VcpuExit::Mmio {
                phys_addr: addr,
                write: true,
                len: data.len() as u8,
                data: pack_u64(data),
            },
            KvmExit::MmioRead(addr, data) => VcpuExit::Mmio {
                phys_addr: addr,
                write: false,
                len: data.len() as u8,
                data: 0,
            },
            KvmExit::Hlt | KvmExit::Shutdown => VcpuExit::Halt,
            KvmExit::Intr => VcpuExit::Canceled,
            // Unmodeled exits (debug, internal-error, …). The raw discriminant
            // isn't exposed by kvm-ioctls' enum; 0 is the "unknown" sentinel.
            _ => VcpuExit::Unknown(0),
        })
    }
}

impl KvmVcpu {
    /// Whether a [`KvmVcpuHandle::force_exit`] has asked this vCPU to stop. The
    /// run loop checks this after an `Intr` exit (and may check it between
    /// iterations) so the stop is honored even if the signal raced the run.
    pub fn should_exit(&self) -> bool {
        self.reg.exit.load(Ordering::SeqCst)
    }

    /// Clear a pending force-exit request (e.g. before re-running a vCPU that
    /// was stopped for a snapshot).
    pub fn clear_exit(&self) {
        self.reg.exit.store(false, Ordering::SeqCst);
    }

    /// As the `HypervisorVcpu::capture_snapshot` trait method, but using an
    /// already-borrowed `VcpuFd` — for callers that hold the vCPU's `RefCell`
    /// borrow (the live-snapshot pause inside the run loop, where a second
    /// `self.vcpu.borrow()` would panic against the active `borrow_mut`).
    /// Re-apply a captured [`KvmSnapshotState`] using an already-held vcpu fd
    /// borrow — the restore analog of [`capture_snapshot_locked`]. The in-place
    /// reset path (`run_vcpu`) holds `self.vcpu.borrow_mut()` across the whole
    /// run loop, so it cannot call [`restore_snapshot`] (which takes its own
    /// borrow → RefCell double-borrow panic); it re-applies the vCPU's baseline
    /// registers through this on its own owning thread.
    pub(crate) fn restore_snapshot_locked(
        &self,
        v: &VcpuFd,
        s: &KvmSnapshotState,
    ) -> Result<(), KvmError> {
        // SREGS before REGS (paging/segments must be in place); the rest are
        // order-independent.
        v.set_sregs(&s.sregs)?;
        v.set_regs(&s.regs)?;
        v.set_fpu(&s.fpu)?;
        v.set_xcrs(&s.xcrs)?;
        v.set_lapic(&s.lapic)?;
        let set = v.set_msrs(&s.msrs)?;
        if set != SNAPSHOT_MSRS.len() {
            return Err(KvmError(format!(
                "kvm: set_msrs wrote {set}/{} entries",
                SNAPSHOT_MSRS.len()
            )));
        }
        v.set_vcpu_events(&s.events)?;
        v.set_debug_regs(&s.debug_regs)?;
        v.set_mp_state(s.mp_state)?;
        Ok(())
    }

    pub(crate) fn capture_snapshot_locked(&self, v: &VcpuFd) -> Result<KvmSnapshotState, KvmError> {
        let entries: Vec<kvm_msr_entry> = SNAPSHOT_MSRS
            .iter()
            .map(|&index| kvm_msr_entry {
                index,
                data: 0,
                ..Default::default()
            })
            .collect();
        let mut msrs =
            Msrs::from_entries(&entries).map_err(|e| KvmError(format!("kvm: msrs fam: {e:?}")))?;
        let got = v.get_msrs(&mut msrs)?;
        if got != SNAPSHOT_MSRS.len() {
            return Err(KvmError(format!(
                "kvm: get_msrs read {got}/{} entries",
                SNAPSHOT_MSRS.len()
            )));
        }
        Ok(KvmSnapshotState {
            regs: v.get_regs()?,
            sregs: v.get_sregs()?,
            fpu: v.get_fpu()?,
            xcrs: v.get_xcrs()?,
            events: v.get_vcpu_events()?,
            mp_state: v.get_mp_state()?,
            debug_regs: v.get_debug_regs()?,
            lapic: v.get_lapic()?,
            msrs,
        })
    }

    /// Bind this vCPU to the *calling* OS thread for race-free force-exit.
    /// Idempotent; must be called on the thread that will drive `run()`/`step()`
    /// (step() calls it automatically; the KVM-native run loop calls it once up
    /// front). Two things happen:
    ///   1. SIGUSR1 is blocked at the thread level, so a force-exit signal sent
    ///      while we are *not* in KVM_RUN stays pending (instead of running the
    ///      no-op handler and being lost).
    ///   2. KVM is told to unblock all signals during guest execution
    ///      (KVM_SET_SIGNAL_MASK, empty set), so that pending SIGUSR1 fires the
    ///      instant we enter the guest → KVM_RUN returns EINTR. Together these
    ///      close the race where the signal lands between two runs.
    pub fn bind_thread(&self) -> Result<(), KvmError> {
        if self.bound.swap(true, Ordering::SeqCst) {
            return Ok(());
        }
        install_force_exit_signal();
        // 1. Block SIGUSR1 on this thread.
        // SAFETY: standard pthread signal-mask manipulation on the current thread.
        unsafe {
            let mut set: libc::sigset_t = std::mem::zeroed();
            libc::sigemptyset(&mut set);
            libc::sigaddset(&mut set, libc::SIGUSR1);
            libc::pthread_sigmask(libc::SIG_BLOCK, &set, std::ptr::null_mut());
        }
        // 2. KVM_SET_SIGNAL_MASK with an empty sigset → no signals blocked
        //    during guest execution, so the pending SIGUSR1 is delivered on
        //    guest entry.
        self.set_kvm_signal_mask_empty()?;
        // 3. Record the thread so a handle can target it.
        let tid = unsafe { libc::pthread_self() } as u64;
        self.reg.tid.store(tid, Ordering::SeqCst);
        Ok(())
    }

    /// Issue `KVM_SET_SIGNAL_MASK` with an empty signal set on this vCPU fd.
    /// kvm-ioctls 0.23 has no wrapper, so this is the raw ioctl. The argument
    /// is `struct kvm_signal_mask { __u32 len; __u8 sigset[len]; }`; an 8-byte
    /// (64-signal) all-zero sigset means "block nothing during guest mode".
    fn set_kvm_signal_mask_empty(&self) -> Result<(), KvmError> {
        use std::os::unix::io::AsRawFd;
        // _IOW(KVMIO=0xAE, 0x8b, struct kvm_signal_mask) with size field = 4.
        const KVM_SET_SIGNAL_MASK: libc::c_ulong = 0x4004_ae8b;
        let mut buf = [0u8; 4 + 8];
        buf[0..4].copy_from_slice(&8u32.to_le_bytes()); // len = 8 sigset bytes
        let fd = self.vcpu.borrow().as_raw_fd();
        // SAFETY: `fd` is this vCPU's KVM fd; `buf` is a valid kvm_signal_mask.
        let r = unsafe { libc::ioctl(fd, KVM_SET_SIGNAL_MASK, buf.as_ptr()) };
        if r != 0 {
            return Err(KvmError::from(std::io::Error::last_os_error()));
        }
        Ok(())
    }

    /// Bring the vCPU up in 64-bit long mode: a flat 64-bit code/data segment,
    /// paging on (`cr3` = identity-mapped page-table root), and `rip` = entry.
    /// The caller must have written the page tables into guest RAM first.
    pub fn enter_long_mode(&self, entry: u64, cr3: u64) -> Result<(), KvmError> {
        let vcpu = self.vcpu.borrow();
        let mut sregs = vcpu.get_sregs()?;
        let code = kvm_segment {
            base: 0,
            limit: 0xffff_ffff,
            selector: 0x08,
            type_: 0b1011, // code: execute/read/accessed
            present: 1,
            dpl: 0,
            db: 0,
            s: 1,
            l: 1, // 64-bit
            g: 1,
            ..Default::default()
        };
        let data = kvm_segment {
            base: 0,
            limit: 0xffff_ffff,
            selector: 0x10,
            type_: 0b0011, // data: read/write/accessed
            present: 1,
            dpl: 0,
            db: 1,
            s: 1,
            l: 0,
            g: 1,
            ..Default::default()
        };
        sregs.cs = code;
        sregs.ds = data;
        sregs.es = data;
        sregs.fs = data;
        sregs.gs = data;
        sregs.ss = data;
        sregs.cr3 = cr3;
        sregs.cr0 = 0x8000_0001; // PG | PE
        sregs.cr4 = 0x0000_0020; // PAE
        sregs.efer = 0x0000_0500; // LME | LMA
        vcpu.set_sregs(&sregs)?;

        let mut regs = vcpu.get_regs()?;
        regs.rip = entry;
        regs.rflags = 0x2; // reserved-1 bit
        vcpu.set_regs(&regs)?;
        Ok(())
    }

    /// Park a secondary CPU (AP) in the wait-for-SIPI state. Only the BSP
    /// (vCPU 0) boots from the kernel entry; APs sit `UNINITIALIZED` until the
    /// kernel sends INIT-SIPI-SIPI via the in-kernel LAPIC, which KVM services.
    pub fn park_for_sipi(&self) -> Result<(), KvmError> {
        let st = kvm_mp_state {
            mp_state: KVM_MP_STATE_UNINITIALIZED,
        };
        self.vcpu.borrow().set_mp_state(st)?;
        Ok(())
    }

    /// Apply the long-mode register state from the x86 boot protocol
    /// ([`crate::arch::x86_64::boot::setup_boot`]) to this vCPU: the flat
    /// `__BOOT_CS`/`__BOOT_DS` segments, the GDT pointer, CR0/CR3/CR4/EFER, and
    /// RIP/RSI/RFLAGS (RSI = `boot_params`). The caller must have run
    /// `setup_boot` over this VM's guest RAM first (it writes the page tables,
    /// GDT, and zero page that these registers reference).
    pub fn apply_boot_regs(&self, b: &crate::arch::x86_64::boot::BootRegs) -> Result<(), KvmError> {
        let vcpu = self.vcpu.borrow();
        let mut sregs = vcpu.get_sregs()?;
        let cs = seg_to_kvm(&b.cs);
        let ds = seg_to_kvm(&b.ds);
        sregs.cs = cs;
        sregs.ds = ds;
        sregs.es = ds;
        sregs.ss = ds;
        sregs.fs = ds;
        sregs.gs = ds;
        sregs.gdt.base = b.gdt_base;
        sregs.gdt.limit = b.gdt_limit;
        sregs.cr0 = b.cr0;
        sregs.cr3 = b.cr3;
        sregs.cr4 = b.cr4;
        sregs.efer = b.efer;
        vcpu.set_sregs(&sregs)?;

        let mut regs = vcpu.get_regs()?;
        regs.rip = b.rip;
        regs.rsi = b.rsi;
        regs.rflags = b.rflags;
        vcpu.set_regs(&regs)?;
        Ok(())
    }
}

/// Translate the portable boot-protocol [`Segment`](crate::arch::x86_64::boot::Segment)
/// into KVM's `kvm_segment`.
fn seg_to_kvm(s: &crate::arch::x86_64::boot::Segment) -> kvm_segment {
    kvm_segment {
        base: s.base,
        limit: s.limit,
        selector: s.selector,
        type_: s.type_,
        present: s.present,
        dpl: s.dpl,
        s: s.s,
        l: s.l,
        db: s.db,
        g: s.g,
        ..Default::default()
    }
}

fn pack_u32(data: &[u8]) -> u32 {
    let mut buf = [0u8; 4];
    let n = data.len().min(4);
    buf[..n].copy_from_slice(&data[..n]);
    u32::from_le_bytes(buf)
}

fn pack_u64(data: &[u8]) -> u64 {
    let mut buf = [0u8; 8];
    let n = data.len().min(8);
    buf[..n].copy_from_slice(&data[..n]);
    u64::from_le_bytes(buf)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::hypervisor::prot;

    /// The length-prefixed POD (de)serialization behind `capture_intc` /
    /// `restore_intc` must round-trip and reject corrupt/truncated blobs (pure
    /// logic — no `/dev/kvm` needed).
    #[test]
    fn pod_blob_round_trips_and_rejects_corruption() {
        let mut buf = Vec::new();
        push_pod(&mut buf, &0x1122_3344_5566_7788u64);
        push_pod(&mut buf, &[0xAAu8; 8]);

        // Round-trip in order.
        let mut p = 0;
        let a: u64 = read_pod(&buf, &mut p).unwrap();
        let b: [u8; 8] = read_pod(&buf, &mut p).unwrap();
        assert_eq!(a, 0x1122_3344_5566_7788);
        assert_eq!(b, [0xAA; 8]);
        assert_eq!(p, buf.len(), "consumed the whole blob");

        // Truncated length prefix.
        let mut p = 0;
        assert!(read_pod::<u64>(&buf[..2], &mut p).is_err());

        // Stored len (8, the u64's) != size_of::<u32>() → size mismatch.
        let mut p = 0;
        assert!(read_pod::<u32>(&buf, &mut p).is_err());

        // Length prefix claims 8 payload bytes but only 4 follow.
        let mut bad = Vec::new();
        bad.extend_from_slice(&8u32.to_le_bytes());
        bad.extend_from_slice(&[1, 2, 3, 4]);
        let mut p = 0;
        assert!(read_pod::<u64>(&bad, &mut p).is_err());
    }

    /// Increment 1, through the seam: create a KVM VM + irqchip, map RAM, bring
    /// a vCPU up in long mode, run a stub that `out`s 'K' to 0x3f8, and catch it
    /// as `VcpuExit::Io`. The standalone `spikes/kvm-floor` proved the raw
    /// mechanics; this proves the `HypervisorVm`/`HypervisorVcpu` binding.
    #[test]
    fn floor_long_mode_io_exit_through_seam() {
        const MEM: usize = 0x20_0000;
        let vm = KvmVm::create().expect("create VM");

        let host = unsafe {
            libc::mmap(
                std::ptr::null_mut(),
                MEM,
                libc::PROT_READ | libc::PROT_WRITE,
                libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
                -1,
                0,
            )
        };
        assert!(host != libc::MAP_FAILED, "mmap");
        let host = host as *mut u8;

        let put_u64 = |gpa: u64, v: u64| unsafe {
            std::ptr::write(host.add(gpa as usize) as *mut u64, v.to_le());
        };
        // PML4[0]->PDPT, PDPT[0]->PD, PD[0]->2MiB page at 0 (present|rw|ps).
        put_u64(0x1000, 0x2000 | 0x3);
        put_u64(0x2000, 0x3000 | 0x3);
        put_u64(0x3000, 0x83);
        // mov dx,0x3f8 ; mov al,'K' ; out dx,al ; jmp $
        let stub = [0x66u8, 0xba, 0xf8, 0x03, 0xb0, b'K', 0xee, 0xeb, 0xfe];
        unsafe { std::ptr::copy_nonoverlapping(stub.as_ptr(), host, stub.len()) };

        unsafe { vm.map_ram(host, 0, MEM, prot::RWX).expect("map_ram") };
        let vcpu = vm.create_vcpu().expect("create_vcpu");
        vcpu.enter_long_mode(0x0, 0x1000).expect("enter_long_mode");

        match vcpu.step().expect("step") {
            VcpuExit::Io {
                port, write, data, ..
            } => {
                assert_eq!(port, 0x3f8, "serial port");
                assert!(write, "OUT direction");
                assert_eq!(data & 0xff, u32::from(b'K'), "serial byte");
            }
            other => panic!("unexpected exit: {other:?}"),
        }
        unsafe { libc::munmap(host as *mut libc::c_void, MEM) };
    }

    /// The boot-protocol glue: build `BootRegs` from the x86 boot setup and
    /// apply them to a real KVM vCPU, then read the SREGS/REGS back and confirm
    /// the core is configured for long-mode kernel entry (paging on, EFER.LMA,
    /// flat __BOOT_CS, GDT pointer, RIP=load+0x200, RSI=boot_params).
    #[test]
    fn apply_boot_regs_configures_long_mode_entry() {
        use crate::arch::x86_64::boot::{setup_boot, BootConfig};

        let vm = KvmVm::create().expect("create VM");
        let vcpu = vm.create_vcpu().expect("create_vcpu");

        // Minimal fake bzImage: valid setup header span + a little pm-kernel.
        let setup_sects = 4u8;
        let pm_off = (setup_sects as usize + 1) * 512;
        let mut bz = vec![0u8; pm_off + 512];
        bz[0x1f1] = setup_sects;

        let mem_size = 2 * 1024 * 1024;
        let mut mem = vec![0u8; mem_size];
        let cfg = BootConfig {
            mem_size,
            cmdline: "console=ttyS0",
            bzimage: &bz,
            initrd: None,
        };
        let regs = setup_boot(&mut mem, &cfg).expect("setup_boot");
        vcpu.apply_boot_regs(&regs).expect("apply_boot_regs");

        let sregs = vcpu.vcpu.borrow().get_sregs().expect("get_sregs");
        assert_eq!(sregs.cr0, 0x8000_0001, "PG|PE");
        assert_eq!(sregs.cr4, 0x20, "PAE");
        assert_eq!(sregs.efer & 0x500, 0x500, "LME|LMA active");
        assert_eq!(sregs.cr3, 0x1000, "PML4 root");
        assert_eq!(sregs.cs.selector, 0x10, "__BOOT_CS");
        assert_eq!(sregs.cs.l, 1, "64-bit code segment");
        assert_eq!(sregs.ds.selector, 0x18, "__BOOT_DS");
        assert_eq!(sregs.gdt.base, 0x4000, "GDT base");
        assert_eq!(sregs.gdt.limit, 31, "GDT limit");

        let r = vcpu.vcpu.borrow().get_regs().expect("get_regs");
        assert_eq!(r.rip, 0x10_0200, "64-bit entry = load+0x200");
        assert_eq!(r.rsi, 0x1_0000, "RSI = boot_params");
    }

    /// Increment 5: cross-thread force-exit. A vCPU runs a tight infinite loop
    /// (`jmp $`) on its own thread — it never VM-exits on its own. From the main
    /// thread, `force_exit` must break it out of `KVM_RUN` (SIGUSR1 → EINTR →
    /// Canceled). The channel + timeout proves it stopped rather than hanging.
    #[test]
    fn force_exit_breaks_a_spinning_vcpu() {
        use std::sync::mpsc;
        use std::time::Duration;

        const MEM: usize = 0x20_0000;
        let vm = KvmVm::create().expect("create VM");

        let host = unsafe {
            libc::mmap(
                std::ptr::null_mut(),
                MEM,
                libc::PROT_READ | libc::PROT_WRITE,
                libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
                -1,
                0,
            )
        };
        assert!(host != libc::MAP_FAILED, "mmap");
        let host = host as *mut u8;
        let put_u64 = |gpa: u64, v: u64| unsafe {
            std::ptr::write(host.add(gpa as usize) as *mut u64, v.to_le());
        };
        put_u64(0x1000, 0x2000 | 0x3);
        put_u64(0x2000, 0x3000 | 0x3);
        put_u64(0x3000, 0x83);
        // jmp $ (EB FE) — spin forever, never exit.
        let stub = [0xebu8, 0xfe];
        unsafe { std::ptr::copy_nonoverlapping(stub.as_ptr(), host, stub.len()) };

        unsafe { vm.map_ram(host, 0, MEM, prot::RWX).expect("map_ram") };
        let vcpu = vm.create_vcpu().expect("create_vcpu");
        vcpu.enter_long_mode(0x0, 0x1000).expect("enter_long_mode");
        let handle = vcpu.exit_token();

        let (ready_tx, ready_rx) = mpsc::channel::<()>();
        let (tx, rx) = mpsc::channel::<&'static str>();
        let runner = std::thread::spawn(move || {
            // Bind on this thread (sets the SIGUSR1 mask + tid) BEFORE signalling
            // ready, so force_exit can never race the binding.
            vcpu.bind_thread().expect("bind_thread");
            let _ = ready_tx.send(());
            loop {
                match vcpu.step() {
                    Ok(VcpuExit::Canceled) => {
                        let _ = tx.send("canceled");
                        break;
                    }
                    Ok(_) => continue,
                    Err(_) => {
                        let _ = tx.send("err");
                        break;
                    }
                }
            }
            // vcpu drops here, removing its registry entry.
        });

        // Once bound, force_exit is race-free: a SIGUSR1 sent before the vCPU
        // enters KVM_RUN stays pending (blocked) and fires on guest entry.
        ready_rx
            .recv_timeout(Duration::from_secs(2))
            .expect("vcpu ready");
        KvmVcpuHandle::force_exit(&[handle]);

        let got = rx.recv_timeout(Duration::from_secs(5));
        assert_eq!(
            got.ok(),
            Some("canceled"),
            "force_exit did not break the spinning vCPU out of KVM_RUN"
        );
        runner.join().expect("join runner");
        unsafe { libc::munmap(host as *mut libc::c_void, MEM) };
    }

    /// Regression: a `force_exit` issued in the window BEFORE the vCPU thread has
    /// bound itself — `tid == 0`, so NO SIGUSR1 can be delivered — must still
    /// stop the vCPU. `immediate_exit` is published at vCPU *creation* (not in
    /// `bind_thread`), so it gates the very first `KVM_RUN` entry → EINTR →
    /// Canceled, with no signal involved. This is exactly the start()/teardown
    /// race that used to need a 20–40 ms re-kick poll (the vCPU would otherwise
    /// spin forever on the never-exiting guest below).
    #[test]
    fn force_exit_before_bind_still_stops_vcpu() {
        use std::sync::mpsc;
        use std::time::Duration;

        const MEM: usize = 0x20_0000;
        let vm = KvmVm::create().expect("create VM");
        let host = unsafe {
            libc::mmap(
                std::ptr::null_mut(),
                MEM,
                libc::PROT_READ | libc::PROT_WRITE,
                libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
                -1,
                0,
            )
        };
        assert!(host != libc::MAP_FAILED, "mmap");
        let host = host as *mut u8;
        let put_u64 = |gpa: u64, v: u64| unsafe {
            std::ptr::write(host.add(gpa as usize) as *mut u64, v.to_le());
        };
        put_u64(0x1000, 0x2000 | 0x3);
        put_u64(0x2000, 0x3000 | 0x3);
        put_u64(0x3000, 0x83);
        // jmp $ (EB FE) — spin forever, never exit on its own.
        let stub = [0xebu8, 0xfe];
        unsafe { std::ptr::copy_nonoverlapping(stub.as_ptr(), host, stub.len()) };

        unsafe { vm.map_ram(host, 0, MEM, prot::RWX).expect("map_ram") };
        let vcpu = vm.create_vcpu().expect("create_vcpu");
        vcpu.enter_long_mode(0x0, 0x1000).expect("enter_long_mode");
        let handle = vcpu.exit_token();

        // Force-exit NOW — before any thread runs `bind_thread`, so the registry
        // tid is still 0 and `force_exit` sends no signal. Only the immediate_exit
        // gate (published at creation) can stop the vCPU here.
        KvmVcpuHandle::force_exit(&[handle]);

        let (tx, rx) = mpsc::channel::<&'static str>();
        let runner = std::thread::spawn(move || {
            // step() binds (stores tid) then enters KVM_RUN. The spinning guest
            // would run forever unless immediate_exit breaks the first entry.
            let r = match vcpu.step() {
                Ok(VcpuExit::Canceled) => "canceled",
                Ok(_) => "other",
                Err(_) => "err",
            };
            let _ = tx.send(r);
        });

        let got = rx.recv_timeout(Duration::from_secs(5));
        assert_eq!(
            got.ok(),
            Some("canceled"),
            "immediate_exit did not gate the first KVM_RUN entry for a pre-bind force_exit"
        );
        runner.join().expect("join runner");
        unsafe { libc::munmap(host as *mut libc::c_void, MEM) };
    }

    /// A single `force_exit` over a slice of handles must stop EVERY vCPU — the
    /// multi-vCPU teardown path (`stop`/`Drop` force-exit all handles at once).
    /// Both spin forever on their own thread until the one kick cancels them.
    #[test]
    fn force_exit_stops_all_vcpus() {
        use std::sync::mpsc;
        use std::time::Duration;

        const MEM: usize = 0x20_0000;
        let vm = KvmVm::create().expect("create VM");
        let host = unsafe {
            libc::mmap(
                std::ptr::null_mut(),
                MEM,
                libc::PROT_READ | libc::PROT_WRITE,
                libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
                -1,
                0,
            )
        };
        assert!(host != libc::MAP_FAILED, "mmap");
        let host = host as *mut u8;
        let put_u64 = |gpa: u64, v: u64| unsafe {
            std::ptr::write(host.add(gpa as usize) as *mut u64, v.to_le());
        };
        put_u64(0x1000, 0x2000 | 0x3);
        put_u64(0x2000, 0x3000 | 0x3);
        put_u64(0x3000, 0x83);
        let stub = [0xebu8, 0xfe]; // jmp $
        unsafe { std::ptr::copy_nonoverlapping(stub.as_ptr(), host, stub.len()) };
        unsafe { vm.map_ram(host, 0, MEM, prot::RWX).expect("map_ram") };

        let (ready_tx, ready_rx) = mpsc::channel::<()>();
        let (done_tx, done_rx) = mpsc::channel::<&'static str>();
        let mut handles = Vec::new();
        let mut runners = Vec::new();
        for _ in 0..2 {
            let vcpu = vm.create_vcpu().expect("create_vcpu");
            vcpu.enter_long_mode(0x0, 0x1000).expect("enter_long_mode");
            handles.push(vcpu.exit_token());
            let ready = ready_tx.clone();
            let done = done_tx.clone();
            runners.push(std::thread::spawn(move || {
                // Bind before signalling ready so the kick can never race binding.
                vcpu.bind_thread().expect("bind_thread");
                let _ = ready.send(());
                loop {
                    match vcpu.step() {
                        Ok(VcpuExit::Canceled) => {
                            let _ = done.send("canceled");
                            break;
                        }
                        Ok(_) => continue,
                        Err(_) => {
                            let _ = done.send("err");
                            break;
                        }
                    }
                }
            }));
        }

        // Both bound → one force_exit over both handles stops both.
        ready_rx
            .recv_timeout(Duration::from_secs(2))
            .expect("vcpu0");
        ready_rx
            .recv_timeout(Duration::from_secs(2))
            .expect("vcpu1");
        KvmVcpuHandle::force_exit(&handles);

        for _ in 0..2 {
            assert_eq!(
                done_rx.recv_timeout(Duration::from_secs(5)).ok(),
                Some("canceled"),
                "a vCPU was not stopped by the shared force_exit"
            );
        }
        for r in runners {
            r.join().expect("join runner");
        }
        unsafe { libc::munmap(host as *mut libc::c_void, MEM) };
    }

    /// Snapshot increment 7a: capture the full per-vCPU state, clobber the
    /// registers, restore, and confirm the captured values come back. Exercises
    /// every get/set on real /dev/kvm (regs/sregs/fpu/xcrs/lapic/msrs/events/
    /// debugregs/mp_state) — the main risk is any one erroring on the kernel.
    #[test]
    fn snapshot_round_trips_vcpu_state() {
        const MEM: usize = 0x20_0000;
        let vm = KvmVm::create().expect("create VM");
        let host = unsafe {
            libc::mmap(
                std::ptr::null_mut(),
                MEM,
                libc::PROT_READ | libc::PROT_WRITE,
                libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
                -1,
                0,
            )
        };
        assert!(host != libc::MAP_FAILED, "mmap");
        let host = host as *mut u8;
        let put_u64 = |gpa: u64, v: u64| unsafe {
            std::ptr::write(host.add(gpa as usize) as *mut u64, v.to_le());
        };
        put_u64(0x1000, 0x2000 | 0x3);
        put_u64(0x2000, 0x3000 | 0x3);
        put_u64(0x3000, 0x83);
        unsafe { vm.map_ram(host, 0, MEM, prot::RWX).expect("map_ram") };
        let vcpu = vm.create_vcpu().expect("create_vcpu");
        vcpu.enter_long_mode(0x0, 0x1000).expect("enter_long_mode");

        // Set known register values to snapshot.
        {
            let v = vcpu.vcpu.borrow();
            let mut r = v.get_regs().expect("get_regs");
            r.rax = 0x1234_5678_9abc_def0;
            r.rbx = 0x00ca_feba_be00_1357;
            r.rip = 0x0;
            v.set_regs(&r).expect("set_regs");
        }

        let snap = vcpu.capture_snapshot().expect("capture_snapshot");

        // Clobber everything we'll check.
        {
            let v = vcpu.vcpu.borrow();
            let mut r = v.get_regs().expect("get_regs");
            r.rax = 0;
            r.rbx = 0;
            r.rip = 0x1000;
            v.set_regs(&r).expect("set_regs");
        }

        vcpu.restore_snapshot(&snap).expect("restore_snapshot");

        let r = vcpu.vcpu.borrow().get_regs().expect("get_regs");
        assert_eq!(r.rax, 0x1234_5678_9abc_def0, "rax restored");
        assert_eq!(r.rbx, 0x00ca_feba_be00_1357, "rbx restored");
        assert_eq!(r.rip, 0x0, "rip restored");

        unsafe { libc::munmap(host as *mut libc::c_void, MEM) };
    }

    /// Snapshot increment 7b: capture a *running* guest's evolving vCPU state +
    /// RAM, restore into a brand-new VM, and confirm it RESUMES (not restarts).
    ///
    /// The guest is a tight `inc rax; jmp` loop (no IO), so it never exits on its
    /// own — we pause it at a clean instruction boundary with `force_exit` (the
    /// production quiesce path; snapshotting at an IO exit would lose KVM's
    /// pending-IO-completion state and re-execute the instruction). We snapshot
    /// the paused vCPU, tear the source VM down, restore into a fresh VM with a
    /// copy of the RAM, and check: (a) rax equals the snapshot value (running
    /// state transferred), then (b) after running again it has grown (resumed,
    /// not reset to 0).
    #[test]
    fn snapshot_resumes_running_guest_in_fresh_vm() {
        use std::time::Duration;

        const MEM: usize = 0x20_0000;
        // inc rax ; jmp -5 (back to offset 0) — a tight no-exit loop.
        let stub = [0x48u8, 0xff, 0xc0, 0xeb, 0xfb];

        let build = || -> (KvmVm, *mut u8) {
            let vm = KvmVm::create().expect("create VM");
            let host = unsafe {
                libc::mmap(
                    std::ptr::null_mut(),
                    MEM,
                    libc::PROT_READ | libc::PROT_WRITE,
                    libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
                    -1,
                    0,
                )
            } as *mut u8;
            assert!(host as *mut libc::c_void != libc::MAP_FAILED, "mmap");
            let put_u64 = |gpa: u64, v: u64| unsafe {
                std::ptr::write(host.add(gpa as usize) as *mut u64, v.to_le());
            };
            put_u64(0x1000, 0x2000 | 0x3);
            put_u64(0x2000, 0x3000 | 0x3);
            put_u64(0x3000, 0x83);
            unsafe { std::ptr::copy_nonoverlapping(stub.as_ptr(), host, stub.len()) };
            unsafe { vm.map_ram(host, 0, MEM, prot::RWX).expect("map_ram") };
            (vm, host)
        };

        // Run a vCPU's loop on its own thread until force-exited, then capture +
        // return its snapshot and the rax it reached.
        let run_until_kicked = move |vcpu: KvmVcpu| -> (KvmSnapshotState, u64) {
            vcpu.bind_thread().expect("bind_thread");
            loop {
                match vcpu.step() {
                    Ok(VcpuExit::Canceled) => break,
                    Ok(_) => continue,
                    Err(e) => panic!("step: {e}"),
                }
            }
            let rax = vcpu.vcpu.borrow().get_regs().expect("get_regs").rax;
            (vcpu.capture_snapshot().expect("capture"), rax)
        };

        // --- Source: let the loop spin, then pause + snapshot. ---
        let (vm1, host1) = build();
        let vcpu1 = vm1.create_vcpu().expect("create_vcpu");
        vcpu1.enter_long_mode(0x0, 0x1000).expect("enter_long_mode");
        let h1 = vcpu1.exit_token();
        let t1 = std::thread::spawn(move || run_until_kicked(vcpu1));
        std::thread::sleep(Duration::from_millis(50));
        KvmVcpuHandle::force_exit(&[h1]);
        let (snap, rax_at_snap) = t1.join().expect("join t1");
        assert!(rax_at_snap > 0, "guest should have incremented rax");

        let mut saved_ram = vec![0u8; MEM];
        unsafe { std::ptr::copy_nonoverlapping(host1, saved_ram.as_mut_ptr(), MEM) };
        unsafe { libc::munmap(host1 as *mut libc::c_void, MEM) };

        // --- Destination: fresh VM, load RAM, restore, verify resume. ---
        let (vm2, host2) = build();
        unsafe { std::ptr::copy_nonoverlapping(saved_ram.as_ptr(), host2, MEM) };
        let vcpu2 = vm2.create_vcpu().expect("create_vcpu");
        vcpu2.restore_snapshot(&snap).expect("restore_snapshot");

        // (a) Running state transferred: rax matches the snapshot exactly.
        let rax_restored = vcpu2.vcpu.borrow().get_regs().expect("get_regs").rax;
        assert_eq!(rax_restored, rax_at_snap, "rax not transferred to fresh VM");

        // (b) Resumes (keeps counting from there, not from 0).
        let h2 = vcpu2.exit_token();
        let t2 = std::thread::spawn(move || run_until_kicked(vcpu2));
        std::thread::sleep(Duration::from_millis(50));
        KvmVcpuHandle::force_exit(&[h2]);
        let (_snap2, rax_after) = t2.join().expect("join t2");
        assert!(
            rax_after > rax_at_snap,
            "restored guest did not resume: rax {rax_after} <= snapshot {rax_at_snap}"
        );

        unsafe { libc::munmap(host2 as *mut libc::c_void, MEM) };
    }

    /// Snapshot increment 7c: the in-kernel device state (PIT + PIC master/slave
    /// + IOAPIC + clock) captures and restores cleanly on real /dev/kvm —
    /// exercising get/set_pit2, get/set_irqchip (×3 chips), get/set_clock. These
    /// are needed so a restored Linux keeps timekeeping + interrupt routing.
    #[test]
    fn device_state_captures_and_restores() {
        let vm = KvmVm::create().expect("create VM");
        vm.create_pit().expect("create_pit");
        let dev = vm.capture_devices().expect("capture_devices");
        vm.restore_devices(&dev).expect("restore_devices");
        // Re-capture after restore: all the ioctls still succeed (round-trip).
        let _dev2 = vm.capture_devices().expect("recapture_devices");
    }
}